Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

2785 lines
98 KiB

  1. /**************************************************************************
  2. *
  3. * Copyright (c) 2000 Microsoft Corporation
  4. *
  5. * Module Name & Abstract
  6. *
  7. * Stretch. This module contains the code to do various stretching
  8. * by applying a kernel filter. The code correctly handles minification.
  9. *
  10. * Note:
  11. * This module is not compiled into an .obj file, rather it is included
  12. * directly into the header file stretch.hpp.
  13. * This is due to the use of template functions.
  14. *
  15. *
  16. * Notes:
  17. *
  18. * This code does not handle rotation or shear.
  19. *
  20. * Created:
  21. *
  22. * 04/17/2000 asecchia
  23. * Created it.
  24. *
  25. **************************************************************************/
  26. #define LAST_K_UNUSED ((INT)0x7fffffff)
  27. const INT BicubicKernelShift = 7;
  28. const INT BicubicKernelSize = 1 << BicubicKernelShift;
  29. const FIX16 BicubicKernel[BicubicKernelSize+1] =
  30. {
  31. 65536, 65496, 65379, 65186, 64920, 64583, 64177, 63705,
  32. 63168, 62569, 61911, 61195, 60424, 59600, 58725, 57802,
  33. 56832, 55818, 54763, 53668, 52536, 51369, 50169, 48939,
  34. 47680, 46395, 45087, 43757, 42408, 41042, 39661, 38268,
  35. 36864, 35452, 34035, 32614, 31192, 29771, 28353, 26941,
  36. 25536, 24141, 22759, 21391, 20040, 18708, 17397, 16110,
  37. 14848, 13614, 12411, 11240, 10104, 9005, 7945, 6927,
  38. 5952, 5023, 4143, 3313, 2536, 1814, 1149, 544,
  39. 0, -496, -961, -1395, -1800, -2176, -2523, -2843,
  40. -3136, -3403, -3645, -3862, -4056, -4227, -4375, -4502,
  41. -4608, -4694, -4761, -4809, -4840, -4854, -4851, -4833,
  42. -4800, -4753, -4693, -4620, -4536, -4441, -4335, -4220,
  43. -4096, -3964, -3825, -3679, -3528, -3372, -3211, -3047,
  44. -2880, -2711, -2541, -2370, -2200, -2031, -1863, -1698,
  45. -1536, -1378, -1225, -1077, -936, -802, -675, -557,
  46. -448, -349, -261, -184, -120, -69, -31, -8,
  47. 0
  48. };
  49. const FIX16 SymmetricBicubicKernel[BicubicKernelSize * 2 + 1] =
  50. {
  51. 0,
  52. -8, -31, -69, -120, -184, -261,-349, -448,
  53. -557, -675, -802, -936, -1077, -1225, -1378, -1536,
  54. -1698, -1863, -2031, -2200, -2370, -2541, -2711, -2880,
  55. -3047, -3211, -3372, -3528, -3679, -3825, -3964, -4096,
  56. -4220, -4335, -4441, -4536, -4620, -4693, -4753, -4800,
  57. -4833, -4851, -4854, -4840, -4809, -4761, -4694, -4608,
  58. -4502, -4375, -4227, -4056, -3862, -3645, -3403, -3136,
  59. -2843, -2523, -2176, -1800, -1395, -961, -496,
  60. 0,
  61. 544, 1149, 1814, 2536, 3313, 4143, 5023, 5952,
  62. 6927, 7945,9005, 10104, 11240, 12411, 13614, 14848,
  63. 16110, 17397, 18708, 20040, 21391, 22759, 24141, 25536,
  64. 26941, 28353, 29771, 31192, 32614, 34035, 35452, 36864,
  65. 38268, 39661, 41042, 42408, 43757, 45087, 46395, 47680,
  66. 48939, 50169, 51369, 52536, 53668, 54763, 55818, 56832,
  67. 57802, 58725, 59600, 60424, 61195, 61911, 62569, 63168,
  68. 63705, 64177, 64583, 64920, 65186, 65379, 65496,
  69. 65536,
  70. 65496, 65379, 65186, 64920, 64583, 64177, 63705,
  71. 63168, 62569, 61911, 61195, 60424, 59600, 58725, 57802,
  72. 56832, 55818, 54763, 53668, 52536, 51369, 50169, 48939,
  73. 47680, 46395, 45087, 43757, 42408, 41042, 39661, 38268,
  74. 36864, 35452, 34035, 32614, 31192, 29771, 28353, 26941,
  75. 25536, 24141, 22759, 21391, 20040, 18708, 17397, 16110,
  76. 14848, 13614, 12411, 11240, 10104, 9005, 7945, 6927,
  77. 5952, 5023, 4143, 3313, 2536, 1814, 1149, 544,
  78. 0,
  79. -496, -961, -1395, -1800, -2176, -2523, -2843,
  80. -3136, -3403, -3645, -3862, -4056, -4227, -4375, -4502,
  81. -4608, -4694, -4761, -4809, -4840, -4854, -4851, -4833,
  82. -4800, -4753, -4693, -4620, -4536, -4441, -4335, -4220,
  83. -4096, -3964, -3825, -3679, -3528, -3372, -3211, -3047,
  84. -2880, -2711, -2541, -2370, -2200, -2031, -1863, -1698,
  85. -1536, -1378, -1225, -1077, -936, -802, -675, -557,
  86. -448, -349, -261, -184, -120, -69, -31, -8,
  87. 0
  88. };
  89. /*
  90. // Higher precision bicubic kernel - more data.
  91. // Commented out in case we eventually need it.
  92. const FIX16 BK[512+1] =
  93. {
  94. 0,
  95. -2, -8, -18, -31, -48, -69, -93, -120,
  96. -151, -184, -221, -261, -304, -349, -397, -448,
  97. -501, -557, -615, -675, -737, -802, -868, -936,
  98. -1006, -1077, -1150, -1225, -1301, -1378, -1457, -1536,
  99. -1616, -1698, -1780, -1863, -1947, -2031, -2115, -2200,
  100. -2285, -2370, -2456, -2541, -2626, -2711, -2796, -2880,
  101. -2964, -3047, -3129, -3211, -3292, -3372, -3450, -3528,
  102. -3604, -3679, -3753, -3825, -3895, -3964, -4031, -4096,
  103. -4159, -4220, -4279, -4335, -4389, -4441, -4490, -4536,
  104. -4580, -4620, -4658, -4693, -4725, -4753, -4778, -4800,
  105. -4818, -4833, -4844, -4851, -4854, -4854, -4849, -4840,
  106. -4827, -4809, -4787, -4761, -4730, -4694, -4654, -4608,
  107. -4557, -4502, -4441, -4375, -4304, -4227, -4144, -4056,
  108. -3962, -3862, -3757, -3645, -3527, -3403, -3273, -3136,
  109. -2993, -2843, -2686, -2523, -2353, -2176, -1991, -1800,
  110. -1601, -1395, -1182, -961, -732, -496, -252,
  111. 0,
  112. 264, 544, 839, 1149, 1474, 1814, 2168, 2536,
  113. 2918, 3313, 3722, 4143, 4577, 5023, 5482, 5952,
  114. 6434, 6927, 7430, 7945, 8470, 9005, 9550, 10104,
  115. 10668, 11240, 11821, 12411, 13009, 13614, 14228, 14848,
  116. 15475, 16110, 16750, 17397, 18050, 18708, 19371, 20040,
  117. 20713, 21391, 22073, 22759, 23449, 24141, 24837, 25536,
  118. 26237, 26941, 27646, 28353, 29061, 29771, 30481, 31192,
  119. 31903, 32614, 33325, 34035, 34744, 35452, 36159, 36864,
  120. 37567, 38268, 38966, 39661, 40353, 41042, 41727, 42408,
  121. 43085, 43757, 44425, 45087, 45744, 46395, 47041, 47680,
  122. 48313, 48939, 49557, 50169, 50773, 51369, 51957, 52536,
  123. 53107, 53668, 54220, 54763, 55296, 55818, 56331, 56832,
  124. 57322, 57802, 58269, 58725, 59169, 59600, 60018, 60424,
  125. 60816, 61195, 61560, 61911, 62248, 62569, 62876, 63168,
  126. 63444, 63705, 63949, 64177, 64388, 64583, 64760, 64920,
  127. 65062, 65186, 65292, 65379, 65447, 65496, 65526,
  128. 65536,
  129. 65526, 65496, 65447, 65379, 65292, 65186, 65062, 64920,
  130. 64760, 64583, 64388, 64177, 63949, 63705, 63444, 63168,
  131. 62876, 62569, 62248, 61911, 61560, 61195, 60816, 60424,
  132. 60018, 59600, 59169, 58725, 58269, 57802, 57322, 56832,
  133. 56331, 55818, 55296, 54763, 54220, 53668, 53107, 52536,
  134. 51957, 51369, 50773, 50169, 49557, 48939, 48313, 47680,
  135. 47041, 46395, 45744, 45087, 44425, 43757, 43085, 42408,
  136. 41727, 41042, 40353, 39661, 38966, 38268, 37567, 36864,
  137. 36159, 35452, 34744, 34035, 33325, 32614, 31903, 31192,
  138. 30481, 29771, 29061, 28353, 27646, 26941, 26237, 25536,
  139. 24837, 24141, 23449, 22759, 22073, 21391, 20713, 20040,
  140. 19371, 18708, 18050, 17397, 16750, 16110, 15475, 14848,
  141. 14228, 13614, 13009, 12411, 11821, 11240, 10668, 10104,
  142. 9550, 9005, 8470, 7945, 7430, 6927, 6434, 5952,
  143. 5482, 5023, 4577, 4143, 3722, 3313, 2918, 2536,
  144. 2168, 1814, 1474, 1149, 839, 544, 264,
  145. 0,
  146. -252, -496, -732, -961, -1182, -1395, -1601, -1800,
  147. -1991, -2176, -2353, -2523, -2686, -2843, -2993, -3136,
  148. -3273, -3403, -3527, -3645, -3757, -3862, -3962, -4056,
  149. -4144, -4227, -4304, -4375, -4441, -4502, -4557, -4608,
  150. -4654, -4694, -4730, -4761, -4787, -4809, -4827, -4840,
  151. -4849, -4854, -4854, -4851, -4844, -4833, -4818, -4800,
  152. -4778, -4753, -4725, -4693, -4658, -4620, -4580, -4536,
  153. -4490, -4441, -4389, -4335, -4279, -4220, -4159, -4096,
  154. -4031, -3964, -3895, -3825, -3753, -3679, -3604, -3528,
  155. -3450, -3372, -3292, -3211, -3129, -3047, -2964, -2880,
  156. -2796, -2711, -2626, -2541, -2456, -2370, -2285, -2200,
  157. -2115, -2031, -1947, -1863, -1780, -1698, -1616, -1536,
  158. -1457, -1378, -1301, -1225, -1150, -1077, -1006, -936,
  159. -868, -802, -737, -675, -615, -557, -501, -448,
  160. -397, -349, -304, -261, -221, -184, -151, -120,
  161. -93, -69, -48, -31, -18, -8, -2,
  162. 0
  163. };
  164. // Bicubic kernel with the 'perceptual' coefficient tweaked
  165. // see Wolberg. Provides a slightly different experience.
  166. // Commented out in case we eventually need it.
  167. const FIX16 BK_V[512+1] =
  168. {
  169. 0,
  170. -4, -16, -35, -62, -96, -137, -185, -240,
  171. -301, -369, -442, -522, -607, -698, -795, -896,
  172. -1002, -1114, -1230, -1350, -1475, -1603, -1736, -1872,
  173. -2012, -2155, -2301, -2450, -2602, -2756, -2913, -3072,
  174. -3233, -3396, -3560, -3726, -3893, -4061, -4230, -4400,
  175. -4570, -4741, -4911, -5082, -5252, -5422, -5592, -5760,
  176. -5927, -6094, -6259, -6422, -6584, -6743, -6901, -7056,
  177. -7209, -7359, -7506, -7650, -7791, -7928, -8062, -8192,
  178. -8318, -8440, -8557, -8670, -8778, -8881, -8979, -9072,
  179. -9159, -9241, -9316, -9386, -9449, -9506, -9557, -9600,
  180. -9636, -9666, -9688, -9702, -9709, -9707, -9698, -9680,
  181. -9654, -9619, -9575, -9522, -9460, -9388, -9307, -9216,
  182. -9115, -9004, -8882, -8750, -8607, -8453, -8288, -8112,
  183. -7924, -7725, -7513, -7290, -7054, -6806, -6546, -6272,
  184. -5985, -5686, -5373, -5046, -4706, -4351, -3983, -3600,
  185. -3203, -2791, -2364, -1922, -1465, -992, -504,
  186. 0,
  187. 516, 1040, 1571, 2110, 2656, 3209, 3769, 4336,
  188. 4909, 5489, 6074, 6666, 7263, 7866, 8475, 9088,
  189. 9706, 10330, 10958, 11590, 12227, 12867, 13512, 14160,
  190. 14812, 15467, 16125, 16786, 17450, 18116, 18785, 19456,
  191. 20129, 20804, 21480, 22158, 22837, 23517, 24198, 24880,
  192. 25562, 26245, 26927, 27610, 28292, 28974, 29656, 30336,
  193. 31015, 31694, 32371, 33046, 33720, 34391, 35061, 35728,
  194. 36393, 37055, 37714, 38370, 39023, 39672, 40318, 40960,
  195. 41598, 42232, 42861, 43486, 44106, 44721, 45331, 45936,
  196. 46535, 47129, 47716, 48298, 48873, 49442, 50005, 50560,
  197. 51108, 51650, 52184, 52710, 53229, 53739, 54242, 54736,
  198. 55222, 55699, 56167, 56626, 57076, 57516, 57947, 58368,
  199. 58779, 59180, 59570, 59950, 60319, 60677, 61024, 61360,
  200. 61684, 61997, 62297, 62586, 62862, 63126, 63378, 63616,
  201. 63841, 64054, 64253, 64438, 64610, 64767, 64911, 65040,
  202. 65155, 65255, 65340, 65410, 65465, 65504, 65528,
  203. 65536,
  204. 65528, 65504, 65465, 65410, 65340, 65255, 65155, 65040,
  205. 64911, 64767, 64610, 64438, 64253, 64054, 63841, 63616,
  206. 63378, 63126, 62862, 62586, 62297, 61997, 61684, 61360,
  207. 61024, 60677, 60319, 59950, 59570, 59180, 58779, 58368,
  208. 57947, 57516, 57076, 56626, 56167, 55699, 55222, 54736,
  209. 54242, 53739, 53229, 52710, 52184, 51650, 51108, 50560,
  210. 50005, 49442, 48873, 48298, 47716, 47129, 46535, 45936,
  211. 45331, 44721, 44106, 43486, 42861, 42232, 41598, 40960,
  212. 40318, 39672, 39023, 38370, 37714, 37055, 36393, 35728,
  213. 35061, 34391, 33720, 33046, 32371, 31694, 31015, 30336,
  214. 29656, 28974, 28292, 27610, 26927, 26245, 25562, 24880,
  215. 24198, 23517, 22837, 22158, 21480, 20804, 20129, 19456,
  216. 18785, 18116, 17450, 16786, 16125, 15467, 14812, 14160,
  217. 13512, 12867, 12227, 11590, 10958, 10330, 9706, 9088,
  218. 8475, 7866, 7263, 6666, 6074, 5489, 4909, 4336,
  219. 3769, 3209, 2656, 2110, 1571, 1040, 516,
  220. 0,
  221. -504, -992, -1465, -1922, -2364, -2791, -3203, -3600,
  222. -3983, -4351, -4706, -5046, -5373, -5686, -5985, -6272,
  223. -6546, -6806, -7054, -7290, -7513, -7725, -7924, -8112,
  224. -8288, -8453, -8607, -8750, -8882, -9004, -9115, -9216,
  225. -9307, -9388, -9460, -9522, -9575, -9619, -9654, -9680,
  226. -9698, -9707, -9709, -9702, -9688, -9666, -9636, -9600,
  227. -9557, -9506, -9449, -9386, -9316, -9241, -9159, -9072,
  228. -8979, -8881, -8778, -8670, -8557, -8440, -8318, -8192,
  229. -8062, -7928, -7791, -7650, -7506, -7359, -7209, -7056,
  230. -6901, -6743, -6584, -6422, -6259, -6094, -5927, -5760,
  231. -5592, -5422, -5252, -5082, -4911, -4741, -4570, -4400,
  232. -4230, -4061, -3893, -3726, -3560, -3396, -3233, -3072,
  233. -2913, -2756, -2602, -2450, -2301, -2155, -2012, -1872,
  234. -1736, -1603, -1475, -1350, -1230, -1114, -1002, -896,
  235. -795, -698, -607, -522, -442, -369, -301,
  236. -240, -185, -137, -96, -62, -35, -16, -4,
  237. 0
  238. };
  239. */
  240. // This is the table of partial sums of the bilinear kernel.
  241. // Simply put, each point in the array represents the integral
  242. // from -infinity to position x in the kernel function.
  243. // We can subtract two table lookups to get the integral
  244. // of the kernel (area) between the two points.
  245. // The table is padded with zeros and ones at the beginning and end
  246. // so we can consistently address areas outside of the actual kernel
  247. // Currently we don't make use of the zeros at the beginning but
  248. // we definitely sample past the end by at least one half-width
  249. // of the kernel.
  250. const FIX16 BilinearPartialIntegral[512+1] =
  251. {
  252. 0, 0, 0, 0, 0, 0, 0, 0,
  253. 0, 0, 0, 0, 0, 0, 0, 0,
  254. 0, 0, 0, 0, 0, 0, 0, 0,
  255. 0, 0, 0, 0, 0, 0, 0, 0,
  256. 0, 0, 0, 0, 0, 0, 0, 0,
  257. 0, 0, 0, 0, 0, 0, 0, 0,
  258. 0, 0, 0, 0, 0, 0, 0, 0,
  259. 0, 0, 0, 0, 0, 0, 0, 0,
  260. 0, 0, 0, 0, 0, 0, 0, 0,
  261. 0, 0, 0, 0, 0, 0, 0, 0,
  262. 0, 0, 0, 0, 0, 0, 0, 0,
  263. 0, 0, 0, 0, 0, 0, 0, 0,
  264. 0, 0, 0, 0, 0, 0, 0, 0,
  265. 0, 0, 0, 0, 0, 0, 0, 0,
  266. 0, 0, 0, 0, 0, 0, 0, 0,
  267. 0, 0, 0, 0, 0, 0, 0, 0,
  268. 0,2, 8, 18, 32, 50, 72, 98,
  269. 128, 162, 200, 242, 288, 338, 392, 450,
  270. 512, 578, 648, 722, 800, 882, 968, 1058,
  271. 1152, 1250, 1352, 1458, 1568, 1682, 1800, 1922,
  272. 2048, 2178, 2312, 2450, 2592, 2738, 2888, 3042,
  273. 3200, 3362, 3528, 3698, 3872, 4050, 4232, 4418,
  274. 4608, 4802, 5000, 5202, 5408, 5618, 5832, 6050,
  275. 6272, 6498, 6728, 6962, 7200, 7442, 7688, 7938,
  276. 8192, 8450, 8712, 8978, 9248, 9522, 9800, 10082,
  277. 10368, 10658, 10952, 11250, 11552, 11858, 12168, 12482,
  278. 12800, 13122, 13448, 13778, 14112, 14450, 14792, 15138,
  279. 15488, 15842, 16200, 16562, 16928, 17298, 17672, 18050,
  280. 18432, 18818, 19208, 19602, 20000, 20402, 20808, 21218,
  281. 21632, 22050, 22472, 22898, 23328, 23762, 24200, 24642,
  282. 25088, 25538, 25992, 26450, 26912, 27378, 27848, 28322,
  283. 28800, 29282, 29768, 30258, 30752, 31250, 31752, 32258,
  284. 32768, // center of the kernel. Index 256
  285. 33278, 33784, 34286, 34784, 35278, 35768, 36254, 36736,
  286. 37214, 37688, 38158, 38624, 39086, 39544, 39998, 40448,
  287. 40894, 41336, 41774, 42208, 42638, 43064, 43486, 43904,
  288. 44318, 44728, 45134, 45536, 45934, 46328, 46718, 47104,
  289. 47486, 47864, 48238, 48608, 48974, 49336, 49694, 50048,
  290. 50398, 50744, 51086, 51424, 51758, 52088, 52414, 52736,
  291. 53054, 53368, 53678, 53984, 54286, 54584, 54878, 55168,
  292. 55454, 55736, 56014, 56288, 56558, 56824, 57086, 57344,
  293. 57598, 57848, 58094, 58336, 58574, 58808, 59038, 59264,
  294. 59486, 59704, 59918, 60128, 60334, 60536, 60734, 60928,
  295. 61118, 61304, 61486, 61664, 61838, 62008, 62174, 62336,
  296. 62494, 62648, 62798, 62944, 63086, 63224, 63358, 63488,
  297. 63614, 63736, 63854, 63968, 64078, 64184, 64286, 64384,
  298. 64478, 64568, 64654, 64736, 64814, 64888, 64958, 65024,
  299. 65086, 65144, 65198, 65248, 65294, 65336, 65374, 65408,
  300. 65438, 65464, 65486, 65504, 65518, 65528, 65534, 65536,
  301. 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536,
  302. 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536,
  303. 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536,
  304. 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536,
  305. 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536,
  306. 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536,
  307. 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536,
  308. 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536,
  309. 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536,
  310. 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536,
  311. 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536,
  312. 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536,
  313. 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536,
  314. 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536,
  315. 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536,
  316. 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536,
  317. };
  318. // This is the table of partial sums of the bicubic kernel.
  319. // Simply put, each point in the array represents the integral
  320. // from -infinity to position x in the kernel function.
  321. // We can subtract two table lookups to get the integral
  322. // of the kernel (area) between the two points.
  323. // The table is padded with zeros and ones at the beginning and end
  324. // so we can consistently address areas outside of the actual kernel
  325. // Currently we don't make use of the zeros at the beginning but
  326. // we definitely sample past the end by at least one half-width
  327. // of the kernel.
  328. const FIX16 BicubicPartialIntegral[1024+1] =
  329. {
  330. 0, 0, 0, 0, 0, 0, 0, 0,
  331. 0, 0, 0, 0, 0, 0, 0, 0,
  332. 0, 0, 0, 0, 0, 0, 0, 0,
  333. 0, 0, 0, 0, 0, 0, 0, 0,
  334. 0, 0, 0, 0, 0, 0, 0, 0,
  335. 0, 0, 0, 0, 0, 0, 0, 0,
  336. 0, 0, 0, 0, 0, 0, 0, 0,
  337. 0, 0, 0, 0, 0, 0, 0, 0,
  338. 0, 0, 0, 0, 0, 0, 0, 0,
  339. 0, 0, 0, 0, 0, 0, 0, 0,
  340. 0, 0, 0, 0, 0, 0, 0, 0,
  341. 0, 0, 0, 0, 0, 0, 0, 0,
  342. 0, 0, 0, 0, 0, 0, 0, 0,
  343. 0, 0, 0, 0, 0, 0, 0, 0,
  344. 0, 0, 0, 0, 0, 0, 0, 0,
  345. 0, 0, 0, 0, 0, 0, 0, 0,
  346. 0, 0, 0, 0, 0, 0, 0, 0,
  347. 0, 0, 0, 0, 0, 0, 0, 0,
  348. 0, 0, 0, 0, 0, 0, 0, 0,
  349. 0, 0, 0, 0, 0, 0, 0, 0,
  350. 0, 0, 0, 0, 0, 0, 0, 0,
  351. 0, 0, 0, 0, 0, 0, 0, 0,
  352. 0, 0, 0, 0, 0, 0, 0, 0,
  353. 0, 0, 0, 0, 0, 0, 0, 0,
  354. 0, 0, 0, 0, 0, 0, 0, 0,
  355. 0, 0, 0, 0, 0, 0, 0, 0,
  356. 0, 0, 0, 0, 0, 0, 0, 0,
  357. 0, 0, 0, 0, 0, 0, 0, 0,
  358. 0, 0, 0, 0, 0, 0, 0, 0,
  359. 0, 0, 0, 0, 0, 0, 0, 0,
  360. 0, 0, 0, 0, 0, 0, 0, 0,
  361. 0, 0, 0, 0, 0, 0, 0, 0,
  362. 0, 0, 0, 0, -1, -2, -3, -4,
  363. -6, -8, -11, -15, -19, -24, -29, -35,
  364. -42, -50, -59, -68, -79, -90, -103, -117,
  365. -131, -147, -164, -182, -201, -221, -243, -265,
  366. -289, -315, -341, -369, -398, -429, -460, -493,
  367. -528, -563, -600, -639, -679, -720, -762, -806,
  368. -851, -897, -945, -993, -1044, -1095, -1148, -1202,
  369. -1257, -1313, -1371, -1429, -1489, -1550, -1612, -1675,
  370. -1739, -1804, -1870, -1937, -2004, -2073, -2142, -2212,
  371. -2283, -2355, -2427, -2500, -2573, -2647, -2721, -2796,
  372. -2871, -2946, -3022, -3097, -3173, -3249, -3325, -3401,
  373. -3476, -3552, -3627, -3702, -3776, -3850, -3923, -3996,
  374. -4068, -4139, -4209, -4279, -4347, -4414, -4481, -4545,
  375. -4609, -4671, -4731, -4790, -4847, -4902, -4955, -5006,
  376. -5055, -5102, -5146, -5188, -5228, -5264, -5298, -5329,
  377. -5358, -5383, -5404, -5423, -5438, -5449, -5457, -5461,
  378. -5461, -5457, -5449, -5437, -5420, -5399, -5374, -5345,
  379. -5311, -5273, -5230, -5182, -5130, -5073, -5012, -4946,
  380. -4875, -4799, -4718, -4633, -4542, -4447, -4346, -4240,
  381. -4130, -4014, -3893, -3767, -3636, -3500, -3358, -3212,
  382. -3060, -2902, -2740, -2572, -2399, -2220, -2037, -1848,
  383. -1653, -1454, -1249, -1038, -822, -601, -375, -143,
  384. 94, 336, 584, 836, 1095, 1358, 1627, 1901,
  385. 2180, 2464, 2754, 3048, 3348, 3653, 3963, 4278,
  386. 4598, 4923, 5253, 5588, 5927, 6272, 6621, 6975,
  387. 7334, 7698, 8066, 8439, 8816, 9198, 9584, 9975,
  388. 10370, 10769, 11173, 11580, 11992, 12408, 12828, 13252,
  389. 13679, 14111, 14546, 14985, 15427, 15873, 16322, 16775,
  390. 17231, 17690, 18152, 18618, 19086, 19557, 20032, 20508,
  391. 20988, 21470, 21954, 22441, 22930, 23421, 23914, 24409,
  392. 24906, 25405, 25905, 26407, 26911, 27415, 27921, 28428,
  393. 28937, 29446, 29955, 30466, 30977, 31488, 32000, 32512,
  394. 33024, // center of the kernel. Index 512
  395. 33536, 34048, 34559, 35070, 35581, 36090, 36599, 37108,
  396. 37615, 38121, 38625, 39129, 39631, 40131, 40630, 41127,
  397. 41622, 42115, 42606, 43095, 43582, 44066, 44548, 45028,
  398. 45504, 45979, 46450, 46918, 47384, 47846, 48305, 48761,
  399. 49214, 49663, 50109, 50551, 50990, 51425, 51857, 52284,
  400. 52708, 53128, 53544, 53956, 54363, 54767, 55166, 55561,
  401. 55952, 56338, 56720, 57097, 57470, 57838, 58202, 58561,
  402. 58915, 59264, 59609, 59948, 60283, 60613, 60938, 61258,
  403. 61573, 61883, 62188, 62488, 62782, 63072, 63356, 63635,
  404. 63909, 64178, 64441, 64700, 64952, 65200, 65442, 65679,
  405. 65911, 66137, 66358, 66574, 66785, 66990, 67189, 67384,
  406. 67573, 67756, 67935, 68108, 68276, 68438, 68596, 68748,
  407. 68894, 69036, 69172, 69303, 69429, 69550, 69666, 69776,
  408. 69882, 69983, 70078, 70169, 70254, 70335, 70411, 70482,
  409. 70548, 70609, 70666, 70718, 70766, 70809, 70847, 70881,
  410. 70910, 70935, 70956, 70973, 70985, 70993, 70997, 70997,
  411. 70993, 70985, 70974, 70959, 70940, 70919, 70894, 70865,
  412. 70834, 70800, 70764, 70724, 70682, 70638, 70591, 70542,
  413. 70491, 70438, 70383, 70326, 70267, 70207, 70145, 70081,
  414. 70017, 69950, 69883, 69815, 69745, 69675, 69604, 69532,
  415. 69459, 69386, 69312, 69238, 69163, 69088, 69012, 68937,
  416. 68861, 68785, 68709, 68633, 68558, 68482, 68407, 68332,
  417. 68257, 68183, 68109, 68036, 67963, 67891, 67819, 67748,
  418. 67678, 67609, 67540, 67473, 67406, 67340, 67275, 67211,
  419. 67148, 67086, 67025, 66965, 66907, 66849, 66793, 66738,
  420. 66684, 66631, 66580, 66529, 66481, 66433, 66387, 66342,
  421. 66298, 66256, 66215, 66175, 66136, 66099, 66064, 66029,
  422. 65996, 65965, 65934, 65905, 65877, 65851, 65825, 65801,
  423. 65779, 65757, 65737, 65718, 65700, 65683, 65667, 65653,
  424. 65639, 65626, 65615, 65604, 65595, 65586, 65578, 65571,
  425. 65565, 65560, 65555, 65551, 65547, 65544, 65542, 65540,
  426. 65539, 65538, 65537, 65536, 65536, 65536, 65536, 65536,
  427. 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536,
  428. 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536,
  429. 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536,
  430. 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536,
  431. 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536,
  432. 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536,
  433. 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536,
  434. 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536,
  435. 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536,
  436. 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536,
  437. 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536,
  438. 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536,
  439. 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536,
  440. 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536,
  441. 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536,
  442. 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536,
  443. 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536,
  444. 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536,
  445. 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536,
  446. 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536,
  447. 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536,
  448. 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536,
  449. 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536,
  450. 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536,
  451. 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536,
  452. 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536,
  453. 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536,
  454. 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536,
  455. 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536,
  456. 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536,
  457. 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536,
  458. 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536,
  459. };
  460. // We use a biased pointer to the center of the array
  461. // so that we can look up the negative part of the kernel
  462. // without repositioning the index or using an absolute value
  463. // computation in the inner loop.
  464. // Linear Partial Integral Center.
  465. const FIX16 *LPIC = &BilinearPartialIntegral[256];
  466. // Cubic Partial Integral Center.
  467. const FIX16 *CPIC = &BicubicPartialIntegral[512];
  468. const FIX16 *SymmetricBicubicKernelCenter = &SymmetricBicubicKernel[128];
  469. const ULONGLONG FIX14_HALF_MMX = 0x0000200000002000;
  470. /**************************************************************************
  471. *
  472. * Function Description:
  473. *
  474. * Constructor for the DpOutputSpanStretch class.
  475. *
  476. * Return Value:
  477. *
  478. * NONE
  479. *
  480. * Created:
  481. *
  482. * 04/17/2000 asecchia
  483. * Created it.
  484. *
  485. **************************************************************************/
  486. #define FIX4TOFIX16_SHIFT (FIX16_SHIFT - FIX4_SHIFT)
  487. template<FilterModeType FilterMode>
  488. void DpOutputSpanStretch<FilterMode>::InitializeClass(
  489. DpBitmap* bitmap,
  490. DpScanBuffer * scan,
  491. DpContext* /*context*/,
  492. DpImageAttributes imgAttributes,
  493. const GpRectF *dstRect,
  494. const GpRectF *srcRect
  495. )
  496. {
  497. isValid = true;
  498. // Make sure these get initialized up front before we can early out
  499. // otherwise we could end up freeing uninitialized pointers in our
  500. // destructor.
  501. ycoeff = NULL;
  502. xbuffer = NULL;
  503. Scan = scan;
  504. dBitmap = bitmap;
  505. QWrapMode = imgAttributes.wrapMode;
  506. ClampColor = imgAttributes.clampColor;
  507. ClampColorA = (BYTE)( (ClampColor >> 24) );
  508. ClampColorR = (BYTE)( (ClampColor >> 16) & 0xff);
  509. ClampColorG = (BYTE)( (ClampColor >> 8) & 0xff);
  510. ClampColorB = (BYTE)( ClampColor & 0xff);
  511. // Accleration for clamp mode with zero clamp color (transparent)
  512. WrapZeroClamp = FALSE;
  513. if((QWrapMode == WrapModeClamp) &&
  514. (imgAttributes.clampColor == 0))
  515. {
  516. WrapZeroClamp = TRUE;
  517. }
  518. ASSERT(dBitmap != NULL);
  519. ASSERT(dBitmap->IsValid());
  520. // on bad bitmap, we return with Valid = FALSE
  521. if (dBitmap == NULL ||
  522. !dBitmap->IsValid()
  523. )
  524. {
  525. dBitmap = NULL;
  526. isValid = false;
  527. return;
  528. } else {
  529. BmpData.Width = dBitmap->Width;
  530. BmpData.Height = dBitmap->Height;
  531. BmpData.PixelFormat = PIXFMT_32BPP_PARGB;
  532. BmpData.Stride = dBitmap->Delta;
  533. BmpData.Scan0 = dBitmap->Bits;
  534. }
  535. if(srcRect)
  536. SrcRect = *srcRect;
  537. else
  538. {
  539. SrcRect.X = 0.0f;
  540. SrcRect.Y = 0.0f;
  541. SrcRect.Width = (REAL)dBitmap->Width;
  542. SrcRect.Height = (REAL) dBitmap->Height;
  543. }
  544. // Set up the translation.
  545. if(dstRect)
  546. {
  547. DstRect = *dstRect;
  548. }
  549. else
  550. {
  551. DstRect.X = 0.0f;
  552. DstRect.Y = 0.0f;
  553. DstRect.Width = (REAL)SrcRect.Width;
  554. DstRect.Height = (REAL)SrcRect.Height;
  555. }
  556. if( !GpValidFixed16(SrcRect.X) ||
  557. !GpValidFixed16(SrcRect.Y) ||
  558. !GpValidFixed16(SrcRect.Width) ||
  559. !GpValidFixed16(SrcRect.Height) ||
  560. !GpValidFixed16(DstRect.X) ||
  561. !GpValidFixed16(DstRect.Y) ||
  562. !GpValidFixed16(DstRect.Width) ||
  563. !GpValidFixed16(DstRect.Height) )
  564. {
  565. // punt
  566. isValid = false;
  567. return;
  568. }
  569. // Initialize the state for the x-dimension scale.
  570. xscale = GpRealToFix16(SrcRect.Width/DstRect.Width);
  571. xscaleinv = GpRealToFix16(DstRect.Width/SrcRect.Width);
  572. // Initialize the state for the y-dimension scale.
  573. yscale = GpRealToFix16(SrcRect.Height/DstRect.Height);
  574. yscaleinv = GpRealToFix16(DstRect.Height/SrcRect.Height);
  575. // Compute the destination contribution.
  576. // Note: the actual pixels touched are the floor of
  577. // the top left to the ceiling of the bottom right.
  578. // (modulus the clipping)
  579. // Note: We want to be tracking our internal state in FIX16 so we have
  580. // the extra fractional precision, but when we compute our bounds for the
  581. // drawing, we use Ceiling and Floor on these FIX16 numbers below. We want
  582. // the rounding to match the rounding of the FIX4 numbers (i.e. we don't
  583. // want to track any extra fractional precision errors from the float
  584. // representation) because we use FIX4 in our DrawImage loop.
  585. // To accomplish this, we round to FIX4 dropping all error that is smaller
  586. // than the FIX4 precision and then upconvert to FIX16. Now when we use
  587. // Fix16Ceiling and Floor, we'll get the same results as Fix4Ceiling and
  588. // Floor.
  589. REAL xinv = DstRect.Width/SrcRect.Width;
  590. REAL yinv = DstRect.Height/SrcRect.Height;
  591. fixDLeft = GpRealToFix4(DstRect.X);
  592. fixDRight = GpRealToFix4(xinv * (SrcRect.Width) + DstRect.X);
  593. fixDTop = GpRealToFix4(DstRect.Y);
  594. fixDBottom = GpRealToFix4(yinv * (SrcRect.Height) + DstRect.Y);
  595. // Handle negative scale
  596. FIX16 fixTemp;
  597. if(fixDLeft > fixDRight)
  598. {
  599. // Swap the left and right x coordinates.
  600. fixTemp = fixDLeft;
  601. fixDLeft = fixDRight;
  602. fixDRight = fixTemp;
  603. }
  604. if(fixDTop > fixDBottom)
  605. {
  606. // Swap the top and bottom x coordinates.
  607. fixTemp = fixDTop;
  608. fixDTop = fixDBottom;
  609. fixDBottom = fixTemp;
  610. }
  611. // Compute the left edge using the rasterizer rounding rules. Used
  612. // for clipping in x.
  613. ixleft = GpFix4Ceiling(fixDLeft);
  614. // Convert up to FIX16.
  615. fixDLeft <<= FIX4TOFIX16_SHIFT;
  616. fixDRight <<= FIX4TOFIX16_SHIFT;
  617. fixDTop <<= FIX4TOFIX16_SHIFT;
  618. fixDBottom <<= FIX4TOFIX16_SHIFT;
  619. // Get the initial kernel center. This specifies the x-dimension
  620. // fractional pixel offset.
  621. if(xscale < 0)
  622. {
  623. xkci = GpRealToFix16(
  624. (((DstRect.X+DstRect.Width) - GpFix16Ceiling(fixDRight)) *
  625. (xscale)) / FIX16_ONE +
  626. SrcRect.X
  627. );
  628. }
  629. else
  630. {
  631. xkci = GpRealToFix16(
  632. ((DstRect.X - GpFix16Floor(fixDLeft)) *
  633. xscale) / FIX16_ONE +
  634. SrcRect.X
  635. );
  636. }
  637. // Get the width of the kernel.
  638. // Make sure to multiply by the actual width of the filter kernel in
  639. // normalized space (FilterWidth[i])
  640. xw = GpRealToFix16(
  641. (SrcRect.Width*FilterWidth[FilterMode]) /
  642. DstRect.Width
  643. ); // convert to FIX16
  644. // Handle the negative transform
  645. if(xscale < 0)
  646. {
  647. xw = -xw;
  648. }
  649. // the width of the kernel must be a positive quantity.
  650. ASSERT(xw >= 0);
  651. // if the width is less than one we're doing a stretch, not a shrink.
  652. // in this case we clamp the kernel size to one.
  653. if(xw < FIX16_ONE * FilterWidth[FilterMode])
  654. {
  655. xw = FIX16_ONE * FilterWidth[FilterMode];
  656. }
  657. // a is 1/w - used to work out the tent filter.
  658. xa = GpRealToFix16(65536.0f/xw);
  659. // Get the initial kernel center. This specifies the y-dimension
  660. // fractional pixel offset.
  661. if(yscale < 0)
  662. {
  663. ykci = GpRealToFix16(
  664. ((GpFix16Ceiling(fixDBottom) - (DstRect.Y+DstRect.Height)) *
  665. (-yscale)) / FIX16_ONE +
  666. SrcRect.Y
  667. );
  668. }
  669. else
  670. {
  671. ykci = GpRealToFix16(
  672. ((GpFix16Floor(fixDTop) - DstRect.Y) *
  673. yscale) / FIX16_ONE +
  674. SrcRect.Y
  675. );
  676. }
  677. // Get the width of the kernel.
  678. // Make sure to multiply by the actual width of the filter kernel in
  679. // normalized space (FilterWidth[i])
  680. yw = GpRealToFix16(
  681. (SrcRect.Height * FilterWidth[FilterMode]) /
  682. DstRect.Height
  683. ); // Convert to FIX16
  684. // Handle the negative transform
  685. if(yscale < 0)
  686. {
  687. yw = -yw;
  688. }
  689. // the width of the kernel must be a positive quantity.
  690. ASSERT(yw >= 0);
  691. // if the kernel width is less than one we're doing a stretch, not
  692. // a shrink. In this case we clamp the kernel size to one.
  693. if(yw < (FIX16_ONE * FilterWidth[FilterMode]))
  694. {
  695. yw = FIX16_ONE * FilterWidth[FilterMode];
  696. }
  697. // a is 1/w - used to work out the tent filter.
  698. ya = GpRealToFix16(65536.0f/yw);
  699. // !!! [asecchia] The rounding used here should match the rounding used to compute
  700. // the parameters to StretchBitsMainLoop.
  701. iytop = GpFix16Floor(fixDTop);
  702. // Compute the width of one scanline in the destination.
  703. xbuffer_width = GpFix16Ceiling(fixDRight) - GpFix16Floor(fixDLeft);
  704. ASSERT(xbuffer_width >= 0);
  705. xbuffer_height = GpFix16Ceiling(yw)*2+1;
  706. ASSERT(xbuffer_height >= 0);
  707. // set the rotational array to start at the first scanline.
  708. xbuffer_start_scanline = 0;
  709. // allocate the xbuffer.
  710. // !!! PERF [asecchia]. Ouch this is ugly.
  711. // we should at least try use a stack buffer for small images.
  712. // Maybe a lookaside list or something.
  713. xbuffer = (ARGB *)GpMalloc(xbuffer_height*xbuffer_width*sizeof(ARGB));
  714. // ycoeff needs to have 2 entries more than xbuffer_height because
  715. // it may be reused to store the MMX coefficients (see OutputSpan
  716. // routine for details).
  717. ycoeff = (FIX16 *)GpMalloc((xbuffer_height + 2) * sizeof(FIX16));
  718. if((NULL == ycoeff) || (NULL == xbuffer))
  719. {
  720. isValid = false;
  721. GpFree(xbuffer);
  722. GpFree(ycoeff);
  723. // Make sure these get initialized to NULL before we can early out
  724. // otherwise we could end up double freeing the pointers in our
  725. // destructor.
  726. xbuffer = NULL;
  727. ycoeff = NULL;
  728. return;
  729. }
  730. // set the initial value of last_k to maxint
  731. last_k = LAST_K_UNUSED;
  732. }
  733. /**************************************************************************\
  734. *
  735. * Function Description:
  736. *
  737. * This function performs a 1d stretch using the tent filter
  738. *
  739. * Arguments:
  740. *
  741. * dst - destination buffer
  742. * src - source pixels
  743. * dw - destination width in pixels
  744. * sw - source width in pixels
  745. * kci - the initial kernel centering position (for fractional translate)
  746. * scale - the scale of the filter - sw/dw
  747. * w - the width of the filter kernel - typically the ceiling of sw/dw
  748. * a - 1/w
  749. *
  750. * History:
  751. * 04/16/2000 asecchia created it.
  752. *
  753. \**************************************************************************/
  754. // !!! Perf [asecchia] For really complicated wrapmodes where many of the
  755. // pixels are outside of the source and hence need to be wrapped, it may
  756. // make more sense to copy the source into an extended buffer and pre-wrap
  757. // the end points (i.e. overallocate) for each scanline.
  758. // This could simplify the code for the complex wrap conditions.
  759. // However, for the simple codepath, this would give an extra copy per
  760. // pixel and might not be worth it.
  761. // Ick. Why does the compiler do a better job of optimizing macros?
  762. // These should really be inline function calls.
  763. #define ClampColors() \
  764. if(FilterMode == HighQualityBilinear) \
  765. { \
  766. ta = GpFix16Round(ta); \
  767. tr = GpFix16Round(tr); \
  768. tg = GpFix16Round(tg); \
  769. tb = GpFix16Round(tb); \
  770. if(ta>255) ta = 255; \
  771. if(tr>255) tr = 255; \
  772. if(tg>255) tg = 255; \
  773. if(tb>255) tb = 255; \
  774. } \
  775. if(FilterMode == HighQualityBicubic) \
  776. { \
  777. ta = GpFix16Round(ta); \
  778. tr = GpFix16Round(tr); \
  779. tg = GpFix16Round(tg); \
  780. tb = GpFix16Round(tb); \
  781. if(ta>255) ta = 255; \
  782. if(tr>ta) tr = ta; \
  783. if(tg>ta) tg = ta; \
  784. if(tb>ta) tb = ta; \
  785. if(ta<0) ta = 0; \
  786. if(tr<0) tr = 0; \
  787. if(tg<0) tg = 0; \
  788. if(tb<0) tb = 0; \
  789. }
  790. // Compute the kernel in the inner loop
  791. // Note: the If statements are compiled away in the final code
  792. // because they are template variable comparisons which can be
  793. // done at compile time.
  794. // This macro looks up the new kernel value, subtracts the old one
  795. // to get the area of contribution for this pixel, computes the
  796. // new kernel position and stores the current table lookup.
  797. #define ComputeKernel(pc, a, pa, pa_old, krn) \
  798. if(FilterMode == HighQualityBilinear) \
  799. { \
  800. pa = LPIC[krn >> 9]; \
  801. pc = pa-pa_old; \
  802. krn += (a); \
  803. pa_old = pa; \
  804. } \
  805. if(FilterMode == HighQualityBicubic) \
  806. { \
  807. pa = CPIC[krn >> 8]; \
  808. pc = pa-pa_old; \
  809. krn += (a); \
  810. pa_old = pa; \
  811. }
  812. // This block of code accumulates the individual channels from
  813. // kptr into the accumulation buffers tb, tg, tr, and ta.
  814. #define AccumulateChannels(pc, kptr) \
  815. { \
  816. tb += pc * kptr[0]; \
  817. tg += pc * kptr[1]; \
  818. tr += pc * kptr[2]; \
  819. ta += pc * kptr[3]; \
  820. }
  821. inline void Apply1DWrapModeX(INT WrapMode, INT &x, INT w)
  822. {
  823. INT xm;
  824. switch(WrapMode) {
  825. case WrapModeTileFlipY:
  826. case WrapModeTile:
  827. x = RemainderI(x, w);
  828. break;
  829. case WrapModeTileFlipX:
  830. case WrapModeTileFlipXY:
  831. xm = RemainderI(x, w);
  832. if(((x-xm)/w) & 1) {
  833. x = w-1-xm;
  834. }
  835. else
  836. {
  837. x = xm;
  838. }
  839. break;
  840. default:
  841. // Caller should correctly anticipate other wrap modes.
  842. ASSERT(FALSE);
  843. break;
  844. }
  845. }
  846. inline void Apply1DWrapModeY(INT WrapMode, INT &y, INT h)
  847. {
  848. INT ym;
  849. switch(WrapMode) {
  850. case WrapModeTile:
  851. case WrapModeTileFlipX:
  852. y = RemainderI(y, h);
  853. break;
  854. case WrapModeTileFlipY:
  855. case WrapModeTileFlipXY:
  856. ym = RemainderI(y, h);
  857. if(((y-ym)/h) & 1) {
  858. y = h-1-ym;
  859. }
  860. else
  861. {
  862. y = ym;
  863. }
  864. break;
  865. default:
  866. // Caller should correctly anticipate other wrap modes.
  867. ASSERT(FALSE);
  868. break;
  869. }
  870. }
  871. #undef RemainderI
  872. /**************************************************************************
  873. *
  874. * Function Description:
  875. *
  876. * Outputs the middle pixels in a 2:1 stretched scanline. Note that
  877. * this function doesn't need to handle wrap modes.
  878. *
  879. * Note: this function must not use floating point values, because it could be
  880. * called with an invalid floating point state (prior to the call to emms)
  881. *
  882. * Arguments:
  883. *
  884. * dst - The first pixel to be output
  885. * src - The first pixel in the source that will affect the destination
  886. * pixel in a bicubic 2:1 stretch
  887. * dw - The number of pixels in the destination
  888. * kci - The subpixel shift in the position of the destination pixels
  889. *
  890. **************************************************************************/
  891. void DpOutputSpanStretch<HighQualityBilinear>::StretchMiddleScanline2_MMX(
  892. ARGB *dst,
  893. ARGB *src,
  894. INT dw,
  895. FIX16 kci
  896. )
  897. {
  898. ASSERT(FALSE);
  899. }
  900. void DpOutputSpanStretch<HighQualityBicubic>::StretchMiddleScanline2_MMX(
  901. ARGB *dst,
  902. ARGB *src,
  903. INT dw,
  904. FIX16 kci
  905. )
  906. {
  907. #if defined(_X86_)
  908. //
  909. // In order to store the kernel multipliers in 16bit registers, we
  910. // will lose the bottom 3 precision bits (hence each k[i] must be
  911. // right shifted by three). The summation of the kernel multipliers
  912. // should come to 16K, hence KERNEL_SHIFT_AMOUNT is 14.
  913. //
  914. #define KERNEL_SHIFT_AMOUNT 14
  915. FIX16 k[8];
  916. FIX16 kernelIncrement = FIX16_ONE >> 2 ;
  917. FIX16 kCurrent = (kci >> 2) - FIX16_ONE;
  918. for (INT i = 0; i < 8; i++)
  919. {
  920. ASSERT(kCurrent >= -FIX16_ONE);
  921. ASSERT(kCurrent <= FIX16_ONE);
  922. k[i] = SymmetricBicubicKernelCenter[kCurrent >> (FIX16_SHIFT-BicubicKernelShift)];
  923. k[i] >>= 3;
  924. kCurrent += kernelIncrement;
  925. }
  926. //
  927. // Setup 64bit aligned workspace for the MMX code
  928. //
  929. // 0 - zero
  930. // 8 - kernel multiplier 0
  931. // 16 - kernel multiplier 1
  932. // 24 - kernel multiplier 2
  933. // 32 - kernel multiplier 3
  934. // 40 - accumulator 3: g, b
  935. // 48 - accumulator 3: a, r
  936. // 56 - FIX14_HALF
  937. //
  938. #define BUFFER_SIZE 16
  939. INT buffer[BUFFER_SIZE + 1];
  940. INT *buffer_64bitAligned = (INT *) ((((UINT_PTR) buffer) + 4) & ~0x7);
  941. buffer_64bitAligned[0] = 0; // zero
  942. buffer_64bitAligned[1] = 0;
  943. buffer_64bitAligned[2] = (k[7] << 16) | (k[6] & 0xFFFF); // kernel multiplier 0
  944. buffer_64bitAligned[3] = buffer_64bitAligned[2];
  945. buffer_64bitAligned[4] = (k[5] << 16) | (k[4] & 0xFFFF); // kernel multiplier 1
  946. buffer_64bitAligned[5] = buffer_64bitAligned[4];
  947. buffer_64bitAligned[6] = (k[3] << 16) | (k[2] & 0xFFFF); // kernel multiplier 2
  948. buffer_64bitAligned[7] = buffer_64bitAligned[6];
  949. buffer_64bitAligned[8] = (k[1] << 16) | (k[0] & 0xFFFF); // kernel multiplier 3
  950. buffer_64bitAligned[9] = buffer_64bitAligned[8];
  951. buffer_64bitAligned[10] = 0; // Accumulator 3
  952. buffer_64bitAligned[11] = 0;
  953. buffer_64bitAligned[12] = 0;
  954. buffer_64bitAligned[13] = 0;
  955. buffer_64bitAligned[14] = (1 << (14 - 1)); // FIX14_HALF
  956. buffer_64bitAligned[15] = (1 << (14 - 1));
  957. //
  958. // eax - counter for the first loop
  959. // ebx - 0xffffffff
  960. // esi - source
  961. // edi - destination
  962. // ecx - counter
  963. // edx - 64it aligned workspace buffer
  964. //
  965. // mm6, mm7: accumulator 0
  966. // mm4, mm5: accumulator 1
  967. //
  968. _asm
  969. {
  970. mov ebx, 0xFFFFFFFF
  971. mov esi, src
  972. mov edi, dst
  973. mov ecx, dw
  974. mov edx, buffer_64bitAligned
  975. //
  976. // The first loop loads the initial values into the accumulators, but
  977. // doesn't write out any pixels. It executes exactly three times.
  978. //
  979. pxor mm4, mm4
  980. pxor mm5, mm5
  981. mov eax, 3
  982. loop1:
  983. //
  984. // Read expanded pixel values into mm0 and mm1
  985. //
  986. movd mm1, [esi] ; mm1 = 00000000a1r1g1b1
  987. movd mm2, [esi + 4] ; mm2 = 00000000a2r2g2b2
  988. add esi, 8
  989. punpcklbw mm1, [edx] ; mm1 = 00a100r100g100b1
  990. punpcklbw mm2, [edx] ; mm2 = 00a200r200g200b2
  991. movq mm0, mm1 ; mm0 = 00a100r100g100b1
  992. punpckhwd mm1, mm2 ; mm1 = 00a200a100r200r1
  993. punpcklwd mm0, mm2 ; mm0 = 00g200g100b200b1
  994. //
  995. // Add the contribution to accumulator 1
  996. //
  997. movq mm6, [edx + 16] ; kernel multiplier 1
  998. movq mm7, mm6 ; kernel multiplier 1
  999. pmaddwd mm6, mm0
  1000. pmaddwd mm7, mm1
  1001. paddd mm6, mm4
  1002. paddd mm7, mm5
  1003. //
  1004. // Add the contribution to accumulator 2
  1005. //
  1006. movq mm4, [edx + 24] ; kernel multiplier 2
  1007. movq mm5, mm4 ; kernel multiplier 2
  1008. pmaddwd mm4, mm0
  1009. pmaddwd mm5, mm1
  1010. paddd mm4, [edx + 40]
  1011. paddd mm5, [edx + 48]
  1012. //
  1013. // Compute the new third accumulator
  1014. //
  1015. pmaddwd mm0, [edx + 32] ; multiply by kernel multiplier 3
  1016. pmaddwd mm1, [edx + 32]
  1017. movq [edx + 40], mm0
  1018. movq [edx + 48], mm1
  1019. dec eax
  1020. jnz loop1
  1021. //
  1022. // The second loop continues to compute the accumulators, but
  1023. // also writes out destination pixels.
  1024. //
  1025. loop2:
  1026. //
  1027. // Read expanded pixel values into mm0 and mm1
  1028. //
  1029. movd mm1, [esi] ; mm1 = 00000000a1r1g1b1
  1030. movd mm2, [esi + 4] ; mm2 = 00000000a2r2g2b2
  1031. add esi, 8
  1032. punpcklbw mm1, [edx] ; mm1 = 00a100r100g100b1
  1033. punpcklbw mm2, [edx] ; mm2 = 00a200r200g200b2
  1034. movq mm0, mm1 ; mm0 = 00a100r100g100b1
  1035. punpckhwd mm1, mm2 ; mm1 = 00a200a100r200r1
  1036. punpcklwd mm0, mm2 ; mm0 = 00g200g100b200b1
  1037. //
  1038. // Add the contribution to accumulator 0
  1039. //
  1040. movq mm2, [edx + 8] ; mm2 = kernel multiplier 0
  1041. movq mm3, mm2 ; mm3 = kernel multiplier 0
  1042. pmaddwd mm2, mm0 ; mm2 = 0000gggg0000bbbb
  1043. pmaddwd mm3, mm1 ; mm3 = 0000aaaa0000rrrr
  1044. paddd mm6, mm2 ; add contributions to accumulator 0
  1045. paddd mm7, mm3
  1046. //
  1047. // Extract the pixel value from accumulator 0.
  1048. //
  1049. paddd mm6, [edx + 56] ; round
  1050. psrad mm6, KERNEL_SHIFT_AMOUNT
  1051. paddd mm7, [edx + 56]
  1052. psrad mm7, KERNEL_SHIFT_AMOUNT
  1053. packssdw mm6, mm7 ; mm6 = 00aa00rr00gg00bb
  1054. packuswb mm6, mm6 ; mm6 = 00000000aarrggbb
  1055. //
  1056. // Clip all channels to alpha
  1057. //
  1058. movd mm2, ebx ; mm2 = 00000000ffffffff
  1059. movq mm7, mm6 ; mm7 = 00000000aarrggbb
  1060. psrad mm7, 24 ; mm7 = 00000000000000aa
  1061. punpcklbw mm7, mm7 ; mm7 = 000000000000aaaa
  1062. punpcklbw mm7, mm7 ; mm7 = 00000000aaaaaaaa
  1063. psubusb mm2, mm7
  1064. paddusb mm6, mm2
  1065. psubusb mm6, mm2
  1066. movd [edi], mm6
  1067. add edi, 4
  1068. //
  1069. // Add the contribution to accumulator 1
  1070. //
  1071. movq mm6, [edx + 16] ; kernel multiplier 1
  1072. movq mm7, mm6 ; kernel multiplier 1
  1073. pmaddwd mm6, mm0
  1074. pmaddwd mm7, mm1
  1075. paddd mm6, mm4
  1076. paddd mm7, mm5
  1077. //
  1078. // Add the contribution to accumulator 2
  1079. //
  1080. movq mm4, [edx + 24] ; kernel multiplier 2
  1081. movq mm5, mm4 ; kernel multiplier 2
  1082. pmaddwd mm4, mm0
  1083. pmaddwd mm5, mm1
  1084. paddd mm4, [edx + 40]
  1085. paddd mm5, [edx + 48]
  1086. //
  1087. // Compute the new third accumulator
  1088. //
  1089. pmaddwd mm0, [edx + 32] ; multiply by kernel multiplier 3
  1090. pmaddwd mm1, [edx + 32]
  1091. movq [edx + 40], mm0
  1092. movq [edx + 48], mm1
  1093. dec ecx
  1094. jnz loop2
  1095. emms
  1096. }
  1097. #undef KERNEL_SHIFT_AMOUNT
  1098. #endif // defined(_X86_)
  1099. }
  1100. /**************************************************************************\
  1101. *
  1102. * Function Description:
  1103. *
  1104. * DpOutputSpanStretch<FilterMode>::StretchScanline
  1105. * Stretches a single scanline (magnification or minification) using
  1106. * the reconstruction/interpolation mode specified by the template
  1107. * parameter. Currently this is used for bilinear and bicubic filters.
  1108. *
  1109. * Arguments:
  1110. *
  1111. * ARGB *dst, // destination pointer
  1112. * ARGB *src, // source pointer
  1113. * INT dw, // destination width (pixels)
  1114. * INT sw, // source width (pixels)
  1115. * FIX16 kci, // initial position of the kernel center
  1116. * FIX16 scale, // scale factor
  1117. * FIX16 w, // width from center of the kernel to the edge
  1118. * FIX16 a, // 1/w
  1119. *
  1120. * Notes:
  1121. *
  1122. The following description is based on the bilinear (tent) filter but it is
  1123. equally applicable to the bicubic - though the pictures and description would
  1124. be slightly more complicated.
  1125. The code below is significantly complicated by the fact that we want the inner
  1126. kernel loop to be quick and therefore not handle the wrap modes. In order to
  1127. make this work, we first compute the number of pixels on the left and right
  1128. of the scanline that need to consider the wrap mode. We process the left first
  1129. and then run the optimized loop for all the inner pixels (which ignores the
  1130. wrap conditions). After that we run the right edge.
  1131. Bilinear filter convolution kernel:
  1132. Note that each kernel has an intrinsic width - bilinear = 1 and bicubic = 2.
  1133. This width is scaled by the inverse of the stretch factor - i.e. a shrink
  1134. that results in 1/3 of the size being output requires a width (w) of 3 for the
  1135. bilinear and 6 for the bicubic. Also the height of the filter kernel is scaled
  1136. by the scale factor - i.e. the height of 1 (for all kernels) becomes 1/3 in
  1137. the above example.
  1138. --- | --- ^
  1139. --- . | . --- |
  1140. --- . | . .--- h
  1141. --- . . | . . --- |
  1142. --- . . . | . . . --- |
  1143. ---. . . . | . . . .--- v
  1144. -----0-----1-----2-----3-----4--|--5-----6-----7-----8-----9------------
  1145. |
  1146. kb kc ke
  1147. <------------w----------->
  1148. The filter kernel is shifted so that kc is exactly at the position of the
  1149. required destination pixel transformed into the source pixel array by the
  1150. scale factor. This will in general fall somewhere between two pixel samples -
  1151. in the above picture, between pixels 4 and 5.
  1152. The goal is to get a color value for the position at kc and emit that into
  1153. the destination pixel stream. The standard evaluation method is to compute
  1154. the height of the filter kernel at each of the pixel samples under the filter
  1155. convolution corresponding to pixels 0, 1, ... 9. These heights are used to
  1156. weight each pixel sample and the result is summed giving the destination pixel
  1157. at kc.
  1158. The problem with the standard evaluation is that at non-integer shrinks
  1159. the mathematical evaluation of the kernel produces ripples in the output - i.e.
  1160. a solid field of pixels responds with a sine-wave-like ripple output. This is
  1161. a theoretical problem with the discrete evaluation of the kernel integral.
  1162. Our evaluation actually stores a table of partial integrals from -inf to x. We
  1163. use this table to compute the area around each pixel and the area is used as
  1164. the weight. This evaluation is guaranteed to respond with exactly one for any
  1165. position and scale factor of the kernel. This property gives a stable field
  1166. response allowing us to have non-ripple shrinks.
  1167. ---.: ---
  1168. ---.....: ---
  1169. --- :.....: ---
  1170. --- :.....: ---
  1171. --- :.....: ---
  1172. --- :.....: ---
  1173. -----0-----1-----2-----3-----4-----5-----6-----7-----8-----9------------
  1174. To evaluate this properly, we lookup the integral from -inf to 4.5 ( actually
  1175. we rescale so that the center of the kernel is at 0 ) and then subtract the
  1176. table lookup for the integral from -inf to 3.5. This gives us an exact
  1177. (within the error of the table) computation for the area from 3.5 to 4.5.
  1178. This is what we use for the weight of pixel 4. Note that contrary to the
  1179. standard evaluation pixel 9 does contribute even though 9 is outside of the
  1180. kernel. 8.5 is inside the kernel so the area under the kernel from 8.5 to 9.5
  1181. is a small triangular area and is not equal to zero. Not accounting for this is
  1182. the major source of error in the standard evaluation.
  1183. Note that the lookup for the end point integral for pixel 4 of -inf to 4.5 can
  1184. be reused as the start point for the next pixel (5). An important property of
  1185. this is that any error (e) in the lookup for -inf to 4.5 is added in pixel
  1186. 4's contribution and subtracted in pixel 5's contribution which results in
  1187. the total error for the filter response -- due to table discretization -- being
  1188. completely subtracted away --- the end points have an error of exactly zero
  1189. because we sample from beyond the left (area of exactly 0) to beyond the right
  1190. (area of exactly 1). This is not precisely true because the error is scaled
  1191. by the pixel values, but it does help.
  1192. Note that this integral method is equivalent to convolving the input pixels
  1193. (comb) with the box filter of width 1 pixel and then convolving the result
  1194. with the filter kernel. [analysis due to Jim Blinn - see documentation in
  1195. the Specs directory.]
  1196. Further documentation is available in the specs directory:
  1197. gdiplus\specs\filter\convolution.doc
  1198. * Note: this function must not use floating point values, because it could be
  1199. * called with an invalid floating point state (prior to the call to emms)
  1200. *
  1201. * History:
  1202. *
  1203. * 04/16/2000 asecchia created it
  1204. *
  1205. \**************************************************************************/
  1206. template<FilterModeType FilterMode>
  1207. void DpOutputSpanStretch<FilterMode>::StretchScanline(
  1208. ARGB *dst, // destination pointer
  1209. ARGB *src, // source pointer
  1210. INT dw, // destination width (pixels)
  1211. INT sw, // source width (pixels)
  1212. FIX16 kci, // initial position of the kernel center
  1213. FIX16 scale, // scale factor
  1214. FIX16 w, // width from center of the kernel to the edge
  1215. FIX16 a // 1/w
  1216. )
  1217. {
  1218. // Note: this is a template class so the value of FilterMode
  1219. // is defined at compile time. We're relying on the compiler
  1220. // to perform dead code removal for each template instantiation
  1221. // eliminating both the constant comparison and all the
  1222. // code branches corresponding to other FilterMode values.
  1223. // That way our inner loop is not impacted by extra code for
  1224. // filter modes we're not using and extraneous conditional
  1225. // statements.
  1226. // Center of the filter kernel.
  1227. // Shift over to the left by half because we want to center the area of
  1228. // contribution for each sample on the sample - rather than taking the
  1229. // area between two point samples as the contribution for the sample on
  1230. // the right.
  1231. FIX16 kc = kci - FIX16_HALF;
  1232. // Left and right extent of the kernel, intra-kernel position,
  1233. // and pixel contribution.
  1234. INT kb, ke;
  1235. INT kbt, ket;
  1236. FIX16 kp, pc, pa, pa_old;
  1237. // Loop variables
  1238. INT x, k;
  1239. // Incremental loop state, intermediate computation.
  1240. ARGB *d = dst;
  1241. FIX16 krn = 0;
  1242. // Color channel accumulators.
  1243. FIX16 ta, tr, tg, tb;
  1244. // Compute the first pixel along the destination scanline that doesn't
  1245. // have any wrap contribution and then the last pixel (l & r).
  1246. // Note that all the terms have a FIX16_ONE factor which cancel out.
  1247. // !!! Perf: [asecchia] This stuff is computed every scanline -
  1248. // and it's always the same. We could pass these coordinates to
  1249. // this routine and have them precomputed.
  1250. INT lWrapX;
  1251. INT rWrapX;
  1252. if(scale>=0)
  1253. {
  1254. // x==sw is considered outside of the source.
  1255. FIX16 fix_sw = (sw-1) << FIX16_SHIFT;
  1256. // add (scale-1) and use idiv to get a Ceiling()
  1257. lWrapX = (w-kc+(scale-1))/scale;
  1258. // idiv should give us Floor().
  1259. rWrapX = (fix_sw-w-kc)/scale;
  1260. }
  1261. else
  1262. {
  1263. // x==sw is considered outside of the source.
  1264. FIX16 fix_sw = (sw-1) << FIX16_SHIFT;
  1265. // note: in the -x scale transform, the sense of lWrapX and rWrapX
  1266. // can be confusing. The l&r apply to the destination left and right
  1267. // and are swapped here when we compute the initial position from
  1268. // the inverted left and right source points.
  1269. // As we traverse the destination from left to right we'll encounter
  1270. // lWrapX first and then rWrapX, but the kc (kernel center) will be
  1271. // moving through the source from right to left decrementing by
  1272. // scale each time.
  1273. // use idiv to get a Floor()
  1274. rWrapX = (w-kc)/scale;
  1275. // add scale+1 and use idiv for Ceiling().
  1276. lWrapX = (fix_sw-w-kc+(scale+1))/scale;
  1277. }
  1278. // Now clamp to the range of the destination we're going to draw.
  1279. lWrapX = max(0, lWrapX);
  1280. rWrapX = min(dw, rWrapX);
  1281. BYTE *kptr;
  1282. INT k_wrap;
  1283. // Do the left wrapmode pixels.
  1284. /*
  1285. --- | --- ^
  1286. --- | --- |
  1287. --- | --- h
  1288. --- | --- |
  1289. --- | --- |
  1290. --- | --- v
  1291. -----0-----1-----2-----3-----4--|--5-----6-----7-----8-----9------------
  1292. kb kc <-----------w---------->ke
  1293. kbt ket
  1294. <----wrap----><---------texture------><----wrap----->
  1295. +ve transform -ve transform
  1296. or straddle case or straddle case
  1297. The following loop handles the case where the wrap happens on the left of the
  1298. kernel. There are three subloops - first to handle the pixels in the wrap
  1299. segment on the left, then to handle the pixels in the texture. Normally the
  1300. texture pixels will extend to the right edge of the kernel and we'll be done,
  1301. but two cases make the right wrap essential at this point. First if the
  1302. transform is negative, the sense is flipped and the texture extends from the
  1303. left edge to the middle point and the wrap extends the rest of the kernel to
  1304. the right edge. Also if the texture is sufficiently small and the shrink factor
  1305. sufficiently large, the filter kernel could overlap both the left and right edge
  1306. of the texture and require wrapping on both sides.
  1307. */
  1308. for(x=0; x<min(lWrapX, dw); x++)
  1309. {
  1310. ASSERT(x<dw);
  1311. // Compute the start and end of the filter kernel coverage
  1312. kb = GpFix16Ceiling(kc-w);
  1313. ke = GpFix16Ceiling(kc+w);
  1314. // Bound the pixels in the texture.
  1315. // kbt == kernel begin texture coordinate.
  1316. // ket == kernel end texture coordinate.
  1317. kbt = max(0,kb);
  1318. ket = min(ke, sw-1);
  1319. // Initialize the component accumulators. We accumulate the
  1320. // contribution of each color component scaled by the kernel
  1321. // response into these variables.
  1322. ta = tr = tg = tb = 0;
  1323. // These pixels are off the left of the texture.
  1324. pa_old = 0;
  1325. // Iterate over each pixel under the filter kernel.
  1326. // if ke==kb then there is one point.
  1327. krn = Int32x32Mod16(a, (kb << FIX16_SHIFT) - kc);
  1328. if(QWrapMode == WrapModeClamp)
  1329. {
  1330. // Clamp modes.
  1331. for(k=kb; k<min(kbt, ke+1); k++)
  1332. {
  1333. // these pixels are always off the left of the texture.
  1334. ASSERT(k<0);
  1335. ComputeKernel(pc, a, pa, pa_old, krn);
  1336. ta += pc * ClampColorA;
  1337. tr += pc * ClampColorR;
  1338. tg += pc * ClampColorG;
  1339. tb += pc * ClampColorB;
  1340. }
  1341. }
  1342. else
  1343. {
  1344. // Do the full wrap computation.
  1345. for(k=kb; k<min(kbt, ke+1); k++)
  1346. {
  1347. // these pixels are always off the left of the texture.
  1348. k_wrap = k;
  1349. ASSERT(k<0);
  1350. // !!! Perf: [asecchia] This is really slow.
  1351. // If we ever decide to make wrap modes propagate
  1352. // through the outcrop region and decide that wrap
  1353. // tile and flip x,y are important perf scenarios,
  1354. // we should come back and replace this divide with
  1355. // repeated subtraction - most times it can be avoided.
  1356. // However, right now this is only used for a few
  1357. // pixels on the edges and we don't really mind the
  1358. // perf hit for these modes.
  1359. Apply1DWrapModeX(QWrapMode, k_wrap, sw);
  1360. ComputeKernel(pc, a, pa, pa_old, krn);
  1361. kptr = (BYTE*)(src + k_wrap);
  1362. AccumulateChannels(pc, kptr);
  1363. }
  1364. }
  1365. // Initialize the color channel accessor pointer to the beginning
  1366. // of the source pixel array for this kernel.
  1367. kptr = (BYTE*)(src + kbt);
  1368. // HighQualityBicubic needs to initialize the krn value.
  1369. // It is used to do the kernel table lookup.
  1370. // HighQualityBilinear doesn't use this as it works out it's
  1371. // kernel by direct computation.
  1372. krn = Int32x32Mod16(a, (kbt << FIX16_SHIFT) - kc);
  1373. // These pixels hit the texture.
  1374. for(k=kbt; k<=ket; k++)
  1375. {
  1376. ComputeKernel(pc, a, pa, pa_old, krn);
  1377. // Accumulate the contribution of this source pixel to the pixel
  1378. // we're working on.
  1379. AccumulateChannels(pc, kptr);
  1380. kptr += 4;
  1381. }
  1382. // These pixels are off the right of the texture.
  1383. // This can happen if the kernel spans the entire source texture.
  1384. // Iterate over each pixel under the filter kernel.
  1385. // if ke==kb then there is one point.
  1386. krn = Int32x32Mod16(a, (max(ket+1, kb) << FIX16_SHIFT) - kc);
  1387. if(QWrapMode == WrapModeClamp)
  1388. {
  1389. // Clamp modes.
  1390. for(k=max(ket+1, kb); k<=ke; k++)
  1391. {
  1392. ComputeKernel(pc, a, pa, pa_old, krn);
  1393. ta += pc * ClampColorA;
  1394. tr += pc * ClampColorR;
  1395. tg += pc * ClampColorG;
  1396. tb += pc * ClampColorB;
  1397. }
  1398. }
  1399. else
  1400. {
  1401. // Do the full wrap computation.
  1402. for(k=max(ket+1, kb); k<=ke; k++)
  1403. {
  1404. k_wrap = k;
  1405. Apply1DWrapModeX(QWrapMode, k_wrap, sw);
  1406. ComputeKernel(pc, a, pa, pa_old, krn);
  1407. kptr = (BYTE*)(src + k_wrap);
  1408. AccumulateChannels(pc, kptr);
  1409. }
  1410. }
  1411. // Done with this pixel - store it in the destination buffer.
  1412. // clamp the results to byte range.
  1413. ClampColors();
  1414. // Combine the channels, set the destination pixel and increment
  1415. // to the next pixel
  1416. *d++ = (ta << 24) | (tr << 16) | (tg << 8) | tb;
  1417. kc += scale;
  1418. }
  1419. // For all points, x, in the destination compute the position of the
  1420. // kernel center in the source and sum the contribution under the filter.
  1421. const INT minCenterWidthMMX = 16;
  1422. INT dstCenterWidth = rWrapX - lWrapX;
  1423. INT srcFirst = GpFix16Ceiling(kc - w);
  1424. INT srcLast = GpFix16Floor(kc+w + (dstCenterWidth - 1) * scale);
  1425. // srcLast_2Stretch is the last pixel touched by the MMX routine.
  1426. // The number of pixels touched by the routine is equal to six
  1427. // (setup pixels) plus two times the width of the center strip
  1428. // in the destination. We subtract one in order the get the actual
  1429. // last pixel touched by StretchMiddleScanline2_MMX (so that we can
  1430. // compare it with srcLast).
  1431. INT srcLast_2Stretch = srcFirst + (dstCenterWidth + 3) * 2 - 1;
  1432. #if defined(_X86_)
  1433. if ((OSInfo::HasMMX) &&
  1434. (FilterMode == HighQualityBicubic))
  1435. {
  1436. // MMX and high quality bicubic
  1437. if ((dstCenterWidth >= minCenterWidthMMX) &&
  1438. ((srcLast_2Stretch == srcLast) || (srcLast_2Stretch == (srcLast - 1))))
  1439. {
  1440. ASSERT(srcFirst >= 0);
  1441. ASSERT(srcLast_2Stretch < sw);
  1442. // Stretch the middle pixels by a factor of two using optimized MMX
  1443. // code.
  1444. FIX16 kc_center = kc + FIX16_HALF;
  1445. StretchMiddleScanline2_MMX(d,
  1446. src + srcFirst,
  1447. dstCenterWidth,
  1448. kc_center - (GpFix16Floor(kc_center) * FIX16_ONE));
  1449. d += dstCenterWidth;
  1450. kc += scale * dstCenterWidth;
  1451. x += dstCenterWidth;
  1452. }
  1453. else
  1454. {
  1455. // This is the MMX version of the general purpose bicubic scaling
  1456. // code.
  1457. for(x=lWrapX; x<rWrapX; x++)
  1458. {
  1459. // Cannot go over dw because rWrap is < dw
  1460. ASSERT(x<dw);
  1461. // Compute the start and end of the filter kernel coverage
  1462. kb = GpFix16Ceiling(kc-w);
  1463. ke = GpFix16Ceiling(kc+w);
  1464. // Default loop assumes most pixels don't have to worry about
  1465. // wrap mode along the ends of the scanline.
  1466. ASSERT(kb>=0);
  1467. ASSERT(ke<sw);
  1468. // Initialize the color channel accessor pointer to the beginning
  1469. // of the source pixel array for this kernel.
  1470. kptr = (BYTE*)(src + kb);
  1471. // HighQualityBicubic needs to initialize the krn value.
  1472. // It is used to do the kernel table lookup.
  1473. // HighQualityBilinear doesn't use this as it works out it's
  1474. // kernel by direct computation.
  1475. krn = Int32x32Mod16(a, (kb << FIX16_SHIFT) - kc);
  1476. // Iterate over each pixel under the filter kernel.
  1477. // if ke==kb then there is one point.
  1478. INT bcl_count = ke - kb + 1;
  1479. INT bcl_half_count = bcl_count >> 1;
  1480. bcl_count &= 0x1;
  1481. _asm
  1482. {
  1483. // eax - krn
  1484. // ebx - kptr
  1485. // esi - LPIC
  1486. // edi - a
  1487. //
  1488. // mm5 - pold
  1489. // mm6 - green ; blue
  1490. // mm7 - alpha ; red
  1491. mov eax, krn
  1492. mov ebx, kptr
  1493. mov esi, CPIC
  1494. mov edi, a
  1495. pxor mm5, mm5
  1496. movq mm6, FIX14_HALF_MMX
  1497. movq mm7, mm6
  1498. pxor mm0, mm0
  1499. dec bcl_half_count
  1500. jl bicubic_center_loop_last_pixel
  1501. bicubic_center_loop:
  1502. // Read the next two pixels into mm2 and mm1
  1503. movd mm2, [ebx] // mm2 = pixel1
  1504. movd mm1, [ebx + 4] // mm1 = pixel2
  1505. add ebx, 8
  1506. // Compute the kernel values for these two pixels
  1507. mov edx, eax
  1508. sar edx, 8
  1509. punpcklbw mm2, mm0
  1510. movd mm3, [esi + 4 * edx] // mm3 = p1
  1511. lea edx, [eax + edi]
  1512. sar edx, 8
  1513. punpcklbw mm1, mm0
  1514. movd mm4, [esi + 4 * edx] // mm4 = p2
  1515. punpckldq mm5, mm3 // mm5 = p1 | pold
  1516. lea eax, [eax + 2 * edi]
  1517. punpckldq mm3, mm4 // mm3 = p2 | p1
  1518. psrad mm5, 2
  1519. psrad mm3, 2
  1520. psubd mm3, mm5 // mm3 = kernel2 | kernel1
  1521. movq mm5, mm4 // mm5 = pold
  1522. packssdw mm3, mm3 // mm3 = kernel2 | kernel1 | kernel2 | kernel1
  1523. // At this point:
  1524. // mm3 = kernel2 | kernel1 | kernel2 | kernel1
  1525. // mm2, mm1 contain pixel1 and pixel2 respectively
  1526. movq mm4, mm2
  1527. punpcklwd mm2, mm1
  1528. pmaddwd mm2, mm3
  1529. punpckhwd mm4, mm1
  1530. paddd mm6, mm2
  1531. dec bcl_half_count
  1532. pmaddwd mm4, mm3
  1533. paddd mm7, mm4
  1534. jge bicubic_center_loop
  1535. bicubic_center_loop_last_pixel:
  1536. dec bcl_count
  1537. jl bicubic_center_loop_done
  1538. // Read the last pixel into mm2
  1539. movd mm2, [ebx]
  1540. punpcklbw mm2, mm0 // mm2 = a | r | g | b
  1541. movq mm3, mm2
  1542. punpcklwd mm2, mm0 // mm2 = 0 | g | 0 | b
  1543. punpckhwd mm3, mm0 // mm3 = 0 | a | 0 | r
  1544. // Compute the kernel value for this pixel
  1545. sar eax, 8
  1546. psrad mm5, 2
  1547. movd mm4, [esi + 4 * eax] // mm4 = p
  1548. psrad mm4, 2
  1549. psubd mm4, mm5
  1550. packssdw mm4, mm4
  1551. pmaddwd mm2, mm4
  1552. pmaddwd mm3, mm4
  1553. paddd mm6, mm2
  1554. paddd mm7, mm3
  1555. bicubic_center_loop_done:
  1556. // At this point, mm6 and mm7 contain the output channels
  1557. // for the pixel. We need to clamp the alpha and store it
  1558. // in the destination buffer.
  1559. psrad mm6, 14
  1560. psrad mm7, 14
  1561. packssdw mm6, mm7 // mm6 = a | r | g | b
  1562. packuswb mm6, mm6 // mm6 = 00000000aarrggbb
  1563. movq mm7, mm6 // mm7 = 00000000aarrggbb
  1564. psrad mm6, 24 // mm6 = xxxxxxxxxxxxxxaa
  1565. mov eax, 0xFFFFFFFF
  1566. punpcklbw mm6, mm6 // mm6 = xxxxxxxxxxxxaaaa
  1567. movd mm2, eax
  1568. punpcklbw mm6, mm6 // mm6 = xxxxxxxxaaaaaaaa
  1569. psubusb mm2, mm6
  1570. mov eax, d
  1571. paddusb mm7, mm2
  1572. psubusb mm7, mm2
  1573. movd [eax], mm7
  1574. add eax, 4
  1575. mov d, eax
  1576. }
  1577. kc += scale;
  1578. }
  1579. }
  1580. }
  1581. else
  1582. #endif // defined(_X86_)
  1583. /*
  1584. --- | --- ^
  1585. --- | --- |
  1586. --- | --- h
  1587. --- | --- |
  1588. --- | --- |
  1589. --- | --- v
  1590. -----0-----1-----2-----3-----4--|--5-----6-----7-----8-----9------------
  1591. kb kc <-----------w---------->ke
  1592. <-----------------------texture--------------------->
  1593. The following loop is guaranteed to only hit texture for every pixel under
  1594. the kernel. This is the majority of the pixels in most normal stretch
  1595. cases. We can simplify this loop because of this assumption and therefore
  1596. get a performance win.
  1597. Many of the degenerate wrap cases will simply skip this loop.
  1598. */
  1599. {
  1600. // no MMX
  1601. for(x=lWrapX; x<rWrapX; x++)
  1602. {
  1603. // Cannot go over dw because rWrap is < dw
  1604. ASSERT(x<dw);
  1605. // Compute the start and end of the filter kernel coverage
  1606. kb = GpFix16Ceiling(kc-w);
  1607. ke = GpFix16Ceiling(kc+w);
  1608. // Default loop assumes most pixels don't have to worry about
  1609. // wrap mode along the ends of the scanline.
  1610. ASSERT(kb>=0);
  1611. ASSERT(ke<sw);
  1612. // Initialize the component accumulators. We accumulate the
  1613. // contribution of each color component scaled by the kernel
  1614. // response into these variables.
  1615. ta = tr = tg = tb = 0;
  1616. // Initialize the color channel accessor pointer to the beginning
  1617. // of the source pixel array for this kernel.
  1618. kptr = (BYTE*)(src + kb);
  1619. // HighQualityBicubic needs to initialize the krn value.
  1620. // It is used to do the kernel table lookup.
  1621. // HighQualityBilinear doesn't use this as it works out it's
  1622. // kernel by direct computation.
  1623. krn = Int32x32Mod16(a, (kb << FIX16_SHIFT) - kc);
  1624. pa_old = 0;
  1625. // Iterate over each pixel under the filter kernel.
  1626. // if ke==kb then there is one point.
  1627. for(k=kb; k<=ke; k++)
  1628. {
  1629. ComputeKernel(pc, a, pa, pa_old, krn);
  1630. // Accumulate the contribution of this source pixel to the pixel
  1631. // we're working on.
  1632. AccumulateChannels(pc, kptr);
  1633. kptr += 4;
  1634. }
  1635. // Done with this pixel - store it in the destination buffer.
  1636. // clamp the results to byte range.
  1637. ClampColors();
  1638. ASSERT(tr<=ta);
  1639. ASSERT(tg<=ta);
  1640. ASSERT(tb<=ta);
  1641. ASSERT(ta>=0);
  1642. ASSERT(tr>=0);
  1643. ASSERT(tg>=0);
  1644. ASSERT(tb>=0);
  1645. // Combine the channels, set the destination pixel and increment
  1646. // to the next pixel
  1647. *d++ = (ta << 24) | (tr << 16) | (tg << 8) | tb;
  1648. kc += scale;
  1649. }
  1650. }
  1651. // Need to use max() here to handle the case where lWrapX > rWrapX
  1652. // which can happen if the filter spans both edges of the scanline.
  1653. // Do the right wrapmode pixels.
  1654. /*
  1655. --- | --- ^
  1656. --- | --- |
  1657. --- | --- h
  1658. --- | --- |
  1659. --- | --- |
  1660. --- | --- v
  1661. -----0-----1-----2-----3-----4--|--5-----6-----7-----8-----9------------
  1662. kb kc <-----------w---------->ke
  1663. kbt ket
  1664. <----wrap----><---------texture------><----wrap----->
  1665. -ve transform +ve tranform
  1666. case only case only
  1667. The following loop handles the case where the wrap happens on the right of the
  1668. kernel. There are three subloops - first to handle the pixels in the wrap
  1669. segment on the left - if any, then to handle the pixels in the texture. After
  1670. that handle the pixels in the right wrap. Normally the texture pixels will
  1671. extend to the left edge of the kernel and the first subloop will simply be
  1672. skipped, but the left wrap is essential if the transform is negative --- the
  1673. sense is flipped and the texture extends from the right edge to the middle
  1674. point and the wrap extends the rest of the kernel to the left edge.
  1675. Note it's not possible at this point to have wrapping at both edges of the
  1676. kernel the wrap is on the left iff the transform is negative. The wrap is on
  1677. the right iff the transform is positive. The case where both wrapmodes is
  1678. present has already been taken care of in the first loop.
  1679. */
  1680. for(x=max(x, rWrapX); x<dw; x++)
  1681. {
  1682. // Compute the start and end of the filter kernel coverage
  1683. kb = GpFix16Ceiling(kc-w);
  1684. ke = GpFix16Ceiling(kc+w);
  1685. // Bound the pixels in the texture.
  1686. // ket == kernel end texture coordinate (inclusive).
  1687. // kbt == kernel begin texture coordinate.
  1688. kbt = max(0,kb);
  1689. ket = min(ke, sw-1);
  1690. // Initialize the component accumulators. We accumulate the
  1691. // contribution of each color component scaled by the kernel
  1692. // response into these variables.
  1693. ta = tr = tg = tb = 0;
  1694. // Initialize the color channel accessor pointer to the beginning
  1695. // of the source pixel array for this kernel.
  1696. kptr = (BYTE*)(src + kb);
  1697. // HighQualityBicubic needs to initialize the krn value.
  1698. // It is used to do the kernel table lookup.
  1699. // HighQualityBilinear doesn't use this as it works out it's
  1700. // kernel by direct computation.
  1701. pa_old = 0;
  1702. if(kb<kbt)
  1703. {
  1704. krn = Int32x32Mod16(a, (kb << FIX16_SHIFT) - kc);
  1705. }
  1706. // Iterate over each pixel under the filter kernel.
  1707. // if ke==kb then there is one point.
  1708. // These pixels are off the left of the texture.
  1709. // This is possible for negative transform cases.
  1710. if(QWrapMode == WrapModeClamp)
  1711. {
  1712. // Clamp modes.
  1713. for(k=kb; k<min(kbt, ke+1); k++)
  1714. {
  1715. // these pixels are always off the left of the texture.
  1716. ASSERT(k<0);
  1717. ComputeKernel(pc, a, pa, pa_old, krn);
  1718. ta += pc * ClampColorA;
  1719. tr += pc * ClampColorR;
  1720. tg += pc * ClampColorG;
  1721. tb += pc * ClampColorB;
  1722. }
  1723. }
  1724. else
  1725. {
  1726. // Do the full wrap computation.
  1727. for(k=kb; k<min(kbt, ke+1); k++)
  1728. {
  1729. // these pixels are always off the left of the texture.
  1730. k_wrap = k;
  1731. ASSERT(k<0);
  1732. // !!! Perf: [asecchia] This is really slow.
  1733. // If we ever decide to make wrap modes propagate
  1734. // through the outcrop region and decide that wrap
  1735. // tile and flip x,y are important perf scenarios,
  1736. // we should come back and replace this divide with
  1737. // repeated subtraction - most times it can be avoided.
  1738. // However, right now this is only used for a few
  1739. // pixels on the edges and we don't really mind the
  1740. // perf hit for these modes.
  1741. Apply1DWrapModeX(QWrapMode, k_wrap, sw);
  1742. ComputeKernel(pc, a, pa, pa_old, krn);
  1743. kptr = (BYTE*)(src + k_wrap);
  1744. AccumulateChannels(pc, kptr);
  1745. }
  1746. }
  1747. // Initialize the color channel accessor pointer to the beginning
  1748. // of the source pixel array for this kernel.
  1749. kptr = (BYTE*)(src + kbt);
  1750. // HighQualityBicubic needs to initialize the krn value.
  1751. // It is used to do the kernel table lookup.
  1752. // HighQualityBilinear doesn't use this as it works out it's
  1753. // kernel by direct computation.
  1754. krn = Int32x32Mod16(a, (kbt << FIX16_SHIFT) - kc);
  1755. // These pixels hit the texture.
  1756. for(k=kbt; k<=ket; k++)
  1757. {
  1758. ComputeKernel(pc, a, pa, pa_old, krn);
  1759. // Accumulate the contribution of this source pixel to the pixel
  1760. // we're working on.
  1761. AccumulateChannels(pc, kptr);
  1762. kptr += 4;
  1763. }
  1764. // These pixels are off the right of the texture.
  1765. // Iterate over each pixel under the filter kernel.
  1766. // if ke==kb then there is one point.
  1767. krn = Int32x32Mod16(a, ((max(ket+1, kb)) << FIX16_SHIFT) - kc);
  1768. if(QWrapMode == WrapModeClamp)
  1769. {
  1770. // Clamp modes.
  1771. for(k=max(ket+1, kb); k<=ke; k++)
  1772. {
  1773. ComputeKernel(pc, a, pa, pa_old, krn);
  1774. ta += pc * ClampColorA;
  1775. tr += pc * ClampColorR;
  1776. tg += pc * ClampColorG;
  1777. tb += pc * ClampColorB;
  1778. }
  1779. }
  1780. else
  1781. {
  1782. // Do the full wrap computation.
  1783. for(k=max(ket+1, kb); k<=ke; k++)
  1784. {
  1785. // Apply the general pixel wrap
  1786. k_wrap = k;
  1787. Apply1DWrapModeX(QWrapMode, k_wrap, sw);
  1788. ComputeKernel(pc, a, pa, pa_old, krn);
  1789. kptr = (BYTE*)(src + k_wrap);
  1790. AccumulateChannels(pc, kptr);
  1791. }
  1792. }
  1793. // Done with this pixel - store it in the destination buffer.
  1794. // clamp the results to byte range.
  1795. ClampColors();
  1796. // Combine the channels, set the destination pixel and increment
  1797. // to the next pixel
  1798. *d++ = (ta << 24) | (tr << 16) | (tg << 8) | tb;
  1799. kc += scale;
  1800. }
  1801. #ifdef _X86_
  1802. if ((OSInfo::HasMMX) &&
  1803. (FilterMode == HighQualityBicubic))
  1804. {
  1805. _asm
  1806. {
  1807. emms
  1808. }
  1809. }
  1810. #endif // _X86_
  1811. }
  1812. /**************************************************************************
  1813. *
  1814. * Function Description:
  1815. *
  1816. * Outputs one scanline on the destination device
  1817. *
  1818. * Note: this function must not use floating point values because of
  1819. * potential conflicts with the MMX register values.
  1820. *
  1821. * Return Value:
  1822. *
  1823. * GpStatus. Always returns Ok.
  1824. * !!! [asecchia] are we going to remove this return value - these
  1825. * always return success.
  1826. *
  1827. * Created:
  1828. *
  1829. * 04/17/2000 asecchia
  1830. * Created it.
  1831. *
  1832. **************************************************************************/
  1833. template<FilterModeType FilterMode>
  1834. GpStatus DpOutputSpanStretch<FilterMode>::OutputSpan(
  1835. INT y,
  1836. INT xMin,
  1837. INT xMax // xMax is exclusive
  1838. )
  1839. {
  1840. ASSERT(isValid);
  1841. // This function assumes that it's called with a correctly ordered span.
  1842. ASSERT((xMax-xMin)>=0);
  1843. INT width = xMax-xMin;
  1844. // We can't have someone draw outside our specified destination.
  1845. // If this assert fires, we don't have enough buffer space to store the
  1846. // destination xscale so we'd overrun the buffer. The caller set us up
  1847. // with an incorrect destination rectangle or got their rounding wrong.
  1848. ASSERT(width <= xbuffer_width);
  1849. INT left = xMin;
  1850. INT right = xMax;
  1851. // If there's nothing to do, simply return.
  1852. if(right < left)
  1853. {
  1854. return Ok;
  1855. }
  1856. ASSERT(right >= left);
  1857. // Make sure the caller clipped correctly - we can't handle
  1858. // being called to draw outside out destination rectangle.
  1859. ASSERT(y >= iytop);
  1860. // Compute the kernel center for this y coordinate relative to the first
  1861. // y coordinate (y coordinate corresponding to DstRect.Y) and offset
  1862. // by the source rectangle.
  1863. FIX16 kc;
  1864. if(yscale < 0)
  1865. {
  1866. kc = ykci - (y - iytop) * (-yscale);
  1867. }
  1868. else
  1869. {
  1870. kc = ykci + (y - iytop) * yscale;
  1871. }
  1872. // Center of the filter kernel.
  1873. // Shift over to the left by half because we want to center the area of
  1874. // contribution for each sample on the sample - rather than taking the
  1875. // area between two point samples as the contribution for the sample on
  1876. // the right.
  1877. kc -= FIX16_HALF;
  1878. // Compute the start and end of the filter kernel coverage
  1879. FIX16 kb = GpFix16Ceiling(kc-yw);
  1880. FIX16 ke = GpFix16Ceiling(kc+yw);
  1881. // Get the source pointer.
  1882. ARGB *srcPtr0 = static_cast<ARGB*> (BmpData.Scan0);
  1883. INT stride = BmpData.Stride/sizeof(ARGB);
  1884. ARGB *src;
  1885. ARGB *dst;
  1886. FIX16 pc, kp, pa, pa_old;
  1887. FIX16 ta, tr, tg, tb;
  1888. ARGB pix;
  1889. INT k, x, kmod;
  1890. FIX16 krn = 0;
  1891. // if there was a last_k before this iteration
  1892. // compute the new xbuffer_start_scanline
  1893. if(last_k != LAST_K_UNUSED)
  1894. {
  1895. // If there is no overlap in the rotational buffer from the
  1896. // last time, initialize the rotational buffer to the start.
  1897. if(yscale < 0)
  1898. {
  1899. // Negative y scale.
  1900. if(ke-last_k < 0)
  1901. {
  1902. xbuffer_start_scanline = 0;
  1903. }
  1904. else
  1905. {
  1906. xbuffer_start_scanline -= last_k-kb;
  1907. if(xbuffer_start_scanline < 0)
  1908. {
  1909. xbuffer_start_scanline += xbuffer_height;
  1910. }
  1911. }
  1912. }
  1913. else
  1914. {
  1915. // Positive y scale.
  1916. if(last_k-kb < 0)
  1917. {
  1918. xbuffer_start_scanline = 0;
  1919. }
  1920. else
  1921. {
  1922. // Figure out where to start in the xbuffer so that we
  1923. // can reuse the already scaled scanlines.
  1924. xbuffer_start_scanline -= (last_k-kb)+1;
  1925. if(xbuffer_start_scanline < 0)
  1926. {
  1927. xbuffer_start_scanline += xbuffer_height;
  1928. }
  1929. }
  1930. }
  1931. }
  1932. else
  1933. {
  1934. // this should be the first time we're hitting this
  1935. // routine. xbuffer_start_scanline should be properly
  1936. // initialized.
  1937. ASSERT(xbuffer_start_scanline == 0);
  1938. }
  1939. // make sure we're going to access valid memory in the xbuffer.
  1940. ASSERT(xbuffer_start_scanline >= 0);
  1941. ASSERT(xbuffer_start_scanline < xbuffer_height);
  1942. // !!! [asecchia] if we thought about it some, we could probably
  1943. // import the code in StretchScanline into this loop
  1944. // and merge this and the next loop significantly reducing the memory
  1945. // requirements for the xbuffer.
  1946. // The xbuffer_height should be == (ke-kb)+1 for all cases except when
  1947. // the center (kc) is exactly on an integer in which case the first and
  1948. // last entries under the kernel have a contribution of zero so it doesn't
  1949. // matter if we drop one scanline in that case.
  1950. // Start at the position we left off from the previous scanline. Use the
  1951. // rotational buffer to remember the data from the previous scanline work.
  1952. // HighQualityBicubic needs to initialize the krn value.
  1953. // It is used to do the kernel table lookup.
  1954. // HighQualityBilinear doesn't use this as it works out it's
  1955. // kernel by direct computation.
  1956. // Note: this is a template class so the value of FilterMode
  1957. // is defined at compile time. We're relying on the compiler
  1958. // to perform dead code removal for each template instantiation
  1959. // eliminating both the constant comparison and all the
  1960. // code branches corresponding to other FilterMode values.
  1961. // That way our inner loop is not impacted by extra code for
  1962. // filter modes we're not using and extraneous conditional
  1963. // statements.
  1964. krn = Int32x32Mod16(ya, (kb << FIX16_SHIFT) - kc);
  1965. pa_old = 0;
  1966. for(k=0; k<xbuffer_height; k++)
  1967. {
  1968. kmod = xbuffer_start_scanline + k;
  1969. if(kmod >= xbuffer_height) kmod -= xbuffer_height;
  1970. // We avoid using a mod (%) computation above because we
  1971. // know that the xbuffer_start_scanline is always within
  1972. // the range 0..xbuffer_height-1.
  1973. // ASSERT that this assumption is true.
  1974. ASSERT(kmod < xbuffer_height);
  1975. ASSERT(kmod >= 0);
  1976. // Compute the kernel response for this pixel based on the
  1977. // positive value of kp
  1978. if(kb+k>ke)
  1979. {
  1980. // The buffer could be larger than the actual kernel,
  1981. // in that case, simply set the extra coefficients to
  1982. // zero.
  1983. ycoeff[kmod] = 0;
  1984. }
  1985. else
  1986. {
  1987. ComputeKernel(ycoeff[kmod], ya, pa, pa_old, krn);
  1988. }
  1989. // Compute the position in the destination buffer to draw to.
  1990. dst = xbuffer + xbuffer_width * kmod;
  1991. // This assert fires if the arithmetic for computing the size of the
  1992. // xbuffer or the iteration over the kernel support has a bug. The
  1993. // xbuffer_height should be the maximum width of the kernel support.
  1994. ASSERT(k < xbuffer_height);
  1995. ASSERT(kmod < xbuffer_height);
  1996. INT k_wrap = kb+k;
  1997. // NTRAID#NTBUG9-370168-2001/04/18-asecchia
  1998. // This is an unsigned/signed comparison.
  1999. // NOTE: the (INT) cast is the invalid one. BmpData.Height is UINT
  2000. // and is always positive - casting it to int is irrelevant.
  2001. // However, the k_wrap is signed and _can_ be negative. The unsigned
  2002. // cast is by design - it allows us to figure out both sides of the
  2003. // wrap using one comparison.
  2004. // The unsigned comparison >= Height tells us if k_wrap does not fall
  2005. // within the range 0..Height-1 and therefore needs wrapping because
  2006. // negative numbers cast to huge positive numbers and succeed the
  2007. // comparison too.
  2008. // NOTE also that this kind of comparison limits the effective range
  2009. // of Height to (max unsigned)/2 with the single caveat of k_wrap being
  2010. // equal to -MAXINT.
  2011. // For code that's executed once per scanline, this kind of subtlety
  2012. // is probably not warranted.
  2013. if((UINT)(k_wrap) >= (INT)BmpData.Height)
  2014. {
  2015. // Handle the wrap mode here.
  2016. if(WrapZeroClamp)
  2017. {
  2018. // GpMemset(dst, 0, (right-left)*sizeof(ARGB));
  2019. // If we're filling with zero, we may as well optimize the kernel
  2020. // contribution.
  2021. ycoeff[kmod] = 0;
  2022. // done this scan - go on to the next
  2023. continue;
  2024. }
  2025. else
  2026. {
  2027. if(QWrapMode == WrapModeClamp)
  2028. {
  2029. INT i = right-left;
  2030. ARGB *d = dst;
  2031. while(i--)
  2032. {
  2033. *d++ = ClampColor;
  2034. }
  2035. // done this scan - go on to the next
  2036. continue;
  2037. }
  2038. else
  2039. {
  2040. // Apply the general wrap code.
  2041. Apply1DWrapModeY(QWrapMode, k_wrap, (INT)BmpData.Height);
  2042. src = srcPtr0 + stride*k_wrap;
  2043. // Not done yet - fall through and call StretchScanline.
  2044. }
  2045. }
  2046. }
  2047. else
  2048. {
  2049. // If the x2 and x1 are out of order, we failed to correctly
  2050. // compute the span in the above logic.
  2051. // Seek to the start of the scanline.
  2052. // Note: whatever X coordinate we add to the src pointer
  2053. // we need to subtract from the width passed to the
  2054. // StretchScanline routine below.
  2055. src = srcPtr0 + stride*(k_wrap);
  2056. }
  2057. // Only x-scale if we haven't already done this scanline on a previous
  2058. // call and stored the result in the xbuffer.
  2059. if((last_k==LAST_K_UNUSED) || (
  2060. (yscale >= 0) && (last_k-(kb+k) < 0) ||
  2061. (yscale < 0) && (last_k-(kb+k) > 0)
  2062. )
  2063. )
  2064. {
  2065. // Filter in the x-dimension.
  2066. StretchScanline(
  2067. dst,
  2068. src,
  2069. xbuffer_width,
  2070. static_cast<INT>(BmpData.Width),
  2071. xkci,
  2072. xscale,
  2073. xw,
  2074. xa
  2075. );
  2076. }
  2077. }
  2078. // set up the k_last for the next iteration. This represents the last
  2079. // scanline for which we actually have x-scaled data.
  2080. if(yscale < 0)
  2081. {
  2082. last_k = kb;
  2083. }
  2084. else
  2085. {
  2086. last_k = kb + xbuffer_height - 1;
  2087. }
  2088. // Get the final destination buffer
  2089. ARGB *buffer = Scan->NextBuffer(left, y, width);
  2090. // Now we have the entire buffer full with the x-dimension scaled data.
  2091. // for every x coordinate, apply the y kernel.
  2092. #ifdef _X86_
  2093. if (OSInfo::HasMMX)
  2094. {
  2095. // MMX
  2096. INT *ycoeffMMX = (INT *) ((((UINT_PTR) ycoeff) + 4) & ~0x7);
  2097. INT n = (xbuffer_height + 1) >> 1;
  2098. // Transform the kernel coeffecient array into a form that is
  2099. // easily usable by MMX code. The loop must go backward so that
  2100. // we don't erase kernel coefficients (MMX starting point could
  2101. // be 4 bytes ahead of integer starting point).
  2102. // ycoeff must be large enough to hold the MMX coefficients (2 extra
  2103. // entries)
  2104. for (INT i = n-1; i >= 0; i--)
  2105. {
  2106. INT kernel1 = ycoeff[i * 2] >> 2;
  2107. INT kernel2 = ycoeff[i * 2 + 1] >> 2;
  2108. INT kernelMMX = (kernel1 & 0xFFFF) | (kernel2 << 16);
  2109. ycoeffMMX[i * 2] = kernelMMX;
  2110. ycoeffMMX[i * 2 + 1] = kernelMMX;
  2111. }
  2112. for(x=0; x<width; x++)
  2113. {
  2114. // iterate over every point under the kernel
  2115. // Note we don't need the kmod arithmetic here because
  2116. // we've precomputed the coefficients and we don't care what order
  2117. // we sum them.
  2118. BYTE *kptr = (BYTE*)(xbuffer + x + (left - ixleft));
  2119. // Compute the increment in bytes to move from the current scanline
  2120. // to the next in the xbuffer.
  2121. INT kptr_inc_MMX = xbuffer_width*sizeof(ARGB);
  2122. INT bos_count = xbuffer_height;
  2123. INT bos_half_count = bos_count >> 1;
  2124. bos_count &= 0x1;
  2125. _asm
  2126. {
  2127. // eax - kptr
  2128. // ebx - kptr_inc
  2129. // ecx - counter
  2130. // esi - ycoeff current pointer
  2131. pxor mm0, mm0
  2132. movq mm6, FIX14_HALF_MMX
  2133. movq mm7, mm6
  2134. mov eax, kptr
  2135. mov ebx, kptr_inc_MMX
  2136. mov ecx, bos_half_count
  2137. mov esi, ycoeffMMX
  2138. dec ecx
  2139. jl bicubic_output_span_loop_last_pixel
  2140. bicubic_output_span_loop:
  2141. movd mm2, [eax] // mm2 = 00000000a1r1b1g1
  2142. movd mm4, [eax + ebx]
  2143. punpcklbw mm2, mm0 // mm2 = 00a100r100g100b1
  2144. movq mm1, [esi] // mm1 = kernel2 | kernel1 | kernel2 | kernel1
  2145. punpcklbw mm4, mm0 // mm4 = 00a200r200g200b2
  2146. movq mm3, mm2 // mm3 = 00a100r100g100b1
  2147. punpcklwd mm2, mm4 // mm2 = 00g200g100b200b1
  2148. add esi, 8
  2149. pmaddwd mm2, mm1
  2150. punpckhwd mm3, mm4 // mm3 = 00a200a100r200r1
  2151. paddd mm6, mm2
  2152. dec ecx
  2153. pmaddwd mm3, mm1
  2154. lea eax, [eax + 2 * ebx] // does not affect flags
  2155. paddd mm7, mm3
  2156. jge bicubic_output_span_loop
  2157. bicubic_output_span_loop_last_pixel:
  2158. dec bos_count
  2159. jl bicubic_output_span_loop_done
  2160. movd mm2, [eax] // mm2 = 00000000aarrggbb
  2161. punpcklbw mm2, mm0 // mm2 = 00aa00rr00gg00bb
  2162. movq mm3, mm2
  2163. punpcklwd mm2, mm0 // mm2 = 000000gg000000bb
  2164. movq mm1, [esi] // mm1 = xxxx | kernel1 | xxxx |kernel1
  2165. punpckhwd mm3, mm0 // mm3 = 000000aa000000bb
  2166. pmaddwd mm2, mm1
  2167. pmaddwd mm3, mm1
  2168. paddd mm6, mm2
  2169. paddd mm7, mm3
  2170. bicubic_output_span_loop_done:
  2171. // At this point, mm6 and mm7 contain the output channels
  2172. // for the pixel. We need to clamp the alpha and store it
  2173. // in the destination buffer.
  2174. psrad mm6, 14
  2175. psrad mm7, 14
  2176. packssdw mm6, mm7 // mm6 = a | r | g | b
  2177. packuswb mm6, mm6 // mm6 = 00000000aarrggbb
  2178. movq mm7, mm6 // mm7 = 00000000aarrggbb
  2179. psrad mm6, 24 // mm6 = xxxxxxxxxxxxxxaa
  2180. mov eax, 0xFFFFFFFF
  2181. punpcklbw mm6, mm6 // mm6 = xxxxxxxxxxxxaaaa
  2182. movd mm2, eax
  2183. punpcklbw mm6, mm6 // mm6 = xxxxxxxxaaaaaaaa
  2184. psubusb mm2, mm6
  2185. mov eax, buffer
  2186. paddusb mm7, mm2
  2187. psubusb mm7, mm2
  2188. movd [eax], mm7
  2189. add eax, 4
  2190. mov buffer, eax
  2191. }
  2192. }
  2193. }
  2194. else
  2195. #endif // _X86_
  2196. {
  2197. // No MMX
  2198. for(x=0; x<width; x++)
  2199. {
  2200. // Initialize the component accumulators. We accumulate the
  2201. // contribution of each color component scaled by the kernel
  2202. // response into these variables.
  2203. ta = tr = tg = tb = 0;
  2204. // iterate over every point under the kernel
  2205. // Note we don't need the kmod arithmetic here because
  2206. // we've precomputed the coefficients and we don't care what order
  2207. // we sum them.
  2208. BYTE *kptr = (BYTE*)(xbuffer + x + (left - ixleft));
  2209. // Compute the increment in bytes to move from the current scanline
  2210. // to the next in the xbuffer after incrementing through 3 of the
  2211. // color channels.
  2212. INT kptr_inc = xbuffer_width*sizeof(ARGB);
  2213. for(k=0; k<xbuffer_height; k++)
  2214. {
  2215. // Find the pixel contributing to this part of the kernel
  2216. // taking into account the edge conditions.
  2217. // lookup the kernel coefficient for this scanline.
  2218. pc = ycoeff[k];
  2219. // Accumulate the contribution of this source pixel to the pixel
  2220. // we're working on.
  2221. AccumulateChannels(pc, kptr);
  2222. kptr += kptr_inc;
  2223. }
  2224. // Done with this pixel - store it in the destination buffer.
  2225. // clamp the results to byte range.
  2226. ClampColors();
  2227. // Combine the channels, set the destination pixel and increment
  2228. // to the next pixel
  2229. *buffer++ = (ta << 24) | (tr << 16) | (tg << 8) | tb;
  2230. }
  2231. }
  2232. #ifdef _X86_
  2233. if (OSInfo::HasMMX)
  2234. {
  2235. _asm
  2236. {
  2237. emms
  2238. }
  2239. }
  2240. #endif // _X86_
  2241. return Ok;
  2242. }
  2243. #undef ClampColors