You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
2785 lines
98 KiB
2785 lines
98 KiB
/**************************************************************************
|
|
*
|
|
* Copyright (c) 2000 Microsoft Corporation
|
|
*
|
|
* Module Name & Abstract
|
|
*
|
|
* Stretch. This module contains the code to do various stretching
|
|
* by applying a kernel filter. The code correctly handles minification.
|
|
*
|
|
* Note:
|
|
* This module is not compiled into an .obj file, rather it is included
|
|
* directly into the header file stretch.hpp.
|
|
* This is due to the use of template functions.
|
|
*
|
|
*
|
|
* Notes:
|
|
*
|
|
* This code does not handle rotation or shear.
|
|
*
|
|
* Created:
|
|
*
|
|
* 04/17/2000 asecchia
|
|
* Created it.
|
|
*
|
|
**************************************************************************/
|
|
|
|
#define LAST_K_UNUSED ((INT)0x7fffffff)
|
|
|
|
const INT BicubicKernelShift = 7;
|
|
const INT BicubicKernelSize = 1 << BicubicKernelShift;
|
|
const FIX16 BicubicKernel[BicubicKernelSize+1] =
|
|
{
|
|
65536, 65496, 65379, 65186, 64920, 64583, 64177, 63705,
|
|
63168, 62569, 61911, 61195, 60424, 59600, 58725, 57802,
|
|
56832, 55818, 54763, 53668, 52536, 51369, 50169, 48939,
|
|
47680, 46395, 45087, 43757, 42408, 41042, 39661, 38268,
|
|
36864, 35452, 34035, 32614, 31192, 29771, 28353, 26941,
|
|
25536, 24141, 22759, 21391, 20040, 18708, 17397, 16110,
|
|
14848, 13614, 12411, 11240, 10104, 9005, 7945, 6927,
|
|
5952, 5023, 4143, 3313, 2536, 1814, 1149, 544,
|
|
0, -496, -961, -1395, -1800, -2176, -2523, -2843,
|
|
-3136, -3403, -3645, -3862, -4056, -4227, -4375, -4502,
|
|
-4608, -4694, -4761, -4809, -4840, -4854, -4851, -4833,
|
|
-4800, -4753, -4693, -4620, -4536, -4441, -4335, -4220,
|
|
-4096, -3964, -3825, -3679, -3528, -3372, -3211, -3047,
|
|
-2880, -2711, -2541, -2370, -2200, -2031, -1863, -1698,
|
|
-1536, -1378, -1225, -1077, -936, -802, -675, -557,
|
|
-448, -349, -261, -184, -120, -69, -31, -8,
|
|
0
|
|
};
|
|
|
|
const FIX16 SymmetricBicubicKernel[BicubicKernelSize * 2 + 1] =
|
|
{
|
|
0,
|
|
-8, -31, -69, -120, -184, -261,-349, -448,
|
|
-557, -675, -802, -936, -1077, -1225, -1378, -1536,
|
|
-1698, -1863, -2031, -2200, -2370, -2541, -2711, -2880,
|
|
-3047, -3211, -3372, -3528, -3679, -3825, -3964, -4096,
|
|
-4220, -4335, -4441, -4536, -4620, -4693, -4753, -4800,
|
|
-4833, -4851, -4854, -4840, -4809, -4761, -4694, -4608,
|
|
-4502, -4375, -4227, -4056, -3862, -3645, -3403, -3136,
|
|
-2843, -2523, -2176, -1800, -1395, -961, -496,
|
|
0,
|
|
544, 1149, 1814, 2536, 3313, 4143, 5023, 5952,
|
|
6927, 7945,9005, 10104, 11240, 12411, 13614, 14848,
|
|
16110, 17397, 18708, 20040, 21391, 22759, 24141, 25536,
|
|
26941, 28353, 29771, 31192, 32614, 34035, 35452, 36864,
|
|
38268, 39661, 41042, 42408, 43757, 45087, 46395, 47680,
|
|
48939, 50169, 51369, 52536, 53668, 54763, 55818, 56832,
|
|
57802, 58725, 59600, 60424, 61195, 61911, 62569, 63168,
|
|
63705, 64177, 64583, 64920, 65186, 65379, 65496,
|
|
65536,
|
|
65496, 65379, 65186, 64920, 64583, 64177, 63705,
|
|
63168, 62569, 61911, 61195, 60424, 59600, 58725, 57802,
|
|
56832, 55818, 54763, 53668, 52536, 51369, 50169, 48939,
|
|
47680, 46395, 45087, 43757, 42408, 41042, 39661, 38268,
|
|
36864, 35452, 34035, 32614, 31192, 29771, 28353, 26941,
|
|
25536, 24141, 22759, 21391, 20040, 18708, 17397, 16110,
|
|
14848, 13614, 12411, 11240, 10104, 9005, 7945, 6927,
|
|
5952, 5023, 4143, 3313, 2536, 1814, 1149, 544,
|
|
0,
|
|
-496, -961, -1395, -1800, -2176, -2523, -2843,
|
|
-3136, -3403, -3645, -3862, -4056, -4227, -4375, -4502,
|
|
-4608, -4694, -4761, -4809, -4840, -4854, -4851, -4833,
|
|
-4800, -4753, -4693, -4620, -4536, -4441, -4335, -4220,
|
|
-4096, -3964, -3825, -3679, -3528, -3372, -3211, -3047,
|
|
-2880, -2711, -2541, -2370, -2200, -2031, -1863, -1698,
|
|
-1536, -1378, -1225, -1077, -936, -802, -675, -557,
|
|
-448, -349, -261, -184, -120, -69, -31, -8,
|
|
0
|
|
};
|
|
|
|
/*
|
|
// Higher precision bicubic kernel - more data.
|
|
// Commented out in case we eventually need it.
|
|
const FIX16 BK[512+1] =
|
|
{
|
|
0,
|
|
-2, -8, -18, -31, -48, -69, -93, -120,
|
|
-151, -184, -221, -261, -304, -349, -397, -448,
|
|
-501, -557, -615, -675, -737, -802, -868, -936,
|
|
-1006, -1077, -1150, -1225, -1301, -1378, -1457, -1536,
|
|
-1616, -1698, -1780, -1863, -1947, -2031, -2115, -2200,
|
|
-2285, -2370, -2456, -2541, -2626, -2711, -2796, -2880,
|
|
-2964, -3047, -3129, -3211, -3292, -3372, -3450, -3528,
|
|
-3604, -3679, -3753, -3825, -3895, -3964, -4031, -4096,
|
|
-4159, -4220, -4279, -4335, -4389, -4441, -4490, -4536,
|
|
-4580, -4620, -4658, -4693, -4725, -4753, -4778, -4800,
|
|
-4818, -4833, -4844, -4851, -4854, -4854, -4849, -4840,
|
|
-4827, -4809, -4787, -4761, -4730, -4694, -4654, -4608,
|
|
-4557, -4502, -4441, -4375, -4304, -4227, -4144, -4056,
|
|
-3962, -3862, -3757, -3645, -3527, -3403, -3273, -3136,
|
|
-2993, -2843, -2686, -2523, -2353, -2176, -1991, -1800,
|
|
-1601, -1395, -1182, -961, -732, -496, -252,
|
|
0,
|
|
264, 544, 839, 1149, 1474, 1814, 2168, 2536,
|
|
2918, 3313, 3722, 4143, 4577, 5023, 5482, 5952,
|
|
6434, 6927, 7430, 7945, 8470, 9005, 9550, 10104,
|
|
10668, 11240, 11821, 12411, 13009, 13614, 14228, 14848,
|
|
15475, 16110, 16750, 17397, 18050, 18708, 19371, 20040,
|
|
20713, 21391, 22073, 22759, 23449, 24141, 24837, 25536,
|
|
26237, 26941, 27646, 28353, 29061, 29771, 30481, 31192,
|
|
31903, 32614, 33325, 34035, 34744, 35452, 36159, 36864,
|
|
37567, 38268, 38966, 39661, 40353, 41042, 41727, 42408,
|
|
43085, 43757, 44425, 45087, 45744, 46395, 47041, 47680,
|
|
48313, 48939, 49557, 50169, 50773, 51369, 51957, 52536,
|
|
53107, 53668, 54220, 54763, 55296, 55818, 56331, 56832,
|
|
57322, 57802, 58269, 58725, 59169, 59600, 60018, 60424,
|
|
60816, 61195, 61560, 61911, 62248, 62569, 62876, 63168,
|
|
63444, 63705, 63949, 64177, 64388, 64583, 64760, 64920,
|
|
65062, 65186, 65292, 65379, 65447, 65496, 65526,
|
|
65536,
|
|
65526, 65496, 65447, 65379, 65292, 65186, 65062, 64920,
|
|
64760, 64583, 64388, 64177, 63949, 63705, 63444, 63168,
|
|
62876, 62569, 62248, 61911, 61560, 61195, 60816, 60424,
|
|
60018, 59600, 59169, 58725, 58269, 57802, 57322, 56832,
|
|
56331, 55818, 55296, 54763, 54220, 53668, 53107, 52536,
|
|
51957, 51369, 50773, 50169, 49557, 48939, 48313, 47680,
|
|
47041, 46395, 45744, 45087, 44425, 43757, 43085, 42408,
|
|
41727, 41042, 40353, 39661, 38966, 38268, 37567, 36864,
|
|
36159, 35452, 34744, 34035, 33325, 32614, 31903, 31192,
|
|
30481, 29771, 29061, 28353, 27646, 26941, 26237, 25536,
|
|
24837, 24141, 23449, 22759, 22073, 21391, 20713, 20040,
|
|
19371, 18708, 18050, 17397, 16750, 16110, 15475, 14848,
|
|
14228, 13614, 13009, 12411, 11821, 11240, 10668, 10104,
|
|
9550, 9005, 8470, 7945, 7430, 6927, 6434, 5952,
|
|
5482, 5023, 4577, 4143, 3722, 3313, 2918, 2536,
|
|
2168, 1814, 1474, 1149, 839, 544, 264,
|
|
0,
|
|
-252, -496, -732, -961, -1182, -1395, -1601, -1800,
|
|
-1991, -2176, -2353, -2523, -2686, -2843, -2993, -3136,
|
|
-3273, -3403, -3527, -3645, -3757, -3862, -3962, -4056,
|
|
-4144, -4227, -4304, -4375, -4441, -4502, -4557, -4608,
|
|
-4654, -4694, -4730, -4761, -4787, -4809, -4827, -4840,
|
|
-4849, -4854, -4854, -4851, -4844, -4833, -4818, -4800,
|
|
-4778, -4753, -4725, -4693, -4658, -4620, -4580, -4536,
|
|
-4490, -4441, -4389, -4335, -4279, -4220, -4159, -4096,
|
|
-4031, -3964, -3895, -3825, -3753, -3679, -3604, -3528,
|
|
-3450, -3372, -3292, -3211, -3129, -3047, -2964, -2880,
|
|
-2796, -2711, -2626, -2541, -2456, -2370, -2285, -2200,
|
|
-2115, -2031, -1947, -1863, -1780, -1698, -1616, -1536,
|
|
-1457, -1378, -1301, -1225, -1150, -1077, -1006, -936,
|
|
-868, -802, -737, -675, -615, -557, -501, -448,
|
|
-397, -349, -304, -261, -221, -184, -151, -120,
|
|
-93, -69, -48, -31, -18, -8, -2,
|
|
0
|
|
};
|
|
|
|
|
|
// Bicubic kernel with the 'perceptual' coefficient tweaked
|
|
// see Wolberg. Provides a slightly different experience.
|
|
// Commented out in case we eventually need it.
|
|
|
|
const FIX16 BK_V[512+1] =
|
|
{
|
|
|
|
0,
|
|
-4, -16, -35, -62, -96, -137, -185, -240,
|
|
-301, -369, -442, -522, -607, -698, -795, -896,
|
|
-1002, -1114, -1230, -1350, -1475, -1603, -1736, -1872,
|
|
-2012, -2155, -2301, -2450, -2602, -2756, -2913, -3072,
|
|
-3233, -3396, -3560, -3726, -3893, -4061, -4230, -4400,
|
|
-4570, -4741, -4911, -5082, -5252, -5422, -5592, -5760,
|
|
-5927, -6094, -6259, -6422, -6584, -6743, -6901, -7056,
|
|
-7209, -7359, -7506, -7650, -7791, -7928, -8062, -8192,
|
|
-8318, -8440, -8557, -8670, -8778, -8881, -8979, -9072,
|
|
-9159, -9241, -9316, -9386, -9449, -9506, -9557, -9600,
|
|
-9636, -9666, -9688, -9702, -9709, -9707, -9698, -9680,
|
|
-9654, -9619, -9575, -9522, -9460, -9388, -9307, -9216,
|
|
-9115, -9004, -8882, -8750, -8607, -8453, -8288, -8112,
|
|
-7924, -7725, -7513, -7290, -7054, -6806, -6546, -6272,
|
|
-5985, -5686, -5373, -5046, -4706, -4351, -3983, -3600,
|
|
-3203, -2791, -2364, -1922, -1465, -992, -504,
|
|
0,
|
|
516, 1040, 1571, 2110, 2656, 3209, 3769, 4336,
|
|
4909, 5489, 6074, 6666, 7263, 7866, 8475, 9088,
|
|
9706, 10330, 10958, 11590, 12227, 12867, 13512, 14160,
|
|
14812, 15467, 16125, 16786, 17450, 18116, 18785, 19456,
|
|
20129, 20804, 21480, 22158, 22837, 23517, 24198, 24880,
|
|
25562, 26245, 26927, 27610, 28292, 28974, 29656, 30336,
|
|
31015, 31694, 32371, 33046, 33720, 34391, 35061, 35728,
|
|
36393, 37055, 37714, 38370, 39023, 39672, 40318, 40960,
|
|
41598, 42232, 42861, 43486, 44106, 44721, 45331, 45936,
|
|
46535, 47129, 47716, 48298, 48873, 49442, 50005, 50560,
|
|
51108, 51650, 52184, 52710, 53229, 53739, 54242, 54736,
|
|
55222, 55699, 56167, 56626, 57076, 57516, 57947, 58368,
|
|
58779, 59180, 59570, 59950, 60319, 60677, 61024, 61360,
|
|
61684, 61997, 62297, 62586, 62862, 63126, 63378, 63616,
|
|
63841, 64054, 64253, 64438, 64610, 64767, 64911, 65040,
|
|
65155, 65255, 65340, 65410, 65465, 65504, 65528,
|
|
65536,
|
|
65528, 65504, 65465, 65410, 65340, 65255, 65155, 65040,
|
|
64911, 64767, 64610, 64438, 64253, 64054, 63841, 63616,
|
|
63378, 63126, 62862, 62586, 62297, 61997, 61684, 61360,
|
|
61024, 60677, 60319, 59950, 59570, 59180, 58779, 58368,
|
|
57947, 57516, 57076, 56626, 56167, 55699, 55222, 54736,
|
|
54242, 53739, 53229, 52710, 52184, 51650, 51108, 50560,
|
|
50005, 49442, 48873, 48298, 47716, 47129, 46535, 45936,
|
|
45331, 44721, 44106, 43486, 42861, 42232, 41598, 40960,
|
|
40318, 39672, 39023, 38370, 37714, 37055, 36393, 35728,
|
|
35061, 34391, 33720, 33046, 32371, 31694, 31015, 30336,
|
|
29656, 28974, 28292, 27610, 26927, 26245, 25562, 24880,
|
|
24198, 23517, 22837, 22158, 21480, 20804, 20129, 19456,
|
|
18785, 18116, 17450, 16786, 16125, 15467, 14812, 14160,
|
|
13512, 12867, 12227, 11590, 10958, 10330, 9706, 9088,
|
|
8475, 7866, 7263, 6666, 6074, 5489, 4909, 4336,
|
|
3769, 3209, 2656, 2110, 1571, 1040, 516,
|
|
0,
|
|
-504, -992, -1465, -1922, -2364, -2791, -3203, -3600,
|
|
-3983, -4351, -4706, -5046, -5373, -5686, -5985, -6272,
|
|
-6546, -6806, -7054, -7290, -7513, -7725, -7924, -8112,
|
|
-8288, -8453, -8607, -8750, -8882, -9004, -9115, -9216,
|
|
-9307, -9388, -9460, -9522, -9575, -9619, -9654, -9680,
|
|
-9698, -9707, -9709, -9702, -9688, -9666, -9636, -9600,
|
|
-9557, -9506, -9449, -9386, -9316, -9241, -9159, -9072,
|
|
-8979, -8881, -8778, -8670, -8557, -8440, -8318, -8192,
|
|
-8062, -7928, -7791, -7650, -7506, -7359, -7209, -7056,
|
|
-6901, -6743, -6584, -6422, -6259, -6094, -5927, -5760,
|
|
-5592, -5422, -5252, -5082, -4911, -4741, -4570, -4400,
|
|
-4230, -4061, -3893, -3726, -3560, -3396, -3233, -3072,
|
|
-2913, -2756, -2602, -2450, -2301, -2155, -2012, -1872,
|
|
-1736, -1603, -1475, -1350, -1230, -1114, -1002, -896,
|
|
-795, -698, -607, -522, -442, -369, -301,
|
|
-240, -185, -137, -96, -62, -35, -16, -4,
|
|
0
|
|
};
|
|
*/
|
|
|
|
|
|
// This is the table of partial sums of the bilinear kernel.
|
|
// Simply put, each point in the array represents the integral
|
|
// from -infinity to position x in the kernel function.
|
|
// We can subtract two table lookups to get the integral
|
|
// of the kernel (area) between the two points.
|
|
// The table is padded with zeros and ones at the beginning and end
|
|
// so we can consistently address areas outside of the actual kernel
|
|
// Currently we don't make use of the zeros at the beginning but
|
|
// we definitely sample past the end by at least one half-width
|
|
// of the kernel.
|
|
|
|
const FIX16 BilinearPartialIntegral[512+1] =
|
|
{
|
|
0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0,
|
|
|
|
0,2, 8, 18, 32, 50, 72, 98,
|
|
128, 162, 200, 242, 288, 338, 392, 450,
|
|
512, 578, 648, 722, 800, 882, 968, 1058,
|
|
1152, 1250, 1352, 1458, 1568, 1682, 1800, 1922,
|
|
2048, 2178, 2312, 2450, 2592, 2738, 2888, 3042,
|
|
3200, 3362, 3528, 3698, 3872, 4050, 4232, 4418,
|
|
4608, 4802, 5000, 5202, 5408, 5618, 5832, 6050,
|
|
6272, 6498, 6728, 6962, 7200, 7442, 7688, 7938,
|
|
8192, 8450, 8712, 8978, 9248, 9522, 9800, 10082,
|
|
10368, 10658, 10952, 11250, 11552, 11858, 12168, 12482,
|
|
12800, 13122, 13448, 13778, 14112, 14450, 14792, 15138,
|
|
15488, 15842, 16200, 16562, 16928, 17298, 17672, 18050,
|
|
18432, 18818, 19208, 19602, 20000, 20402, 20808, 21218,
|
|
21632, 22050, 22472, 22898, 23328, 23762, 24200, 24642,
|
|
25088, 25538, 25992, 26450, 26912, 27378, 27848, 28322,
|
|
28800, 29282, 29768, 30258, 30752, 31250, 31752, 32258,
|
|
|
|
32768, // center of the kernel. Index 256
|
|
|
|
33278, 33784, 34286, 34784, 35278, 35768, 36254, 36736,
|
|
37214, 37688, 38158, 38624, 39086, 39544, 39998, 40448,
|
|
40894, 41336, 41774, 42208, 42638, 43064, 43486, 43904,
|
|
44318, 44728, 45134, 45536, 45934, 46328, 46718, 47104,
|
|
47486, 47864, 48238, 48608, 48974, 49336, 49694, 50048,
|
|
50398, 50744, 51086, 51424, 51758, 52088, 52414, 52736,
|
|
53054, 53368, 53678, 53984, 54286, 54584, 54878, 55168,
|
|
55454, 55736, 56014, 56288, 56558, 56824, 57086, 57344,
|
|
57598, 57848, 58094, 58336, 58574, 58808, 59038, 59264,
|
|
59486, 59704, 59918, 60128, 60334, 60536, 60734, 60928,
|
|
61118, 61304, 61486, 61664, 61838, 62008, 62174, 62336,
|
|
62494, 62648, 62798, 62944, 63086, 63224, 63358, 63488,
|
|
63614, 63736, 63854, 63968, 64078, 64184, 64286, 64384,
|
|
64478, 64568, 64654, 64736, 64814, 64888, 64958, 65024,
|
|
65086, 65144, 65198, 65248, 65294, 65336, 65374, 65408,
|
|
65438, 65464, 65486, 65504, 65518, 65528, 65534, 65536,
|
|
|
|
65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536,
|
|
65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536,
|
|
65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536,
|
|
65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536,
|
|
65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536,
|
|
65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536,
|
|
65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536,
|
|
65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536,
|
|
65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536,
|
|
65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536,
|
|
65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536,
|
|
65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536,
|
|
65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536,
|
|
65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536,
|
|
65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536,
|
|
65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536,
|
|
};
|
|
|
|
// This is the table of partial sums of the bicubic kernel.
|
|
// Simply put, each point in the array represents the integral
|
|
// from -infinity to position x in the kernel function.
|
|
// We can subtract two table lookups to get the integral
|
|
// of the kernel (area) between the two points.
|
|
// The table is padded with zeros and ones at the beginning and end
|
|
// so we can consistently address areas outside of the actual kernel
|
|
// Currently we don't make use of the zeros at the beginning but
|
|
// we definitely sample past the end by at least one half-width
|
|
// of the kernel.
|
|
|
|
const FIX16 BicubicPartialIntegral[1024+1] =
|
|
{
|
|
0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0,
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0,
|
|
|
|
0, 0, 0, 0, -1, -2, -3, -4,
|
|
-6, -8, -11, -15, -19, -24, -29, -35,
|
|
-42, -50, -59, -68, -79, -90, -103, -117,
|
|
-131, -147, -164, -182, -201, -221, -243, -265,
|
|
-289, -315, -341, -369, -398, -429, -460, -493,
|
|
-528, -563, -600, -639, -679, -720, -762, -806,
|
|
-851, -897, -945, -993, -1044, -1095, -1148, -1202,
|
|
-1257, -1313, -1371, -1429, -1489, -1550, -1612, -1675,
|
|
-1739, -1804, -1870, -1937, -2004, -2073, -2142, -2212,
|
|
-2283, -2355, -2427, -2500, -2573, -2647, -2721, -2796,
|
|
-2871, -2946, -3022, -3097, -3173, -3249, -3325, -3401,
|
|
-3476, -3552, -3627, -3702, -3776, -3850, -3923, -3996,
|
|
-4068, -4139, -4209, -4279, -4347, -4414, -4481, -4545,
|
|
-4609, -4671, -4731, -4790, -4847, -4902, -4955, -5006,
|
|
-5055, -5102, -5146, -5188, -5228, -5264, -5298, -5329,
|
|
-5358, -5383, -5404, -5423, -5438, -5449, -5457, -5461,
|
|
-5461, -5457, -5449, -5437, -5420, -5399, -5374, -5345,
|
|
-5311, -5273, -5230, -5182, -5130, -5073, -5012, -4946,
|
|
-4875, -4799, -4718, -4633, -4542, -4447, -4346, -4240,
|
|
-4130, -4014, -3893, -3767, -3636, -3500, -3358, -3212,
|
|
-3060, -2902, -2740, -2572, -2399, -2220, -2037, -1848,
|
|
-1653, -1454, -1249, -1038, -822, -601, -375, -143,
|
|
94, 336, 584, 836, 1095, 1358, 1627, 1901,
|
|
2180, 2464, 2754, 3048, 3348, 3653, 3963, 4278,
|
|
4598, 4923, 5253, 5588, 5927, 6272, 6621, 6975,
|
|
7334, 7698, 8066, 8439, 8816, 9198, 9584, 9975,
|
|
10370, 10769, 11173, 11580, 11992, 12408, 12828, 13252,
|
|
13679, 14111, 14546, 14985, 15427, 15873, 16322, 16775,
|
|
17231, 17690, 18152, 18618, 19086, 19557, 20032, 20508,
|
|
20988, 21470, 21954, 22441, 22930, 23421, 23914, 24409,
|
|
24906, 25405, 25905, 26407, 26911, 27415, 27921, 28428,
|
|
28937, 29446, 29955, 30466, 30977, 31488, 32000, 32512,
|
|
|
|
33024, // center of the kernel. Index 512
|
|
|
|
33536, 34048, 34559, 35070, 35581, 36090, 36599, 37108,
|
|
37615, 38121, 38625, 39129, 39631, 40131, 40630, 41127,
|
|
41622, 42115, 42606, 43095, 43582, 44066, 44548, 45028,
|
|
45504, 45979, 46450, 46918, 47384, 47846, 48305, 48761,
|
|
49214, 49663, 50109, 50551, 50990, 51425, 51857, 52284,
|
|
52708, 53128, 53544, 53956, 54363, 54767, 55166, 55561,
|
|
55952, 56338, 56720, 57097, 57470, 57838, 58202, 58561,
|
|
58915, 59264, 59609, 59948, 60283, 60613, 60938, 61258,
|
|
61573, 61883, 62188, 62488, 62782, 63072, 63356, 63635,
|
|
63909, 64178, 64441, 64700, 64952, 65200, 65442, 65679,
|
|
65911, 66137, 66358, 66574, 66785, 66990, 67189, 67384,
|
|
67573, 67756, 67935, 68108, 68276, 68438, 68596, 68748,
|
|
68894, 69036, 69172, 69303, 69429, 69550, 69666, 69776,
|
|
69882, 69983, 70078, 70169, 70254, 70335, 70411, 70482,
|
|
70548, 70609, 70666, 70718, 70766, 70809, 70847, 70881,
|
|
70910, 70935, 70956, 70973, 70985, 70993, 70997, 70997,
|
|
70993, 70985, 70974, 70959, 70940, 70919, 70894, 70865,
|
|
70834, 70800, 70764, 70724, 70682, 70638, 70591, 70542,
|
|
70491, 70438, 70383, 70326, 70267, 70207, 70145, 70081,
|
|
70017, 69950, 69883, 69815, 69745, 69675, 69604, 69532,
|
|
69459, 69386, 69312, 69238, 69163, 69088, 69012, 68937,
|
|
68861, 68785, 68709, 68633, 68558, 68482, 68407, 68332,
|
|
68257, 68183, 68109, 68036, 67963, 67891, 67819, 67748,
|
|
67678, 67609, 67540, 67473, 67406, 67340, 67275, 67211,
|
|
67148, 67086, 67025, 66965, 66907, 66849, 66793, 66738,
|
|
66684, 66631, 66580, 66529, 66481, 66433, 66387, 66342,
|
|
66298, 66256, 66215, 66175, 66136, 66099, 66064, 66029,
|
|
65996, 65965, 65934, 65905, 65877, 65851, 65825, 65801,
|
|
65779, 65757, 65737, 65718, 65700, 65683, 65667, 65653,
|
|
65639, 65626, 65615, 65604, 65595, 65586, 65578, 65571,
|
|
65565, 65560, 65555, 65551, 65547, 65544, 65542, 65540,
|
|
65539, 65538, 65537, 65536, 65536, 65536, 65536, 65536,
|
|
|
|
65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536,
|
|
65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536,
|
|
65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536,
|
|
65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536,
|
|
65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536,
|
|
65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536,
|
|
65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536,
|
|
65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536,
|
|
65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536,
|
|
65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536,
|
|
65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536,
|
|
65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536,
|
|
65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536,
|
|
65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536,
|
|
65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536,
|
|
65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536,
|
|
|
|
65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536,
|
|
65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536,
|
|
65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536,
|
|
65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536,
|
|
65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536,
|
|
65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536,
|
|
65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536,
|
|
65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536,
|
|
65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536,
|
|
65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536,
|
|
65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536,
|
|
65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536,
|
|
65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536,
|
|
65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536,
|
|
65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536,
|
|
65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536,
|
|
};
|
|
|
|
|
|
// We use a biased pointer to the center of the array
|
|
// so that we can look up the negative part of the kernel
|
|
// without repositioning the index or using an absolute value
|
|
// computation in the inner loop.
|
|
|
|
// Linear Partial Integral Center.
|
|
const FIX16 *LPIC = &BilinearPartialIntegral[256];
|
|
|
|
// Cubic Partial Integral Center.
|
|
const FIX16 *CPIC = &BicubicPartialIntegral[512];
|
|
|
|
const FIX16 *SymmetricBicubicKernelCenter = &SymmetricBicubicKernel[128];
|
|
|
|
const ULONGLONG FIX14_HALF_MMX = 0x0000200000002000;
|
|
|
|
/**************************************************************************
|
|
*
|
|
* Function Description:
|
|
*
|
|
* Constructor for the DpOutputSpanStretch class.
|
|
*
|
|
* Return Value:
|
|
*
|
|
* NONE
|
|
*
|
|
* Created:
|
|
*
|
|
* 04/17/2000 asecchia
|
|
* Created it.
|
|
*
|
|
**************************************************************************/
|
|
|
|
#define FIX4TOFIX16_SHIFT (FIX16_SHIFT - FIX4_SHIFT)
|
|
|
|
template<FilterModeType FilterMode>
|
|
void DpOutputSpanStretch<FilterMode>::InitializeClass(
|
|
DpBitmap* bitmap,
|
|
DpScanBuffer * scan,
|
|
DpContext* /*context*/,
|
|
DpImageAttributes imgAttributes,
|
|
const GpRectF *dstRect,
|
|
const GpRectF *srcRect
|
|
)
|
|
{
|
|
isValid = true;
|
|
|
|
// Make sure these get initialized up front before we can early out
|
|
// otherwise we could end up freeing uninitialized pointers in our
|
|
// destructor.
|
|
|
|
ycoeff = NULL;
|
|
xbuffer = NULL;
|
|
|
|
Scan = scan;
|
|
dBitmap = bitmap;
|
|
|
|
QWrapMode = imgAttributes.wrapMode;
|
|
|
|
ClampColor = imgAttributes.clampColor;
|
|
|
|
ClampColorA = (BYTE)( (ClampColor >> 24) );
|
|
ClampColorR = (BYTE)( (ClampColor >> 16) & 0xff);
|
|
ClampColorG = (BYTE)( (ClampColor >> 8) & 0xff);
|
|
ClampColorB = (BYTE)( ClampColor & 0xff);
|
|
|
|
// Accleration for clamp mode with zero clamp color (transparent)
|
|
|
|
WrapZeroClamp = FALSE;
|
|
if((QWrapMode == WrapModeClamp) &&
|
|
(imgAttributes.clampColor == 0))
|
|
{
|
|
WrapZeroClamp = TRUE;
|
|
}
|
|
|
|
|
|
|
|
ASSERT(dBitmap != NULL);
|
|
ASSERT(dBitmap->IsValid());
|
|
|
|
// on bad bitmap, we return with Valid = FALSE
|
|
if (dBitmap == NULL ||
|
|
!dBitmap->IsValid()
|
|
)
|
|
{
|
|
dBitmap = NULL;
|
|
isValid = false;
|
|
return;
|
|
} else {
|
|
BmpData.Width = dBitmap->Width;
|
|
BmpData.Height = dBitmap->Height;
|
|
BmpData.PixelFormat = PIXFMT_32BPP_PARGB;
|
|
BmpData.Stride = dBitmap->Delta;
|
|
BmpData.Scan0 = dBitmap->Bits;
|
|
}
|
|
|
|
if(srcRect)
|
|
SrcRect = *srcRect;
|
|
else
|
|
{
|
|
SrcRect.X = 0.0f;
|
|
SrcRect.Y = 0.0f;
|
|
SrcRect.Width = (REAL)dBitmap->Width;
|
|
SrcRect.Height = (REAL) dBitmap->Height;
|
|
}
|
|
|
|
// Set up the translation.
|
|
if(dstRect)
|
|
{
|
|
DstRect = *dstRect;
|
|
}
|
|
else
|
|
{
|
|
DstRect.X = 0.0f;
|
|
DstRect.Y = 0.0f;
|
|
DstRect.Width = (REAL)SrcRect.Width;
|
|
DstRect.Height = (REAL)SrcRect.Height;
|
|
}
|
|
|
|
|
|
if( !GpValidFixed16(SrcRect.X) ||
|
|
!GpValidFixed16(SrcRect.Y) ||
|
|
!GpValidFixed16(SrcRect.Width) ||
|
|
!GpValidFixed16(SrcRect.Height) ||
|
|
!GpValidFixed16(DstRect.X) ||
|
|
!GpValidFixed16(DstRect.Y) ||
|
|
!GpValidFixed16(DstRect.Width) ||
|
|
!GpValidFixed16(DstRect.Height) )
|
|
{
|
|
// punt
|
|
|
|
isValid = false;
|
|
return;
|
|
}
|
|
|
|
|
|
// Initialize the state for the x-dimension scale.
|
|
|
|
xscale = GpRealToFix16(SrcRect.Width/DstRect.Width);
|
|
xscaleinv = GpRealToFix16(DstRect.Width/SrcRect.Width);
|
|
|
|
// Initialize the state for the y-dimension scale.
|
|
|
|
yscale = GpRealToFix16(SrcRect.Height/DstRect.Height);
|
|
yscaleinv = GpRealToFix16(DstRect.Height/SrcRect.Height);
|
|
|
|
// Compute the destination contribution.
|
|
// Note: the actual pixels touched are the floor of
|
|
// the top left to the ceiling of the bottom right.
|
|
// (modulus the clipping)
|
|
|
|
// Note: We want to be tracking our internal state in FIX16 so we have
|
|
// the extra fractional precision, but when we compute our bounds for the
|
|
// drawing, we use Ceiling and Floor on these FIX16 numbers below. We want
|
|
// the rounding to match the rounding of the FIX4 numbers (i.e. we don't
|
|
// want to track any extra fractional precision errors from the float
|
|
// representation) because we use FIX4 in our DrawImage loop.
|
|
// To accomplish this, we round to FIX4 dropping all error that is smaller
|
|
// than the FIX4 precision and then upconvert to FIX16. Now when we use
|
|
// Fix16Ceiling and Floor, we'll get the same results as Fix4Ceiling and
|
|
// Floor.
|
|
|
|
REAL xinv = DstRect.Width/SrcRect.Width;
|
|
REAL yinv = DstRect.Height/SrcRect.Height;
|
|
|
|
fixDLeft = GpRealToFix4(DstRect.X);
|
|
fixDRight = GpRealToFix4(xinv * (SrcRect.Width) + DstRect.X);
|
|
fixDTop = GpRealToFix4(DstRect.Y);
|
|
fixDBottom = GpRealToFix4(yinv * (SrcRect.Height) + DstRect.Y);
|
|
|
|
// Handle negative scale
|
|
|
|
FIX16 fixTemp;
|
|
|
|
if(fixDLeft > fixDRight)
|
|
{
|
|
// Swap the left and right x coordinates.
|
|
fixTemp = fixDLeft;
|
|
fixDLeft = fixDRight;
|
|
fixDRight = fixTemp;
|
|
}
|
|
|
|
if(fixDTop > fixDBottom)
|
|
{
|
|
// Swap the top and bottom x coordinates.
|
|
fixTemp = fixDTop;
|
|
fixDTop = fixDBottom;
|
|
fixDBottom = fixTemp;
|
|
}
|
|
|
|
// Compute the left edge using the rasterizer rounding rules. Used
|
|
// for clipping in x.
|
|
|
|
ixleft = GpFix4Ceiling(fixDLeft);
|
|
|
|
// Convert up to FIX16.
|
|
|
|
fixDLeft <<= FIX4TOFIX16_SHIFT;
|
|
fixDRight <<= FIX4TOFIX16_SHIFT;
|
|
fixDTop <<= FIX4TOFIX16_SHIFT;
|
|
fixDBottom <<= FIX4TOFIX16_SHIFT;
|
|
|
|
// Get the initial kernel center. This specifies the x-dimension
|
|
// fractional pixel offset.
|
|
|
|
if(xscale < 0)
|
|
{
|
|
xkci = GpRealToFix16(
|
|
(((DstRect.X+DstRect.Width) - GpFix16Ceiling(fixDRight)) *
|
|
(xscale)) / FIX16_ONE +
|
|
SrcRect.X
|
|
);
|
|
}
|
|
else
|
|
{
|
|
xkci = GpRealToFix16(
|
|
((DstRect.X - GpFix16Floor(fixDLeft)) *
|
|
xscale) / FIX16_ONE +
|
|
SrcRect.X
|
|
);
|
|
}
|
|
|
|
// Get the width of the kernel.
|
|
// Make sure to multiply by the actual width of the filter kernel in
|
|
// normalized space (FilterWidth[i])
|
|
|
|
xw = GpRealToFix16(
|
|
(SrcRect.Width*FilterWidth[FilterMode]) /
|
|
DstRect.Width
|
|
); // convert to FIX16
|
|
|
|
// Handle the negative transform
|
|
|
|
if(xscale < 0)
|
|
{
|
|
xw = -xw;
|
|
}
|
|
|
|
// the width of the kernel must be a positive quantity.
|
|
|
|
ASSERT(xw >= 0);
|
|
|
|
// if the width is less than one we're doing a stretch, not a shrink.
|
|
// in this case we clamp the kernel size to one.
|
|
|
|
if(xw < FIX16_ONE * FilterWidth[FilterMode])
|
|
{
|
|
xw = FIX16_ONE * FilterWidth[FilterMode];
|
|
}
|
|
|
|
// a is 1/w - used to work out the tent filter.
|
|
|
|
xa = GpRealToFix16(65536.0f/xw);
|
|
|
|
// Get the initial kernel center. This specifies the y-dimension
|
|
// fractional pixel offset.
|
|
|
|
if(yscale < 0)
|
|
{
|
|
ykci = GpRealToFix16(
|
|
((GpFix16Ceiling(fixDBottom) - (DstRect.Y+DstRect.Height)) *
|
|
(-yscale)) / FIX16_ONE +
|
|
SrcRect.Y
|
|
);
|
|
}
|
|
else
|
|
{
|
|
ykci = GpRealToFix16(
|
|
((GpFix16Floor(fixDTop) - DstRect.Y) *
|
|
yscale) / FIX16_ONE +
|
|
SrcRect.Y
|
|
);
|
|
}
|
|
|
|
// Get the width of the kernel.
|
|
// Make sure to multiply by the actual width of the filter kernel in
|
|
// normalized space (FilterWidth[i])
|
|
|
|
yw = GpRealToFix16(
|
|
(SrcRect.Height * FilterWidth[FilterMode]) /
|
|
DstRect.Height
|
|
); // Convert to FIX16
|
|
|
|
// Handle the negative transform
|
|
|
|
if(yscale < 0)
|
|
{
|
|
yw = -yw;
|
|
}
|
|
|
|
// the width of the kernel must be a positive quantity.
|
|
|
|
ASSERT(yw >= 0);
|
|
|
|
// if the kernel width is less than one we're doing a stretch, not
|
|
// a shrink. In this case we clamp the kernel size to one.
|
|
|
|
if(yw < (FIX16_ONE * FilterWidth[FilterMode]))
|
|
{
|
|
yw = FIX16_ONE * FilterWidth[FilterMode];
|
|
}
|
|
|
|
// a is 1/w - used to work out the tent filter.
|
|
|
|
ya = GpRealToFix16(65536.0f/yw);
|
|
|
|
// !!! [asecchia] The rounding used here should match the rounding used to compute
|
|
// the parameters to StretchBitsMainLoop.
|
|
|
|
iytop = GpFix16Floor(fixDTop);
|
|
|
|
// Compute the width of one scanline in the destination.
|
|
|
|
xbuffer_width = GpFix16Ceiling(fixDRight) - GpFix16Floor(fixDLeft);
|
|
ASSERT(xbuffer_width >= 0);
|
|
|
|
xbuffer_height = GpFix16Ceiling(yw)*2+1;
|
|
ASSERT(xbuffer_height >= 0);
|
|
|
|
// set the rotational array to start at the first scanline.
|
|
|
|
xbuffer_start_scanline = 0;
|
|
|
|
// allocate the xbuffer.
|
|
|
|
// !!! PERF [asecchia]. Ouch this is ugly.
|
|
// we should at least try use a stack buffer for small images.
|
|
// Maybe a lookaside list or something.
|
|
|
|
xbuffer = (ARGB *)GpMalloc(xbuffer_height*xbuffer_width*sizeof(ARGB));
|
|
|
|
// ycoeff needs to have 2 entries more than xbuffer_height because
|
|
// it may be reused to store the MMX coefficients (see OutputSpan
|
|
// routine for details).
|
|
|
|
ycoeff = (FIX16 *)GpMalloc((xbuffer_height + 2) * sizeof(FIX16));
|
|
|
|
if((NULL == ycoeff) || (NULL == xbuffer))
|
|
{
|
|
isValid = false;
|
|
|
|
GpFree(xbuffer);
|
|
GpFree(ycoeff);
|
|
|
|
// Make sure these get initialized to NULL before we can early out
|
|
// otherwise we could end up double freeing the pointers in our
|
|
// destructor.
|
|
|
|
xbuffer = NULL;
|
|
ycoeff = NULL;
|
|
|
|
return;
|
|
}
|
|
|
|
// set the initial value of last_k to maxint
|
|
|
|
last_k = LAST_K_UNUSED;
|
|
}
|
|
|
|
|
|
|
|
/**************************************************************************\
|
|
*
|
|
* Function Description:
|
|
*
|
|
* This function performs a 1d stretch using the tent filter
|
|
*
|
|
* Arguments:
|
|
*
|
|
* dst - destination buffer
|
|
* src - source pixels
|
|
* dw - destination width in pixels
|
|
* sw - source width in pixels
|
|
* kci - the initial kernel centering position (for fractional translate)
|
|
* scale - the scale of the filter - sw/dw
|
|
* w - the width of the filter kernel - typically the ceiling of sw/dw
|
|
* a - 1/w
|
|
*
|
|
* History:
|
|
* 04/16/2000 asecchia created it.
|
|
*
|
|
\**************************************************************************/
|
|
|
|
// !!! Perf [asecchia] For really complicated wrapmodes where many of the
|
|
// pixels are outside of the source and hence need to be wrapped, it may
|
|
// make more sense to copy the source into an extended buffer and pre-wrap
|
|
// the end points (i.e. overallocate) for each scanline.
|
|
// This could simplify the code for the complex wrap conditions.
|
|
// However, for the simple codepath, this would give an extra copy per
|
|
// pixel and might not be worth it.
|
|
|
|
|
|
// Ick. Why does the compiler do a better job of optimizing macros?
|
|
// These should really be inline function calls.
|
|
|
|
#define ClampColors() \
|
|
if(FilterMode == HighQualityBilinear) \
|
|
{ \
|
|
ta = GpFix16Round(ta); \
|
|
tr = GpFix16Round(tr); \
|
|
tg = GpFix16Round(tg); \
|
|
tb = GpFix16Round(tb); \
|
|
if(ta>255) ta = 255; \
|
|
if(tr>255) tr = 255; \
|
|
if(tg>255) tg = 255; \
|
|
if(tb>255) tb = 255; \
|
|
} \
|
|
if(FilterMode == HighQualityBicubic) \
|
|
{ \
|
|
ta = GpFix16Round(ta); \
|
|
tr = GpFix16Round(tr); \
|
|
tg = GpFix16Round(tg); \
|
|
tb = GpFix16Round(tb); \
|
|
if(ta>255) ta = 255; \
|
|
if(tr>ta) tr = ta; \
|
|
if(tg>ta) tg = ta; \
|
|
if(tb>ta) tb = ta; \
|
|
if(ta<0) ta = 0; \
|
|
if(tr<0) tr = 0; \
|
|
if(tg<0) tg = 0; \
|
|
if(tb<0) tb = 0; \
|
|
}
|
|
|
|
|
|
// Compute the kernel in the inner loop
|
|
// Note: the If statements are compiled away in the final code
|
|
// because they are template variable comparisons which can be
|
|
// done at compile time.
|
|
|
|
// This macro looks up the new kernel value, subtracts the old one
|
|
// to get the area of contribution for this pixel, computes the
|
|
// new kernel position and stores the current table lookup.
|
|
|
|
#define ComputeKernel(pc, a, pa, pa_old, krn) \
|
|
if(FilterMode == HighQualityBilinear) \
|
|
{ \
|
|
pa = LPIC[krn >> 9]; \
|
|
pc = pa-pa_old; \
|
|
krn += (a); \
|
|
pa_old = pa; \
|
|
} \
|
|
if(FilterMode == HighQualityBicubic) \
|
|
{ \
|
|
pa = CPIC[krn >> 8]; \
|
|
pc = pa-pa_old; \
|
|
krn += (a); \
|
|
pa_old = pa; \
|
|
}
|
|
|
|
// This block of code accumulates the individual channels from
|
|
// kptr into the accumulation buffers tb, tg, tr, and ta.
|
|
|
|
#define AccumulateChannels(pc, kptr) \
|
|
{ \
|
|
tb += pc * kptr[0]; \
|
|
tg += pc * kptr[1]; \
|
|
tr += pc * kptr[2]; \
|
|
ta += pc * kptr[3]; \
|
|
}
|
|
|
|
|
|
|
|
inline void Apply1DWrapModeX(INT WrapMode, INT &x, INT w)
|
|
{
|
|
INT xm;
|
|
switch(WrapMode) {
|
|
|
|
case WrapModeTileFlipY:
|
|
case WrapModeTile:
|
|
x = RemainderI(x, w);
|
|
break;
|
|
|
|
case WrapModeTileFlipX:
|
|
case WrapModeTileFlipXY:
|
|
xm = RemainderI(x, w);
|
|
if(((x-xm)/w) & 1) {
|
|
x = w-1-xm;
|
|
}
|
|
else
|
|
{
|
|
x = xm;
|
|
}
|
|
break;
|
|
|
|
default:
|
|
// Caller should correctly anticipate other wrap modes.
|
|
ASSERT(FALSE);
|
|
break;
|
|
}
|
|
}
|
|
|
|
inline void Apply1DWrapModeY(INT WrapMode, INT &y, INT h)
|
|
{
|
|
INT ym;
|
|
switch(WrapMode) {
|
|
|
|
case WrapModeTile:
|
|
case WrapModeTileFlipX:
|
|
y = RemainderI(y, h);
|
|
break;
|
|
|
|
case WrapModeTileFlipY:
|
|
case WrapModeTileFlipXY:
|
|
ym = RemainderI(y, h);
|
|
if(((y-ym)/h) & 1) {
|
|
y = h-1-ym;
|
|
}
|
|
else
|
|
{
|
|
y = ym;
|
|
}
|
|
break;
|
|
|
|
default:
|
|
// Caller should correctly anticipate other wrap modes.
|
|
ASSERT(FALSE);
|
|
break;
|
|
}
|
|
}
|
|
|
|
|
|
#undef RemainderI
|
|
|
|
|
|
/**************************************************************************
|
|
*
|
|
* Function Description:
|
|
*
|
|
* Outputs the middle pixels in a 2:1 stretched scanline. Note that
|
|
* this function doesn't need to handle wrap modes.
|
|
*
|
|
* Note: this function must not use floating point values, because it could be
|
|
* called with an invalid floating point state (prior to the call to emms)
|
|
*
|
|
* Arguments:
|
|
*
|
|
* dst - The first pixel to be output
|
|
* src - The first pixel in the source that will affect the destination
|
|
* pixel in a bicubic 2:1 stretch
|
|
* dw - The number of pixels in the destination
|
|
* kci - The subpixel shift in the position of the destination pixels
|
|
*
|
|
**************************************************************************/
|
|
|
|
void DpOutputSpanStretch<HighQualityBilinear>::StretchMiddleScanline2_MMX(
|
|
ARGB *dst,
|
|
ARGB *src,
|
|
INT dw,
|
|
FIX16 kci
|
|
)
|
|
{
|
|
ASSERT(FALSE);
|
|
}
|
|
|
|
void DpOutputSpanStretch<HighQualityBicubic>::StretchMiddleScanline2_MMX(
|
|
ARGB *dst,
|
|
ARGB *src,
|
|
INT dw,
|
|
FIX16 kci
|
|
)
|
|
{
|
|
#if defined(_X86_)
|
|
|
|
//
|
|
// In order to store the kernel multipliers in 16bit registers, we
|
|
// will lose the bottom 3 precision bits (hence each k[i] must be
|
|
// right shifted by three). The summation of the kernel multipliers
|
|
// should come to 16K, hence KERNEL_SHIFT_AMOUNT is 14.
|
|
//
|
|
|
|
#define KERNEL_SHIFT_AMOUNT 14
|
|
|
|
FIX16 k[8];
|
|
FIX16 kernelIncrement = FIX16_ONE >> 2 ;
|
|
FIX16 kCurrent = (kci >> 2) - FIX16_ONE;
|
|
for (INT i = 0; i < 8; i++)
|
|
{
|
|
ASSERT(kCurrent >= -FIX16_ONE);
|
|
ASSERT(kCurrent <= FIX16_ONE);
|
|
|
|
k[i] = SymmetricBicubicKernelCenter[kCurrent >> (FIX16_SHIFT-BicubicKernelShift)];
|
|
k[i] >>= 3;
|
|
|
|
kCurrent += kernelIncrement;
|
|
}
|
|
|
|
//
|
|
// Setup 64bit aligned workspace for the MMX code
|
|
//
|
|
// 0 - zero
|
|
// 8 - kernel multiplier 0
|
|
// 16 - kernel multiplier 1
|
|
// 24 - kernel multiplier 2
|
|
// 32 - kernel multiplier 3
|
|
// 40 - accumulator 3: g, b
|
|
// 48 - accumulator 3: a, r
|
|
// 56 - FIX14_HALF
|
|
//
|
|
|
|
#define BUFFER_SIZE 16
|
|
INT buffer[BUFFER_SIZE + 1];
|
|
INT *buffer_64bitAligned = (INT *) ((((UINT_PTR) buffer) + 4) & ~0x7);
|
|
|
|
buffer_64bitAligned[0] = 0; // zero
|
|
buffer_64bitAligned[1] = 0;
|
|
|
|
buffer_64bitAligned[2] = (k[7] << 16) | (k[6] & 0xFFFF); // kernel multiplier 0
|
|
buffer_64bitAligned[3] = buffer_64bitAligned[2];
|
|
|
|
buffer_64bitAligned[4] = (k[5] << 16) | (k[4] & 0xFFFF); // kernel multiplier 1
|
|
buffer_64bitAligned[5] = buffer_64bitAligned[4];
|
|
|
|
buffer_64bitAligned[6] = (k[3] << 16) | (k[2] & 0xFFFF); // kernel multiplier 2
|
|
buffer_64bitAligned[7] = buffer_64bitAligned[6];
|
|
|
|
buffer_64bitAligned[8] = (k[1] << 16) | (k[0] & 0xFFFF); // kernel multiplier 3
|
|
buffer_64bitAligned[9] = buffer_64bitAligned[8];
|
|
|
|
buffer_64bitAligned[10] = 0; // Accumulator 3
|
|
buffer_64bitAligned[11] = 0;
|
|
buffer_64bitAligned[12] = 0;
|
|
buffer_64bitAligned[13] = 0;
|
|
|
|
buffer_64bitAligned[14] = (1 << (14 - 1)); // FIX14_HALF
|
|
buffer_64bitAligned[15] = (1 << (14 - 1));
|
|
|
|
//
|
|
// eax - counter for the first loop
|
|
// ebx - 0xffffffff
|
|
// esi - source
|
|
// edi - destination
|
|
// ecx - counter
|
|
// edx - 64it aligned workspace buffer
|
|
//
|
|
// mm6, mm7: accumulator 0
|
|
// mm4, mm5: accumulator 1
|
|
//
|
|
|
|
_asm
|
|
{
|
|
mov ebx, 0xFFFFFFFF
|
|
mov esi, src
|
|
mov edi, dst
|
|
mov ecx, dw
|
|
mov edx, buffer_64bitAligned
|
|
|
|
|
|
//
|
|
// The first loop loads the initial values into the accumulators, but
|
|
// doesn't write out any pixels. It executes exactly three times.
|
|
//
|
|
|
|
pxor mm4, mm4
|
|
pxor mm5, mm5
|
|
mov eax, 3
|
|
|
|
loop1:
|
|
|
|
//
|
|
// Read expanded pixel values into mm0 and mm1
|
|
//
|
|
|
|
movd mm1, [esi] ; mm1 = 00000000a1r1g1b1
|
|
movd mm2, [esi + 4] ; mm2 = 00000000a2r2g2b2
|
|
add esi, 8
|
|
|
|
punpcklbw mm1, [edx] ; mm1 = 00a100r100g100b1
|
|
punpcklbw mm2, [edx] ; mm2 = 00a200r200g200b2
|
|
|
|
movq mm0, mm1 ; mm0 = 00a100r100g100b1
|
|
|
|
punpckhwd mm1, mm2 ; mm1 = 00a200a100r200r1
|
|
punpcklwd mm0, mm2 ; mm0 = 00g200g100b200b1
|
|
|
|
//
|
|
// Add the contribution to accumulator 1
|
|
//
|
|
|
|
movq mm6, [edx + 16] ; kernel multiplier 1
|
|
movq mm7, mm6 ; kernel multiplier 1
|
|
pmaddwd mm6, mm0
|
|
pmaddwd mm7, mm1
|
|
paddd mm6, mm4
|
|
paddd mm7, mm5
|
|
|
|
//
|
|
// Add the contribution to accumulator 2
|
|
//
|
|
|
|
movq mm4, [edx + 24] ; kernel multiplier 2
|
|
movq mm5, mm4 ; kernel multiplier 2
|
|
pmaddwd mm4, mm0
|
|
pmaddwd mm5, mm1
|
|
paddd mm4, [edx + 40]
|
|
paddd mm5, [edx + 48]
|
|
|
|
//
|
|
// Compute the new third accumulator
|
|
//
|
|
|
|
pmaddwd mm0, [edx + 32] ; multiply by kernel multiplier 3
|
|
pmaddwd mm1, [edx + 32]
|
|
movq [edx + 40], mm0
|
|
movq [edx + 48], mm1
|
|
|
|
dec eax
|
|
jnz loop1
|
|
|
|
|
|
//
|
|
// The second loop continues to compute the accumulators, but
|
|
// also writes out destination pixels.
|
|
//
|
|
|
|
loop2:
|
|
|
|
//
|
|
// Read expanded pixel values into mm0 and mm1
|
|
//
|
|
|
|
movd mm1, [esi] ; mm1 = 00000000a1r1g1b1
|
|
movd mm2, [esi + 4] ; mm2 = 00000000a2r2g2b2
|
|
add esi, 8
|
|
|
|
punpcklbw mm1, [edx] ; mm1 = 00a100r100g100b1
|
|
punpcklbw mm2, [edx] ; mm2 = 00a200r200g200b2
|
|
|
|
movq mm0, mm1 ; mm0 = 00a100r100g100b1
|
|
|
|
punpckhwd mm1, mm2 ; mm1 = 00a200a100r200r1
|
|
punpcklwd mm0, mm2 ; mm0 = 00g200g100b200b1
|
|
|
|
//
|
|
// Add the contribution to accumulator 0
|
|
//
|
|
|
|
movq mm2, [edx + 8] ; mm2 = kernel multiplier 0
|
|
movq mm3, mm2 ; mm3 = kernel multiplier 0
|
|
pmaddwd mm2, mm0 ; mm2 = 0000gggg0000bbbb
|
|
pmaddwd mm3, mm1 ; mm3 = 0000aaaa0000rrrr
|
|
paddd mm6, mm2 ; add contributions to accumulator 0
|
|
paddd mm7, mm3
|
|
|
|
//
|
|
// Extract the pixel value from accumulator 0.
|
|
//
|
|
|
|
paddd mm6, [edx + 56] ; round
|
|
psrad mm6, KERNEL_SHIFT_AMOUNT
|
|
paddd mm7, [edx + 56]
|
|
psrad mm7, KERNEL_SHIFT_AMOUNT
|
|
packssdw mm6, mm7 ; mm6 = 00aa00rr00gg00bb
|
|
packuswb mm6, mm6 ; mm6 = 00000000aarrggbb
|
|
|
|
//
|
|
// Clip all channels to alpha
|
|
//
|
|
|
|
movd mm2, ebx ; mm2 = 00000000ffffffff
|
|
movq mm7, mm6 ; mm7 = 00000000aarrggbb
|
|
psrad mm7, 24 ; mm7 = 00000000000000aa
|
|
punpcklbw mm7, mm7 ; mm7 = 000000000000aaaa
|
|
punpcklbw mm7, mm7 ; mm7 = 00000000aaaaaaaa
|
|
psubusb mm2, mm7
|
|
paddusb mm6, mm2
|
|
psubusb mm6, mm2
|
|
|
|
movd [edi], mm6
|
|
add edi, 4
|
|
|
|
//
|
|
// Add the contribution to accumulator 1
|
|
//
|
|
|
|
movq mm6, [edx + 16] ; kernel multiplier 1
|
|
movq mm7, mm6 ; kernel multiplier 1
|
|
pmaddwd mm6, mm0
|
|
pmaddwd mm7, mm1
|
|
paddd mm6, mm4
|
|
paddd mm7, mm5
|
|
|
|
//
|
|
// Add the contribution to accumulator 2
|
|
//
|
|
|
|
movq mm4, [edx + 24] ; kernel multiplier 2
|
|
movq mm5, mm4 ; kernel multiplier 2
|
|
pmaddwd mm4, mm0
|
|
pmaddwd mm5, mm1
|
|
paddd mm4, [edx + 40]
|
|
paddd mm5, [edx + 48]
|
|
|
|
//
|
|
// Compute the new third accumulator
|
|
//
|
|
|
|
pmaddwd mm0, [edx + 32] ; multiply by kernel multiplier 3
|
|
pmaddwd mm1, [edx + 32]
|
|
movq [edx + 40], mm0
|
|
movq [edx + 48], mm1
|
|
|
|
dec ecx
|
|
jnz loop2
|
|
emms
|
|
}
|
|
|
|
#undef KERNEL_SHIFT_AMOUNT
|
|
|
|
#endif // defined(_X86_)
|
|
}
|
|
|
|
/**************************************************************************\
|
|
*
|
|
* Function Description:
|
|
*
|
|
* DpOutputSpanStretch<FilterMode>::StretchScanline
|
|
* Stretches a single scanline (magnification or minification) using
|
|
* the reconstruction/interpolation mode specified by the template
|
|
* parameter. Currently this is used for bilinear and bicubic filters.
|
|
*
|
|
* Arguments:
|
|
*
|
|
* ARGB *dst, // destination pointer
|
|
* ARGB *src, // source pointer
|
|
* INT dw, // destination width (pixels)
|
|
* INT sw, // source width (pixels)
|
|
* FIX16 kci, // initial position of the kernel center
|
|
* FIX16 scale, // scale factor
|
|
* FIX16 w, // width from center of the kernel to the edge
|
|
* FIX16 a, // 1/w
|
|
*
|
|
* Notes:
|
|
*
|
|
|
|
The following description is based on the bilinear (tent) filter but it is
|
|
equally applicable to the bicubic - though the pictures and description would
|
|
be slightly more complicated.
|
|
|
|
The code below is significantly complicated by the fact that we want the inner
|
|
kernel loop to be quick and therefore not handle the wrap modes. In order to
|
|
make this work, we first compute the number of pixels on the left and right
|
|
of the scanline that need to consider the wrap mode. We process the left first
|
|
and then run the optimized loop for all the inner pixels (which ignores the
|
|
wrap conditions). After that we run the right edge.
|
|
|
|
Bilinear filter convolution kernel:
|
|
Note that each kernel has an intrinsic width - bilinear = 1 and bicubic = 2.
|
|
This width is scaled by the inverse of the stretch factor - i.e. a shrink
|
|
that results in 1/3 of the size being output requires a width (w) of 3 for the
|
|
bilinear and 6 for the bicubic. Also the height of the filter kernel is scaled
|
|
by the scale factor - i.e. the height of 1 (for all kernels) becomes 1/3 in
|
|
the above example.
|
|
|
|
|
|
--- | --- ^
|
|
--- . | . --- |
|
|
--- . | . .--- h
|
|
--- . . | . . --- |
|
|
--- . . . | . . . --- |
|
|
---. . . . | . . . .--- v
|
|
-----0-----1-----2-----3-----4--|--5-----6-----7-----8-----9------------
|
|
|
|
|
kb kc ke
|
|
<------------w----------->
|
|
|
|
The filter kernel is shifted so that kc is exactly at the position of the
|
|
required destination pixel transformed into the source pixel array by the
|
|
scale factor. This will in general fall somewhere between two pixel samples -
|
|
in the above picture, between pixels 4 and 5.
|
|
|
|
The goal is to get a color value for the position at kc and emit that into
|
|
the destination pixel stream. The standard evaluation method is to compute
|
|
the height of the filter kernel at each of the pixel samples under the filter
|
|
convolution corresponding to pixels 0, 1, ... 9. These heights are used to
|
|
weight each pixel sample and the result is summed giving the destination pixel
|
|
at kc.
|
|
|
|
The problem with the standard evaluation is that at non-integer shrinks
|
|
the mathematical evaluation of the kernel produces ripples in the output - i.e.
|
|
a solid field of pixels responds with a sine-wave-like ripple output. This is
|
|
a theoretical problem with the discrete evaluation of the kernel integral.
|
|
|
|
Our evaluation actually stores a table of partial integrals from -inf to x. We
|
|
use this table to compute the area around each pixel and the area is used as
|
|
the weight. This evaluation is guaranteed to respond with exactly one for any
|
|
position and scale factor of the kernel. This property gives a stable field
|
|
response allowing us to have non-ripple shrinks.
|
|
|
|
---.: ---
|
|
---.....: ---
|
|
--- :.....: ---
|
|
--- :.....: ---
|
|
--- :.....: ---
|
|
--- :.....: ---
|
|
-----0-----1-----2-----3-----4-----5-----6-----7-----8-----9------------
|
|
|
|
To evaluate this properly, we lookup the integral from -inf to 4.5 ( actually
|
|
we rescale so that the center of the kernel is at 0 ) and then subtract the
|
|
table lookup for the integral from -inf to 3.5. This gives us an exact
|
|
(within the error of the table) computation for the area from 3.5 to 4.5.
|
|
This is what we use for the weight of pixel 4. Note that contrary to the
|
|
standard evaluation pixel 9 does contribute even though 9 is outside of the
|
|
kernel. 8.5 is inside the kernel so the area under the kernel from 8.5 to 9.5
|
|
is a small triangular area and is not equal to zero. Not accounting for this is
|
|
the major source of error in the standard evaluation.
|
|
|
|
Note that the lookup for the end point integral for pixel 4 of -inf to 4.5 can
|
|
be reused as the start point for the next pixel (5). An important property of
|
|
this is that any error (e) in the lookup for -inf to 4.5 is added in pixel
|
|
4's contribution and subtracted in pixel 5's contribution which results in
|
|
the total error for the filter response -- due to table discretization -- being
|
|
completely subtracted away --- the end points have an error of exactly zero
|
|
because we sample from beyond the left (area of exactly 0) to beyond the right
|
|
(area of exactly 1). This is not precisely true because the error is scaled
|
|
by the pixel values, but it does help.
|
|
|
|
Note that this integral method is equivalent to convolving the input pixels
|
|
(comb) with the box filter of width 1 pixel and then convolving the result
|
|
with the filter kernel. [analysis due to Jim Blinn - see documentation in
|
|
the Specs directory.]
|
|
|
|
Further documentation is available in the specs directory:
|
|
gdiplus\specs\filter\convolution.doc
|
|
|
|
|
|
* Note: this function must not use floating point values, because it could be
|
|
* called with an invalid floating point state (prior to the call to emms)
|
|
*
|
|
* History:
|
|
*
|
|
* 04/16/2000 asecchia created it
|
|
*
|
|
\**************************************************************************/
|
|
|
|
|
|
template<FilterModeType FilterMode>
|
|
void DpOutputSpanStretch<FilterMode>::StretchScanline(
|
|
ARGB *dst, // destination pointer
|
|
ARGB *src, // source pointer
|
|
INT dw, // destination width (pixels)
|
|
INT sw, // source width (pixels)
|
|
FIX16 kci, // initial position of the kernel center
|
|
FIX16 scale, // scale factor
|
|
FIX16 w, // width from center of the kernel to the edge
|
|
FIX16 a // 1/w
|
|
)
|
|
{
|
|
// Note: this is a template class so the value of FilterMode
|
|
// is defined at compile time. We're relying on the compiler
|
|
// to perform dead code removal for each template instantiation
|
|
// eliminating both the constant comparison and all the
|
|
// code branches corresponding to other FilterMode values.
|
|
// That way our inner loop is not impacted by extra code for
|
|
// filter modes we're not using and extraneous conditional
|
|
// statements.
|
|
|
|
// Center of the filter kernel.
|
|
// Shift over to the left by half because we want to center the area of
|
|
// contribution for each sample on the sample - rather than taking the
|
|
// area between two point samples as the contribution for the sample on
|
|
// the right.
|
|
|
|
FIX16 kc = kci - FIX16_HALF;
|
|
|
|
// Left and right extent of the kernel, intra-kernel position,
|
|
// and pixel contribution.
|
|
|
|
INT kb, ke;
|
|
INT kbt, ket;
|
|
FIX16 kp, pc, pa, pa_old;
|
|
|
|
// Loop variables
|
|
|
|
INT x, k;
|
|
|
|
// Incremental loop state, intermediate computation.
|
|
|
|
ARGB *d = dst;
|
|
FIX16 krn = 0;
|
|
|
|
// Color channel accumulators.
|
|
|
|
FIX16 ta, tr, tg, tb;
|
|
|
|
// Compute the first pixel along the destination scanline that doesn't
|
|
// have any wrap contribution and then the last pixel (l & r).
|
|
// Note that all the terms have a FIX16_ONE factor which cancel out.
|
|
|
|
// !!! Perf: [asecchia] This stuff is computed every scanline -
|
|
// and it's always the same. We could pass these coordinates to
|
|
// this routine and have them precomputed.
|
|
|
|
INT lWrapX;
|
|
INT rWrapX;
|
|
|
|
if(scale>=0)
|
|
{
|
|
// x==sw is considered outside of the source.
|
|
|
|
FIX16 fix_sw = (sw-1) << FIX16_SHIFT;
|
|
|
|
// add (scale-1) and use idiv to get a Ceiling()
|
|
|
|
lWrapX = (w-kc+(scale-1))/scale;
|
|
|
|
// idiv should give us Floor().
|
|
|
|
rWrapX = (fix_sw-w-kc)/scale;
|
|
}
|
|
else
|
|
{
|
|
// x==sw is considered outside of the source.
|
|
|
|
FIX16 fix_sw = (sw-1) << FIX16_SHIFT;
|
|
|
|
// note: in the -x scale transform, the sense of lWrapX and rWrapX
|
|
// can be confusing. The l&r apply to the destination left and right
|
|
// and are swapped here when we compute the initial position from
|
|
// the inverted left and right source points.
|
|
// As we traverse the destination from left to right we'll encounter
|
|
// lWrapX first and then rWrapX, but the kc (kernel center) will be
|
|
// moving through the source from right to left decrementing by
|
|
// scale each time.
|
|
|
|
// use idiv to get a Floor()
|
|
|
|
rWrapX = (w-kc)/scale;
|
|
|
|
// add scale+1 and use idiv for Ceiling().
|
|
|
|
lWrapX = (fix_sw-w-kc+(scale+1))/scale;
|
|
}
|
|
|
|
// Now clamp to the range of the destination we're going to draw.
|
|
|
|
lWrapX = max(0, lWrapX);
|
|
rWrapX = min(dw, rWrapX);
|
|
|
|
BYTE *kptr;
|
|
INT k_wrap;
|
|
|
|
// Do the left wrapmode pixels.
|
|
/*
|
|
--- | --- ^
|
|
--- | --- |
|
|
--- | --- h
|
|
--- | --- |
|
|
--- | --- |
|
|
--- | --- v
|
|
-----0-----1-----2-----3-----4--|--5-----6-----7-----8-----9------------
|
|
kb kc <-----------w---------->ke
|
|
kbt ket
|
|
<----wrap----><---------texture------><----wrap----->
|
|
+ve transform -ve transform
|
|
or straddle case or straddle case
|
|
|
|
The following loop handles the case where the wrap happens on the left of the
|
|
kernel. There are three subloops - first to handle the pixels in the wrap
|
|
segment on the left, then to handle the pixels in the texture. Normally the
|
|
texture pixels will extend to the right edge of the kernel and we'll be done,
|
|
but two cases make the right wrap essential at this point. First if the
|
|
transform is negative, the sense is flipped and the texture extends from the
|
|
left edge to the middle point and the wrap extends the rest of the kernel to
|
|
the right edge. Also if the texture is sufficiently small and the shrink factor
|
|
sufficiently large, the filter kernel could overlap both the left and right edge
|
|
of the texture and require wrapping on both sides.
|
|
*/
|
|
|
|
for(x=0; x<min(lWrapX, dw); x++)
|
|
{
|
|
ASSERT(x<dw);
|
|
// Compute the start and end of the filter kernel coverage
|
|
|
|
kb = GpFix16Ceiling(kc-w);
|
|
ke = GpFix16Ceiling(kc+w);
|
|
|
|
// Bound the pixels in the texture.
|
|
|
|
// kbt == kernel begin texture coordinate.
|
|
// ket == kernel end texture coordinate.
|
|
|
|
kbt = max(0,kb);
|
|
ket = min(ke, sw-1);
|
|
|
|
// Initialize the component accumulators. We accumulate the
|
|
// contribution of each color component scaled by the kernel
|
|
// response into these variables.
|
|
|
|
ta = tr = tg = tb = 0;
|
|
|
|
// These pixels are off the left of the texture.
|
|
pa_old = 0;
|
|
|
|
// Iterate over each pixel under the filter kernel.
|
|
// if ke==kb then there is one point.
|
|
krn = Int32x32Mod16(a, (kb << FIX16_SHIFT) - kc);
|
|
|
|
if(QWrapMode == WrapModeClamp)
|
|
{
|
|
// Clamp modes.
|
|
|
|
for(k=kb; k<min(kbt, ke+1); k++)
|
|
{
|
|
// these pixels are always off the left of the texture.
|
|
|
|
ASSERT(k<0);
|
|
|
|
ComputeKernel(pc, a, pa, pa_old, krn);
|
|
|
|
ta += pc * ClampColorA;
|
|
tr += pc * ClampColorR;
|
|
tg += pc * ClampColorG;
|
|
tb += pc * ClampColorB;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
// Do the full wrap computation.
|
|
|
|
for(k=kb; k<min(kbt, ke+1); k++)
|
|
{
|
|
// these pixels are always off the left of the texture.
|
|
|
|
k_wrap = k;
|
|
|
|
ASSERT(k<0);
|
|
|
|
// !!! Perf: [asecchia] This is really slow.
|
|
// If we ever decide to make wrap modes propagate
|
|
// through the outcrop region and decide that wrap
|
|
// tile and flip x,y are important perf scenarios,
|
|
// we should come back and replace this divide with
|
|
// repeated subtraction - most times it can be avoided.
|
|
// However, right now this is only used for a few
|
|
// pixels on the edges and we don't really mind the
|
|
// perf hit for these modes.
|
|
|
|
Apply1DWrapModeX(QWrapMode, k_wrap, sw);
|
|
ComputeKernel(pc, a, pa, pa_old, krn);
|
|
kptr = (BYTE*)(src + k_wrap);
|
|
AccumulateChannels(pc, kptr);
|
|
}
|
|
|
|
}
|
|
|
|
// Initialize the color channel accessor pointer to the beginning
|
|
// of the source pixel array for this kernel.
|
|
|
|
kptr = (BYTE*)(src + kbt);
|
|
|
|
// HighQualityBicubic needs to initialize the krn value.
|
|
// It is used to do the kernel table lookup.
|
|
// HighQualityBilinear doesn't use this as it works out it's
|
|
// kernel by direct computation.
|
|
|
|
krn = Int32x32Mod16(a, (kbt << FIX16_SHIFT) - kc);
|
|
|
|
// These pixels hit the texture.
|
|
|
|
for(k=kbt; k<=ket; k++)
|
|
{
|
|
ComputeKernel(pc, a, pa, pa_old, krn);
|
|
|
|
// Accumulate the contribution of this source pixel to the pixel
|
|
// we're working on.
|
|
AccumulateChannels(pc, kptr);
|
|
kptr += 4;
|
|
}
|
|
|
|
// These pixels are off the right of the texture.
|
|
// This can happen if the kernel spans the entire source texture.
|
|
|
|
// Iterate over each pixel under the filter kernel.
|
|
// if ke==kb then there is one point.
|
|
|
|
krn = Int32x32Mod16(a, (max(ket+1, kb) << FIX16_SHIFT) - kc);
|
|
|
|
if(QWrapMode == WrapModeClamp)
|
|
{
|
|
// Clamp modes.
|
|
|
|
for(k=max(ket+1, kb); k<=ke; k++)
|
|
{
|
|
ComputeKernel(pc, a, pa, pa_old, krn);
|
|
|
|
ta += pc * ClampColorA;
|
|
tr += pc * ClampColorR;
|
|
tg += pc * ClampColorG;
|
|
tb += pc * ClampColorB;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
// Do the full wrap computation.
|
|
|
|
for(k=max(ket+1, kb); k<=ke; k++)
|
|
{
|
|
k_wrap = k;
|
|
Apply1DWrapModeX(QWrapMode, k_wrap, sw);
|
|
ComputeKernel(pc, a, pa, pa_old, krn);
|
|
kptr = (BYTE*)(src + k_wrap);
|
|
AccumulateChannels(pc, kptr);
|
|
}
|
|
|
|
}
|
|
// Done with this pixel - store it in the destination buffer.
|
|
|
|
// clamp the results to byte range.
|
|
|
|
ClampColors();
|
|
|
|
// Combine the channels, set the destination pixel and increment
|
|
// to the next pixel
|
|
|
|
*d++ = (ta << 24) | (tr << 16) | (tg << 8) | tb;
|
|
kc += scale;
|
|
}
|
|
|
|
// For all points, x, in the destination compute the position of the
|
|
// kernel center in the source and sum the contribution under the filter.
|
|
|
|
const INT minCenterWidthMMX = 16;
|
|
INT dstCenterWidth = rWrapX - lWrapX;
|
|
INT srcFirst = GpFix16Ceiling(kc - w);
|
|
INT srcLast = GpFix16Floor(kc+w + (dstCenterWidth - 1) * scale);
|
|
|
|
// srcLast_2Stretch is the last pixel touched by the MMX routine.
|
|
// The number of pixels touched by the routine is equal to six
|
|
// (setup pixels) plus two times the width of the center strip
|
|
// in the destination. We subtract one in order the get the actual
|
|
// last pixel touched by StretchMiddleScanline2_MMX (so that we can
|
|
// compare it with srcLast).
|
|
|
|
INT srcLast_2Stretch = srcFirst + (dstCenterWidth + 3) * 2 - 1;
|
|
|
|
#if defined(_X86_)
|
|
if ((OSInfo::HasMMX) &&
|
|
(FilterMode == HighQualityBicubic))
|
|
{
|
|
// MMX and high quality bicubic
|
|
|
|
if ((dstCenterWidth >= minCenterWidthMMX) &&
|
|
((srcLast_2Stretch == srcLast) || (srcLast_2Stretch == (srcLast - 1))))
|
|
{
|
|
ASSERT(srcFirst >= 0);
|
|
ASSERT(srcLast_2Stretch < sw);
|
|
|
|
// Stretch the middle pixels by a factor of two using optimized MMX
|
|
// code.
|
|
|
|
FIX16 kc_center = kc + FIX16_HALF;
|
|
StretchMiddleScanline2_MMX(d,
|
|
src + srcFirst,
|
|
dstCenterWidth,
|
|
kc_center - (GpFix16Floor(kc_center) * FIX16_ONE));
|
|
d += dstCenterWidth;
|
|
kc += scale * dstCenterWidth;
|
|
x += dstCenterWidth;
|
|
}
|
|
else
|
|
{
|
|
// This is the MMX version of the general purpose bicubic scaling
|
|
// code.
|
|
|
|
for(x=lWrapX; x<rWrapX; x++)
|
|
{
|
|
// Cannot go over dw because rWrap is < dw
|
|
|
|
ASSERT(x<dw);
|
|
|
|
// Compute the start and end of the filter kernel coverage
|
|
|
|
kb = GpFix16Ceiling(kc-w);
|
|
ke = GpFix16Ceiling(kc+w);
|
|
|
|
// Default loop assumes most pixels don't have to worry about
|
|
// wrap mode along the ends of the scanline.
|
|
|
|
ASSERT(kb>=0);
|
|
ASSERT(ke<sw);
|
|
|
|
// Initialize the color channel accessor pointer to the beginning
|
|
// of the source pixel array for this kernel.
|
|
|
|
kptr = (BYTE*)(src + kb);
|
|
|
|
// HighQualityBicubic needs to initialize the krn value.
|
|
// It is used to do the kernel table lookup.
|
|
// HighQualityBilinear doesn't use this as it works out it's
|
|
// kernel by direct computation.
|
|
|
|
krn = Int32x32Mod16(a, (kb << FIX16_SHIFT) - kc);
|
|
|
|
// Iterate over each pixel under the filter kernel.
|
|
// if ke==kb then there is one point.
|
|
|
|
INT bcl_count = ke - kb + 1;
|
|
INT bcl_half_count = bcl_count >> 1;
|
|
bcl_count &= 0x1;
|
|
|
|
_asm
|
|
{
|
|
// eax - krn
|
|
// ebx - kptr
|
|
// esi - LPIC
|
|
// edi - a
|
|
//
|
|
// mm5 - pold
|
|
// mm6 - green ; blue
|
|
// mm7 - alpha ; red
|
|
|
|
mov eax, krn
|
|
mov ebx, kptr
|
|
mov esi, CPIC
|
|
mov edi, a
|
|
pxor mm5, mm5
|
|
movq mm6, FIX14_HALF_MMX
|
|
movq mm7, mm6
|
|
pxor mm0, mm0
|
|
|
|
dec bcl_half_count
|
|
jl bicubic_center_loop_last_pixel
|
|
|
|
bicubic_center_loop:
|
|
|
|
// Read the next two pixels into mm2 and mm1
|
|
|
|
movd mm2, [ebx] // mm2 = pixel1
|
|
movd mm1, [ebx + 4] // mm1 = pixel2
|
|
add ebx, 8
|
|
|
|
// Compute the kernel values for these two pixels
|
|
|
|
mov edx, eax
|
|
sar edx, 8
|
|
punpcklbw mm2, mm0
|
|
movd mm3, [esi + 4 * edx] // mm3 = p1
|
|
|
|
lea edx, [eax + edi]
|
|
sar edx, 8
|
|
punpcklbw mm1, mm0
|
|
movd mm4, [esi + 4 * edx] // mm4 = p2
|
|
|
|
punpckldq mm5, mm3 // mm5 = p1 | pold
|
|
lea eax, [eax + 2 * edi]
|
|
punpckldq mm3, mm4 // mm3 = p2 | p1
|
|
|
|
psrad mm5, 2
|
|
psrad mm3, 2
|
|
|
|
psubd mm3, mm5 // mm3 = kernel2 | kernel1
|
|
movq mm5, mm4 // mm5 = pold
|
|
packssdw mm3, mm3 // mm3 = kernel2 | kernel1 | kernel2 | kernel1
|
|
|
|
// At this point:
|
|
// mm3 = kernel2 | kernel1 | kernel2 | kernel1
|
|
// mm2, mm1 contain pixel1 and pixel2 respectively
|
|
|
|
movq mm4, mm2
|
|
punpcklwd mm2, mm1
|
|
pmaddwd mm2, mm3
|
|
punpckhwd mm4, mm1
|
|
paddd mm6, mm2
|
|
dec bcl_half_count
|
|
pmaddwd mm4, mm3
|
|
paddd mm7, mm4
|
|
|
|
jge bicubic_center_loop
|
|
|
|
bicubic_center_loop_last_pixel:
|
|
|
|
dec bcl_count
|
|
jl bicubic_center_loop_done
|
|
|
|
// Read the last pixel into mm2
|
|
|
|
movd mm2, [ebx]
|
|
punpcklbw mm2, mm0 // mm2 = a | r | g | b
|
|
movq mm3, mm2
|
|
punpcklwd mm2, mm0 // mm2 = 0 | g | 0 | b
|
|
punpckhwd mm3, mm0 // mm3 = 0 | a | 0 | r
|
|
|
|
// Compute the kernel value for this pixel
|
|
|
|
sar eax, 8
|
|
psrad mm5, 2
|
|
movd mm4, [esi + 4 * eax] // mm4 = p
|
|
psrad mm4, 2
|
|
psubd mm4, mm5
|
|
packssdw mm4, mm4
|
|
|
|
pmaddwd mm2, mm4
|
|
pmaddwd mm3, mm4
|
|
|
|
paddd mm6, mm2
|
|
paddd mm7, mm3
|
|
|
|
bicubic_center_loop_done:
|
|
|
|
// At this point, mm6 and mm7 contain the output channels
|
|
// for the pixel. We need to clamp the alpha and store it
|
|
// in the destination buffer.
|
|
|
|
psrad mm6, 14
|
|
psrad mm7, 14
|
|
packssdw mm6, mm7 // mm6 = a | r | g | b
|
|
packuswb mm6, mm6 // mm6 = 00000000aarrggbb
|
|
|
|
movq mm7, mm6 // mm7 = 00000000aarrggbb
|
|
psrad mm6, 24 // mm6 = xxxxxxxxxxxxxxaa
|
|
mov eax, 0xFFFFFFFF
|
|
punpcklbw mm6, mm6 // mm6 = xxxxxxxxxxxxaaaa
|
|
movd mm2, eax
|
|
punpcklbw mm6, mm6 // mm6 = xxxxxxxxaaaaaaaa
|
|
|
|
psubusb mm2, mm6
|
|
mov eax, d
|
|
paddusb mm7, mm2
|
|
psubusb mm7, mm2
|
|
|
|
movd [eax], mm7
|
|
add eax, 4
|
|
mov d, eax
|
|
}
|
|
|
|
kc += scale;
|
|
}
|
|
}
|
|
}
|
|
else
|
|
#endif // defined(_X86_)
|
|
/*
|
|
--- | --- ^
|
|
--- | --- |
|
|
--- | --- h
|
|
--- | --- |
|
|
--- | --- |
|
|
--- | --- v
|
|
-----0-----1-----2-----3-----4--|--5-----6-----7-----8-----9------------
|
|
kb kc <-----------w---------->ke
|
|
|
|
<-----------------------texture--------------------->
|
|
|
|
The following loop is guaranteed to only hit texture for every pixel under
|
|
the kernel. This is the majority of the pixels in most normal stretch
|
|
cases. We can simplify this loop because of this assumption and therefore
|
|
get a performance win.
|
|
Many of the degenerate wrap cases will simply skip this loop.
|
|
*/
|
|
{
|
|
// no MMX
|
|
|
|
for(x=lWrapX; x<rWrapX; x++)
|
|
{
|
|
// Cannot go over dw because rWrap is < dw
|
|
|
|
ASSERT(x<dw);
|
|
|
|
// Compute the start and end of the filter kernel coverage
|
|
|
|
kb = GpFix16Ceiling(kc-w);
|
|
ke = GpFix16Ceiling(kc+w);
|
|
|
|
// Default loop assumes most pixels don't have to worry about
|
|
// wrap mode along the ends of the scanline.
|
|
|
|
ASSERT(kb>=0);
|
|
ASSERT(ke<sw);
|
|
|
|
// Initialize the component accumulators. We accumulate the
|
|
// contribution of each color component scaled by the kernel
|
|
// response into these variables.
|
|
|
|
ta = tr = tg = tb = 0;
|
|
|
|
// Initialize the color channel accessor pointer to the beginning
|
|
// of the source pixel array for this kernel.
|
|
|
|
kptr = (BYTE*)(src + kb);
|
|
|
|
// HighQualityBicubic needs to initialize the krn value.
|
|
// It is used to do the kernel table lookup.
|
|
// HighQualityBilinear doesn't use this as it works out it's
|
|
// kernel by direct computation.
|
|
|
|
krn = Int32x32Mod16(a, (kb << FIX16_SHIFT) - kc);
|
|
pa_old = 0;
|
|
|
|
// Iterate over each pixel under the filter kernel.
|
|
// if ke==kb then there is one point.
|
|
|
|
for(k=kb; k<=ke; k++)
|
|
{
|
|
ComputeKernel(pc, a, pa, pa_old, krn);
|
|
|
|
// Accumulate the contribution of this source pixel to the pixel
|
|
// we're working on.
|
|
|
|
AccumulateChannels(pc, kptr);
|
|
|
|
kptr += 4;
|
|
}
|
|
|
|
// Done with this pixel - store it in the destination buffer.
|
|
|
|
// clamp the results to byte range.
|
|
|
|
ClampColors();
|
|
|
|
ASSERT(tr<=ta);
|
|
ASSERT(tg<=ta);
|
|
ASSERT(tb<=ta);
|
|
ASSERT(ta>=0);
|
|
ASSERT(tr>=0);
|
|
ASSERT(tg>=0);
|
|
ASSERT(tb>=0);
|
|
|
|
// Combine the channels, set the destination pixel and increment
|
|
// to the next pixel
|
|
|
|
*d++ = (ta << 24) | (tr << 16) | (tg << 8) | tb;
|
|
|
|
kc += scale;
|
|
}
|
|
}
|
|
|
|
// Need to use max() here to handle the case where lWrapX > rWrapX
|
|
// which can happen if the filter spans both edges of the scanline.
|
|
|
|
// Do the right wrapmode pixels.
|
|
|
|
/*
|
|
--- | --- ^
|
|
--- | --- |
|
|
--- | --- h
|
|
--- | --- |
|
|
--- | --- |
|
|
--- | --- v
|
|
-----0-----1-----2-----3-----4--|--5-----6-----7-----8-----9------------
|
|
kb kc <-----------w---------->ke
|
|
kbt ket
|
|
<----wrap----><---------texture------><----wrap----->
|
|
-ve transform +ve tranform
|
|
case only case only
|
|
|
|
The following loop handles the case where the wrap happens on the right of the
|
|
kernel. There are three subloops - first to handle the pixels in the wrap
|
|
segment on the left - if any, then to handle the pixels in the texture. After
|
|
that handle the pixels in the right wrap. Normally the texture pixels will
|
|
extend to the left edge of the kernel and the first subloop will simply be
|
|
skipped, but the left wrap is essential if the transform is negative --- the
|
|
sense is flipped and the texture extends from the right edge to the middle
|
|
point and the wrap extends the rest of the kernel to the left edge.
|
|
Note it's not possible at this point to have wrapping at both edges of the
|
|
kernel the wrap is on the left iff the transform is negative. The wrap is on
|
|
the right iff the transform is positive. The case where both wrapmodes is
|
|
present has already been taken care of in the first loop.
|
|
*/
|
|
|
|
for(x=max(x, rWrapX); x<dw; x++)
|
|
{
|
|
// Compute the start and end of the filter kernel coverage
|
|
|
|
kb = GpFix16Ceiling(kc-w);
|
|
ke = GpFix16Ceiling(kc+w);
|
|
|
|
// Bound the pixels in the texture.
|
|
|
|
// ket == kernel end texture coordinate (inclusive).
|
|
// kbt == kernel begin texture coordinate.
|
|
|
|
kbt = max(0,kb);
|
|
ket = min(ke, sw-1);
|
|
|
|
// Initialize the component accumulators. We accumulate the
|
|
// contribution of each color component scaled by the kernel
|
|
// response into these variables.
|
|
|
|
ta = tr = tg = tb = 0;
|
|
|
|
// Initialize the color channel accessor pointer to the beginning
|
|
// of the source pixel array for this kernel.
|
|
|
|
kptr = (BYTE*)(src + kb);
|
|
|
|
// HighQualityBicubic needs to initialize the krn value.
|
|
// It is used to do the kernel table lookup.
|
|
// HighQualityBilinear doesn't use this as it works out it's
|
|
// kernel by direct computation.
|
|
|
|
pa_old = 0;
|
|
|
|
if(kb<kbt)
|
|
{
|
|
krn = Int32x32Mod16(a, (kb << FIX16_SHIFT) - kc);
|
|
}
|
|
|
|
// Iterate over each pixel under the filter kernel.
|
|
// if ke==kb then there is one point.
|
|
|
|
// These pixels are off the left of the texture.
|
|
// This is possible for negative transform cases.
|
|
|
|
if(QWrapMode == WrapModeClamp)
|
|
{
|
|
// Clamp modes.
|
|
|
|
for(k=kb; k<min(kbt, ke+1); k++)
|
|
{
|
|
// these pixels are always off the left of the texture.
|
|
|
|
ASSERT(k<0);
|
|
|
|
ComputeKernel(pc, a, pa, pa_old, krn);
|
|
|
|
ta += pc * ClampColorA;
|
|
tr += pc * ClampColorR;
|
|
tg += pc * ClampColorG;
|
|
tb += pc * ClampColorB;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
// Do the full wrap computation.
|
|
|
|
for(k=kb; k<min(kbt, ke+1); k++)
|
|
{
|
|
// these pixels are always off the left of the texture.
|
|
|
|
k_wrap = k;
|
|
|
|
ASSERT(k<0);
|
|
|
|
// !!! Perf: [asecchia] This is really slow.
|
|
// If we ever decide to make wrap modes propagate
|
|
// through the outcrop region and decide that wrap
|
|
// tile and flip x,y are important perf scenarios,
|
|
// we should come back and replace this divide with
|
|
// repeated subtraction - most times it can be avoided.
|
|
// However, right now this is only used for a few
|
|
// pixels on the edges and we don't really mind the
|
|
// perf hit for these modes.
|
|
|
|
Apply1DWrapModeX(QWrapMode, k_wrap, sw);
|
|
ComputeKernel(pc, a, pa, pa_old, krn);
|
|
kptr = (BYTE*)(src + k_wrap);
|
|
AccumulateChannels(pc, kptr);
|
|
}
|
|
|
|
}
|
|
|
|
// Initialize the color channel accessor pointer to the beginning
|
|
// of the source pixel array for this kernel.
|
|
|
|
kptr = (BYTE*)(src + kbt);
|
|
|
|
// HighQualityBicubic needs to initialize the krn value.
|
|
// It is used to do the kernel table lookup.
|
|
// HighQualityBilinear doesn't use this as it works out it's
|
|
// kernel by direct computation.
|
|
|
|
krn = Int32x32Mod16(a, (kbt << FIX16_SHIFT) - kc);
|
|
|
|
// These pixels hit the texture.
|
|
|
|
for(k=kbt; k<=ket; k++)
|
|
{
|
|
ComputeKernel(pc, a, pa, pa_old, krn);
|
|
|
|
// Accumulate the contribution of this source pixel to the pixel
|
|
// we're working on.
|
|
|
|
AccumulateChannels(pc, kptr);
|
|
kptr += 4;
|
|
}
|
|
|
|
// These pixels are off the right of the texture.
|
|
|
|
// Iterate over each pixel under the filter kernel.
|
|
// if ke==kb then there is one point.
|
|
krn = Int32x32Mod16(a, ((max(ket+1, kb)) << FIX16_SHIFT) - kc);
|
|
|
|
if(QWrapMode == WrapModeClamp)
|
|
{
|
|
// Clamp modes.
|
|
|
|
for(k=max(ket+1, kb); k<=ke; k++)
|
|
{
|
|
ComputeKernel(pc, a, pa, pa_old, krn);
|
|
|
|
ta += pc * ClampColorA;
|
|
tr += pc * ClampColorR;
|
|
tg += pc * ClampColorG;
|
|
tb += pc * ClampColorB;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
// Do the full wrap computation.
|
|
|
|
for(k=max(ket+1, kb); k<=ke; k++)
|
|
{
|
|
// Apply the general pixel wrap
|
|
|
|
k_wrap = k;
|
|
Apply1DWrapModeX(QWrapMode, k_wrap, sw);
|
|
ComputeKernel(pc, a, pa, pa_old, krn);
|
|
kptr = (BYTE*)(src + k_wrap);
|
|
AccumulateChannels(pc, kptr);
|
|
}
|
|
}
|
|
|
|
// Done with this pixel - store it in the destination buffer.
|
|
|
|
// clamp the results to byte range.
|
|
|
|
ClampColors();
|
|
|
|
// Combine the channels, set the destination pixel and increment
|
|
// to the next pixel
|
|
|
|
*d++ = (ta << 24) | (tr << 16) | (tg << 8) | tb;
|
|
kc += scale;
|
|
}
|
|
#ifdef _X86_
|
|
if ((OSInfo::HasMMX) &&
|
|
(FilterMode == HighQualityBicubic))
|
|
{
|
|
_asm
|
|
{
|
|
emms
|
|
}
|
|
}
|
|
#endif // _X86_
|
|
}
|
|
|
|
|
|
|
|
/**************************************************************************
|
|
*
|
|
* Function Description:
|
|
*
|
|
* Outputs one scanline on the destination device
|
|
*
|
|
* Note: this function must not use floating point values because of
|
|
* potential conflicts with the MMX register values.
|
|
*
|
|
* Return Value:
|
|
*
|
|
* GpStatus. Always returns Ok.
|
|
* !!! [asecchia] are we going to remove this return value - these
|
|
* always return success.
|
|
*
|
|
* Created:
|
|
*
|
|
* 04/17/2000 asecchia
|
|
* Created it.
|
|
*
|
|
**************************************************************************/
|
|
|
|
template<FilterModeType FilterMode>
|
|
GpStatus DpOutputSpanStretch<FilterMode>::OutputSpan(
|
|
INT y,
|
|
INT xMin,
|
|
INT xMax // xMax is exclusive
|
|
)
|
|
{
|
|
ASSERT(isValid);
|
|
|
|
// This function assumes that it's called with a correctly ordered span.
|
|
|
|
ASSERT((xMax-xMin)>=0);
|
|
|
|
INT width = xMax-xMin;
|
|
|
|
// We can't have someone draw outside our specified destination.
|
|
// If this assert fires, we don't have enough buffer space to store the
|
|
// destination xscale so we'd overrun the buffer. The caller set us up
|
|
// with an incorrect destination rectangle or got their rounding wrong.
|
|
|
|
ASSERT(width <= xbuffer_width);
|
|
|
|
INT left = xMin;
|
|
INT right = xMax;
|
|
// If there's nothing to do, simply return.
|
|
|
|
if(right < left)
|
|
{
|
|
return Ok;
|
|
}
|
|
|
|
ASSERT(right >= left);
|
|
|
|
// Make sure the caller clipped correctly - we can't handle
|
|
// being called to draw outside out destination rectangle.
|
|
|
|
ASSERT(y >= iytop);
|
|
|
|
// Compute the kernel center for this y coordinate relative to the first
|
|
// y coordinate (y coordinate corresponding to DstRect.Y) and offset
|
|
// by the source rectangle.
|
|
|
|
FIX16 kc;
|
|
|
|
if(yscale < 0)
|
|
{
|
|
kc = ykci - (y - iytop) * (-yscale);
|
|
}
|
|
else
|
|
{
|
|
kc = ykci + (y - iytop) * yscale;
|
|
}
|
|
|
|
// Center of the filter kernel.
|
|
// Shift over to the left by half because we want to center the area of
|
|
// contribution for each sample on the sample - rather than taking the
|
|
// area between two point samples as the contribution for the sample on
|
|
// the right.
|
|
|
|
kc -= FIX16_HALF;
|
|
|
|
// Compute the start and end of the filter kernel coverage
|
|
|
|
FIX16 kb = GpFix16Ceiling(kc-yw);
|
|
FIX16 ke = GpFix16Ceiling(kc+yw);
|
|
|
|
// Get the source pointer.
|
|
|
|
ARGB *srcPtr0 = static_cast<ARGB*> (BmpData.Scan0);
|
|
INT stride = BmpData.Stride/sizeof(ARGB);
|
|
|
|
ARGB *src;
|
|
ARGB *dst;
|
|
|
|
FIX16 pc, kp, pa, pa_old;
|
|
FIX16 ta, tr, tg, tb;
|
|
|
|
ARGB pix;
|
|
|
|
INT k, x, kmod;
|
|
|
|
FIX16 krn = 0;
|
|
|
|
// if there was a last_k before this iteration
|
|
|
|
// compute the new xbuffer_start_scanline
|
|
|
|
if(last_k != LAST_K_UNUSED)
|
|
{
|
|
// If there is no overlap in the rotational buffer from the
|
|
// last time, initialize the rotational buffer to the start.
|
|
|
|
if(yscale < 0)
|
|
{
|
|
// Negative y scale.
|
|
|
|
if(ke-last_k < 0)
|
|
{
|
|
xbuffer_start_scanline = 0;
|
|
}
|
|
else
|
|
{
|
|
xbuffer_start_scanline -= last_k-kb;
|
|
if(xbuffer_start_scanline < 0)
|
|
{
|
|
xbuffer_start_scanline += xbuffer_height;
|
|
}
|
|
|
|
}
|
|
}
|
|
else
|
|
{
|
|
// Positive y scale.
|
|
|
|
if(last_k-kb < 0)
|
|
{
|
|
xbuffer_start_scanline = 0;
|
|
}
|
|
else
|
|
{
|
|
// Figure out where to start in the xbuffer so that we
|
|
// can reuse the already scaled scanlines.
|
|
|
|
xbuffer_start_scanline -= (last_k-kb)+1;
|
|
if(xbuffer_start_scanline < 0)
|
|
{
|
|
xbuffer_start_scanline += xbuffer_height;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
// this should be the first time we're hitting this
|
|
// routine. xbuffer_start_scanline should be properly
|
|
// initialized.
|
|
|
|
ASSERT(xbuffer_start_scanline == 0);
|
|
}
|
|
|
|
// make sure we're going to access valid memory in the xbuffer.
|
|
|
|
ASSERT(xbuffer_start_scanline >= 0);
|
|
ASSERT(xbuffer_start_scanline < xbuffer_height);
|
|
|
|
// !!! [asecchia] if we thought about it some, we could probably
|
|
// import the code in StretchScanline into this loop
|
|
// and merge this and the next loop significantly reducing the memory
|
|
// requirements for the xbuffer.
|
|
|
|
// The xbuffer_height should be == (ke-kb)+1 for all cases except when
|
|
// the center (kc) is exactly on an integer in which case the first and
|
|
// last entries under the kernel have a contribution of zero so it doesn't
|
|
// matter if we drop one scanline in that case.
|
|
// Start at the position we left off from the previous scanline. Use the
|
|
// rotational buffer to remember the data from the previous scanline work.
|
|
|
|
// HighQualityBicubic needs to initialize the krn value.
|
|
// It is used to do the kernel table lookup.
|
|
// HighQualityBilinear doesn't use this as it works out it's
|
|
// kernel by direct computation.
|
|
|
|
// Note: this is a template class so the value of FilterMode
|
|
// is defined at compile time. We're relying on the compiler
|
|
// to perform dead code removal for each template instantiation
|
|
// eliminating both the constant comparison and all the
|
|
// code branches corresponding to other FilterMode values.
|
|
// That way our inner loop is not impacted by extra code for
|
|
// filter modes we're not using and extraneous conditional
|
|
// statements.
|
|
|
|
krn = Int32x32Mod16(ya, (kb << FIX16_SHIFT) - kc);
|
|
pa_old = 0;
|
|
|
|
for(k=0; k<xbuffer_height; k++)
|
|
{
|
|
kmod = xbuffer_start_scanline + k;
|
|
if(kmod >= xbuffer_height) kmod -= xbuffer_height;
|
|
|
|
// We avoid using a mod (%) computation above because we
|
|
// know that the xbuffer_start_scanline is always within
|
|
// the range 0..xbuffer_height-1.
|
|
// ASSERT that this assumption is true.
|
|
|
|
ASSERT(kmod < xbuffer_height);
|
|
ASSERT(kmod >= 0);
|
|
|
|
// Compute the kernel response for this pixel based on the
|
|
// positive value of kp
|
|
|
|
|
|
if(kb+k>ke)
|
|
{
|
|
// The buffer could be larger than the actual kernel,
|
|
// in that case, simply set the extra coefficients to
|
|
// zero.
|
|
|
|
ycoeff[kmod] = 0;
|
|
}
|
|
else
|
|
{
|
|
ComputeKernel(ycoeff[kmod], ya, pa, pa_old, krn);
|
|
}
|
|
|
|
// Compute the position in the destination buffer to draw to.
|
|
|
|
dst = xbuffer + xbuffer_width * kmod;
|
|
|
|
// This assert fires if the arithmetic for computing the size of the
|
|
// xbuffer or the iteration over the kernel support has a bug. The
|
|
// xbuffer_height should be the maximum width of the kernel support.
|
|
|
|
ASSERT(k < xbuffer_height);
|
|
ASSERT(kmod < xbuffer_height);
|
|
|
|
INT k_wrap = kb+k;
|
|
|
|
// NTRAID#NTBUG9-370168-2001/04/18-asecchia
|
|
// This is an unsigned/signed comparison.
|
|
// NOTE: the (INT) cast is the invalid one. BmpData.Height is UINT
|
|
// and is always positive - casting it to int is irrelevant.
|
|
// However, the k_wrap is signed and _can_ be negative. The unsigned
|
|
// cast is by design - it allows us to figure out both sides of the
|
|
// wrap using one comparison.
|
|
// The unsigned comparison >= Height tells us if k_wrap does not fall
|
|
// within the range 0..Height-1 and therefore needs wrapping because
|
|
// negative numbers cast to huge positive numbers and succeed the
|
|
// comparison too.
|
|
// NOTE also that this kind of comparison limits the effective range
|
|
// of Height to (max unsigned)/2 with the single caveat of k_wrap being
|
|
// equal to -MAXINT.
|
|
// For code that's executed once per scanline, this kind of subtlety
|
|
// is probably not warranted.
|
|
|
|
if((UINT)(k_wrap) >= (INT)BmpData.Height)
|
|
{
|
|
// Handle the wrap mode here.
|
|
|
|
if(WrapZeroClamp)
|
|
{
|
|
// GpMemset(dst, 0, (right-left)*sizeof(ARGB));
|
|
|
|
// If we're filling with zero, we may as well optimize the kernel
|
|
// contribution.
|
|
|
|
ycoeff[kmod] = 0;
|
|
|
|
// done this scan - go on to the next
|
|
|
|
continue;
|
|
}
|
|
else
|
|
{
|
|
if(QWrapMode == WrapModeClamp)
|
|
{
|
|
INT i = right-left;
|
|
ARGB *d = dst;
|
|
while(i--)
|
|
{
|
|
*d++ = ClampColor;
|
|
}
|
|
|
|
// done this scan - go on to the next
|
|
|
|
continue;
|
|
}
|
|
else
|
|
{
|
|
// Apply the general wrap code.
|
|
|
|
Apply1DWrapModeY(QWrapMode, k_wrap, (INT)BmpData.Height);
|
|
src = srcPtr0 + stride*k_wrap;
|
|
|
|
// Not done yet - fall through and call StretchScanline.
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
// If the x2 and x1 are out of order, we failed to correctly
|
|
// compute the span in the above logic.
|
|
|
|
// Seek to the start of the scanline.
|
|
// Note: whatever X coordinate we add to the src pointer
|
|
// we need to subtract from the width passed to the
|
|
// StretchScanline routine below.
|
|
|
|
src = srcPtr0 + stride*(k_wrap);
|
|
}
|
|
|
|
// Only x-scale if we haven't already done this scanline on a previous
|
|
// call and stored the result in the xbuffer.
|
|
|
|
if((last_k==LAST_K_UNUSED) || (
|
|
(yscale >= 0) && (last_k-(kb+k) < 0) ||
|
|
(yscale < 0) && (last_k-(kb+k) > 0)
|
|
)
|
|
)
|
|
{
|
|
|
|
// Filter in the x-dimension.
|
|
|
|
StretchScanline(
|
|
dst,
|
|
src,
|
|
xbuffer_width,
|
|
static_cast<INT>(BmpData.Width),
|
|
xkci,
|
|
xscale,
|
|
xw,
|
|
xa
|
|
);
|
|
}
|
|
}
|
|
|
|
// set up the k_last for the next iteration. This represents the last
|
|
// scanline for which we actually have x-scaled data.
|
|
|
|
if(yscale < 0)
|
|
{
|
|
last_k = kb;
|
|
}
|
|
else
|
|
{
|
|
last_k = kb + xbuffer_height - 1;
|
|
}
|
|
|
|
|
|
// Get the final destination buffer
|
|
|
|
ARGB *buffer = Scan->NextBuffer(left, y, width);
|
|
|
|
// Now we have the entire buffer full with the x-dimension scaled data.
|
|
|
|
// for every x coordinate, apply the y kernel.
|
|
|
|
#ifdef _X86_
|
|
if (OSInfo::HasMMX)
|
|
{
|
|
// MMX
|
|
|
|
INT *ycoeffMMX = (INT *) ((((UINT_PTR) ycoeff) + 4) & ~0x7);
|
|
INT n = (xbuffer_height + 1) >> 1;
|
|
|
|
// Transform the kernel coeffecient array into a form that is
|
|
// easily usable by MMX code. The loop must go backward so that
|
|
// we don't erase kernel coefficients (MMX starting point could
|
|
// be 4 bytes ahead of integer starting point).
|
|
// ycoeff must be large enough to hold the MMX coefficients (2 extra
|
|
// entries)
|
|
|
|
for (INT i = n-1; i >= 0; i--)
|
|
{
|
|
INT kernel1 = ycoeff[i * 2] >> 2;
|
|
INT kernel2 = ycoeff[i * 2 + 1] >> 2;
|
|
INT kernelMMX = (kernel1 & 0xFFFF) | (kernel2 << 16);
|
|
|
|
ycoeffMMX[i * 2] = kernelMMX;
|
|
ycoeffMMX[i * 2 + 1] = kernelMMX;
|
|
}
|
|
|
|
for(x=0; x<width; x++)
|
|
{
|
|
// iterate over every point under the kernel
|
|
|
|
// Note we don't need the kmod arithmetic here because
|
|
// we've precomputed the coefficients and we don't care what order
|
|
// we sum them.
|
|
|
|
BYTE *kptr = (BYTE*)(xbuffer + x + (left - ixleft));
|
|
|
|
// Compute the increment in bytes to move from the current scanline
|
|
// to the next in the xbuffer.
|
|
|
|
INT kptr_inc_MMX = xbuffer_width*sizeof(ARGB);
|
|
|
|
INT bos_count = xbuffer_height;
|
|
INT bos_half_count = bos_count >> 1;
|
|
bos_count &= 0x1;
|
|
|
|
_asm
|
|
{
|
|
// eax - kptr
|
|
// ebx - kptr_inc
|
|
// ecx - counter
|
|
// esi - ycoeff current pointer
|
|
|
|
pxor mm0, mm0
|
|
movq mm6, FIX14_HALF_MMX
|
|
movq mm7, mm6
|
|
mov eax, kptr
|
|
mov ebx, kptr_inc_MMX
|
|
mov ecx, bos_half_count
|
|
mov esi, ycoeffMMX
|
|
|
|
dec ecx
|
|
jl bicubic_output_span_loop_last_pixel
|
|
|
|
bicubic_output_span_loop:
|
|
|
|
movd mm2, [eax] // mm2 = 00000000a1r1b1g1
|
|
movd mm4, [eax + ebx]
|
|
|
|
punpcklbw mm2, mm0 // mm2 = 00a100r100g100b1
|
|
movq mm1, [esi] // mm1 = kernel2 | kernel1 | kernel2 | kernel1
|
|
|
|
punpcklbw mm4, mm0 // mm4 = 00a200r200g200b2
|
|
movq mm3, mm2 // mm3 = 00a100r100g100b1
|
|
|
|
punpcklwd mm2, mm4 // mm2 = 00g200g100b200b1
|
|
add esi, 8
|
|
pmaddwd mm2, mm1
|
|
punpckhwd mm3, mm4 // mm3 = 00a200a100r200r1
|
|
paddd mm6, mm2
|
|
dec ecx
|
|
pmaddwd mm3, mm1
|
|
lea eax, [eax + 2 * ebx] // does not affect flags
|
|
paddd mm7, mm3
|
|
|
|
jge bicubic_output_span_loop
|
|
|
|
bicubic_output_span_loop_last_pixel:
|
|
|
|
dec bos_count
|
|
jl bicubic_output_span_loop_done
|
|
|
|
movd mm2, [eax] // mm2 = 00000000aarrggbb
|
|
punpcklbw mm2, mm0 // mm2 = 00aa00rr00gg00bb
|
|
movq mm3, mm2
|
|
punpcklwd mm2, mm0 // mm2 = 000000gg000000bb
|
|
movq mm1, [esi] // mm1 = xxxx | kernel1 | xxxx |kernel1
|
|
punpckhwd mm3, mm0 // mm3 = 000000aa000000bb
|
|
|
|
pmaddwd mm2, mm1
|
|
pmaddwd mm3, mm1
|
|
|
|
paddd mm6, mm2
|
|
paddd mm7, mm3
|
|
|
|
bicubic_output_span_loop_done:
|
|
|
|
// At this point, mm6 and mm7 contain the output channels
|
|
// for the pixel. We need to clamp the alpha and store it
|
|
// in the destination buffer.
|
|
|
|
psrad mm6, 14
|
|
psrad mm7, 14
|
|
packssdw mm6, mm7 // mm6 = a | r | g | b
|
|
packuswb mm6, mm6 // mm6 = 00000000aarrggbb
|
|
|
|
movq mm7, mm6 // mm7 = 00000000aarrggbb
|
|
psrad mm6, 24 // mm6 = xxxxxxxxxxxxxxaa
|
|
mov eax, 0xFFFFFFFF
|
|
punpcklbw mm6, mm6 // mm6 = xxxxxxxxxxxxaaaa
|
|
movd mm2, eax
|
|
punpcklbw mm6, mm6 // mm6 = xxxxxxxxaaaaaaaa
|
|
|
|
psubusb mm2, mm6
|
|
mov eax, buffer
|
|
paddusb mm7, mm2
|
|
psubusb mm7, mm2
|
|
|
|
movd [eax], mm7
|
|
add eax, 4
|
|
mov buffer, eax
|
|
}
|
|
}
|
|
}
|
|
else
|
|
#endif // _X86_
|
|
{
|
|
// No MMX
|
|
|
|
for(x=0; x<width; x++)
|
|
{
|
|
// Initialize the component accumulators. We accumulate the
|
|
// contribution of each color component scaled by the kernel
|
|
// response into these variables.
|
|
|
|
ta = tr = tg = tb = 0;
|
|
|
|
// iterate over every point under the kernel
|
|
|
|
// Note we don't need the kmod arithmetic here because
|
|
// we've precomputed the coefficients and we don't care what order
|
|
// we sum them.
|
|
|
|
BYTE *kptr = (BYTE*)(xbuffer + x + (left - ixleft));
|
|
|
|
// Compute the increment in bytes to move from the current scanline
|
|
// to the next in the xbuffer after incrementing through 3 of the
|
|
// color channels.
|
|
|
|
INT kptr_inc = xbuffer_width*sizeof(ARGB);
|
|
|
|
for(k=0; k<xbuffer_height; k++)
|
|
{
|
|
// Find the pixel contributing to this part of the kernel
|
|
// taking into account the edge conditions.
|
|
|
|
// lookup the kernel coefficient for this scanline.
|
|
|
|
pc = ycoeff[k];
|
|
|
|
// Accumulate the contribution of this source pixel to the pixel
|
|
// we're working on.
|
|
|
|
AccumulateChannels(pc, kptr);
|
|
|
|
kptr += kptr_inc;
|
|
}
|
|
|
|
// Done with this pixel - store it in the destination buffer.
|
|
|
|
// clamp the results to byte range.
|
|
|
|
ClampColors();
|
|
|
|
// Combine the channels, set the destination pixel and increment
|
|
// to the next pixel
|
|
|
|
*buffer++ = (ta << 24) | (tr << 16) | (tg << 8) | tb;
|
|
}
|
|
}
|
|
|
|
#ifdef _X86_
|
|
if (OSInfo::HasMMX)
|
|
{
|
|
_asm
|
|
{
|
|
emms
|
|
}
|
|
}
|
|
#endif // _X86_
|
|
|
|
|
|
return Ok;
|
|
}
|
|
|
|
#undef ClampColors
|
|
|
|
|