/************************************************************************** * * Copyright (c) 2000 Microsoft Corporation * * Module Name & Abstract * * Stretch. This module contains the code to do various stretching * by applying a kernel filter. The code correctly handles minification. * * Note: * This module is not compiled into an .obj file, rather it is included * directly into the header file stretch.hpp. * This is due to the use of template functions. * * * Notes: * * This code does not handle rotation or shear. * * Created: * * 04/17/2000 asecchia * Created it. * **************************************************************************/ #define LAST_K_UNUSED ((INT)0x7fffffff) const INT BicubicKernelShift = 7; const INT BicubicKernelSize = 1 << BicubicKernelShift; const FIX16 BicubicKernel[BicubicKernelSize+1] = { 65536, 65496, 65379, 65186, 64920, 64583, 64177, 63705, 63168, 62569, 61911, 61195, 60424, 59600, 58725, 57802, 56832, 55818, 54763, 53668, 52536, 51369, 50169, 48939, 47680, 46395, 45087, 43757, 42408, 41042, 39661, 38268, 36864, 35452, 34035, 32614, 31192, 29771, 28353, 26941, 25536, 24141, 22759, 21391, 20040, 18708, 17397, 16110, 14848, 13614, 12411, 11240, 10104, 9005, 7945, 6927, 5952, 5023, 4143, 3313, 2536, 1814, 1149, 544, 0, -496, -961, -1395, -1800, -2176, -2523, -2843, -3136, -3403, -3645, -3862, -4056, -4227, -4375, -4502, -4608, -4694, -4761, -4809, -4840, -4854, -4851, -4833, -4800, -4753, -4693, -4620, -4536, -4441, -4335, -4220, -4096, -3964, -3825, -3679, -3528, -3372, -3211, -3047, -2880, -2711, -2541, -2370, -2200, -2031, -1863, -1698, -1536, -1378, -1225, -1077, -936, -802, -675, -557, -448, -349, -261, -184, -120, -69, -31, -8, 0 }; const FIX16 SymmetricBicubicKernel[BicubicKernelSize * 2 + 1] = { 0, -8, -31, -69, -120, -184, -261,-349, -448, -557, -675, -802, -936, -1077, -1225, -1378, -1536, -1698, -1863, -2031, -2200, -2370, -2541, -2711, -2880, -3047, -3211, -3372, -3528, -3679, -3825, -3964, -4096, -4220, -4335, -4441, -4536, -4620, -4693, -4753, -4800, -4833, -4851, -4854, -4840, -4809, -4761, -4694, -4608, -4502, -4375, -4227, -4056, -3862, -3645, -3403, -3136, -2843, -2523, -2176, -1800, -1395, -961, -496, 0, 544, 1149, 1814, 2536, 3313, 4143, 5023, 5952, 6927, 7945,9005, 10104, 11240, 12411, 13614, 14848, 16110, 17397, 18708, 20040, 21391, 22759, 24141, 25536, 26941, 28353, 29771, 31192, 32614, 34035, 35452, 36864, 38268, 39661, 41042, 42408, 43757, 45087, 46395, 47680, 48939, 50169, 51369, 52536, 53668, 54763, 55818, 56832, 57802, 58725, 59600, 60424, 61195, 61911, 62569, 63168, 63705, 64177, 64583, 64920, 65186, 65379, 65496, 65536, 65496, 65379, 65186, 64920, 64583, 64177, 63705, 63168, 62569, 61911, 61195, 60424, 59600, 58725, 57802, 56832, 55818, 54763, 53668, 52536, 51369, 50169, 48939, 47680, 46395, 45087, 43757, 42408, 41042, 39661, 38268, 36864, 35452, 34035, 32614, 31192, 29771, 28353, 26941, 25536, 24141, 22759, 21391, 20040, 18708, 17397, 16110, 14848, 13614, 12411, 11240, 10104, 9005, 7945, 6927, 5952, 5023, 4143, 3313, 2536, 1814, 1149, 544, 0, -496, -961, -1395, -1800, -2176, -2523, -2843, -3136, -3403, -3645, -3862, -4056, -4227, -4375, -4502, -4608, -4694, -4761, -4809, -4840, -4854, -4851, -4833, -4800, -4753, -4693, -4620, -4536, -4441, -4335, -4220, -4096, -3964, -3825, -3679, -3528, -3372, -3211, -3047, -2880, -2711, -2541, -2370, -2200, -2031, -1863, -1698, -1536, -1378, -1225, -1077, -936, -802, -675, -557, -448, -349, -261, -184, -120, -69, -31, -8, 0 }; /* // Higher precision bicubic kernel - more data. // Commented out in case we eventually need it. const FIX16 BK[512+1] = { 0, -2, -8, -18, -31, -48, -69, -93, -120, -151, -184, -221, -261, -304, -349, -397, -448, -501, -557, -615, -675, -737, -802, -868, -936, -1006, -1077, -1150, -1225, -1301, -1378, -1457, -1536, -1616, -1698, -1780, -1863, -1947, -2031, -2115, -2200, -2285, -2370, -2456, -2541, -2626, -2711, -2796, -2880, -2964, -3047, -3129, -3211, -3292, -3372, -3450, -3528, -3604, -3679, -3753, -3825, -3895, -3964, -4031, -4096, -4159, -4220, -4279, -4335, -4389, -4441, -4490, -4536, -4580, -4620, -4658, -4693, -4725, -4753, -4778, -4800, -4818, -4833, -4844, -4851, -4854, -4854, -4849, -4840, -4827, -4809, -4787, -4761, -4730, -4694, -4654, -4608, -4557, -4502, -4441, -4375, -4304, -4227, -4144, -4056, -3962, -3862, -3757, -3645, -3527, -3403, -3273, -3136, -2993, -2843, -2686, -2523, -2353, -2176, -1991, -1800, -1601, -1395, -1182, -961, -732, -496, -252, 0, 264, 544, 839, 1149, 1474, 1814, 2168, 2536, 2918, 3313, 3722, 4143, 4577, 5023, 5482, 5952, 6434, 6927, 7430, 7945, 8470, 9005, 9550, 10104, 10668, 11240, 11821, 12411, 13009, 13614, 14228, 14848, 15475, 16110, 16750, 17397, 18050, 18708, 19371, 20040, 20713, 21391, 22073, 22759, 23449, 24141, 24837, 25536, 26237, 26941, 27646, 28353, 29061, 29771, 30481, 31192, 31903, 32614, 33325, 34035, 34744, 35452, 36159, 36864, 37567, 38268, 38966, 39661, 40353, 41042, 41727, 42408, 43085, 43757, 44425, 45087, 45744, 46395, 47041, 47680, 48313, 48939, 49557, 50169, 50773, 51369, 51957, 52536, 53107, 53668, 54220, 54763, 55296, 55818, 56331, 56832, 57322, 57802, 58269, 58725, 59169, 59600, 60018, 60424, 60816, 61195, 61560, 61911, 62248, 62569, 62876, 63168, 63444, 63705, 63949, 64177, 64388, 64583, 64760, 64920, 65062, 65186, 65292, 65379, 65447, 65496, 65526, 65536, 65526, 65496, 65447, 65379, 65292, 65186, 65062, 64920, 64760, 64583, 64388, 64177, 63949, 63705, 63444, 63168, 62876, 62569, 62248, 61911, 61560, 61195, 60816, 60424, 60018, 59600, 59169, 58725, 58269, 57802, 57322, 56832, 56331, 55818, 55296, 54763, 54220, 53668, 53107, 52536, 51957, 51369, 50773, 50169, 49557, 48939, 48313, 47680, 47041, 46395, 45744, 45087, 44425, 43757, 43085, 42408, 41727, 41042, 40353, 39661, 38966, 38268, 37567, 36864, 36159, 35452, 34744, 34035, 33325, 32614, 31903, 31192, 30481, 29771, 29061, 28353, 27646, 26941, 26237, 25536, 24837, 24141, 23449, 22759, 22073, 21391, 20713, 20040, 19371, 18708, 18050, 17397, 16750, 16110, 15475, 14848, 14228, 13614, 13009, 12411, 11821, 11240, 10668, 10104, 9550, 9005, 8470, 7945, 7430, 6927, 6434, 5952, 5482, 5023, 4577, 4143, 3722, 3313, 2918, 2536, 2168, 1814, 1474, 1149, 839, 544, 264, 0, -252, -496, -732, -961, -1182, -1395, -1601, -1800, -1991, -2176, -2353, -2523, -2686, -2843, -2993, -3136, -3273, -3403, -3527, -3645, -3757, -3862, -3962, -4056, -4144, -4227, -4304, -4375, -4441, -4502, -4557, -4608, -4654, -4694, -4730, -4761, -4787, -4809, -4827, -4840, -4849, -4854, -4854, -4851, -4844, -4833, -4818, -4800, -4778, -4753, -4725, -4693, -4658, -4620, -4580, -4536, -4490, -4441, -4389, -4335, -4279, -4220, -4159, -4096, -4031, -3964, -3895, -3825, -3753, -3679, -3604, -3528, -3450, -3372, -3292, -3211, -3129, -3047, -2964, -2880, -2796, -2711, -2626, -2541, -2456, -2370, -2285, -2200, -2115, -2031, -1947, -1863, -1780, -1698, -1616, -1536, -1457, -1378, -1301, -1225, -1150, -1077, -1006, -936, -868, -802, -737, -675, -615, -557, -501, -448, -397, -349, -304, -261, -221, -184, -151, -120, -93, -69, -48, -31, -18, -8, -2, 0 }; // Bicubic kernel with the 'perceptual' coefficient tweaked // see Wolberg. Provides a slightly different experience. // Commented out in case we eventually need it. const FIX16 BK_V[512+1] = { 0, -4, -16, -35, -62, -96, -137, -185, -240, -301, -369, -442, -522, -607, -698, -795, -896, -1002, -1114, -1230, -1350, -1475, -1603, -1736, -1872, -2012, -2155, -2301, -2450, -2602, -2756, -2913, -3072, -3233, -3396, -3560, -3726, -3893, -4061, -4230, -4400, -4570, -4741, -4911, -5082, -5252, -5422, -5592, -5760, -5927, -6094, -6259, -6422, -6584, -6743, -6901, -7056, -7209, -7359, -7506, -7650, -7791, -7928, -8062, -8192, -8318, -8440, -8557, -8670, -8778, -8881, -8979, -9072, -9159, -9241, -9316, -9386, -9449, -9506, -9557, -9600, -9636, -9666, -9688, -9702, -9709, -9707, -9698, -9680, -9654, -9619, -9575, -9522, -9460, -9388, -9307, -9216, -9115, -9004, -8882, -8750, -8607, -8453, -8288, -8112, -7924, -7725, -7513, -7290, -7054, -6806, -6546, -6272, -5985, -5686, -5373, -5046, -4706, -4351, -3983, -3600, -3203, -2791, -2364, -1922, -1465, -992, -504, 0, 516, 1040, 1571, 2110, 2656, 3209, 3769, 4336, 4909, 5489, 6074, 6666, 7263, 7866, 8475, 9088, 9706, 10330, 10958, 11590, 12227, 12867, 13512, 14160, 14812, 15467, 16125, 16786, 17450, 18116, 18785, 19456, 20129, 20804, 21480, 22158, 22837, 23517, 24198, 24880, 25562, 26245, 26927, 27610, 28292, 28974, 29656, 30336, 31015, 31694, 32371, 33046, 33720, 34391, 35061, 35728, 36393, 37055, 37714, 38370, 39023, 39672, 40318, 40960, 41598, 42232, 42861, 43486, 44106, 44721, 45331, 45936, 46535, 47129, 47716, 48298, 48873, 49442, 50005, 50560, 51108, 51650, 52184, 52710, 53229, 53739, 54242, 54736, 55222, 55699, 56167, 56626, 57076, 57516, 57947, 58368, 58779, 59180, 59570, 59950, 60319, 60677, 61024, 61360, 61684, 61997, 62297, 62586, 62862, 63126, 63378, 63616, 63841, 64054, 64253, 64438, 64610, 64767, 64911, 65040, 65155, 65255, 65340, 65410, 65465, 65504, 65528, 65536, 65528, 65504, 65465, 65410, 65340, 65255, 65155, 65040, 64911, 64767, 64610, 64438, 64253, 64054, 63841, 63616, 63378, 63126, 62862, 62586, 62297, 61997, 61684, 61360, 61024, 60677, 60319, 59950, 59570, 59180, 58779, 58368, 57947, 57516, 57076, 56626, 56167, 55699, 55222, 54736, 54242, 53739, 53229, 52710, 52184, 51650, 51108, 50560, 50005, 49442, 48873, 48298, 47716, 47129, 46535, 45936, 45331, 44721, 44106, 43486, 42861, 42232, 41598, 40960, 40318, 39672, 39023, 38370, 37714, 37055, 36393, 35728, 35061, 34391, 33720, 33046, 32371, 31694, 31015, 30336, 29656, 28974, 28292, 27610, 26927, 26245, 25562, 24880, 24198, 23517, 22837, 22158, 21480, 20804, 20129, 19456, 18785, 18116, 17450, 16786, 16125, 15467, 14812, 14160, 13512, 12867, 12227, 11590, 10958, 10330, 9706, 9088, 8475, 7866, 7263, 6666, 6074, 5489, 4909, 4336, 3769, 3209, 2656, 2110, 1571, 1040, 516, 0, -504, -992, -1465, -1922, -2364, -2791, -3203, -3600, -3983, -4351, -4706, -5046, -5373, -5686, -5985, -6272, -6546, -6806, -7054, -7290, -7513, -7725, -7924, -8112, -8288, -8453, -8607, -8750, -8882, -9004, -9115, -9216, -9307, -9388, -9460, -9522, -9575, -9619, -9654, -9680, -9698, -9707, -9709, -9702, -9688, -9666, -9636, -9600, -9557, -9506, -9449, -9386, -9316, -9241, -9159, -9072, -8979, -8881, -8778, -8670, -8557, -8440, -8318, -8192, -8062, -7928, -7791, -7650, -7506, -7359, -7209, -7056, -6901, -6743, -6584, -6422, -6259, -6094, -5927, -5760, -5592, -5422, -5252, -5082, -4911, -4741, -4570, -4400, -4230, -4061, -3893, -3726, -3560, -3396, -3233, -3072, -2913, -2756, -2602, -2450, -2301, -2155, -2012, -1872, -1736, -1603, -1475, -1350, -1230, -1114, -1002, -896, -795, -698, -607, -522, -442, -369, -301, -240, -185, -137, -96, -62, -35, -16, -4, 0 }; */ // This is the table of partial sums of the bilinear kernel. // Simply put, each point in the array represents the integral // from -infinity to position x in the kernel function. // We can subtract two table lookups to get the integral // of the kernel (area) between the two points. // The table is padded with zeros and ones at the beginning and end // so we can consistently address areas outside of the actual kernel // Currently we don't make use of the zeros at the beginning but // we definitely sample past the end by at least one half-width // of the kernel. const FIX16 BilinearPartialIntegral[512+1] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,2, 8, 18, 32, 50, 72, 98, 128, 162, 200, 242, 288, 338, 392, 450, 512, 578, 648, 722, 800, 882, 968, 1058, 1152, 1250, 1352, 1458, 1568, 1682, 1800, 1922, 2048, 2178, 2312, 2450, 2592, 2738, 2888, 3042, 3200, 3362, 3528, 3698, 3872, 4050, 4232, 4418, 4608, 4802, 5000, 5202, 5408, 5618, 5832, 6050, 6272, 6498, 6728, 6962, 7200, 7442, 7688, 7938, 8192, 8450, 8712, 8978, 9248, 9522, 9800, 10082, 10368, 10658, 10952, 11250, 11552, 11858, 12168, 12482, 12800, 13122, 13448, 13778, 14112, 14450, 14792, 15138, 15488, 15842, 16200, 16562, 16928, 17298, 17672, 18050, 18432, 18818, 19208, 19602, 20000, 20402, 20808, 21218, 21632, 22050, 22472, 22898, 23328, 23762, 24200, 24642, 25088, 25538, 25992, 26450, 26912, 27378, 27848, 28322, 28800, 29282, 29768, 30258, 30752, 31250, 31752, 32258, 32768, // center of the kernel. Index 256 33278, 33784, 34286, 34784, 35278, 35768, 36254, 36736, 37214, 37688, 38158, 38624, 39086, 39544, 39998, 40448, 40894, 41336, 41774, 42208, 42638, 43064, 43486, 43904, 44318, 44728, 45134, 45536, 45934, 46328, 46718, 47104, 47486, 47864, 48238, 48608, 48974, 49336, 49694, 50048, 50398, 50744, 51086, 51424, 51758, 52088, 52414, 52736, 53054, 53368, 53678, 53984, 54286, 54584, 54878, 55168, 55454, 55736, 56014, 56288, 56558, 56824, 57086, 57344, 57598, 57848, 58094, 58336, 58574, 58808, 59038, 59264, 59486, 59704, 59918, 60128, 60334, 60536, 60734, 60928, 61118, 61304, 61486, 61664, 61838, 62008, 62174, 62336, 62494, 62648, 62798, 62944, 63086, 63224, 63358, 63488, 63614, 63736, 63854, 63968, 64078, 64184, 64286, 64384, 64478, 64568, 64654, 64736, 64814, 64888, 64958, 65024, 65086, 65144, 65198, 65248, 65294, 65336, 65374, 65408, 65438, 65464, 65486, 65504, 65518, 65528, 65534, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, }; // This is the table of partial sums of the bicubic kernel. // Simply put, each point in the array represents the integral // from -infinity to position x in the kernel function. // We can subtract two table lookups to get the integral // of the kernel (area) between the two points. // The table is padded with zeros and ones at the beginning and end // so we can consistently address areas outside of the actual kernel // Currently we don't make use of the zeros at the beginning but // we definitely sample past the end by at least one half-width // of the kernel. const FIX16 BicubicPartialIntegral[1024+1] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -2, -3, -4, -6, -8, -11, -15, -19, -24, -29, -35, -42, -50, -59, -68, -79, -90, -103, -117, -131, -147, -164, -182, -201, -221, -243, -265, -289, -315, -341, -369, -398, -429, -460, -493, -528, -563, -600, -639, -679, -720, -762, -806, -851, -897, -945, -993, -1044, -1095, -1148, -1202, -1257, -1313, -1371, -1429, -1489, -1550, -1612, -1675, -1739, -1804, -1870, -1937, -2004, -2073, -2142, -2212, -2283, -2355, -2427, -2500, -2573, -2647, -2721, -2796, -2871, -2946, -3022, -3097, -3173, -3249, -3325, -3401, -3476, -3552, -3627, -3702, -3776, -3850, -3923, -3996, -4068, -4139, -4209, -4279, -4347, -4414, -4481, -4545, -4609, -4671, -4731, -4790, -4847, -4902, -4955, -5006, -5055, -5102, -5146, -5188, -5228, -5264, -5298, -5329, -5358, -5383, -5404, -5423, -5438, -5449, -5457, -5461, -5461, -5457, -5449, -5437, -5420, -5399, -5374, -5345, -5311, -5273, -5230, -5182, -5130, -5073, -5012, -4946, -4875, -4799, -4718, -4633, -4542, -4447, -4346, -4240, -4130, -4014, -3893, -3767, -3636, -3500, -3358, -3212, -3060, -2902, -2740, -2572, -2399, -2220, -2037, -1848, -1653, -1454, -1249, -1038, -822, -601, -375, -143, 94, 336, 584, 836, 1095, 1358, 1627, 1901, 2180, 2464, 2754, 3048, 3348, 3653, 3963, 4278, 4598, 4923, 5253, 5588, 5927, 6272, 6621, 6975, 7334, 7698, 8066, 8439, 8816, 9198, 9584, 9975, 10370, 10769, 11173, 11580, 11992, 12408, 12828, 13252, 13679, 14111, 14546, 14985, 15427, 15873, 16322, 16775, 17231, 17690, 18152, 18618, 19086, 19557, 20032, 20508, 20988, 21470, 21954, 22441, 22930, 23421, 23914, 24409, 24906, 25405, 25905, 26407, 26911, 27415, 27921, 28428, 28937, 29446, 29955, 30466, 30977, 31488, 32000, 32512, 33024, // center of the kernel. Index 512 33536, 34048, 34559, 35070, 35581, 36090, 36599, 37108, 37615, 38121, 38625, 39129, 39631, 40131, 40630, 41127, 41622, 42115, 42606, 43095, 43582, 44066, 44548, 45028, 45504, 45979, 46450, 46918, 47384, 47846, 48305, 48761, 49214, 49663, 50109, 50551, 50990, 51425, 51857, 52284, 52708, 53128, 53544, 53956, 54363, 54767, 55166, 55561, 55952, 56338, 56720, 57097, 57470, 57838, 58202, 58561, 58915, 59264, 59609, 59948, 60283, 60613, 60938, 61258, 61573, 61883, 62188, 62488, 62782, 63072, 63356, 63635, 63909, 64178, 64441, 64700, 64952, 65200, 65442, 65679, 65911, 66137, 66358, 66574, 66785, 66990, 67189, 67384, 67573, 67756, 67935, 68108, 68276, 68438, 68596, 68748, 68894, 69036, 69172, 69303, 69429, 69550, 69666, 69776, 69882, 69983, 70078, 70169, 70254, 70335, 70411, 70482, 70548, 70609, 70666, 70718, 70766, 70809, 70847, 70881, 70910, 70935, 70956, 70973, 70985, 70993, 70997, 70997, 70993, 70985, 70974, 70959, 70940, 70919, 70894, 70865, 70834, 70800, 70764, 70724, 70682, 70638, 70591, 70542, 70491, 70438, 70383, 70326, 70267, 70207, 70145, 70081, 70017, 69950, 69883, 69815, 69745, 69675, 69604, 69532, 69459, 69386, 69312, 69238, 69163, 69088, 69012, 68937, 68861, 68785, 68709, 68633, 68558, 68482, 68407, 68332, 68257, 68183, 68109, 68036, 67963, 67891, 67819, 67748, 67678, 67609, 67540, 67473, 67406, 67340, 67275, 67211, 67148, 67086, 67025, 66965, 66907, 66849, 66793, 66738, 66684, 66631, 66580, 66529, 66481, 66433, 66387, 66342, 66298, 66256, 66215, 66175, 66136, 66099, 66064, 66029, 65996, 65965, 65934, 65905, 65877, 65851, 65825, 65801, 65779, 65757, 65737, 65718, 65700, 65683, 65667, 65653, 65639, 65626, 65615, 65604, 65595, 65586, 65578, 65571, 65565, 65560, 65555, 65551, 65547, 65544, 65542, 65540, 65539, 65538, 65537, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, }; // We use a biased pointer to the center of the array // so that we can look up the negative part of the kernel // without repositioning the index or using an absolute value // computation in the inner loop. // Linear Partial Integral Center. const FIX16 *LPIC = &BilinearPartialIntegral[256]; // Cubic Partial Integral Center. const FIX16 *CPIC = &BicubicPartialIntegral[512]; const FIX16 *SymmetricBicubicKernelCenter = &SymmetricBicubicKernel[128]; const ULONGLONG FIX14_HALF_MMX = 0x0000200000002000; /************************************************************************** * * Function Description: * * Constructor for the DpOutputSpanStretch class. * * Return Value: * * NONE * * Created: * * 04/17/2000 asecchia * Created it. * **************************************************************************/ #define FIX4TOFIX16_SHIFT (FIX16_SHIFT - FIX4_SHIFT) template void DpOutputSpanStretch::InitializeClass( DpBitmap* bitmap, DpScanBuffer * scan, DpContext* /*context*/, DpImageAttributes imgAttributes, const GpRectF *dstRect, const GpRectF *srcRect ) { isValid = true; // Make sure these get initialized up front before we can early out // otherwise we could end up freeing uninitialized pointers in our // destructor. ycoeff = NULL; xbuffer = NULL; Scan = scan; dBitmap = bitmap; QWrapMode = imgAttributes.wrapMode; ClampColor = imgAttributes.clampColor; ClampColorA = (BYTE)( (ClampColor >> 24) ); ClampColorR = (BYTE)( (ClampColor >> 16) & 0xff); ClampColorG = (BYTE)( (ClampColor >> 8) & 0xff); ClampColorB = (BYTE)( ClampColor & 0xff); // Accleration for clamp mode with zero clamp color (transparent) WrapZeroClamp = FALSE; if((QWrapMode == WrapModeClamp) && (imgAttributes.clampColor == 0)) { WrapZeroClamp = TRUE; } ASSERT(dBitmap != NULL); ASSERT(dBitmap->IsValid()); // on bad bitmap, we return with Valid = FALSE if (dBitmap == NULL || !dBitmap->IsValid() ) { dBitmap = NULL; isValid = false; return; } else { BmpData.Width = dBitmap->Width; BmpData.Height = dBitmap->Height; BmpData.PixelFormat = PIXFMT_32BPP_PARGB; BmpData.Stride = dBitmap->Delta; BmpData.Scan0 = dBitmap->Bits; } if(srcRect) SrcRect = *srcRect; else { SrcRect.X = 0.0f; SrcRect.Y = 0.0f; SrcRect.Width = (REAL)dBitmap->Width; SrcRect.Height = (REAL) dBitmap->Height; } // Set up the translation. if(dstRect) { DstRect = *dstRect; } else { DstRect.X = 0.0f; DstRect.Y = 0.0f; DstRect.Width = (REAL)SrcRect.Width; DstRect.Height = (REAL)SrcRect.Height; } if( !GpValidFixed16(SrcRect.X) || !GpValidFixed16(SrcRect.Y) || !GpValidFixed16(SrcRect.Width) || !GpValidFixed16(SrcRect.Height) || !GpValidFixed16(DstRect.X) || !GpValidFixed16(DstRect.Y) || !GpValidFixed16(DstRect.Width) || !GpValidFixed16(DstRect.Height) ) { // punt isValid = false; return; } // Initialize the state for the x-dimension scale. xscale = GpRealToFix16(SrcRect.Width/DstRect.Width); xscaleinv = GpRealToFix16(DstRect.Width/SrcRect.Width); // Initialize the state for the y-dimension scale. yscale = GpRealToFix16(SrcRect.Height/DstRect.Height); yscaleinv = GpRealToFix16(DstRect.Height/SrcRect.Height); // Compute the destination contribution. // Note: the actual pixels touched are the floor of // the top left to the ceiling of the bottom right. // (modulus the clipping) // Note: We want to be tracking our internal state in FIX16 so we have // the extra fractional precision, but when we compute our bounds for the // drawing, we use Ceiling and Floor on these FIX16 numbers below. We want // the rounding to match the rounding of the FIX4 numbers (i.e. we don't // want to track any extra fractional precision errors from the float // representation) because we use FIX4 in our DrawImage loop. // To accomplish this, we round to FIX4 dropping all error that is smaller // than the FIX4 precision and then upconvert to FIX16. Now when we use // Fix16Ceiling and Floor, we'll get the same results as Fix4Ceiling and // Floor. REAL xinv = DstRect.Width/SrcRect.Width; REAL yinv = DstRect.Height/SrcRect.Height; fixDLeft = GpRealToFix4(DstRect.X); fixDRight = GpRealToFix4(xinv * (SrcRect.Width) + DstRect.X); fixDTop = GpRealToFix4(DstRect.Y); fixDBottom = GpRealToFix4(yinv * (SrcRect.Height) + DstRect.Y); // Handle negative scale FIX16 fixTemp; if(fixDLeft > fixDRight) { // Swap the left and right x coordinates. fixTemp = fixDLeft; fixDLeft = fixDRight; fixDRight = fixTemp; } if(fixDTop > fixDBottom) { // Swap the top and bottom x coordinates. fixTemp = fixDTop; fixDTop = fixDBottom; fixDBottom = fixTemp; } // Compute the left edge using the rasterizer rounding rules. Used // for clipping in x. ixleft = GpFix4Ceiling(fixDLeft); // Convert up to FIX16. fixDLeft <<= FIX4TOFIX16_SHIFT; fixDRight <<= FIX4TOFIX16_SHIFT; fixDTop <<= FIX4TOFIX16_SHIFT; fixDBottom <<= FIX4TOFIX16_SHIFT; // Get the initial kernel center. This specifies the x-dimension // fractional pixel offset. if(xscale < 0) { xkci = GpRealToFix16( (((DstRect.X+DstRect.Width) - GpFix16Ceiling(fixDRight)) * (xscale)) / FIX16_ONE + SrcRect.X ); } else { xkci = GpRealToFix16( ((DstRect.X - GpFix16Floor(fixDLeft)) * xscale) / FIX16_ONE + SrcRect.X ); } // Get the width of the kernel. // Make sure to multiply by the actual width of the filter kernel in // normalized space (FilterWidth[i]) xw = GpRealToFix16( (SrcRect.Width*FilterWidth[FilterMode]) / DstRect.Width ); // convert to FIX16 // Handle the negative transform if(xscale < 0) { xw = -xw; } // the width of the kernel must be a positive quantity. ASSERT(xw >= 0); // if the width is less than one we're doing a stretch, not a shrink. // in this case we clamp the kernel size to one. if(xw < FIX16_ONE * FilterWidth[FilterMode]) { xw = FIX16_ONE * FilterWidth[FilterMode]; } // a is 1/w - used to work out the tent filter. xa = GpRealToFix16(65536.0f/xw); // Get the initial kernel center. This specifies the y-dimension // fractional pixel offset. if(yscale < 0) { ykci = GpRealToFix16( ((GpFix16Ceiling(fixDBottom) - (DstRect.Y+DstRect.Height)) * (-yscale)) / FIX16_ONE + SrcRect.Y ); } else { ykci = GpRealToFix16( ((GpFix16Floor(fixDTop) - DstRect.Y) * yscale) / FIX16_ONE + SrcRect.Y ); } // Get the width of the kernel. // Make sure to multiply by the actual width of the filter kernel in // normalized space (FilterWidth[i]) yw = GpRealToFix16( (SrcRect.Height * FilterWidth[FilterMode]) / DstRect.Height ); // Convert to FIX16 // Handle the negative transform if(yscale < 0) { yw = -yw; } // the width of the kernel must be a positive quantity. ASSERT(yw >= 0); // if the kernel width is less than one we're doing a stretch, not // a shrink. In this case we clamp the kernel size to one. if(yw < (FIX16_ONE * FilterWidth[FilterMode])) { yw = FIX16_ONE * FilterWidth[FilterMode]; } // a is 1/w - used to work out the tent filter. ya = GpRealToFix16(65536.0f/yw); // !!! [asecchia] The rounding used here should match the rounding used to compute // the parameters to StretchBitsMainLoop. iytop = GpFix16Floor(fixDTop); // Compute the width of one scanline in the destination. xbuffer_width = GpFix16Ceiling(fixDRight) - GpFix16Floor(fixDLeft); ASSERT(xbuffer_width >= 0); xbuffer_height = GpFix16Ceiling(yw)*2+1; ASSERT(xbuffer_height >= 0); // set the rotational array to start at the first scanline. xbuffer_start_scanline = 0; // allocate the xbuffer. // !!! PERF [asecchia]. Ouch this is ugly. // we should at least try use a stack buffer for small images. // Maybe a lookaside list or something. xbuffer = (ARGB *)GpMalloc(xbuffer_height*xbuffer_width*sizeof(ARGB)); // ycoeff needs to have 2 entries more than xbuffer_height because // it may be reused to store the MMX coefficients (see OutputSpan // routine for details). ycoeff = (FIX16 *)GpMalloc((xbuffer_height + 2) * sizeof(FIX16)); if((NULL == ycoeff) || (NULL == xbuffer)) { isValid = false; GpFree(xbuffer); GpFree(ycoeff); // Make sure these get initialized to NULL before we can early out // otherwise we could end up double freeing the pointers in our // destructor. xbuffer = NULL; ycoeff = NULL; return; } // set the initial value of last_k to maxint last_k = LAST_K_UNUSED; } /**************************************************************************\ * * Function Description: * * This function performs a 1d stretch using the tent filter * * Arguments: * * dst - destination buffer * src - source pixels * dw - destination width in pixels * sw - source width in pixels * kci - the initial kernel centering position (for fractional translate) * scale - the scale of the filter - sw/dw * w - the width of the filter kernel - typically the ceiling of sw/dw * a - 1/w * * History: * 04/16/2000 asecchia created it. * \**************************************************************************/ // !!! Perf [asecchia] For really complicated wrapmodes where many of the // pixels are outside of the source and hence need to be wrapped, it may // make more sense to copy the source into an extended buffer and pre-wrap // the end points (i.e. overallocate) for each scanline. // This could simplify the code for the complex wrap conditions. // However, for the simple codepath, this would give an extra copy per // pixel and might not be worth it. // Ick. Why does the compiler do a better job of optimizing macros? // These should really be inline function calls. #define ClampColors() \ if(FilterMode == HighQualityBilinear) \ { \ ta = GpFix16Round(ta); \ tr = GpFix16Round(tr); \ tg = GpFix16Round(tg); \ tb = GpFix16Round(tb); \ if(ta>255) ta = 255; \ if(tr>255) tr = 255; \ if(tg>255) tg = 255; \ if(tb>255) tb = 255; \ } \ if(FilterMode == HighQualityBicubic) \ { \ ta = GpFix16Round(ta); \ tr = GpFix16Round(tr); \ tg = GpFix16Round(tg); \ tb = GpFix16Round(tb); \ if(ta>255) ta = 255; \ if(tr>ta) tr = ta; \ if(tg>ta) tg = ta; \ if(tb>ta) tb = ta; \ if(ta<0) ta = 0; \ if(tr<0) tr = 0; \ if(tg<0) tg = 0; \ if(tb<0) tb = 0; \ } // Compute the kernel in the inner loop // Note: the If statements are compiled away in the final code // because they are template variable comparisons which can be // done at compile time. // This macro looks up the new kernel value, subtracts the old one // to get the area of contribution for this pixel, computes the // new kernel position and stores the current table lookup. #define ComputeKernel(pc, a, pa, pa_old, krn) \ if(FilterMode == HighQualityBilinear) \ { \ pa = LPIC[krn >> 9]; \ pc = pa-pa_old; \ krn += (a); \ pa_old = pa; \ } \ if(FilterMode == HighQualityBicubic) \ { \ pa = CPIC[krn >> 8]; \ pc = pa-pa_old; \ krn += (a); \ pa_old = pa; \ } // This block of code accumulates the individual channels from // kptr into the accumulation buffers tb, tg, tr, and ta. #define AccumulateChannels(pc, kptr) \ { \ tb += pc * kptr[0]; \ tg += pc * kptr[1]; \ tr += pc * kptr[2]; \ ta += pc * kptr[3]; \ } inline void Apply1DWrapModeX(INT WrapMode, INT &x, INT w) { INT xm; switch(WrapMode) { case WrapModeTileFlipY: case WrapModeTile: x = RemainderI(x, w); break; case WrapModeTileFlipX: case WrapModeTileFlipXY: xm = RemainderI(x, w); if(((x-xm)/w) & 1) { x = w-1-xm; } else { x = xm; } break; default: // Caller should correctly anticipate other wrap modes. ASSERT(FALSE); break; } } inline void Apply1DWrapModeY(INT WrapMode, INT &y, INT h) { INT ym; switch(WrapMode) { case WrapModeTile: case WrapModeTileFlipX: y = RemainderI(y, h); break; case WrapModeTileFlipY: case WrapModeTileFlipXY: ym = RemainderI(y, h); if(((y-ym)/h) & 1) { y = h-1-ym; } else { y = ym; } break; default: // Caller should correctly anticipate other wrap modes. ASSERT(FALSE); break; } } #undef RemainderI /************************************************************************** * * Function Description: * * Outputs the middle pixels in a 2:1 stretched scanline. Note that * this function doesn't need to handle wrap modes. * * Note: this function must not use floating point values, because it could be * called with an invalid floating point state (prior to the call to emms) * * Arguments: * * dst - The first pixel to be output * src - The first pixel in the source that will affect the destination * pixel in a bicubic 2:1 stretch * dw - The number of pixels in the destination * kci - The subpixel shift in the position of the destination pixels * **************************************************************************/ void DpOutputSpanStretch::StretchMiddleScanline2_MMX( ARGB *dst, ARGB *src, INT dw, FIX16 kci ) { ASSERT(FALSE); } void DpOutputSpanStretch::StretchMiddleScanline2_MMX( ARGB *dst, ARGB *src, INT dw, FIX16 kci ) { #if defined(_X86_) // // In order to store the kernel multipliers in 16bit registers, we // will lose the bottom 3 precision bits (hence each k[i] must be // right shifted by three). The summation of the kernel multipliers // should come to 16K, hence KERNEL_SHIFT_AMOUNT is 14. // #define KERNEL_SHIFT_AMOUNT 14 FIX16 k[8]; FIX16 kernelIncrement = FIX16_ONE >> 2 ; FIX16 kCurrent = (kci >> 2) - FIX16_ONE; for (INT i = 0; i < 8; i++) { ASSERT(kCurrent >= -FIX16_ONE); ASSERT(kCurrent <= FIX16_ONE); k[i] = SymmetricBicubicKernelCenter[kCurrent >> (FIX16_SHIFT-BicubicKernelShift)]; k[i] >>= 3; kCurrent += kernelIncrement; } // // Setup 64bit aligned workspace for the MMX code // // 0 - zero // 8 - kernel multiplier 0 // 16 - kernel multiplier 1 // 24 - kernel multiplier 2 // 32 - kernel multiplier 3 // 40 - accumulator 3: g, b // 48 - accumulator 3: a, r // 56 - FIX14_HALF // #define BUFFER_SIZE 16 INT buffer[BUFFER_SIZE + 1]; INT *buffer_64bitAligned = (INT *) ((((UINT_PTR) buffer) + 4) & ~0x7); buffer_64bitAligned[0] = 0; // zero buffer_64bitAligned[1] = 0; buffer_64bitAligned[2] = (k[7] << 16) | (k[6] & 0xFFFF); // kernel multiplier 0 buffer_64bitAligned[3] = buffer_64bitAligned[2]; buffer_64bitAligned[4] = (k[5] << 16) | (k[4] & 0xFFFF); // kernel multiplier 1 buffer_64bitAligned[5] = buffer_64bitAligned[4]; buffer_64bitAligned[6] = (k[3] << 16) | (k[2] & 0xFFFF); // kernel multiplier 2 buffer_64bitAligned[7] = buffer_64bitAligned[6]; buffer_64bitAligned[8] = (k[1] << 16) | (k[0] & 0xFFFF); // kernel multiplier 3 buffer_64bitAligned[9] = buffer_64bitAligned[8]; buffer_64bitAligned[10] = 0; // Accumulator 3 buffer_64bitAligned[11] = 0; buffer_64bitAligned[12] = 0; buffer_64bitAligned[13] = 0; buffer_64bitAligned[14] = (1 << (14 - 1)); // FIX14_HALF buffer_64bitAligned[15] = (1 << (14 - 1)); // // eax - counter for the first loop // ebx - 0xffffffff // esi - source // edi - destination // ecx - counter // edx - 64it aligned workspace buffer // // mm6, mm7: accumulator 0 // mm4, mm5: accumulator 1 // _asm { mov ebx, 0xFFFFFFFF mov esi, src mov edi, dst mov ecx, dw mov edx, buffer_64bitAligned // // The first loop loads the initial values into the accumulators, but // doesn't write out any pixels. It executes exactly three times. // pxor mm4, mm4 pxor mm5, mm5 mov eax, 3 loop1: // // Read expanded pixel values into mm0 and mm1 // movd mm1, [esi] ; mm1 = 00000000a1r1g1b1 movd mm2, [esi + 4] ; mm2 = 00000000a2r2g2b2 add esi, 8 punpcklbw mm1, [edx] ; mm1 = 00a100r100g100b1 punpcklbw mm2, [edx] ; mm2 = 00a200r200g200b2 movq mm0, mm1 ; mm0 = 00a100r100g100b1 punpckhwd mm1, mm2 ; mm1 = 00a200a100r200r1 punpcklwd mm0, mm2 ; mm0 = 00g200g100b200b1 // // Add the contribution to accumulator 1 // movq mm6, [edx + 16] ; kernel multiplier 1 movq mm7, mm6 ; kernel multiplier 1 pmaddwd mm6, mm0 pmaddwd mm7, mm1 paddd mm6, mm4 paddd mm7, mm5 // // Add the contribution to accumulator 2 // movq mm4, [edx + 24] ; kernel multiplier 2 movq mm5, mm4 ; kernel multiplier 2 pmaddwd mm4, mm0 pmaddwd mm5, mm1 paddd mm4, [edx + 40] paddd mm5, [edx + 48] // // Compute the new third accumulator // pmaddwd mm0, [edx + 32] ; multiply by kernel multiplier 3 pmaddwd mm1, [edx + 32] movq [edx + 40], mm0 movq [edx + 48], mm1 dec eax jnz loop1 // // The second loop continues to compute the accumulators, but // also writes out destination pixels. // loop2: // // Read expanded pixel values into mm0 and mm1 // movd mm1, [esi] ; mm1 = 00000000a1r1g1b1 movd mm2, [esi + 4] ; mm2 = 00000000a2r2g2b2 add esi, 8 punpcklbw mm1, [edx] ; mm1 = 00a100r100g100b1 punpcklbw mm2, [edx] ; mm2 = 00a200r200g200b2 movq mm0, mm1 ; mm0 = 00a100r100g100b1 punpckhwd mm1, mm2 ; mm1 = 00a200a100r200r1 punpcklwd mm0, mm2 ; mm0 = 00g200g100b200b1 // // Add the contribution to accumulator 0 // movq mm2, [edx + 8] ; mm2 = kernel multiplier 0 movq mm3, mm2 ; mm3 = kernel multiplier 0 pmaddwd mm2, mm0 ; mm2 = 0000gggg0000bbbb pmaddwd mm3, mm1 ; mm3 = 0000aaaa0000rrrr paddd mm6, mm2 ; add contributions to accumulator 0 paddd mm7, mm3 // // Extract the pixel value from accumulator 0. // paddd mm6, [edx + 56] ; round psrad mm6, KERNEL_SHIFT_AMOUNT paddd mm7, [edx + 56] psrad mm7, KERNEL_SHIFT_AMOUNT packssdw mm6, mm7 ; mm6 = 00aa00rr00gg00bb packuswb mm6, mm6 ; mm6 = 00000000aarrggbb // // Clip all channels to alpha // movd mm2, ebx ; mm2 = 00000000ffffffff movq mm7, mm6 ; mm7 = 00000000aarrggbb psrad mm7, 24 ; mm7 = 00000000000000aa punpcklbw mm7, mm7 ; mm7 = 000000000000aaaa punpcklbw mm7, mm7 ; mm7 = 00000000aaaaaaaa psubusb mm2, mm7 paddusb mm6, mm2 psubusb mm6, mm2 movd [edi], mm6 add edi, 4 // // Add the contribution to accumulator 1 // movq mm6, [edx + 16] ; kernel multiplier 1 movq mm7, mm6 ; kernel multiplier 1 pmaddwd mm6, mm0 pmaddwd mm7, mm1 paddd mm6, mm4 paddd mm7, mm5 // // Add the contribution to accumulator 2 // movq mm4, [edx + 24] ; kernel multiplier 2 movq mm5, mm4 ; kernel multiplier 2 pmaddwd mm4, mm0 pmaddwd mm5, mm1 paddd mm4, [edx + 40] paddd mm5, [edx + 48] // // Compute the new third accumulator // pmaddwd mm0, [edx + 32] ; multiply by kernel multiplier 3 pmaddwd mm1, [edx + 32] movq [edx + 40], mm0 movq [edx + 48], mm1 dec ecx jnz loop2 emms } #undef KERNEL_SHIFT_AMOUNT #endif // defined(_X86_) } /**************************************************************************\ * * Function Description: * * DpOutputSpanStretch::StretchScanline * Stretches a single scanline (magnification or minification) using * the reconstruction/interpolation mode specified by the template * parameter. Currently this is used for bilinear and bicubic filters. * * Arguments: * * ARGB *dst, // destination pointer * ARGB *src, // source pointer * INT dw, // destination width (pixels) * INT sw, // source width (pixels) * FIX16 kci, // initial position of the kernel center * FIX16 scale, // scale factor * FIX16 w, // width from center of the kernel to the edge * FIX16 a, // 1/w * * Notes: * The following description is based on the bilinear (tent) filter but it is equally applicable to the bicubic - though the pictures and description would be slightly more complicated. The code below is significantly complicated by the fact that we want the inner kernel loop to be quick and therefore not handle the wrap modes. In order to make this work, we first compute the number of pixels on the left and right of the scanline that need to consider the wrap mode. We process the left first and then run the optimized loop for all the inner pixels (which ignores the wrap conditions). After that we run the right edge. Bilinear filter convolution kernel: Note that each kernel has an intrinsic width - bilinear = 1 and bicubic = 2. This width is scaled by the inverse of the stretch factor - i.e. a shrink that results in 1/3 of the size being output requires a width (w) of 3 for the bilinear and 6 for the bicubic. Also the height of the filter kernel is scaled by the scale factor - i.e. the height of 1 (for all kernels) becomes 1/3 in the above example. --- | --- ^ --- . | . --- | --- . | . .--- h --- . . | . . --- | --- . . . | . . . --- | ---. . . . | . . . .--- v -----0-----1-----2-----3-----4--|--5-----6-----7-----8-----9------------ | kb kc ke <------------w-----------> The filter kernel is shifted so that kc is exactly at the position of the required destination pixel transformed into the source pixel array by the scale factor. This will in general fall somewhere between two pixel samples - in the above picture, between pixels 4 and 5. The goal is to get a color value for the position at kc and emit that into the destination pixel stream. The standard evaluation method is to compute the height of the filter kernel at each of the pixel samples under the filter convolution corresponding to pixels 0, 1, ... 9. These heights are used to weight each pixel sample and the result is summed giving the destination pixel at kc. The problem with the standard evaluation is that at non-integer shrinks the mathematical evaluation of the kernel produces ripples in the output - i.e. a solid field of pixels responds with a sine-wave-like ripple output. This is a theoretical problem with the discrete evaluation of the kernel integral. Our evaluation actually stores a table of partial integrals from -inf to x. We use this table to compute the area around each pixel and the area is used as the weight. This evaluation is guaranteed to respond with exactly one for any position and scale factor of the kernel. This property gives a stable field response allowing us to have non-ripple shrinks. ---.: --- ---.....: --- --- :.....: --- --- :.....: --- --- :.....: --- --- :.....: --- -----0-----1-----2-----3-----4-----5-----6-----7-----8-----9------------ To evaluate this properly, we lookup the integral from -inf to 4.5 ( actually we rescale so that the center of the kernel is at 0 ) and then subtract the table lookup for the integral from -inf to 3.5. This gives us an exact (within the error of the table) computation for the area from 3.5 to 4.5. This is what we use for the weight of pixel 4. Note that contrary to the standard evaluation pixel 9 does contribute even though 9 is outside of the kernel. 8.5 is inside the kernel so the area under the kernel from 8.5 to 9.5 is a small triangular area and is not equal to zero. Not accounting for this is the major source of error in the standard evaluation. Note that the lookup for the end point integral for pixel 4 of -inf to 4.5 can be reused as the start point for the next pixel (5). An important property of this is that any error (e) in the lookup for -inf to 4.5 is added in pixel 4's contribution and subtracted in pixel 5's contribution which results in the total error for the filter response -- due to table discretization -- being completely subtracted away --- the end points have an error of exactly zero because we sample from beyond the left (area of exactly 0) to beyond the right (area of exactly 1). This is not precisely true because the error is scaled by the pixel values, but it does help. Note that this integral method is equivalent to convolving the input pixels (comb) with the box filter of width 1 pixel and then convolving the result with the filter kernel. [analysis due to Jim Blinn - see documentation in the Specs directory.] Further documentation is available in the specs directory: gdiplus\specs\filter\convolution.doc * Note: this function must not use floating point values, because it could be * called with an invalid floating point state (prior to the call to emms) * * History: * * 04/16/2000 asecchia created it * \**************************************************************************/ template void DpOutputSpanStretch::StretchScanline( ARGB *dst, // destination pointer ARGB *src, // source pointer INT dw, // destination width (pixels) INT sw, // source width (pixels) FIX16 kci, // initial position of the kernel center FIX16 scale, // scale factor FIX16 w, // width from center of the kernel to the edge FIX16 a // 1/w ) { // Note: this is a template class so the value of FilterMode // is defined at compile time. We're relying on the compiler // to perform dead code removal for each template instantiation // eliminating both the constant comparison and all the // code branches corresponding to other FilterMode values. // That way our inner loop is not impacted by extra code for // filter modes we're not using and extraneous conditional // statements. // Center of the filter kernel. // Shift over to the left by half because we want to center the area of // contribution for each sample on the sample - rather than taking the // area between two point samples as the contribution for the sample on // the right. FIX16 kc = kci - FIX16_HALF; // Left and right extent of the kernel, intra-kernel position, // and pixel contribution. INT kb, ke; INT kbt, ket; FIX16 kp, pc, pa, pa_old; // Loop variables INT x, k; // Incremental loop state, intermediate computation. ARGB *d = dst; FIX16 krn = 0; // Color channel accumulators. FIX16 ta, tr, tg, tb; // Compute the first pixel along the destination scanline that doesn't // have any wrap contribution and then the last pixel (l & r). // Note that all the terms have a FIX16_ONE factor which cancel out. // !!! Perf: [asecchia] This stuff is computed every scanline - // and it's always the same. We could pass these coordinates to // this routine and have them precomputed. INT lWrapX; INT rWrapX; if(scale>=0) { // x==sw is considered outside of the source. FIX16 fix_sw = (sw-1) << FIX16_SHIFT; // add (scale-1) and use idiv to get a Ceiling() lWrapX = (w-kc+(scale-1))/scale; // idiv should give us Floor(). rWrapX = (fix_sw-w-kc)/scale; } else { // x==sw is considered outside of the source. FIX16 fix_sw = (sw-1) << FIX16_SHIFT; // note: in the -x scale transform, the sense of lWrapX and rWrapX // can be confusing. The l&r apply to the destination left and right // and are swapped here when we compute the initial position from // the inverted left and right source points. // As we traverse the destination from left to right we'll encounter // lWrapX first and then rWrapX, but the kc (kernel center) will be // moving through the source from right to left decrementing by // scale each time. // use idiv to get a Floor() rWrapX = (w-kc)/scale; // add scale+1 and use idiv for Ceiling(). lWrapX = (fix_sw-w-kc+(scale+1))/scale; } // Now clamp to the range of the destination we're going to draw. lWrapX = max(0, lWrapX); rWrapX = min(dw, rWrapX); BYTE *kptr; INT k_wrap; // Do the left wrapmode pixels. /* --- | --- ^ --- | --- | --- | --- h --- | --- | --- | --- | --- | --- v -----0-----1-----2-----3-----4--|--5-----6-----7-----8-----9------------ kb kc <-----------w---------->ke kbt ket <----wrap----><---------texture------><----wrap-----> +ve transform -ve transform or straddle case or straddle case The following loop handles the case where the wrap happens on the left of the kernel. There are three subloops - first to handle the pixels in the wrap segment on the left, then to handle the pixels in the texture. Normally the texture pixels will extend to the right edge of the kernel and we'll be done, but two cases make the right wrap essential at this point. First if the transform is negative, the sense is flipped and the texture extends from the left edge to the middle point and the wrap extends the rest of the kernel to the right edge. Also if the texture is sufficiently small and the shrink factor sufficiently large, the filter kernel could overlap both the left and right edge of the texture and require wrapping on both sides. */ for(x=0; x= minCenterWidthMMX) && ((srcLast_2Stretch == srcLast) || (srcLast_2Stretch == (srcLast - 1)))) { ASSERT(srcFirst >= 0); ASSERT(srcLast_2Stretch < sw); // Stretch the middle pixels by a factor of two using optimized MMX // code. FIX16 kc_center = kc + FIX16_HALF; StretchMiddleScanline2_MMX(d, src + srcFirst, dstCenterWidth, kc_center - (GpFix16Floor(kc_center) * FIX16_ONE)); d += dstCenterWidth; kc += scale * dstCenterWidth; x += dstCenterWidth; } else { // This is the MMX version of the general purpose bicubic scaling // code. for(x=lWrapX; x=0); ASSERT(ke> 1; bcl_count &= 0x1; _asm { // eax - krn // ebx - kptr // esi - LPIC // edi - a // // mm5 - pold // mm6 - green ; blue // mm7 - alpha ; red mov eax, krn mov ebx, kptr mov esi, CPIC mov edi, a pxor mm5, mm5 movq mm6, FIX14_HALF_MMX movq mm7, mm6 pxor mm0, mm0 dec bcl_half_count jl bicubic_center_loop_last_pixel bicubic_center_loop: // Read the next two pixels into mm2 and mm1 movd mm2, [ebx] // mm2 = pixel1 movd mm1, [ebx + 4] // mm1 = pixel2 add ebx, 8 // Compute the kernel values for these two pixels mov edx, eax sar edx, 8 punpcklbw mm2, mm0 movd mm3, [esi + 4 * edx] // mm3 = p1 lea edx, [eax + edi] sar edx, 8 punpcklbw mm1, mm0 movd mm4, [esi + 4 * edx] // mm4 = p2 punpckldq mm5, mm3 // mm5 = p1 | pold lea eax, [eax + 2 * edi] punpckldq mm3, mm4 // mm3 = p2 | p1 psrad mm5, 2 psrad mm3, 2 psubd mm3, mm5 // mm3 = kernel2 | kernel1 movq mm5, mm4 // mm5 = pold packssdw mm3, mm3 // mm3 = kernel2 | kernel1 | kernel2 | kernel1 // At this point: // mm3 = kernel2 | kernel1 | kernel2 | kernel1 // mm2, mm1 contain pixel1 and pixel2 respectively movq mm4, mm2 punpcklwd mm2, mm1 pmaddwd mm2, mm3 punpckhwd mm4, mm1 paddd mm6, mm2 dec bcl_half_count pmaddwd mm4, mm3 paddd mm7, mm4 jge bicubic_center_loop bicubic_center_loop_last_pixel: dec bcl_count jl bicubic_center_loop_done // Read the last pixel into mm2 movd mm2, [ebx] punpcklbw mm2, mm0 // mm2 = a | r | g | b movq mm3, mm2 punpcklwd mm2, mm0 // mm2 = 0 | g | 0 | b punpckhwd mm3, mm0 // mm3 = 0 | a | 0 | r // Compute the kernel value for this pixel sar eax, 8 psrad mm5, 2 movd mm4, [esi + 4 * eax] // mm4 = p psrad mm4, 2 psubd mm4, mm5 packssdw mm4, mm4 pmaddwd mm2, mm4 pmaddwd mm3, mm4 paddd mm6, mm2 paddd mm7, mm3 bicubic_center_loop_done: // At this point, mm6 and mm7 contain the output channels // for the pixel. We need to clamp the alpha and store it // in the destination buffer. psrad mm6, 14 psrad mm7, 14 packssdw mm6, mm7 // mm6 = a | r | g | b packuswb mm6, mm6 // mm6 = 00000000aarrggbb movq mm7, mm6 // mm7 = 00000000aarrggbb psrad mm6, 24 // mm6 = xxxxxxxxxxxxxxaa mov eax, 0xFFFFFFFF punpcklbw mm6, mm6 // mm6 = xxxxxxxxxxxxaaaa movd mm2, eax punpcklbw mm6, mm6 // mm6 = xxxxxxxxaaaaaaaa psubusb mm2, mm6 mov eax, d paddusb mm7, mm2 psubusb mm7, mm2 movd [eax], mm7 add eax, 4 mov d, eax } kc += scale; } } } else #endif // defined(_X86_) /* --- | --- ^ --- | --- | --- | --- h --- | --- | --- | --- | --- | --- v -----0-----1-----2-----3-----4--|--5-----6-----7-----8-----9------------ kb kc <-----------w---------->ke <-----------------------texture---------------------> The following loop is guaranteed to only hit texture for every pixel under the kernel. This is the majority of the pixels in most normal stretch cases. We can simplify this loop because of this assumption and therefore get a performance win. Many of the degenerate wrap cases will simply skip this loop. */ { // no MMX for(x=lWrapX; x=0); ASSERT(ke=0); ASSERT(tr>=0); ASSERT(tg>=0); ASSERT(tb>=0); // Combine the channels, set the destination pixel and increment // to the next pixel *d++ = (ta << 24) | (tr << 16) | (tg << 8) | tb; kc += scale; } } // Need to use max() here to handle the case where lWrapX > rWrapX // which can happen if the filter spans both edges of the scanline. // Do the right wrapmode pixels. /* --- | --- ^ --- | --- | --- | --- h --- | --- | --- | --- | --- | --- v -----0-----1-----2-----3-----4--|--5-----6-----7-----8-----9------------ kb kc <-----------w---------->ke kbt ket <----wrap----><---------texture------><----wrap-----> -ve transform +ve tranform case only case only The following loop handles the case where the wrap happens on the right of the kernel. There are three subloops - first to handle the pixels in the wrap segment on the left - if any, then to handle the pixels in the texture. After that handle the pixels in the right wrap. Normally the texture pixels will extend to the left edge of the kernel and the first subloop will simply be skipped, but the left wrap is essential if the transform is negative --- the sense is flipped and the texture extends from the right edge to the middle point and the wrap extends the rest of the kernel to the left edge. Note it's not possible at this point to have wrapping at both edges of the kernel the wrap is on the left iff the transform is negative. The wrap is on the right iff the transform is positive. The case where both wrapmodes is present has already been taken care of in the first loop. */ for(x=max(x, rWrapX); x GpStatus DpOutputSpanStretch::OutputSpan( INT y, INT xMin, INT xMax // xMax is exclusive ) { ASSERT(isValid); // This function assumes that it's called with a correctly ordered span. ASSERT((xMax-xMin)>=0); INT width = xMax-xMin; // We can't have someone draw outside our specified destination. // If this assert fires, we don't have enough buffer space to store the // destination xscale so we'd overrun the buffer. The caller set us up // with an incorrect destination rectangle or got their rounding wrong. ASSERT(width <= xbuffer_width); INT left = xMin; INT right = xMax; // If there's nothing to do, simply return. if(right < left) { return Ok; } ASSERT(right >= left); // Make sure the caller clipped correctly - we can't handle // being called to draw outside out destination rectangle. ASSERT(y >= iytop); // Compute the kernel center for this y coordinate relative to the first // y coordinate (y coordinate corresponding to DstRect.Y) and offset // by the source rectangle. FIX16 kc; if(yscale < 0) { kc = ykci - (y - iytop) * (-yscale); } else { kc = ykci + (y - iytop) * yscale; } // Center of the filter kernel. // Shift over to the left by half because we want to center the area of // contribution for each sample on the sample - rather than taking the // area between two point samples as the contribution for the sample on // the right. kc -= FIX16_HALF; // Compute the start and end of the filter kernel coverage FIX16 kb = GpFix16Ceiling(kc-yw); FIX16 ke = GpFix16Ceiling(kc+yw); // Get the source pointer. ARGB *srcPtr0 = static_cast (BmpData.Scan0); INT stride = BmpData.Stride/sizeof(ARGB); ARGB *src; ARGB *dst; FIX16 pc, kp, pa, pa_old; FIX16 ta, tr, tg, tb; ARGB pix; INT k, x, kmod; FIX16 krn = 0; // if there was a last_k before this iteration // compute the new xbuffer_start_scanline if(last_k != LAST_K_UNUSED) { // If there is no overlap in the rotational buffer from the // last time, initialize the rotational buffer to the start. if(yscale < 0) { // Negative y scale. if(ke-last_k < 0) { xbuffer_start_scanline = 0; } else { xbuffer_start_scanline -= last_k-kb; if(xbuffer_start_scanline < 0) { xbuffer_start_scanline += xbuffer_height; } } } else { // Positive y scale. if(last_k-kb < 0) { xbuffer_start_scanline = 0; } else { // Figure out where to start in the xbuffer so that we // can reuse the already scaled scanlines. xbuffer_start_scanline -= (last_k-kb)+1; if(xbuffer_start_scanline < 0) { xbuffer_start_scanline += xbuffer_height; } } } } else { // this should be the first time we're hitting this // routine. xbuffer_start_scanline should be properly // initialized. ASSERT(xbuffer_start_scanline == 0); } // make sure we're going to access valid memory in the xbuffer. ASSERT(xbuffer_start_scanline >= 0); ASSERT(xbuffer_start_scanline < xbuffer_height); // !!! [asecchia] if we thought about it some, we could probably // import the code in StretchScanline into this loop // and merge this and the next loop significantly reducing the memory // requirements for the xbuffer. // The xbuffer_height should be == (ke-kb)+1 for all cases except when // the center (kc) is exactly on an integer in which case the first and // last entries under the kernel have a contribution of zero so it doesn't // matter if we drop one scanline in that case. // Start at the position we left off from the previous scanline. Use the // rotational buffer to remember the data from the previous scanline work. // HighQualityBicubic needs to initialize the krn value. // It is used to do the kernel table lookup. // HighQualityBilinear doesn't use this as it works out it's // kernel by direct computation. // Note: this is a template class so the value of FilterMode // is defined at compile time. We're relying on the compiler // to perform dead code removal for each template instantiation // eliminating both the constant comparison and all the // code branches corresponding to other FilterMode values. // That way our inner loop is not impacted by extra code for // filter modes we're not using and extraneous conditional // statements. krn = Int32x32Mod16(ya, (kb << FIX16_SHIFT) - kc); pa_old = 0; for(k=0; k= xbuffer_height) kmod -= xbuffer_height; // We avoid using a mod (%) computation above because we // know that the xbuffer_start_scanline is always within // the range 0..xbuffer_height-1. // ASSERT that this assumption is true. ASSERT(kmod < xbuffer_height); ASSERT(kmod >= 0); // Compute the kernel response for this pixel based on the // positive value of kp if(kb+k>ke) { // The buffer could be larger than the actual kernel, // in that case, simply set the extra coefficients to // zero. ycoeff[kmod] = 0; } else { ComputeKernel(ycoeff[kmod], ya, pa, pa_old, krn); } // Compute the position in the destination buffer to draw to. dst = xbuffer + xbuffer_width * kmod; // This assert fires if the arithmetic for computing the size of the // xbuffer or the iteration over the kernel support has a bug. The // xbuffer_height should be the maximum width of the kernel support. ASSERT(k < xbuffer_height); ASSERT(kmod < xbuffer_height); INT k_wrap = kb+k; // NTRAID#NTBUG9-370168-2001/04/18-asecchia // This is an unsigned/signed comparison. // NOTE: the (INT) cast is the invalid one. BmpData.Height is UINT // and is always positive - casting it to int is irrelevant. // However, the k_wrap is signed and _can_ be negative. The unsigned // cast is by design - it allows us to figure out both sides of the // wrap using one comparison. // The unsigned comparison >= Height tells us if k_wrap does not fall // within the range 0..Height-1 and therefore needs wrapping because // negative numbers cast to huge positive numbers and succeed the // comparison too. // NOTE also that this kind of comparison limits the effective range // of Height to (max unsigned)/2 with the single caveat of k_wrap being // equal to -MAXINT. // For code that's executed once per scanline, this kind of subtlety // is probably not warranted. if((UINT)(k_wrap) >= (INT)BmpData.Height) { // Handle the wrap mode here. if(WrapZeroClamp) { // GpMemset(dst, 0, (right-left)*sizeof(ARGB)); // If we're filling with zero, we may as well optimize the kernel // contribution. ycoeff[kmod] = 0; // done this scan - go on to the next continue; } else { if(QWrapMode == WrapModeClamp) { INT i = right-left; ARGB *d = dst; while(i--) { *d++ = ClampColor; } // done this scan - go on to the next continue; } else { // Apply the general wrap code. Apply1DWrapModeY(QWrapMode, k_wrap, (INT)BmpData.Height); src = srcPtr0 + stride*k_wrap; // Not done yet - fall through and call StretchScanline. } } } else { // If the x2 and x1 are out of order, we failed to correctly // compute the span in the above logic. // Seek to the start of the scanline. // Note: whatever X coordinate we add to the src pointer // we need to subtract from the width passed to the // StretchScanline routine below. src = srcPtr0 + stride*(k_wrap); } // Only x-scale if we haven't already done this scanline on a previous // call and stored the result in the xbuffer. if((last_k==LAST_K_UNUSED) || ( (yscale >= 0) && (last_k-(kb+k) < 0) || (yscale < 0) && (last_k-(kb+k) > 0) ) ) { // Filter in the x-dimension. StretchScanline( dst, src, xbuffer_width, static_cast(BmpData.Width), xkci, xscale, xw, xa ); } } // set up the k_last for the next iteration. This represents the last // scanline for which we actually have x-scaled data. if(yscale < 0) { last_k = kb; } else { last_k = kb + xbuffer_height - 1; } // Get the final destination buffer ARGB *buffer = Scan->NextBuffer(left, y, width); // Now we have the entire buffer full with the x-dimension scaled data. // for every x coordinate, apply the y kernel. #ifdef _X86_ if (OSInfo::HasMMX) { // MMX INT *ycoeffMMX = (INT *) ((((UINT_PTR) ycoeff) + 4) & ~0x7); INT n = (xbuffer_height + 1) >> 1; // Transform the kernel coeffecient array into a form that is // easily usable by MMX code. The loop must go backward so that // we don't erase kernel coefficients (MMX starting point could // be 4 bytes ahead of integer starting point). // ycoeff must be large enough to hold the MMX coefficients (2 extra // entries) for (INT i = n-1; i >= 0; i--) { INT kernel1 = ycoeff[i * 2] >> 2; INT kernel2 = ycoeff[i * 2 + 1] >> 2; INT kernelMMX = (kernel1 & 0xFFFF) | (kernel2 << 16); ycoeffMMX[i * 2] = kernelMMX; ycoeffMMX[i * 2 + 1] = kernelMMX; } for(x=0; x> 1; bos_count &= 0x1; _asm { // eax - kptr // ebx - kptr_inc // ecx - counter // esi - ycoeff current pointer pxor mm0, mm0 movq mm6, FIX14_HALF_MMX movq mm7, mm6 mov eax, kptr mov ebx, kptr_inc_MMX mov ecx, bos_half_count mov esi, ycoeffMMX dec ecx jl bicubic_output_span_loop_last_pixel bicubic_output_span_loop: movd mm2, [eax] // mm2 = 00000000a1r1b1g1 movd mm4, [eax + ebx] punpcklbw mm2, mm0 // mm2 = 00a100r100g100b1 movq mm1, [esi] // mm1 = kernel2 | kernel1 | kernel2 | kernel1 punpcklbw mm4, mm0 // mm4 = 00a200r200g200b2 movq mm3, mm2 // mm3 = 00a100r100g100b1 punpcklwd mm2, mm4 // mm2 = 00g200g100b200b1 add esi, 8 pmaddwd mm2, mm1 punpckhwd mm3, mm4 // mm3 = 00a200a100r200r1 paddd mm6, mm2 dec ecx pmaddwd mm3, mm1 lea eax, [eax + 2 * ebx] // does not affect flags paddd mm7, mm3 jge bicubic_output_span_loop bicubic_output_span_loop_last_pixel: dec bos_count jl bicubic_output_span_loop_done movd mm2, [eax] // mm2 = 00000000aarrggbb punpcklbw mm2, mm0 // mm2 = 00aa00rr00gg00bb movq mm3, mm2 punpcklwd mm2, mm0 // mm2 = 000000gg000000bb movq mm1, [esi] // mm1 = xxxx | kernel1 | xxxx |kernel1 punpckhwd mm3, mm0 // mm3 = 000000aa000000bb pmaddwd mm2, mm1 pmaddwd mm3, mm1 paddd mm6, mm2 paddd mm7, mm3 bicubic_output_span_loop_done: // At this point, mm6 and mm7 contain the output channels // for the pixel. We need to clamp the alpha and store it // in the destination buffer. psrad mm6, 14 psrad mm7, 14 packssdw mm6, mm7 // mm6 = a | r | g | b packuswb mm6, mm6 // mm6 = 00000000aarrggbb movq mm7, mm6 // mm7 = 00000000aarrggbb psrad mm6, 24 // mm6 = xxxxxxxxxxxxxxaa mov eax, 0xFFFFFFFF punpcklbw mm6, mm6 // mm6 = xxxxxxxxxxxxaaaa movd mm2, eax punpcklbw mm6, mm6 // mm6 = xxxxxxxxaaaaaaaa psubusb mm2, mm6 mov eax, buffer paddusb mm7, mm2 psubusb mm7, mm2 movd [eax], mm7 add eax, 4 mov buffer, eax } } } else #endif // _X86_ { // No MMX for(x=0; x