/**************************************************************************
*
* Copyright (c) 2000 Microsoft Corporation
*
* Module Name & Abstract
*
*   Stretch. This module contains the code to do various stretching
*   by applying a kernel filter. The code correctly handles minification.
*
* Note:
*   This module is not compiled into an .obj file, rather it is included
*   directly into the header file stretch.hpp.
*   This is due to the use of template functions.
*
*
* Notes:
*
*   This code does not handle rotation or shear.
*
* Created:
*
*   04/17/2000 asecchia
*      Created it.
*
**************************************************************************/

#define LAST_K_UNUSED ((INT)0x7fffffff)

const INT BicubicKernelShift = 7;
const INT BicubicKernelSize = 1 << BicubicKernelShift;
const FIX16 BicubicKernel[BicubicKernelSize+1] =
{
    65536, 65496, 65379, 65186, 64920, 64583, 64177, 63705,
    63168, 62569, 61911, 61195, 60424, 59600, 58725, 57802,
    56832, 55818, 54763, 53668, 52536, 51369, 50169, 48939,
    47680, 46395, 45087, 43757, 42408, 41042, 39661, 38268,
    36864, 35452, 34035, 32614, 31192, 29771, 28353, 26941,
    25536, 24141, 22759, 21391, 20040, 18708, 17397, 16110,
    14848, 13614, 12411, 11240, 10104,  9005,  7945,  6927,
     5952,  5023,  4143,  3313, 2536,  1814,  1149,   544,
        0,  -496,  -961, -1395, -1800, -2176, -2523, -2843,
    -3136, -3403, -3645, -3862, -4056, -4227, -4375, -4502,
    -4608, -4694, -4761, -4809, -4840, -4854, -4851, -4833,
    -4800, -4753, -4693, -4620, -4536, -4441, -4335, -4220,
    -4096, -3964, -3825, -3679, -3528, -3372, -3211, -3047,
    -2880, -2711, -2541, -2370, -2200, -2031, -1863, -1698,
    -1536, -1378, -1225, -1077, -936,  -802,  -675,  -557,
     -448,  -349,  -261,  -184, -120,   -69,   -31,    -8,
        0
};

const FIX16 SymmetricBicubicKernel[BicubicKernelSize * 2 + 1] =
{
    0,
     -8, -31, -69, -120, -184, -261,-349, -448,
    -557, -675,  -802, -936, -1077, -1225, -1378, -1536,
    -1698, -1863, -2031, -2200, -2370, -2541, -2711, -2880,
    -3047, -3211, -3372, -3528, -3679, -3825, -3964, -4096,
    -4220, -4335, -4441, -4536, -4620, -4693, -4753, -4800,
    -4833, -4851, -4854, -4840, -4809, -4761, -4694, -4608,
    -4502, -4375, -4227, -4056, -3862, -3645, -3403, -3136,
    -2843, -2523, -2176, -1800, -1395, -961, -496,
    0,
    544, 1149, 1814, 2536, 3313, 4143, 5023, 5952,
    6927,  7945,9005, 10104, 11240, 12411, 13614, 14848,
    16110,  17397, 18708, 20040, 21391, 22759, 24141, 25536,
    26941, 28353, 29771, 31192, 32614, 34035, 35452, 36864,
    38268, 39661, 41042, 42408, 43757,  45087, 46395, 47680,
    48939,  50169, 51369, 52536, 53668, 54763, 55818, 56832,
    57802,  58725, 59600, 60424, 61195, 61911, 62569, 63168,
    63705,  64177, 64583, 64920, 65186, 65379, 65496,
65536,
    65496, 65379, 65186, 64920, 64583, 64177, 63705,
    63168, 62569, 61911, 61195, 60424, 59600, 58725, 57802,
    56832, 55818, 54763, 53668, 52536, 51369, 50169, 48939,
    47680, 46395, 45087, 43757, 42408, 41042, 39661, 38268,
    36864, 35452, 34035, 32614, 31192, 29771, 28353, 26941,
    25536, 24141, 22759, 21391, 20040, 18708, 17397, 16110,
    14848, 13614, 12411, 11240, 10104,  9005,  7945,  6927,
     5952,  5023,  4143,  3313, 2536,  1814,  1149,   544,
     0,
    -496,  -961, -1395, -1800, -2176, -2523, -2843,
    -3136, -3403, -3645, -3862, -4056, -4227, -4375, -4502,
    -4608, -4694, -4761, -4809, -4840, -4854, -4851, -4833,
    -4800, -4753, -4693, -4620, -4536, -4441, -4335, -4220,
    -4096, -3964, -3825, -3679, -3528, -3372, -3211, -3047,
    -2880, -2711, -2541, -2370, -2200, -2031, -1863, -1698,
    -1536, -1378, -1225, -1077, -936,  -802,  -675,  -557,
     -448,  -349,  -261,  -184, -120,   -69,   -31,    -8,
     0
};

/*
// Higher precision bicubic kernel - more data.
// Commented out in case we eventually need it.
const FIX16 BK[512+1] = 
{
    0,
    -2, -8, -18, -31, -48, -69, -93, -120, 
    -151, -184, -221, -261, -304, -349, -397, -448, 
    -501, -557, -615, -675, -737, -802, -868, -936, 
    -1006, -1077, -1150, -1225, -1301, -1378, -1457, -1536, 
    -1616, -1698, -1780, -1863, -1947, -2031, -2115, -2200, 
    -2285, -2370, -2456, -2541, -2626, -2711, -2796, -2880, 
    -2964, -3047, -3129, -3211, -3292, -3372, -3450, -3528, 
    -3604, -3679, -3753, -3825, -3895, -3964, -4031, -4096, 
    -4159, -4220, -4279, -4335, -4389, -4441, -4490, -4536, 
    -4580, -4620, -4658, -4693, -4725, -4753, -4778, -4800, 
    -4818, -4833, -4844, -4851, -4854, -4854, -4849, -4840, 
    -4827, -4809, -4787, -4761, -4730, -4694, -4654, -4608, 
    -4557, -4502, -4441, -4375, -4304, -4227, -4144, -4056, 
    -3962, -3862, -3757, -3645, -3527, -3403, -3273, -3136, 
    -2993, -2843, -2686, -2523, -2353, -2176, -1991, -1800, 
    -1601, -1395, -1182, -961, -732, -496, -252, 
    0, 
    264, 544, 839, 1149, 1474, 1814, 2168, 2536, 
    2918, 3313, 3722, 4143, 4577, 5023, 5482, 5952, 
    6434, 6927, 7430, 7945, 8470, 9005, 9550, 10104, 
    10668, 11240, 11821, 12411, 13009, 13614, 14228, 14848, 
    15475, 16110, 16750, 17397, 18050, 18708, 19371, 20040, 
    20713, 21391, 22073, 22759, 23449, 24141, 24837, 25536, 
    26237, 26941, 27646, 28353, 29061, 29771, 30481, 31192, 
    31903, 32614, 33325, 34035, 34744, 35452, 36159, 36864, 
    37567, 38268, 38966, 39661, 40353, 41042, 41727, 42408, 
    43085, 43757, 44425, 45087, 45744, 46395, 47041, 47680, 
    48313, 48939, 49557, 50169, 50773, 51369, 51957, 52536, 
    53107, 53668, 54220, 54763, 55296, 55818, 56331, 56832, 
    57322, 57802, 58269, 58725, 59169, 59600, 60018, 60424, 
    60816, 61195, 61560, 61911, 62248, 62569, 62876, 63168, 
    63444, 63705, 63949, 64177, 64388, 64583, 64760, 64920, 
    65062, 65186, 65292, 65379, 65447, 65496, 65526, 
    65536, 
    65526, 65496, 65447, 65379, 65292, 65186, 65062, 64920, 
    64760, 64583, 64388, 64177, 63949, 63705, 63444, 63168, 
    62876, 62569, 62248, 61911, 61560, 61195, 60816, 60424, 
    60018, 59600, 59169, 58725, 58269, 57802, 57322, 56832, 
    56331, 55818, 55296, 54763, 54220, 53668, 53107, 52536, 
    51957, 51369, 50773, 50169, 49557, 48939, 48313, 47680, 
    47041, 46395, 45744, 45087, 44425, 43757, 43085, 42408, 
    41727, 41042, 40353, 39661, 38966, 38268, 37567, 36864, 
    36159, 35452, 34744, 34035, 33325, 32614, 31903, 31192, 
    30481, 29771, 29061, 28353, 27646, 26941, 26237, 25536, 
    24837, 24141, 23449, 22759, 22073, 21391, 20713, 20040, 
    19371, 18708, 18050, 17397, 16750, 16110, 15475, 14848, 
    14228, 13614, 13009, 12411, 11821, 11240, 10668, 10104, 
    9550, 9005, 8470, 7945, 7430, 6927, 6434, 5952, 
    5482, 5023, 4577, 4143, 3722, 3313, 2918, 2536, 
    2168, 1814, 1474, 1149, 839, 544, 264, 
    0,
    -252, -496, -732, -961, -1182, -1395, -1601, -1800, 
    -1991, -2176, -2353, -2523, -2686, -2843, -2993, -3136, 
    -3273, -3403, -3527, -3645, -3757, -3862, -3962, -4056, 
    -4144, -4227, -4304, -4375, -4441, -4502, -4557, -4608, 
    -4654, -4694, -4730, -4761, -4787, -4809, -4827, -4840, 
    -4849, -4854, -4854, -4851, -4844, -4833, -4818, -4800, 
    -4778, -4753, -4725, -4693, -4658, -4620, -4580, -4536, 
    -4490, -4441, -4389, -4335, -4279, -4220, -4159, -4096, 
    -4031, -3964, -3895, -3825, -3753, -3679, -3604, -3528, 
    -3450, -3372, -3292, -3211, -3129, -3047, -2964, -2880, 
    -2796, -2711, -2626, -2541, -2456, -2370, -2285, -2200, 
    -2115, -2031, -1947, -1863, -1780, -1698, -1616, -1536, 
    -1457, -1378, -1301, -1225, -1150, -1077, -1006, -936, 
    -868, -802, -737, -675, -615, -557, -501, -448, 
    -397, -349, -304, -261, -221, -184, -151, -120, 
    -93, -69, -48, -31, -18, -8, -2, 
    0
};


// Bicubic kernel with the 'perceptual' coefficient tweaked
// see Wolberg. Provides a slightly different experience.
// Commented out in case we eventually need it.

const FIX16 BK_V[512+1] =
{

    0,
    -4, -16, -35, -62, -96, -137, -185, -240, 
    -301, -369, -442, -522, -607, -698, -795, -896, 
    -1002, -1114, -1230, -1350, -1475, -1603, -1736, -1872, 
    -2012, -2155, -2301, -2450, -2602, -2756, -2913, -3072, 
    -3233, -3396, -3560, -3726, -3893, -4061, -4230, -4400, 
    -4570, -4741, -4911, -5082, -5252, -5422, -5592, -5760, 
    -5927, -6094, -6259, -6422, -6584, -6743, -6901, -7056, 
    -7209, -7359, -7506, -7650, -7791, -7928, -8062, -8192, 
    -8318, -8440, -8557, -8670, -8778, -8881, -8979, -9072, 
    -9159, -9241, -9316, -9386, -9449, -9506, -9557, -9600, 
    -9636, -9666, -9688, -9702, -9709, -9707, -9698, -9680, 
    -9654, -9619, -9575, -9522, -9460, -9388, -9307, -9216, 
    -9115, -9004, -8882, -8750, -8607, -8453, -8288, -8112, 
    -7924, -7725, -7513, -7290, -7054, -6806, -6546, -6272, 
    -5985, -5686, -5373, -5046, -4706, -4351, -3983, -3600, 
    -3203, -2791, -2364, -1922, -1465, -992, -504, 
    0, 
    516, 1040, 1571, 2110, 2656, 3209, 3769, 4336, 
    4909, 5489, 6074, 6666, 7263, 7866, 8475, 9088, 
    9706, 10330, 10958, 11590, 12227, 12867, 13512, 14160, 
    14812, 15467, 16125, 16786, 17450, 18116, 18785, 19456, 
    20129, 20804, 21480, 22158, 22837, 23517, 24198, 24880, 
    25562, 26245, 26927, 27610, 28292, 28974, 29656, 30336, 
    31015, 31694, 32371, 33046, 33720, 34391, 35061, 35728, 
    36393, 37055, 37714, 38370, 39023, 39672, 40318, 40960, 
    41598, 42232, 42861, 43486, 44106, 44721, 45331, 45936, 
    46535, 47129, 47716, 48298, 48873, 49442, 50005, 50560, 
    51108, 51650, 52184, 52710, 53229, 53739, 54242, 54736, 
    55222, 55699, 56167, 56626, 57076, 57516, 57947, 58368, 
    58779, 59180, 59570, 59950, 60319, 60677, 61024, 61360, 
    61684, 61997, 62297, 62586, 62862, 63126, 63378, 63616, 
    63841, 64054, 64253, 64438, 64610, 64767, 64911, 65040, 
    65155, 65255, 65340, 65410, 65465, 65504, 65528, 
    65536, 
    65528, 65504, 65465, 65410, 65340, 65255, 65155, 65040, 
    64911, 64767, 64610, 64438, 64253, 64054, 63841, 63616, 
    63378, 63126, 62862, 62586, 62297, 61997, 61684, 61360, 
    61024, 60677, 60319, 59950, 59570, 59180, 58779, 58368, 
    57947, 57516, 57076, 56626, 56167, 55699, 55222, 54736, 
    54242, 53739, 53229, 52710, 52184, 51650, 51108, 50560, 
    50005, 49442, 48873, 48298, 47716, 47129, 46535, 45936, 
    45331, 44721, 44106, 43486, 42861, 42232, 41598, 40960, 
    40318, 39672, 39023, 38370, 37714, 37055, 36393, 35728, 
    35061, 34391, 33720, 33046, 32371, 31694, 31015, 30336, 
    29656, 28974, 28292, 27610, 26927, 26245, 25562, 24880, 
    24198, 23517, 22837, 22158, 21480, 20804, 20129, 19456, 
    18785, 18116, 17450, 16786, 16125, 15467, 14812, 14160, 
    13512, 12867, 12227, 11590, 10958, 10330, 9706, 9088, 
    8475, 7866, 7263, 6666, 6074, 5489, 4909, 4336, 
    3769, 3209, 2656, 2110, 1571, 1040, 516, 
    0, 
    -504, -992, -1465, -1922, -2364, -2791, -3203, -3600, 
    -3983, -4351, -4706, -5046, -5373, -5686, -5985, -6272, 
    -6546, -6806, -7054, -7290, -7513, -7725, -7924, -8112, 
    -8288, -8453, -8607, -8750, -8882, -9004, -9115, -9216, 
    -9307, -9388, -9460, -9522, -9575, -9619, -9654, -9680, 
    -9698, -9707, -9709, -9702, -9688, -9666, -9636, -9600, 
    -9557, -9506, -9449, -9386, -9316, -9241, -9159, -9072, 
    -8979, -8881, -8778, -8670, -8557, -8440, -8318, -8192, 
    -8062, -7928, -7791, -7650, -7506, -7359, -7209, -7056, 
    -6901, -6743, -6584, -6422, -6259, -6094, -5927, -5760, 
    -5592, -5422, -5252, -5082, -4911, -4741, -4570, -4400, 
    -4230, -4061, -3893, -3726, -3560, -3396, -3233, -3072, 
    -2913, -2756, -2602, -2450, -2301, -2155, -2012, -1872, 
    -1736, -1603, -1475, -1350, -1230, -1114, -1002, -896, 
    -795, -698, -607, -522, -442, -369, -301, 
    -240, -185, -137, -96, -62, -35, -16, -4, 
    0
};
*/


// This is the table of partial sums of the bilinear kernel.
// Simply put, each point in the array represents the integral
// from -infinity to position x in the kernel function.
// We can subtract two table lookups to get the integral
// of the kernel (area) between the two points.
// The table is padded with zeros and ones at the beginning and end
// so we can consistently address areas outside of the actual kernel
// Currently we don't make use of the zeros at the beginning but
// we definitely sample past the end by at least one half-width
// of the kernel.

const FIX16 BilinearPartialIntegral[512+1] =
{   
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    
    0,2, 8, 18, 32, 50, 72, 98, 
    128, 162, 200, 242, 288, 338, 392, 450, 
    512, 578, 648, 722, 800, 882, 968, 1058, 
    1152, 1250, 1352, 1458, 1568, 1682, 1800, 1922, 
    2048, 2178, 2312, 2450, 2592, 2738, 2888, 3042, 
    3200, 3362, 3528, 3698, 3872, 4050, 4232, 4418, 
    4608, 4802, 5000, 5202, 5408, 5618, 5832, 6050, 
    6272, 6498, 6728, 6962, 7200, 7442, 7688, 7938, 
    8192, 8450, 8712, 8978, 9248, 9522, 9800, 10082, 
    10368, 10658, 10952, 11250, 11552, 11858, 12168, 12482, 
    12800, 13122, 13448, 13778, 14112, 14450, 14792, 15138, 
    15488, 15842, 16200, 16562, 16928, 17298, 17672, 18050, 
    18432, 18818, 19208, 19602, 20000, 20402, 20808, 21218, 
    21632, 22050, 22472, 22898, 23328, 23762, 24200, 24642, 
    25088, 25538, 25992, 26450, 26912, 27378, 27848, 28322, 
    28800, 29282, 29768, 30258, 30752, 31250, 31752, 32258, 
    
    32768, // center of the kernel. Index 256
    
    33278, 33784, 34286, 34784, 35278, 35768, 36254, 36736, 
    37214, 37688, 38158, 38624, 39086, 39544, 39998, 40448, 
    40894, 41336, 41774, 42208, 42638, 43064, 43486, 43904, 
    44318, 44728, 45134, 45536, 45934, 46328, 46718, 47104, 
    47486, 47864, 48238, 48608, 48974, 49336, 49694, 50048, 
    50398, 50744, 51086, 51424, 51758, 52088, 52414, 52736, 
    53054, 53368, 53678, 53984, 54286, 54584, 54878, 55168, 
    55454, 55736, 56014, 56288, 56558, 56824, 57086, 57344, 
    57598, 57848, 58094, 58336, 58574, 58808, 59038, 59264, 
    59486, 59704, 59918, 60128, 60334, 60536, 60734, 60928, 
    61118, 61304, 61486, 61664, 61838, 62008, 62174, 62336, 
    62494, 62648, 62798, 62944, 63086, 63224, 63358, 63488, 
    63614, 63736, 63854, 63968, 64078, 64184, 64286, 64384, 
    64478, 64568, 64654, 64736, 64814, 64888, 64958, 65024, 
    65086, 65144, 65198, 65248, 65294, 65336, 65374, 65408, 
    65438, 65464, 65486, 65504, 65518, 65528, 65534, 65536,
    
    65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 
    65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 
    65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 
    65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 
    65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 
    65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 
    65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 
    65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 
    65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 
    65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 
    65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 
    65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 
    65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 
    65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 
    65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 
    65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 
};

// This is the table of partial sums of the bicubic kernel.
// Simply put, each point in the array represents the integral
// from -infinity to position x in the kernel function.
// We can subtract two table lookups to get the integral
// of the kernel (area) between the two points.
// The table is padded with zeros and ones at the beginning and end
// so we can consistently address areas outside of the actual kernel
// Currently we don't make use of the zeros at the beginning but
// we definitely sample past the end by at least one half-width
// of the kernel.

const FIX16 BicubicPartialIntegral[1024+1] =
{   
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,

    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,

    0, 0, 0, 0, -1, -2, -3, -4, 
    -6, -8, -11, -15, -19, -24, -29, -35, 
    -42, -50, -59, -68, -79, -90, -103, -117, 
    -131, -147, -164, -182, -201, -221, -243, -265, 
    -289, -315, -341, -369, -398, -429, -460, -493, 
    -528, -563, -600, -639, -679, -720, -762, -806, 
    -851, -897, -945, -993, -1044, -1095, -1148, -1202, 
    -1257, -1313, -1371, -1429, -1489, -1550, -1612, -1675, 
    -1739, -1804, -1870, -1937, -2004, -2073, -2142, -2212, 
    -2283, -2355, -2427, -2500, -2573, -2647, -2721, -2796, 
    -2871, -2946, -3022, -3097, -3173, -3249, -3325, -3401, 
    -3476, -3552, -3627, -3702, -3776, -3850, -3923, -3996, 
    -4068, -4139, -4209, -4279, -4347, -4414, -4481, -4545, 
    -4609, -4671, -4731, -4790, -4847, -4902, -4955, -5006, 
    -5055, -5102, -5146, -5188, -5228, -5264, -5298, -5329, 
    -5358, -5383, -5404, -5423, -5438, -5449, -5457, -5461, 
    -5461, -5457, -5449, -5437, -5420, -5399, -5374, -5345, 
    -5311, -5273, -5230, -5182, -5130, -5073, -5012, -4946, 
    -4875, -4799, -4718, -4633, -4542, -4447, -4346, -4240, 
    -4130, -4014, -3893, -3767, -3636, -3500, -3358, -3212, 
    -3060, -2902, -2740, -2572, -2399, -2220, -2037, -1848, 
    -1653, -1454, -1249, -1038, -822, -601, -375, -143, 
    94, 336, 584, 836, 1095, 1358, 1627, 1901, 
    2180, 2464, 2754, 3048, 3348, 3653, 3963, 4278, 
    4598, 4923, 5253, 5588, 5927, 6272, 6621, 6975, 
    7334, 7698, 8066, 8439, 8816, 9198, 9584, 9975, 
    10370, 10769, 11173, 11580, 11992, 12408, 12828, 13252, 
    13679, 14111, 14546, 14985, 15427, 15873, 16322, 16775, 
    17231, 17690, 18152, 18618, 19086, 19557, 20032, 20508, 
    20988, 21470, 21954, 22441, 22930, 23421, 23914, 24409, 
    24906, 25405, 25905, 26407, 26911, 27415, 27921, 28428, 
    28937, 29446, 29955, 30466, 30977, 31488, 32000, 32512, 
    
    33024, // center of the kernel. Index 512
    
    33536, 34048, 34559, 35070, 35581, 36090, 36599, 37108, 
    37615, 38121, 38625, 39129, 39631, 40131, 40630, 41127, 
    41622, 42115, 42606, 43095, 43582, 44066, 44548, 45028, 
    45504, 45979, 46450, 46918, 47384, 47846, 48305, 48761, 
    49214, 49663, 50109, 50551, 50990, 51425, 51857, 52284, 
    52708, 53128, 53544, 53956, 54363, 54767, 55166, 55561, 
    55952, 56338, 56720, 57097, 57470, 57838, 58202, 58561, 
    58915, 59264, 59609, 59948, 60283, 60613, 60938, 61258, 
    61573, 61883, 62188, 62488, 62782, 63072, 63356, 63635, 
    63909, 64178, 64441, 64700, 64952, 65200, 65442, 65679, 
    65911, 66137, 66358, 66574, 66785, 66990, 67189, 67384, 
    67573, 67756, 67935, 68108, 68276, 68438, 68596, 68748, 
    68894, 69036, 69172, 69303, 69429, 69550, 69666, 69776, 
    69882, 69983, 70078, 70169, 70254, 70335, 70411, 70482, 
    70548, 70609, 70666, 70718, 70766, 70809, 70847, 70881, 
    70910, 70935, 70956, 70973, 70985, 70993, 70997, 70997, 
    70993, 70985, 70974, 70959, 70940, 70919, 70894, 70865, 
    70834, 70800, 70764, 70724, 70682, 70638, 70591, 70542, 
    70491, 70438, 70383, 70326, 70267, 70207, 70145, 70081, 
    70017, 69950, 69883, 69815, 69745, 69675, 69604, 69532, 
    69459, 69386, 69312, 69238, 69163, 69088, 69012, 68937, 
    68861, 68785, 68709, 68633, 68558, 68482, 68407, 68332, 
    68257, 68183, 68109, 68036, 67963, 67891, 67819, 67748, 
    67678, 67609, 67540, 67473, 67406, 67340, 67275, 67211, 
    67148, 67086, 67025, 66965, 66907, 66849, 66793, 66738, 
    66684, 66631, 66580, 66529, 66481, 66433, 66387, 66342, 
    66298, 66256, 66215, 66175, 66136, 66099, 66064, 66029, 
    65996, 65965, 65934, 65905, 65877, 65851, 65825, 65801, 
    65779, 65757, 65737, 65718, 65700, 65683, 65667, 65653, 
    65639, 65626, 65615, 65604, 65595, 65586, 65578, 65571, 
    65565, 65560, 65555, 65551, 65547, 65544, 65542, 65540, 
    65539, 65538, 65537, 65536, 65536, 65536, 65536, 65536, 

    65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 
    65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 
    65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 
    65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 
    65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 
    65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 
    65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 
    65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 
    65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 
    65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 
    65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 
    65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 
    65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 
    65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 
    65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 
    65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 

    65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 
    65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 
    65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 
    65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 
    65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 
    65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 
    65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 
    65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 
    65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 
    65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 
    65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 
    65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 
    65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 
    65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 
    65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 
    65536, 65536, 65536, 65536, 65536, 65536, 65536, 65536, 
};


// We use a biased pointer to the center of the array
// so that we can look up the negative part of the kernel
// without repositioning the index or using an absolute value
// computation in the inner loop.

// Linear Partial Integral Center.
const FIX16 *LPIC = &BilinearPartialIntegral[256];

// Cubic Partial Integral Center.
const FIX16 *CPIC = &BicubicPartialIntegral[512];

const FIX16 *SymmetricBicubicKernelCenter = &SymmetricBicubicKernel[128];

const ULONGLONG FIX14_HALF_MMX = 0x0000200000002000;

/**************************************************************************
*
* Function Description:
*
*   Constructor for the DpOutputSpanStretch class.
*
* Return Value:
*
*   NONE
*
* Created:
*
*   04/17/2000 asecchia
*      Created it.
*
**************************************************************************/

#define FIX4TOFIX16_SHIFT (FIX16_SHIFT - FIX4_SHIFT)

template<FilterModeType FilterMode>
void DpOutputSpanStretch<FilterMode>::InitializeClass(
    DpBitmap* bitmap,
    DpScanBuffer * scan,
    DpContext* /*context*/,
    DpImageAttributes imgAttributes,
    const GpRectF *dstRect,
    const GpRectF *srcRect
    )
{
    isValid = true;
    
    // Make sure these get initialized up front before we can early out
    // otherwise we could end up freeing uninitialized pointers in our
    // destructor.
    
    ycoeff = NULL;
    xbuffer = NULL;
    
    Scan     = scan;
    dBitmap   = bitmap;

    QWrapMode = imgAttributes.wrapMode;

    ClampColor = imgAttributes.clampColor;

    ClampColorA = (BYTE)( (ClampColor >> 24) );
    ClampColorR = (BYTE)( (ClampColor >> 16) & 0xff);
    ClampColorG = (BYTE)( (ClampColor >> 8) & 0xff);
    ClampColorB = (BYTE)(  ClampColor & 0xff);

    // Accleration for clamp mode with zero clamp color (transparent)

    WrapZeroClamp = FALSE;
    if((QWrapMode == WrapModeClamp) &&
       (imgAttributes.clampColor == 0))
    {
        WrapZeroClamp = TRUE;
    }



    ASSERT(dBitmap != NULL);
    ASSERT(dBitmap->IsValid());

    // on bad bitmap, we return with Valid = FALSE
    if (dBitmap == NULL ||
        !dBitmap->IsValid()
        )
    {
        dBitmap = NULL;
        isValid = false;
        return;
    } else {
        BmpData.Width = dBitmap->Width;
        BmpData.Height = dBitmap->Height;
        BmpData.PixelFormat = PIXFMT_32BPP_PARGB;
        BmpData.Stride = dBitmap->Delta;
        BmpData.Scan0 = dBitmap->Bits;
    }

    if(srcRect)
        SrcRect = *srcRect;
    else
    {
        SrcRect.X = 0.0f;
        SrcRect.Y = 0.0f;
        SrcRect.Width  = (REAL)dBitmap->Width;
        SrcRect.Height = (REAL) dBitmap->Height;
    }

    // Set up the translation.
    if(dstRect)
    {
         DstRect = *dstRect;
    }
    else
    {
         DstRect.X = 0.0f;
         DstRect.Y = 0.0f;
         DstRect.Width = (REAL)SrcRect.Width;
         DstRect.Height = (REAL)SrcRect.Height;
    }


    if( !GpValidFixed16(SrcRect.X) ||
        !GpValidFixed16(SrcRect.Y) ||
        !GpValidFixed16(SrcRect.Width) ||
        !GpValidFixed16(SrcRect.Height) ||
        !GpValidFixed16(DstRect.X) ||
        !GpValidFixed16(DstRect.Y) ||
        !GpValidFixed16(DstRect.Width) ||
        !GpValidFixed16(DstRect.Height) )
    {
        // punt
        
        isValid = false;
        return;
    }
        

    // Initialize the state for the x-dimension scale.

    xscale = GpRealToFix16(SrcRect.Width/DstRect.Width);
    xscaleinv = GpRealToFix16(DstRect.Width/SrcRect.Width);

    // Initialize the state for the y-dimension scale.

    yscale = GpRealToFix16(SrcRect.Height/DstRect.Height);
    yscaleinv = GpRealToFix16(DstRect.Height/SrcRect.Height);

    // Compute the destination contribution.
    // Note: the actual pixels touched are the floor of
    // the top left to the ceiling of the bottom right.
    // (modulus the clipping)

    // Note: We want to be tracking our internal state in FIX16 so we have
    // the extra fractional precision, but when we compute our bounds for the
    // drawing, we use Ceiling and Floor on these FIX16 numbers below. We want
    // the rounding to match the rounding of the FIX4 numbers (i.e. we don't
    // want to track any extra fractional precision errors from the float
    // representation) because we use FIX4 in our DrawImage loop.
    // To accomplish this, we round to FIX4 dropping all error that is smaller
    // than the FIX4 precision and then upconvert to FIX16. Now when we use
    // Fix16Ceiling and Floor, we'll get the same results as Fix4Ceiling and
    // Floor.

    REAL xinv = DstRect.Width/SrcRect.Width;
    REAL yinv = DstRect.Height/SrcRect.Height;

    fixDLeft = GpRealToFix4(DstRect.X);
    fixDRight = GpRealToFix4(xinv * (SrcRect.Width) + DstRect.X);
    fixDTop = GpRealToFix4(DstRect.Y);
    fixDBottom = GpRealToFix4(yinv * (SrcRect.Height) + DstRect.Y);

    // Handle negative scale

    FIX16 fixTemp;

    if(fixDLeft > fixDRight)
    {
        // Swap the left and right x coordinates.
        fixTemp = fixDLeft;
        fixDLeft = fixDRight;
        fixDRight = fixTemp;
    }

    if(fixDTop > fixDBottom)
    {
        // Swap the top and bottom x coordinates.
        fixTemp = fixDTop;
        fixDTop = fixDBottom;
        fixDBottom = fixTemp;
    }

    // Compute the left edge using the rasterizer rounding rules. Used
    // for clipping in x.
    
    ixleft = GpFix4Ceiling(fixDLeft);
    
    // Convert up to FIX16.
    
    fixDLeft   <<= FIX4TOFIX16_SHIFT;    
    fixDRight  <<= FIX4TOFIX16_SHIFT;    
    fixDTop    <<= FIX4TOFIX16_SHIFT;    
    fixDBottom <<= FIX4TOFIX16_SHIFT;    

    // Get the initial kernel center. This specifies the x-dimension
    // fractional pixel offset.

    if(xscale < 0)
    {
        xkci = GpRealToFix16(
            (((DstRect.X+DstRect.Width) - GpFix16Ceiling(fixDRight)) *
            (xscale)) / FIX16_ONE +
            SrcRect.X
        );
    }
    else
    {
        xkci = GpRealToFix16(
            ((DstRect.X - GpFix16Floor(fixDLeft)) *
            xscale) / FIX16_ONE +
            SrcRect.X
        );
    }

    // Get the width of the kernel.
    // Make sure to multiply by the actual width of the filter kernel in
    // normalized space (FilterWidth[i])

    xw = GpRealToFix16(
        (SrcRect.Width*FilterWidth[FilterMode]) /
        DstRect.Width
    );       // convert to FIX16

    // Handle the negative transform

    if(xscale < 0)
    {
        xw = -xw;
    }

    // the width of the kernel must be a positive quantity.

    ASSERT(xw >= 0);

    // if the width is less than one we're doing a stretch, not a shrink.
    // in this case we clamp the kernel size to one.

    if(xw < FIX16_ONE * FilterWidth[FilterMode])
    {
        xw = FIX16_ONE * FilterWidth[FilterMode];
    }

    // a is 1/w - used to work out the tent filter.

    xa = GpRealToFix16(65536.0f/xw);

    // Get the initial kernel center. This specifies the y-dimension
    // fractional pixel offset.

    if(yscale < 0)
    {
        ykci = GpRealToFix16(
            ((GpFix16Ceiling(fixDBottom) - (DstRect.Y+DstRect.Height)) *
            (-yscale)) / FIX16_ONE +
            SrcRect.Y
        );
    }
    else
    {
        ykci = GpRealToFix16(
            ((GpFix16Floor(fixDTop) - DstRect.Y) *
            yscale) / FIX16_ONE +
            SrcRect.Y
        );
    }

    // Get the width of the kernel.
    // Make sure to multiply by the actual width of the filter kernel in
    // normalized space (FilterWidth[i])

    yw = GpRealToFix16(
        (SrcRect.Height * FilterWidth[FilterMode]) /
        DstRect.Height
    );      // Convert to FIX16

    // Handle the negative transform

    if(yscale < 0)
    {
        yw = -yw;
    }

    // the width of the kernel must be a positive quantity.

    ASSERT(yw >= 0);

    // if the kernel width is less than one we're doing a stretch, not
    // a shrink. In this case we clamp the kernel size to one.

    if(yw < (FIX16_ONE * FilterWidth[FilterMode]))
    {
        yw = FIX16_ONE * FilterWidth[FilterMode];
    }

    // a is 1/w - used to work out the tent filter.

    ya = GpRealToFix16(65536.0f/yw);

    // !!! [asecchia] The rounding used here should match the rounding used to compute
    // the parameters to StretchBitsMainLoop.

    iytop = GpFix16Floor(fixDTop);

    // Compute the width of one scanline in the destination.

    xbuffer_width = GpFix16Ceiling(fixDRight) - GpFix16Floor(fixDLeft);
    ASSERT(xbuffer_width >= 0);

    xbuffer_height = GpFix16Ceiling(yw)*2+1;
    ASSERT(xbuffer_height >= 0);

    // set the rotational array to start at the first scanline.

    xbuffer_start_scanline = 0;

    // allocate the xbuffer.

    // !!! PERF [asecchia]. Ouch this is ugly.
    // we should at least try use a stack buffer for small images.
    // Maybe a lookaside list or something.

    xbuffer = (ARGB *)GpMalloc(xbuffer_height*xbuffer_width*sizeof(ARGB));
    
    // ycoeff needs to have 2 entries more than xbuffer_height because
    // it may be reused to store the MMX coefficients (see OutputSpan
    // routine for details).

    ycoeff = (FIX16 *)GpMalloc((xbuffer_height + 2) * sizeof(FIX16));

    if((NULL == ycoeff) || (NULL == xbuffer))
    {
        isValid = false;
        
        GpFree(xbuffer);
        GpFree(ycoeff);
        
        // Make sure these get initialized to NULL before we can early out
        // otherwise we could end up double freeing the pointers in our
        // destructor.
    
        xbuffer = NULL;
        ycoeff = NULL;
        
        return;
    }

    // set the initial value of last_k to maxint

    last_k = LAST_K_UNUSED;
}



/**************************************************************************\
*
* Function Description:
*
*   This function performs a 1d stretch using the tent filter
*
* Arguments:
*
*   dst   - destination buffer
*   src   - source pixels
*   dw    - destination width in pixels
*   sw    - source width in pixels
*   kci   - the initial kernel centering position (for fractional translate)
*   scale - the scale of the filter - sw/dw
*   w     - the width of the filter kernel - typically the ceiling of sw/dw
*   a     - 1/w
*
* History:
*   04/16/2000 asecchia   created it.
*
\**************************************************************************/

// !!! Perf [asecchia] For really complicated wrapmodes where many of the
//     pixels are outside of the source and hence need to be wrapped, it may
//     make more sense to copy the source into an extended buffer and pre-wrap
//     the end points (i.e. overallocate) for each scanline.
//     This could simplify the code for the complex wrap conditions.
//     However, for the simple codepath, this would give an extra copy per
//     pixel and might not be worth it.


// Ick. Why does the compiler do a better job of optimizing macros?
// These should really be inline function calls.

#define ClampColors() \
        if(FilterMode == HighQualityBilinear)       \
        {                                           \
            ta = GpFix16Round(ta);                  \
            tr = GpFix16Round(tr);                  \
            tg = GpFix16Round(tg);                  \
            tb = GpFix16Round(tb);                  \
            if(ta>255) ta = 255;                    \
            if(tr>255) tr = 255;                    \
            if(tg>255) tg = 255;                    \
            if(tb>255) tb = 255;                    \
        }                                           \
        if(FilterMode == HighQualityBicubic)        \
        {                                           \
            ta = GpFix16Round(ta);                  \
            tr = GpFix16Round(tr);                  \
            tg = GpFix16Round(tg);                  \
            tb = GpFix16Round(tb);                  \
            if(ta>255) ta = 255;                    \
            if(tr>ta) tr = ta;                      \
            if(tg>ta) tg = ta;                      \
            if(tb>ta) tb = ta;                      \
            if(ta<0) ta = 0;                        \
            if(tr<0) tr = 0;                        \
            if(tg<0) tg = 0;                        \
            if(tb<0) tb = 0;                        \
        }


// Compute the kernel in the inner loop
// Note: the If statements are compiled away in the final code
// because they are template variable comparisons which can be 
// done at compile time.

// This macro looks up the new kernel value, subtracts the old one
// to get the area of contribution for this pixel, computes the 
// new kernel position and stores the current table lookup.

#define ComputeKernel(pc, a, pa, pa_old, krn) \
            if(FilterMode == HighQualityBilinear)              \
            {                                                  \
                pa = LPIC[krn >> 9];                           \
                pc = pa-pa_old;                                \
                krn += (a);                                    \
                pa_old = pa;                                   \
            }                                                  \
            if(FilterMode == HighQualityBicubic)               \
            {                                                  \
                pa = CPIC[krn >> 8];                           \
                pc = pa-pa_old;                                \
                krn += (a);                                    \
                pa_old = pa;                                   \
            }

// This block of code accumulates the individual channels from
// kptr into the accumulation buffers tb, tg, tr, and ta.

#define AccumulateChannels(pc, kptr) \
{                        \
    tb += pc * kptr[0];  \
    tg += pc * kptr[1];  \
    tr += pc * kptr[2];  \
    ta += pc * kptr[3];  \
}



inline void Apply1DWrapModeX(INT WrapMode, INT &x, INT w)
{
    INT xm;
    switch(WrapMode) {

    case WrapModeTileFlipY:
    case WrapModeTile:
        x = RemainderI(x, w);
    break;

    case WrapModeTileFlipX:
    case WrapModeTileFlipXY:
        xm = RemainderI(x, w);
        if(((x-xm)/w) & 1) {
            x = w-1-xm;
        }
        else
        {
            x = xm;
        }
    break;

    default:
        // Caller should correctly anticipate other wrap modes.
        ASSERT(FALSE);
    break;
    }
}

inline void Apply1DWrapModeY(INT WrapMode, INT &y, INT h)
{
    INT ym;
    switch(WrapMode) {

    case WrapModeTile:
    case WrapModeTileFlipX:
        y = RemainderI(y, h);
        break;

    case WrapModeTileFlipY:
    case WrapModeTileFlipXY:
        ym = RemainderI(y, h);
        if(((y-ym)/h) & 1) {
            y = h-1-ym;
        }
        else
        {
            y = ym;
        }
    break;

    default:
        // Caller should correctly anticipate other wrap modes.
        ASSERT(FALSE);
    break;
    }
}


#undef RemainderI


/**************************************************************************
*
* Function Description:
*
*   Outputs the middle pixels in a 2:1 stretched scanline.  Note that
*   this function doesn't need to handle wrap modes.
*
*   Note:  this function must not use floating point values, because it could be
*   called with an invalid floating point state (prior to the call to emms)
*
* Arguments:
*
*   dst - The first pixel to be output
*   src - The first pixel in the source that will affect the destination
*         pixel in a bicubic 2:1 stretch
*   dw  - The number of pixels in the destination
*   kci - The subpixel shift in the position of the destination pixels
*
**************************************************************************/

void DpOutputSpanStretch<HighQualityBilinear>::StretchMiddleScanline2_MMX(
    ARGB *dst, 
    ARGB *src, 
    INT dw, 
    FIX16 kci 
)
{
    ASSERT(FALSE);
}

void DpOutputSpanStretch<HighQualityBicubic>::StretchMiddleScanline2_MMX(
    ARGB *dst, 
    ARGB *src, 
    INT dw, 
    FIX16 kci 
)
{
#if defined(_X86_)

    //
    // In order to store the kernel multipliers in 16bit registers, we 
    // will lose the bottom 3 precision bits (hence each k[i] must be 
    // right shifted by three).  The summation of the kernel multipliers
    // should come to 16K, hence KERNEL_SHIFT_AMOUNT is 14.
    //
     
#define KERNEL_SHIFT_AMOUNT 14     
     
    FIX16 k[8];
    FIX16 kernelIncrement = FIX16_ONE >> 2 ;
    FIX16 kCurrent = (kci >> 2) - FIX16_ONE;
    for (INT i = 0; i < 8; i++)
    {
        ASSERT(kCurrent >= -FIX16_ONE);
        ASSERT(kCurrent <= FIX16_ONE);

        k[i] = SymmetricBicubicKernelCenter[kCurrent >> (FIX16_SHIFT-BicubicKernelShift)]; 
        k[i] >>= 3;

        kCurrent += kernelIncrement;        
    }
    
    //
    // Setup 64bit aligned workspace for the MMX code
    //
    // 0 - zero
    // 8  - kernel multiplier 0
    // 16 - kernel multiplier 1
    // 24 - kernel multiplier 2
    // 32 - kernel multiplier 3
    // 40 - accumulator 3: g, b
    // 48 - accumulator 3: a, r
    // 56 - FIX14_HALF
    //

    #define BUFFER_SIZE 16    
    INT buffer[BUFFER_SIZE + 1];
    INT *buffer_64bitAligned = (INT *) ((((UINT_PTR) buffer) + 4) & ~0x7);
    
    buffer_64bitAligned[0]  = 0; // zero
    buffer_64bitAligned[1]  = 0;
    
    buffer_64bitAligned[2]  = (k[7] << 16) | (k[6] & 0xFFFF);   // kernel multiplier 0 
    buffer_64bitAligned[3]  = buffer_64bitAligned[2];

    buffer_64bitAligned[4]  = (k[5] << 16) | (k[4] & 0xFFFF);   // kernel multiplier 1 
    buffer_64bitAligned[5]  = buffer_64bitAligned[4];

    buffer_64bitAligned[6]  = (k[3] << 16) | (k[2] & 0xFFFF);   // kernel multiplier 2 
    buffer_64bitAligned[7]  = buffer_64bitAligned[6];

    buffer_64bitAligned[8]  = (k[1] << 16) | (k[0] & 0xFFFF);   // kernel multiplier 3 
    buffer_64bitAligned[9]  = buffer_64bitAligned[8];

    buffer_64bitAligned[10]  = 0; // Accumulator 3
    buffer_64bitAligned[11]  = 0;
    buffer_64bitAligned[12]  = 0;
    buffer_64bitAligned[13]  = 0;
    
    buffer_64bitAligned[14] = (1 << (14 - 1));       // FIX14_HALF
    buffer_64bitAligned[15] = (1 << (14 - 1));

    //
    // eax - counter for the first loop
    // ebx - 0xffffffff
    // esi - source
    // edi - destination
    // ecx - counter
    // edx - 64it aligned workspace buffer
    //
    // mm6, mm7: accumulator 0
    // mm4, mm5: accumulator 1
    //
    
    _asm
    {
        mov ebx, 0xFFFFFFFF
        mov esi, src
        mov edi, dst
        mov ecx, dw
        mov edx, buffer_64bitAligned
   

        //
        // The first loop loads the initial values into the accumulators, but
        // doesn't write out any pixels.  It executes exactly three times.
        //

        pxor mm4, mm4
        pxor mm5, mm5
        mov  eax, 3
            
loop1:
        
        //
        // Read expanded pixel values into mm0 and mm1
        //

        movd      mm1, [esi]         ; mm1 = 00000000a1r1g1b1
        movd      mm2, [esi + 4]     ; mm2 = 00000000a2r2g2b2
        add       esi, 8

        punpcklbw mm1, [edx]         ; mm1 = 00a100r100g100b1
        punpcklbw mm2, [edx]         ; mm2 = 00a200r200g200b2

        movq      mm0, mm1           ; mm0 = 00a100r100g100b1

        punpckhwd mm1, mm2           ; mm1 = 00a200a100r200r1
        punpcklwd mm0, mm2           ; mm0 = 00g200g100b200b1

        //
        // Add the contribution to accumulator 1
        //

        movq      mm6, [edx + 16]    ; kernel multiplier 1
        movq      mm7, mm6           ; kernel multiplier 1
        pmaddwd   mm6, mm0
        pmaddwd   mm7, mm1
        paddd     mm6, mm4
        paddd     mm7, mm5

        //
        // Add the contribution to accumulator 2
        //

        movq      mm4, [edx + 24]    ; kernel multiplier 2
        movq      mm5, mm4           ; kernel multiplier 2
        pmaddwd   mm4, mm0
        pmaddwd   mm5, mm1
        paddd     mm4, [edx + 40]
        paddd     mm5, [edx + 48]

        //
        // Compute the new third accumulator
        //

        pmaddwd   mm0, [edx + 32]    ; multiply by kernel multiplier 3
        pmaddwd   mm1, [edx + 32]
        movq      [edx + 40], mm0
        movq      [edx + 48], mm1

        dec eax
        jnz loop1


        //
        // The second loop continues to compute the accumulators, but
        // also writes out destination pixels.
        //

loop2:
        
        //
        // Read expanded pixel values into mm0 and mm1
        //

        movd      mm1, [esi]         ; mm1 = 00000000a1r1g1b1
        movd      mm2, [esi + 4]     ; mm2 = 00000000a2r2g2b2
        add       esi, 8

        punpcklbw mm1, [edx]         ; mm1 = 00a100r100g100b1
        punpcklbw mm2, [edx]         ; mm2 = 00a200r200g200b2

        movq      mm0, mm1           ; mm0 = 00a100r100g100b1

        punpckhwd mm1, mm2           ; mm1 = 00a200a100r200r1
        punpcklwd mm0, mm2           ; mm0 = 00g200g100b200b1

        //
        // Add the contribution to accumulator 0
        //

        movq      mm2, [edx + 8]     ; mm2 = kernel multiplier 0
        movq      mm3, mm2           ; mm3 = kernel multiplier 0
        pmaddwd   mm2, mm0           ; mm2 = 0000gggg0000bbbb
        pmaddwd   mm3, mm1           ; mm3 = 0000aaaa0000rrrr
        paddd     mm6, mm2           ; add contributions to accumulator 0
        paddd     mm7, mm3

        //
        // Extract the pixel value from accumulator 0.
        //

        paddd     mm6, [edx + 56]    ; round
        psrad     mm6, KERNEL_SHIFT_AMOUNT
        paddd     mm7, [edx + 56]
        psrad     mm7, KERNEL_SHIFT_AMOUNT
        packssdw  mm6, mm7           ; mm6 = 00aa00rr00gg00bb    
        packuswb  mm6, mm6           ; mm6 = 00000000aarrggbb

        //
        // Clip all channels to alpha
        //

        movd      mm2, ebx           ; mm2 = 00000000ffffffff
        movq      mm7, mm6           ; mm7 = 00000000aarrggbb
        psrad     mm7, 24            ; mm7 = 00000000000000aa
        punpcklbw mm7, mm7           ; mm7 = 000000000000aaaa
        punpcklbw mm7, mm7           ; mm7 = 00000000aaaaaaaa
        psubusb   mm2, mm7
        paddusb   mm6, mm2
        psubusb   mm6, mm2

        movd      [edi], mm6
        add       edi, 4
        
        //
        // Add the contribution to accumulator 1
        //

        movq      mm6, [edx + 16]    ; kernel multiplier 1
        movq      mm7, mm6           ; kernel multiplier 1
        pmaddwd   mm6, mm0
        pmaddwd   mm7, mm1
        paddd     mm6, mm4
        paddd     mm7, mm5

        //
        // Add the contribution to accumulator 2
        //

        movq      mm4, [edx + 24]    ; kernel multiplier 2
        movq      mm5, mm4           ; kernel multiplier 2
        pmaddwd   mm4, mm0
        pmaddwd   mm5, mm1
        paddd     mm4, [edx + 40]
        paddd     mm5, [edx + 48]

        //
        // Compute the new third accumulator
        //

        pmaddwd   mm0, [edx + 32]    ; multiply by kernel multiplier 3
        pmaddwd   mm1, [edx + 32]
        movq      [edx + 40], mm0
        movq      [edx + 48], mm1

        dec ecx
        jnz loop2
        emms
    }

#undef KERNEL_SHIFT_AMOUNT

#endif // defined(_X86_)
}

/**************************************************************************\
*
* Function Description:
*
*   DpOutputSpanStretch<FilterMode>::StretchScanline
*   Stretches a single scanline (magnification or minification) using
*   the reconstruction/interpolation mode specified by the template
*   parameter. Currently this is used for bilinear and bicubic filters.
*
* Arguments:
*
*    ARGB *dst,    // destination pointer
*    ARGB *src,    // source pointer
*    INT dw,       // destination width (pixels)
*    INT sw,       // source width (pixels)
*    FIX16 kci,    // initial position of the kernel center
*    FIX16 scale,  // scale factor
*    FIX16 w,      // width from center of the kernel to the edge
*    FIX16 a,      // 1/w
*
* Notes:
*   

The following description is based on the bilinear (tent) filter but it is
equally applicable to the bicubic - though the pictures and description would
be slightly more complicated.

The code below is significantly complicated by the fact that we want the inner
kernel loop to be quick and therefore not handle the wrap modes. In order to 
make this work, we first compute the number of pixels on the left and right
of the scanline that need to consider the wrap mode. We process the left first
and then run the optimized loop for all the inner pixels (which ignores the 
wrap conditions). After that we run the right edge.
          
Bilinear filter convolution kernel:
Note that each kernel has an intrinsic width - bilinear = 1 and bicubic = 2.
This width is scaled by the inverse of the stretch factor - i.e. a shrink
that results in 1/3 of the size being output requires a width (w) of 3 for the
bilinear and 6 for the bicubic. Also the height of the filter kernel is scaled
by the scale factor - i.e. the height of 1 (for all kernels) becomes 1/3 in 
the above example.

          
                            --- | ---                      ^
                        ---  .  |  .  ---                  |
                    ---      .  |  .     .---              h
                ---    .     .  |  .     .    ---          |
            ---  .     .     .  |  .     .     .  ---      |
        ---.     .     .     .  |  .     .     .     .---  v
-----0-----1-----2-----3-----4--|--5-----6-----7-----8-----9------------        
                                |
     kb                         kc                         ke
                                <------------w----------->

The filter kernel is shifted so that kc is exactly at the position of the
required destination pixel transformed into the source pixel array by the
scale factor. This will in general fall somewhere between two pixel samples - 
in the above picture, between pixels 4 and 5.

The goal is to get a color value for the position at kc and emit that into
the destination pixel stream. The standard evaluation method is to compute 
the height of the filter kernel at each of the pixel samples under the filter 
convolution corresponding to pixels 0, 1, ... 9. These heights are used to
weight each pixel sample and the result is summed giving the destination pixel
at kc. 

The problem with the standard evaluation is that at non-integer shrinks 
the mathematical evaluation of the kernel produces ripples in the output - i.e.
a solid field of pixels responds with a sine-wave-like ripple output. This is 
a theoretical problem with the discrete evaluation of the kernel integral.

Our evaluation actually stores a table of partial integrals from -inf to x. We
use this table to compute the area around each pixel and the area is used as
the weight. This evaluation is guaranteed to respond with exactly one for any
position and scale factor of the kernel. This property gives a stable field 
response allowing us to have non-ripple shrinks.

                            ---.: ---                       
                        ---.....:     ---                   
                    ---   :.....:         ---               
                ---       :.....:             ---           
            ---           :.....:                 ---       
        ---               :.....:                     ---   
-----0-----1-----2-----3-----4-----5-----6-----7-----8-----9------------        
                                                              
To evaluate this properly, we lookup the integral from -inf to 4.5 ( actually
we rescale so that the center of the kernel is at 0 ) and then subtract the 
table lookup for the integral from -inf to 3.5. This gives us an exact 
(within the error of the table) computation for the area from 3.5 to 4.5. 
This is what we use for the weight of pixel 4. Note that contrary to the 
standard evaluation pixel 9 does contribute even though 9 is outside of the
kernel. 8.5 is inside the kernel so the area under the kernel from 8.5 to 9.5 
is a small triangular area and is not equal to zero. Not accounting for this is 
the major source of error in the standard evaluation.

Note that the lookup for the end point integral for pixel 4 of -inf to 4.5 can 
be reused as the start point for the next pixel (5). An important property of 
this is that any error (e) in the lookup for -inf to 4.5 is added in pixel 
4's contribution and subtracted in pixel 5's contribution which results in 
the total error for the filter response -- due to table discretization -- being
completely subtracted away --- the end points have an error of exactly zero 
because we sample from beyond the left (area of exactly 0) to beyond the right 
(area of exactly 1). This is not precisely true because the error is scaled
by the pixel values, but it does help.

Note that this integral method is equivalent to convolving the input pixels
(comb) with the box filter of width 1 pixel and then convolving the result
with the filter kernel. [analysis due to Jim Blinn - see documentation in 
the Specs directory.]

Further documentation is available in the specs directory:
gdiplus\specs\filter\convolution.doc


*   Note:  this function must not use floating point values, because it could be
*   called with an invalid floating point state (prior to the call to emms)
*
* History:
*
*   04/16/2000 asecchia   created it
*
\**************************************************************************/


template<FilterModeType FilterMode>
void DpOutputSpanStretch<FilterMode>::StretchScanline(
    ARGB *dst,    // destination pointer
    ARGB *src,    // source pointer
    INT dw,       // destination width (pixels)
    INT sw,       // source width (pixels)
    FIX16 kci,    // initial position of the kernel center
    FIX16 scale,  // scale factor
    FIX16 w,      // width from center of the kernel to the edge
    FIX16 a       // 1/w
)
{
    // Note: this is a template class so the value of FilterMode
    // is defined at compile time. We're relying on the compiler
    // to perform dead code removal for each template instantiation
    // eliminating both the constant comparison and all the
    // code branches corresponding to other FilterMode values.
    // That way our inner loop is not impacted by extra code for
    // filter modes we're not using and extraneous conditional
    // statements.

    // Center of the filter kernel.
    // Shift over to the left by half because we want to center the area of
    // contribution for each sample on the sample - rather than taking the 
    // area between two point samples as the contribution for the sample on 
    // the right.

    FIX16 kc = kci - FIX16_HALF;

    // Left and right extent of the kernel, intra-kernel position,
    // and pixel contribution.

    INT kb, ke;
    INT kbt, ket;
    FIX16 kp, pc, pa, pa_old;

    // Loop variables

    INT x, k;

    // Incremental loop state, intermediate computation.

    ARGB *d = dst;
    FIX16 krn = 0;

    // Color channel accumulators.

    FIX16 ta, tr, tg, tb;

    // Compute the first pixel along the destination scanline that doesn't
    // have any wrap contribution and then the last pixel (l & r).
    // Note that all the terms have a FIX16_ONE factor which cancel out.

    // !!! Perf: [asecchia] This stuff is computed every scanline -
    //     and it's always the same. We could pass these coordinates to
    //     this routine and have them precomputed.

    INT lWrapX;
    INT rWrapX;

    if(scale>=0)
    {
        // x==sw is considered outside of the source.

        FIX16 fix_sw = (sw-1) << FIX16_SHIFT;

        // add (scale-1) and use idiv to get a Ceiling()

        lWrapX = (w-kc+(scale-1))/scale;

        // idiv should give us Floor().

        rWrapX = (fix_sw-w-kc)/scale;
    }
    else
    {
        // x==sw is considered outside of the source.

        FIX16 fix_sw = (sw-1) << FIX16_SHIFT;

        // note: in the -x scale transform, the sense of lWrapX and rWrapX
        // can be confusing. The l&r apply to the destination left and right
        // and are swapped here when we compute the initial position from
        // the inverted left and right source points.
        // As we traverse the destination from left to right we'll encounter
        // lWrapX first and then rWrapX, but the kc (kernel center) will be
        // moving through the source from right to left decrementing by
        // scale each time.

        // use idiv to get a Floor()

        rWrapX = (w-kc)/scale;

        // add scale+1 and use idiv for Ceiling().

        lWrapX = (fix_sw-w-kc+(scale+1))/scale;
    }

    // Now clamp to the range of the destination we're going to draw.

    lWrapX = max(0, lWrapX);
    rWrapX = min(dw, rWrapX);

    BYTE *kptr;
    INT  k_wrap;

    // Do the left wrapmode pixels.
/*          
                            --- | ---                      ^
                        ---     |     ---                  |
                    ---         |         ---              h
                ---             |             ---          |
            ---                 |                 ---      |
        ---                     |                     ---  v
-----0-----1-----2-----3-----4--|--5-----6-----7-----8-----9------------        
     kb                         kc <-----------w---------->ke
                   kbt                     ket
      <----wrap----><---------texture------><----wrap----->
       +ve transform                         -ve transform
      or straddle case                      or straddle case

The following loop handles the case where the wrap happens on the left of the
kernel. There are three subloops - first to handle the pixels in the wrap 
segment on the left, then to handle the pixels in the texture. Normally the
texture pixels will extend to the right edge of the kernel and we'll be done, 
but two cases make the right wrap essential at this point. First if the 
transform is negative, the sense is flipped and the texture extends from the 
left edge to the middle point and the wrap extends the rest of the kernel to 
the right edge. Also if the texture is sufficiently small and the shrink factor 
sufficiently large, the filter kernel could overlap both the left and right edge
of the texture and require wrapping on both sides.
*/

    for(x=0; x<min(lWrapX, dw); x++)
    {
        ASSERT(x<dw);
        // Compute the start and end of the filter kernel coverage

        kb = GpFix16Ceiling(kc-w);
        ke = GpFix16Ceiling(kc+w);

        // Bound the pixels in the texture.

        // kbt == kernel begin texture coordinate.
        // ket == kernel end texture coordinate.

        kbt = max(0,kb);
        ket = min(ke, sw-1);

        // Initialize the component accumulators. We accumulate the
        // contribution of each color component scaled by the kernel
        // response into these variables.

        ta = tr = tg = tb = 0;

        // These pixels are off the left of the texture.
        pa_old = 0;

        // Iterate over each pixel under the filter kernel.
        // if ke==kb then there is one point.
        krn = Int32x32Mod16(a, (kb << FIX16_SHIFT) - kc);

        if(QWrapMode == WrapModeClamp)
        {
            // Clamp modes.

            for(k=kb; k<min(kbt, ke+1); k++)
            {
                // these pixels are always off the left of the texture.

                ASSERT(k<0);

                ComputeKernel(pc, a, pa, pa_old, krn);

                ta += pc * ClampColorA;
                tr += pc * ClampColorR;
                tg += pc * ClampColorG;
                tb += pc * ClampColorB;
            }
        }
        else
        {
            // Do the full wrap computation.

            for(k=kb; k<min(kbt, ke+1); k++)
            {
                // these pixels are always off the left of the texture.

                k_wrap = k;

                ASSERT(k<0);

                // !!! Perf: [asecchia] This is really slow.
                //     If we ever decide to make wrap modes propagate
                //     through the outcrop region and decide that wrap
                //     tile and flip x,y are important perf scenarios,
                //     we should come back and replace this divide with
                //     repeated subtraction - most times it can be avoided.
                //     However, right now this is only used for a few
                //     pixels on the edges and we don't really mind the
                //     perf hit for these modes.

                Apply1DWrapModeX(QWrapMode, k_wrap, sw);
                ComputeKernel(pc, a, pa, pa_old, krn);
                kptr = (BYTE*)(src + k_wrap);
                AccumulateChannels(pc, kptr);
            }

        }

        // Initialize the color channel accessor pointer to the beginning
        // of the source pixel array for this kernel.

        kptr = (BYTE*)(src + kbt);

        // HighQualityBicubic needs to initialize the krn value.
        // It is used to do the kernel table lookup.
        // HighQualityBilinear doesn't use this as it works out it's
        // kernel by direct computation.

        krn = Int32x32Mod16(a, (kbt << FIX16_SHIFT) - kc);

        // These pixels hit the texture.

        for(k=kbt; k<=ket; k++)
        {
            ComputeKernel(pc, a, pa, pa_old, krn);

            // Accumulate the contribution of this source pixel to the pixel
            // we're working on.
            AccumulateChannels(pc, kptr);
            kptr += 4;
        }

        // These pixels are off the right of the texture.
        // This can happen if the kernel spans the entire source texture.

        // Iterate over each pixel under the filter kernel.
        // if ke==kb then there is one point.

        krn = Int32x32Mod16(a, (max(ket+1, kb) << FIX16_SHIFT) - kc);

        if(QWrapMode == WrapModeClamp)
        {
            // Clamp modes.

            for(k=max(ket+1, kb); k<=ke; k++)
            {
                ComputeKernel(pc, a, pa, pa_old, krn);

                ta += pc * ClampColorA;
                tr += pc * ClampColorR;
                tg += pc * ClampColorG;
                tb += pc * ClampColorB;
            }
        }
        else
        {
            // Do the full wrap computation.

            for(k=max(ket+1, kb); k<=ke; k++)
            {
                k_wrap = k;
                Apply1DWrapModeX(QWrapMode, k_wrap, sw);
                ComputeKernel(pc, a, pa, pa_old, krn);
                kptr = (BYTE*)(src + k_wrap);
                AccumulateChannels(pc, kptr);
            }

        }
        // Done with this pixel - store it in the destination buffer.

        // clamp the results to byte range.

        ClampColors();

        // Combine the channels, set the destination pixel and increment
        // to the next pixel

        *d++ = (ta << 24) | (tr << 16) | (tg << 8) | tb;
        kc += scale;
    }

    // For all points, x, in the destination compute the position of the
    // kernel center in the source and sum the contribution under the filter.

    const INT minCenterWidthMMX = 16;
    INT dstCenterWidth = rWrapX - lWrapX;
    INT srcFirst = GpFix16Ceiling(kc - w);
    INT srcLast  = GpFix16Floor(kc+w + (dstCenterWidth - 1) * scale);

    // srcLast_2Stretch is the last pixel touched by the MMX routine.
    // The number of pixels touched by the routine is equal to six
    // (setup pixels) plus two times the width of the center strip
    // in the destination.  We subtract one in order the get the actual
    // last pixel touched by StretchMiddleScanline2_MMX (so that we can
    // compare it with srcLast).

    INT srcLast_2Stretch = srcFirst + (dstCenterWidth + 3) * 2 - 1;

#if defined(_X86_)
    if ((OSInfo::HasMMX) &&
        (FilterMode == HighQualityBicubic))
    {
        // MMX and high quality bicubic

        if ((dstCenterWidth >= minCenterWidthMMX) &&
            ((srcLast_2Stretch == srcLast) || (srcLast_2Stretch == (srcLast - 1))))
        {
            ASSERT(srcFirst >= 0);
            ASSERT(srcLast_2Stretch < sw);

            // Stretch the middle pixels by a factor of two using optimized MMX
            // code.

            FIX16 kc_center = kc + FIX16_HALF;
            StretchMiddleScanline2_MMX(d,
                                       src + srcFirst,
                                       dstCenterWidth,
                                       kc_center - (GpFix16Floor(kc_center) * FIX16_ONE));
            d += dstCenterWidth;
            kc += scale * dstCenterWidth;
            x += dstCenterWidth;
        }
        else
        {
            // This is the MMX version of the general purpose bicubic scaling
            // code.

            for(x=lWrapX; x<rWrapX; x++)
            {
                // Cannot go over dw because rWrap is < dw

                ASSERT(x<dw);

                // Compute the start and end of the filter kernel coverage

                kb = GpFix16Ceiling(kc-w);
                ke = GpFix16Ceiling(kc+w);

                // Default loop assumes most pixels don't have to worry about
                // wrap mode along the ends of the scanline.

                ASSERT(kb>=0);
                ASSERT(ke<sw);

                // Initialize the color channel accessor pointer to the beginning
                // of the source pixel array for this kernel.

                kptr = (BYTE*)(src + kb);

                // HighQualityBicubic needs to initialize the krn value.
                // It is used to do the kernel table lookup.
                // HighQualityBilinear doesn't use this as it works out it's
                // kernel by direct computation.

                krn = Int32x32Mod16(a, (kb << FIX16_SHIFT) - kc);

                // Iterate over each pixel under the filter kernel.
                // if ke==kb then there is one point.

                INT bcl_count = ke - kb + 1;
                INT bcl_half_count = bcl_count >> 1;
                bcl_count &= 0x1;

                _asm
                {
                    // eax - krn
                    // ebx - kptr
                    // esi - LPIC
                    // edi - a
                    //
                    // mm5 - pold
                    // mm6 - green ; blue
                    // mm7 - alpha ; red

                    mov          eax, krn
                    mov          ebx, kptr
                    mov          esi, CPIC
                    mov          edi, a
                    pxor         mm5, mm5
                    movq         mm6, FIX14_HALF_MMX
                    movq         mm7, mm6
                    pxor         mm0, mm0

                    dec          bcl_half_count
                    jl           bicubic_center_loop_last_pixel

                bicubic_center_loop:

                    // Read the next two pixels into mm2 and mm1

                    movd         mm2, [ebx]      // mm2 = pixel1
                    movd         mm1, [ebx + 4]  // mm1 = pixel2
                    add          ebx, 8

                    // Compute the kernel values for these two pixels

                    mov          edx, eax
                    sar          edx, 8
                      punpcklbw    mm2, mm0
                    movd         mm3, [esi + 4 * edx] // mm3 = p1

                    lea          edx, [eax + edi]
                    sar          edx, 8
                      punpcklbw    mm1, mm0
                    movd         mm4, [esi + 4 * edx] // mm4 = p2

                    punpckldq    mm5, mm3             // mm5 = p1 | pold
                    lea          eax, [eax + 2 * edi]
                    punpckldq    mm3, mm4             // mm3 = p2 | p1

                    psrad        mm5, 2
                    psrad        mm3, 2

                    psubd        mm3, mm5             // mm3 = kernel2 | kernel1
                    movq         mm5, mm4             // mm5 = pold
                    packssdw     mm3, mm3             // mm3 = kernel2 | kernel1 | kernel2 | kernel1

                    // At this point:
                    // mm3 = kernel2 | kernel1 | kernel2 | kernel1
                    // mm2, mm1 contain pixel1 and pixel2 respectively

                    movq         mm4, mm2
                    punpcklwd    mm2, mm1
                    pmaddwd      mm2, mm3    
                    punpckhwd    mm4, mm1
                    paddd        mm6, mm2
                    dec          bcl_half_count
                    pmaddwd      mm4, mm3
                    paddd        mm7, mm4

                    jge          bicubic_center_loop

                bicubic_center_loop_last_pixel:

                    dec          bcl_count
                    jl           bicubic_center_loop_done

                    // Read the last pixel into mm2

                    movd         mm2, [ebx]
                    punpcklbw    mm2, mm0 // mm2 = a | r | g | b
                    movq         mm3, mm2
                    punpcklwd    mm2, mm0 // mm2 = 0 | g | 0 | b
                    punpckhwd    mm3, mm0 // mm3 = 0 | a | 0 | r

                    // Compute the kernel value for this pixel

                    sar          eax, 8
                    psrad        mm5, 2
                    movd         mm4, [esi + 4 * eax] // mm4 = p
                    psrad        mm4, 2
                    psubd        mm4, mm5
                    packssdw     mm4, mm4

                    pmaddwd      mm2, mm4
                    pmaddwd      mm3, mm4

                    paddd        mm6, mm2
                    paddd        mm7, mm3

                bicubic_center_loop_done:

                    // At this point, mm6 and mm7 contain the output channels
                    // for the pixel.  We need to clamp the alpha and store it
                    // in the destination buffer.

                    psrad        mm6, 14
                    psrad        mm7, 14
                    packssdw     mm6, mm7 // mm6 = a | r | g | b
                    packuswb     mm6, mm6 // mm6 = 00000000aarrggbb

                    movq         mm7, mm6 // mm7 = 00000000aarrggbb
                    psrad        mm6, 24  // mm6 = xxxxxxxxxxxxxxaa
                      mov          eax, 0xFFFFFFFF
                    punpcklbw    mm6, mm6 // mm6 = xxxxxxxxxxxxaaaa
                      movd         mm2, eax
                    punpcklbw    mm6, mm6 // mm6 = xxxxxxxxaaaaaaaa

                    psubusb      mm2, mm6
                      mov          eax, d
                    paddusb      mm7, mm2
                    psubusb      mm7, mm2

                    movd         [eax], mm7
                    add          eax, 4
                    mov          d, eax
                }
            
                kc += scale;
            }
        }
    }
    else
#endif // defined(_X86_)    
/*          
                            --- | ---                      ^
                        ---     |     ---                  |
                    ---         |         ---              h
                ---             |             ---          |
            ---                 |                 ---      |
        ---                     |                     ---  v
-----0-----1-----2-----3-----4--|--5-----6-----7-----8-----9------------        
     kb                         kc <-----------w---------->ke
                                
      <-----------------------texture--------------------->

The following loop is guaranteed to only hit texture for every pixel under
the kernel. This is the majority of the pixels in most normal stretch 
cases. We can simplify this loop because of this assumption and therefore
get a performance win.
Many of the degenerate wrap cases will simply skip this loop.
*/
    {
        // no MMX

        for(x=lWrapX; x<rWrapX; x++)
        {
            // Cannot go over dw because rWrap is < dw

            ASSERT(x<dw);

            // Compute the start and end of the filter kernel coverage

            kb = GpFix16Ceiling(kc-w);
            ke = GpFix16Ceiling(kc+w);

            // Default loop assumes most pixels don't have to worry about
            // wrap mode along the ends of the scanline.

            ASSERT(kb>=0);
            ASSERT(ke<sw);

            // Initialize the component accumulators. We accumulate the
            // contribution of each color component scaled by the kernel
            // response into these variables.

            ta = tr = tg = tb = 0;

            // Initialize the color channel accessor pointer to the beginning
            // of the source pixel array for this kernel.

            kptr = (BYTE*)(src + kb);

            // HighQualityBicubic needs to initialize the krn value.
            // It is used to do the kernel table lookup.
            // HighQualityBilinear doesn't use this as it works out it's
            // kernel by direct computation.

            krn = Int32x32Mod16(a, (kb << FIX16_SHIFT) - kc);
            pa_old = 0;

            // Iterate over each pixel under the filter kernel.
            // if ke==kb then there is one point.

            for(k=kb; k<=ke; k++)
            {
                ComputeKernel(pc, a, pa, pa_old, krn);

                // Accumulate the contribution of this source pixel to the pixel
                // we're working on.
                
                AccumulateChannels(pc, kptr);

                kptr += 4;
            }

            // Done with this pixel - store it in the destination buffer.

            // clamp the results to byte range.

            ClampColors();
        
            ASSERT(tr<=ta);
            ASSERT(tg<=ta);
            ASSERT(tb<=ta);
            ASSERT(ta>=0);
            ASSERT(tr>=0);
            ASSERT(tg>=0);
            ASSERT(tb>=0);

            // Combine the channels, set the destination pixel and increment
            // to the next pixel

            *d++ = (ta << 24) | (tr << 16) | (tg << 8) | tb;
            
            kc += scale;
        }
    }

    // Need to use max() here to handle the case where lWrapX > rWrapX
    // which can happen if the filter spans both edges of the scanline.

    // Do the right wrapmode pixels.

/*          
                            --- | ---                      ^
                        ---     |     ---                  |
                    ---         |         ---              h
                ---             |             ---          |
            ---                 |                 ---      |
        ---                     |                     ---  v
-----0-----1-----2-----3-----4--|--5-----6-----7-----8-----9------------        
     kb                         kc <-----------w---------->ke
                   kbt                     ket
      <----wrap----><---------texture------><----wrap----->
      -ve transform                           +ve tranform
        case only                               case only
      
The following loop handles the case where the wrap happens on the right of the
kernel. There are three subloops - first to handle the pixels in the wrap 
segment on the left - if any, then to handle the pixels in the texture. After 
that handle the pixels in the right wrap. Normally the texture pixels will 
extend to the left edge of the kernel and the first subloop will simply be 
skipped, but the left wrap is essential if the transform is negative --- the 
sense is flipped and the texture extends from the right edge to the middle 
point and the wrap extends the rest of the kernel to the left edge. 
Note it's not possible at this point to have wrapping at both edges of the 
kernel the wrap is on the left iff the transform is negative. The wrap is on 
the right iff the transform is positive. The case where both wrapmodes is 
present has already been taken care of in the first loop.
*/

    for(x=max(x, rWrapX); x<dw; x++)
    {
        // Compute the start and end of the filter kernel coverage

        kb = GpFix16Ceiling(kc-w);
        ke = GpFix16Ceiling(kc+w);

        // Bound the pixels in the texture.

        // ket == kernel end texture coordinate (inclusive).
        // kbt == kernel begin texture coordinate.

        kbt = max(0,kb);
        ket = min(ke, sw-1);

        // Initialize the component accumulators. We accumulate the
        // contribution of each color component scaled by the kernel
        // response into these variables.

        ta = tr = tg = tb = 0;

        // Initialize the color channel accessor pointer to the beginning
        // of the source pixel array for this kernel.

        kptr = (BYTE*)(src + kb);

        // HighQualityBicubic needs to initialize the krn value.
        // It is used to do the kernel table lookup.
        // HighQualityBilinear doesn't use this as it works out it's
        // kernel by direct computation.

        pa_old = 0;

        if(kb<kbt)
        {  
            krn = Int32x32Mod16(a, (kb << FIX16_SHIFT) - kc);
        }

        // Iterate over each pixel under the filter kernel.
        // if ke==kb then there is one point.

        // These pixels are off the left of the texture.
        // This is possible for negative transform cases.

        if(QWrapMode == WrapModeClamp)
        {
            // Clamp modes.

            for(k=kb; k<min(kbt, ke+1); k++)
            {
                // these pixels are always off the left of the texture.

                ASSERT(k<0);

                ComputeKernel(pc, a, pa, pa_old, krn);

                ta += pc * ClampColorA;
                tr += pc * ClampColorR;
                tg += pc * ClampColorG;
                tb += pc * ClampColorB;
            }
        }
        else
        {
            // Do the full wrap computation.

            for(k=kb; k<min(kbt, ke+1); k++)
            {
                // these pixels are always off the left of the texture.

                k_wrap = k;

                ASSERT(k<0);

                // !!! Perf: [asecchia] This is really slow.
                //     If we ever decide to make wrap modes propagate
                //     through the outcrop region and decide that wrap
                //     tile and flip x,y are important perf scenarios,
                //     we should come back and replace this divide with
                //     repeated subtraction - most times it can be avoided.
                //     However, right now this is only used for a few
                //     pixels on the edges and we don't really mind the
                //     perf hit for these modes.

                Apply1DWrapModeX(QWrapMode, k_wrap, sw);
                ComputeKernel(pc, a, pa, pa_old, krn);
                kptr = (BYTE*)(src + k_wrap);
                AccumulateChannels(pc, kptr);
            }

        }

        // Initialize the color channel accessor pointer to the beginning
        // of the source pixel array for this kernel.

        kptr = (BYTE*)(src + kbt);

        // HighQualityBicubic needs to initialize the krn value.
        // It is used to do the kernel table lookup.
        // HighQualityBilinear doesn't use this as it works out it's
        // kernel by direct computation.

        krn = Int32x32Mod16(a, (kbt << FIX16_SHIFT) - kc);

        // These pixels hit the texture.

        for(k=kbt; k<=ket; k++)
        {
            ComputeKernel(pc, a, pa, pa_old, krn);

            // Accumulate the contribution of this source pixel to the pixel
            // we're working on.
            
            AccumulateChannels(pc, kptr);
            kptr += 4;
        }

        // These pixels are off the right of the texture.
        
        // Iterate over each pixel under the filter kernel.
        // if ke==kb then there is one point.
        krn = Int32x32Mod16(a, ((max(ket+1, kb)) << FIX16_SHIFT) - kc);

        if(QWrapMode == WrapModeClamp)
        {
            // Clamp modes.

            for(k=max(ket+1, kb); k<=ke; k++)
            {
                ComputeKernel(pc, a, pa, pa_old, krn);

                ta += pc * ClampColorA;
                tr += pc * ClampColorR;
                tg += pc * ClampColorG;
                tb += pc * ClampColorB;
            }
        }
        else
        {
            // Do the full wrap computation.

            for(k=max(ket+1, kb); k<=ke; k++)
            {
                // Apply the general pixel wrap

                k_wrap = k;
                Apply1DWrapModeX(QWrapMode, k_wrap, sw);
                ComputeKernel(pc, a, pa, pa_old, krn);
                kptr = (BYTE*)(src + k_wrap);
                AccumulateChannels(pc, kptr);
            }
        }

        // Done with this pixel - store it in the destination buffer.

        // clamp the results to byte range.

        ClampColors();

        // Combine the channels, set the destination pixel and increment
        // to the next pixel

        *d++ = (ta << 24) | (tr << 16) | (tg << 8) | tb;
        kc += scale;
    }
#ifdef _X86_
    if ((OSInfo::HasMMX) &&
        (FilterMode == HighQualityBicubic))
    {
        _asm
        {
            emms
        }
    }
#endif // _X86_
}



/**************************************************************************
*
* Function Description:
*
*   Outputs one scanline on the destination device
*
*   Note:  this function must not use floating point values because of
*   potential conflicts with the MMX register values.
*
* Return Value:
*
*   GpStatus. Always returns Ok.
*   !!! [asecchia] are we going to remove this return value - these
*   always return success.
*
* Created:
*
*   04/17/2000 asecchia
*      Created it.
*
**************************************************************************/

template<FilterModeType FilterMode>
GpStatus DpOutputSpanStretch<FilterMode>::OutputSpan(
  INT y,
  INT xMin,
  INT xMax     // xMax is exclusive
)
{
    ASSERT(isValid);
    
    // This function assumes that it's called with a correctly ordered span.

    ASSERT((xMax-xMin)>=0);

    INT width = xMax-xMin;

    // We can't have someone draw outside our specified destination.
    // If this assert fires, we don't have enough buffer space to store the
    // destination xscale so we'd overrun the buffer. The caller set us up
    // with an incorrect destination rectangle or got their rounding wrong.

    ASSERT(width <= xbuffer_width);

    INT left = xMin;
    INT right = xMax;
    // If there's nothing to do, simply return.

    if(right < left)
    {
        return Ok;
    }

    ASSERT(right >= left);

    // Make sure the caller clipped correctly - we can't handle
    // being called to draw outside out destination rectangle.

    ASSERT(y >= iytop);

    // Compute the kernel center for this y coordinate relative to the first
    // y coordinate (y coordinate corresponding to DstRect.Y) and offset
    // by the source rectangle.

    FIX16 kc;

    if(yscale < 0)
    {
        kc = ykci - (y - iytop) * (-yscale);
    }
    else
    {
        kc = ykci + (y - iytop) * yscale;
    }
    
    // Center of the filter kernel.
    // Shift over to the left by half because we want to center the area of
    // contribution for each sample on the sample - rather than taking the 
    // area between two point samples as the contribution for the sample on 
    // the right.
    
    kc -= FIX16_HALF;

    // Compute the start and end of the filter kernel coverage

    FIX16 kb = GpFix16Ceiling(kc-yw);
    FIX16 ke = GpFix16Ceiling(kc+yw);

    // Get the source pointer.

    ARGB *srcPtr0 = static_cast<ARGB*> (BmpData.Scan0);
    INT stride = BmpData.Stride/sizeof(ARGB);

    ARGB *src;
    ARGB *dst;

    FIX16 pc, kp, pa, pa_old;
    FIX16 ta, tr, tg, tb;

    ARGB pix;

    INT k, x, kmod;

    FIX16 krn = 0;

    // if there was a last_k before this iteration

    // compute the new xbuffer_start_scanline

    if(last_k != LAST_K_UNUSED)
    {
        // If there is no overlap in the rotational buffer from the
        // last time, initialize the rotational buffer to the start.

        if(yscale < 0)
        {
            // Negative y scale.

            if(ke-last_k < 0)
            {
                xbuffer_start_scanline = 0;
            }
            else
            {
                xbuffer_start_scanline -= last_k-kb;
                if(xbuffer_start_scanline < 0)
                {
                    xbuffer_start_scanline += xbuffer_height;
                }

            }
        }
        else
        {
            // Positive y scale.

            if(last_k-kb < 0)
            {
                xbuffer_start_scanline = 0;
            }
            else
            {
                // Figure out where to start in the xbuffer so that we
                // can reuse the already scaled scanlines.

                xbuffer_start_scanline -= (last_k-kb)+1;
                if(xbuffer_start_scanline < 0)
                {
                    xbuffer_start_scanline += xbuffer_height;
                }
            }
        }
    }
    else
    {
        // this should be the first time we're hitting this
        // routine. xbuffer_start_scanline should be properly
        // initialized.

        ASSERT(xbuffer_start_scanline == 0);
    }

    // make sure we're going to access valid memory in the xbuffer.

    ASSERT(xbuffer_start_scanline >= 0);
    ASSERT(xbuffer_start_scanline < xbuffer_height);

    // !!! [asecchia] if we thought about it some, we could probably
    // import the code in StretchScanline into this loop
    // and merge this and the next loop significantly reducing the memory
    // requirements for the xbuffer.

    // The xbuffer_height should be == (ke-kb)+1 for all cases except when
    // the center (kc) is exactly on an integer in which case the first and
    // last entries under the kernel have a contribution of zero so it doesn't
    // matter if we drop one scanline in that case.
    // Start at the position we left off from the previous scanline. Use the
    // rotational buffer to remember the data from the previous scanline work.

    // HighQualityBicubic needs to initialize the krn value.
    // It is used to do the kernel table lookup.
    // HighQualityBilinear doesn't use this as it works out it's
    // kernel by direct computation.

    // Note: this is a template class so the value of FilterMode
    // is defined at compile time. We're relying on the compiler
    // to perform dead code removal for each template instantiation
    // eliminating both the constant comparison and all the
    // code branches corresponding to other FilterMode values.
    // That way our inner loop is not impacted by extra code for
    // filter modes we're not using and extraneous conditional
    // statements.

    krn = Int32x32Mod16(ya, (kb << FIX16_SHIFT) - kc);
    pa_old = 0;

    for(k=0; k<xbuffer_height; k++)
    {
        kmod = xbuffer_start_scanline + k;
        if(kmod >= xbuffer_height) kmod -= xbuffer_height;

        // We avoid using a mod (%) computation above because we
        // know that the xbuffer_start_scanline is always within
        // the range 0..xbuffer_height-1.
        // ASSERT that this assumption is true.

        ASSERT(kmod < xbuffer_height);
        ASSERT(kmod >= 0);

        // Compute the kernel response for this pixel based on the
        // positive value of kp


        if(kb+k>ke)
        {
            // The buffer could be larger than the actual kernel,
            // in that case, simply set the extra coefficients to 
            // zero.
            
            ycoeff[kmod] = 0;
        }
        else
        {
            ComputeKernel(ycoeff[kmod], ya, pa, pa_old, krn);
        }

        // Compute the position in the destination buffer to draw to.

        dst = xbuffer + xbuffer_width * kmod;

        // This assert fires if the arithmetic for computing the size of the
        // xbuffer or the iteration over the kernel support has a bug. The
        // xbuffer_height should be the maximum width of the kernel support.

        ASSERT(k < xbuffer_height);
        ASSERT(kmod < xbuffer_height);

        INT k_wrap = kb+k;

        // NTRAID#NTBUG9-370168-2001/04/18-asecchia
        // This is an unsigned/signed comparison.
        // NOTE: the (INT) cast is the invalid one. BmpData.Height is UINT
        // and is always positive - casting it to int is irrelevant.
        // However, the k_wrap is signed and _can_ be negative. The unsigned
        // cast is by design - it allows us to figure out both sides of the 
        // wrap using one comparison.
        // The unsigned comparison >= Height tells us if k_wrap does not fall 
        // within the range 0..Height-1 and therefore needs wrapping because 
        // negative numbers cast to huge positive numbers and succeed the 
        // comparison too.
        // NOTE also that this kind of comparison limits the effective range
        // of Height to (max unsigned)/2 with the single caveat of k_wrap being
        // equal to -MAXINT.
        // For code that's executed once per scanline, this kind of subtlety
        // is probably not warranted.

        if((UINT)(k_wrap) >= (INT)BmpData.Height)
        {
            // Handle the wrap mode here.

            if(WrapZeroClamp)
            {
                // GpMemset(dst, 0, (right-left)*sizeof(ARGB));

                // If we're filling with zero, we may as well optimize the kernel
                // contribution.

                ycoeff[kmod] = 0;

                // done this scan - go on to the next

                continue;
            }
            else
            {
                if(QWrapMode == WrapModeClamp)
                {
                    INT i = right-left;
                    ARGB *d = dst;
                    while(i--)
                    {
                        *d++ = ClampColor;
                    }

                    // done this scan - go on to the next

                    continue;
                }
                else
                {
                    // Apply the general wrap code.

                    Apply1DWrapModeY(QWrapMode, k_wrap, (INT)BmpData.Height);
                    src = srcPtr0 + stride*k_wrap;

                    // Not done yet - fall through and call StretchScanline.
                }
            }
        }
        else
        {
            // If the x2 and x1 are out of order, we failed to correctly
            // compute the span in the above logic.

            // Seek to the start of the scanline.
            // Note: whatever X coordinate we add to the src pointer
            // we need to subtract from the width passed to the
            // StretchScanline routine below.

            src = srcPtr0 + stride*(k_wrap);
        }

        // Only x-scale if we haven't already done this scanline on a previous
        // call and stored the result in the xbuffer.

        if((last_k==LAST_K_UNUSED) || (
               (yscale >= 0) && (last_k-(kb+k) < 0) ||
               (yscale < 0) && (last_k-(kb+k) > 0)
            )
           )
        {

            // Filter in the x-dimension.

            StretchScanline(
                dst,
                src,
                xbuffer_width,
                static_cast<INT>(BmpData.Width),
                xkci,
                xscale,
                xw,
                xa
            );
        }
    }

    // set up the k_last for the next iteration. This represents the last
    // scanline for which we actually have x-scaled data.

    if(yscale < 0)
    {
        last_k = kb;
    }
    else
    {
        last_k = kb + xbuffer_height - 1;
    }

    
    // Get the final destination buffer

    ARGB *buffer = Scan->NextBuffer(left, y, width);

    // Now we have the entire buffer full with the x-dimension scaled data.

    // for every x coordinate, apply the y kernel.
    
#ifdef _X86_
    if (OSInfo::HasMMX)
    {
        // MMX
    
        INT *ycoeffMMX = (INT *) ((((UINT_PTR) ycoeff) + 4) & ~0x7);
        INT n = (xbuffer_height + 1) >> 1;

        // Transform the kernel coeffecient array into a form that is
        // easily usable by MMX code.  The loop must go backward so that
        // we don't erase kernel coefficients (MMX starting point could
        // be 4 bytes ahead of integer starting point).
        // ycoeff must be large enough to hold the MMX coefficients (2 extra
        // entries)

        for (INT i = n-1; i >= 0; i--)
        {
            INT kernel1   = ycoeff[i * 2]     >> 2;
            INT kernel2   = ycoeff[i * 2 + 1] >> 2;
            INT kernelMMX = (kernel1 & 0xFFFF) | (kernel2 << 16); 

            ycoeffMMX[i * 2]     = kernelMMX;
            ycoeffMMX[i * 2 + 1] = kernelMMX;
        }

        for(x=0; x<width; x++)
        {
            // iterate over every point under the kernel

            // Note we don't need the kmod arithmetic here because
            // we've precomputed the coefficients and we don't care what order
            // we sum them.

            BYTE *kptr = (BYTE*)(xbuffer + x + (left - ixleft));

            // Compute the increment in bytes to move from the current scanline
            // to the next in the xbuffer.

            INT kptr_inc_MMX = xbuffer_width*sizeof(ARGB);

            INT bos_count      = xbuffer_height;
            INT bos_half_count = bos_count >> 1;
            bos_count &= 0x1;

            _asm
            {
                    // eax - kptr
                    // ebx - kptr_inc
                    // ecx - counter
                    // esi - ycoeff current pointer

                    pxor       mm0, mm0
                    movq       mm6, FIX14_HALF_MMX
                    movq       mm7, mm6
                    mov        eax, kptr
                    mov        ebx, kptr_inc_MMX
                    mov        ecx, bos_half_count
                    mov        esi, ycoeffMMX

                    dec        ecx
                    jl         bicubic_output_span_loop_last_pixel

                bicubic_output_span_loop:

                    movd       mm2, [eax]  // mm2 = 00000000a1r1b1g1
                    movd       mm4, [eax + ebx]

                    punpcklbw  mm2, mm0    // mm2 = 00a100r100g100b1
                    movq       mm1, [esi]  // mm1 = kernel2 | kernel1 | kernel2 | kernel1

                    punpcklbw  mm4, mm0    // mm4 = 00a200r200g200b2
                    movq       mm3, mm2    // mm3 = 00a100r100g100b1

                    punpcklwd  mm2, mm4    // mm2 = 00g200g100b200b1
                    add        esi, 8
                    pmaddwd    mm2, mm1
                    punpckhwd  mm3, mm4    // mm3 = 00a200a100r200r1
                    paddd      mm6, mm2
                    dec        ecx
                    pmaddwd    mm3, mm1
                    lea        eax, [eax + 2 * ebx] // does not affect flags      
                    paddd      mm7, mm3

                    jge        bicubic_output_span_loop

                bicubic_output_span_loop_last_pixel:

                    dec        bos_count
                    jl         bicubic_output_span_loop_done

                    movd       mm2, [eax]  // mm2 = 00000000aarrggbb
                    punpcklbw  mm2, mm0    // mm2 = 00aa00rr00gg00bb
                    movq       mm3, mm2
                    punpcklwd  mm2, mm0    // mm2 = 000000gg000000bb
                    movq       mm1, [esi]  // mm1 = xxxx | kernel1 | xxxx |kernel1
                    punpckhwd  mm3, mm0    // mm3 = 000000aa000000bb

                    pmaddwd    mm2, mm1
                    pmaddwd    mm3, mm1

                    paddd      mm6, mm2
                    paddd      mm7, mm3

                bicubic_output_span_loop_done:

                    // At this point, mm6 and mm7 contain the output channels
                    // for the pixel.  We need to clamp the alpha and store it
                    // in the destination buffer.

                    psrad        mm6, 14
                    psrad        mm7, 14
                    packssdw     mm6, mm7 // mm6 = a | r | g | b
                    packuswb     mm6, mm6 // mm6 = 00000000aarrggbb

                    movq         mm7, mm6 // mm7 = 00000000aarrggbb
                    psrad        mm6, 24  // mm6 = xxxxxxxxxxxxxxaa
                      mov          eax, 0xFFFFFFFF
                    punpcklbw    mm6, mm6 // mm6 = xxxxxxxxxxxxaaaa
                      movd         mm2, eax
                    punpcklbw    mm6, mm6 // mm6 = xxxxxxxxaaaaaaaa

                    psubusb      mm2, mm6
                      mov          eax, buffer
                    paddusb      mm7, mm2
                    psubusb      mm7, mm2

                    movd         [eax], mm7
                    add          eax, 4
                    mov          buffer, eax
            }
        }
    }
    else
#endif // _X86_
    {
        // No MMX

        for(x=0; x<width; x++)
        {
            // Initialize the component accumulators. We accumulate the
            // contribution of each color component scaled by the kernel
            // response into these variables.

            ta = tr = tg = tb = 0;

            // iterate over every point under the kernel

            // Note we don't need the kmod arithmetic here because
            // we've precomputed the coefficients and we don't care what order
            // we sum them.

            BYTE *kptr = (BYTE*)(xbuffer + x + (left - ixleft));

            // Compute the increment in bytes to move from the current scanline
            // to the next in the xbuffer after incrementing through 3 of the
            // color channels.

            INT kptr_inc = xbuffer_width*sizeof(ARGB);
            
            for(k=0; k<xbuffer_height; k++)
            {
                // Find the pixel contributing to this part of the kernel
                // taking into account the edge conditions.

                // lookup the kernel coefficient for this scanline.

                pc = ycoeff[k];

                // Accumulate the contribution of this source pixel to the pixel
                // we're working on.

                AccumulateChannels(pc, kptr);

                kptr += kptr_inc;
            }

            // Done with this pixel - store it in the destination buffer.

            // clamp the results to byte range.

            ClampColors();

            // Combine the channels, set the destination pixel and increment
            // to the next pixel

            *buffer++ = (ta << 24) | (tr << 16) | (tg << 8) | tb;
        }
    }

#ifdef _X86_
    if (OSInfo::HasMMX)
    {
        _asm
        {
            emms
        }
    }
#endif // _X86_


    return Ok;
}

#undef ClampColors