windows-server-2003/windows/advcore/gdiplus/engine/render/bicubic.cpp


								/**************************************************************************\

								*

								* Copyright (c) 1999  Microsoft Corporation

								*

								* Module Name:

								*

								*   bicubic.cpp

								*

								* Abstract:

								*

								*   Bicubic Resampling code

								*

								* Created:

								*

								*   11/03/1999 ASecchia

								\**************************************************************************/


								#include "precomp.hpp"


								DpOutputBicubicImageSpan::DpOutputBicubicImageSpan(

								    DpBitmap* bitmap,

								    DpScanBuffer * scan,

								    DpContext* context,

								    DpImageAttributes imageAttributes,

								    INT numPoints,

								    const GpPointF *dstPoints,

								    const GpRectF *srcRect

								    )

								{

								    Scan     = scan;

								    BWrapMode = imageAttributes.wrapMode;

								    ClampColor = imageAttributes.clampColor;

								    SrcRectClamp = imageAttributes.srcRectClamp;

								    dBitmap   = bitmap;


								    ASSERT(dBitmap != NULL);

								    ASSERT(dBitmap->IsValid());


								    // on bad bitmap, we return with Valid = FALSE

								    if (dBitmap == NULL ||

								        !dBitmap->IsValid() )

								    {

								        dBitmap = NULL;

								        return;

								    } else {

								        BmpData.Width = dBitmap->Width;

								        BmpData.Height = dBitmap->Height;

								        BmpData.PixelFormat = PIXFMT_32BPP_PARGB;

								        BmpData.Stride = dBitmap->Delta;

								        BmpData.Scan0 = dBitmap->Bits;

								    }


								    WorldToDevice = context->WorldToDevice;

								    context->GetDeviceToWorld(&DeviceToWorld);


								    if(srcRect)

								        SrcRect = *srcRect;

								    else

								    {

								        SrcRect.X = 0;

								        SrcRect.Y = 0;

								        SrcRect.Width  = (REAL) dBitmap->Width;

								        SrcRect.Height = (REAL) dBitmap->Height;

								    }


								    GpPointF points[4];


								    GpMatrix xForm;

								    BOOL existsTransform = TRUE;


								    switch(numPoints)

								    {

								    case 0:

								        points[0].X = 0;

								        points[0].Y = 0;

								        points[1].X = (REAL) SrcRect.Width;

								        points[1].Y = 0;

								        points[2].X = 0;

								        points[2].Y = (REAL) SrcRect.Height;

								        break;


								    case 1:

								        points[0] = dstPoints[0];

								        points[1].X = (REAL) (points[0].X + SrcRect.Width);

								        points[1].Y = points[0].Y;

								        points[2].X = points[0].X;

								        points[2].Y = (REAL) (points[0].Y + SrcRect.Height);

								        break;


								    case 3:

								    case 4:

								        GpMemcpy(&points[0], dstPoints, numPoints*sizeof(GpPointF));

								        break;


								    default:

								        existsTransform = FALSE;

								    }


								    if(existsTransform)

								    {

								        xForm.InferAffineMatrix(points, SrcRect);

								    }


								    WorldToDevice = context->WorldToDevice;

								    WorldToDevice.Prepend(xForm);

								    if(WorldToDevice.IsInvertible())

								    {

								        DeviceToWorld = WorldToDevice;

								        DeviceToWorld.Invert();

								    }

								}


								namespace DpOutputBicubicImageSpanNS {

								const INT KernShift = 6;

								const INT Oversample = 1 << KernShift;

								const FIX16 kern[2*Oversample+1] =

								{

								    65536, 65496, 65379, 65186, 64920, 64583, 64177, 63705,

								    63168, 62569, 61911, 61195, 60424, 59600, 58725, 57802,

								    56832, 55818, 54763, 53668, 52536, 51369, 50169, 48939,

								    47680, 46395, 45087, 43757, 42408, 41042, 39661, 38268,

								    36864, 35452, 34035, 32614, 31192, 29771, 28353, 26941,

								    25536, 24141, 22759, 21391, 20040, 18708, 17397, 16110,

								    14848, 13614, 12411, 11240, 10104,  9005,  7945,  6927,

								     5952,  5023,  4143,  3313, 2536,  1814,  1149,   544,

								        0,  -496,  -961, -1395, -1800, -2176, -2523, -2843,

								    -3136, -3403, -3645, -3862, -4056, -4227, -4375, -4502,

								    -4608, -4694, -4761, -4809, -4840, -4854, -4851, -4833,

								    -4800, -4753, -4693, -4620, -4536, -4441, -4335, -4220,

								    -4096, -3964, -3825, -3679, -3528, -3372, -3211, -3047,

								    -2880, -2711, -2541, -2370, -2200, -2031, -1863, -1698,

								    -1536, -1378, -1225, -1077, -936,  -802,  -675,  -557,

								     -448,  -349,  -261,  -184, -120,   -69,   -31,    -8,

								        0

								};


								#ifdef _X86_


								const short kern14[2*Oversample+1] =

								{

								    16384, 16374, 16345, 16297, 16230, 16146, 16044, 15926,

								    15792, 15642, 15478, 15299, 15106, 14900, 14681, 14451,

								    14208, 13955, 13691, 13417, 13134, 12842, 12542, 12235,

								    11920, 11599, 11272, 10939, 10602, 10261,  9915,  9567,

								     9216,  8863,  8509,  8154,  7798,  7443,  7088,  6735,

								     6384,  6035,  5690,  5348,  5010,  4677,  4349,  4028,

								     3712,  3404,  3103,  2810,  2526,  2251,  1986,  1732,

								     1488,  1256,  1036,   828,   634,   454,   287,   136,

								        0,  -124,  -240,  -349,  -450,  -544,  -631,  -711,

								     -784,  -851,  -911,  -966, -1014, -1057, -1094, -1126,

								    -1152, -1174, -1190, -1202, -1210, -1214, -1213, -1208,

								    -1200, -1188, -1173, -1155, -1134, -1110, -1084, -1055,

								    -1024,  -991,  -956,  -920,  -882,  -843,  -803,  -762,

								     -720,  -678,  -635,  -593,  -550,  -508,  -466,  -425,

								     -384,  -345,  -306,  -269,  -234,  -201,  -169,  -139,

								     -112,   -87,   -65,   -46,   -30,   -17,    -8,    -2,

								        0

								};


								#pragma warning(disable : 4799)


								ARGB FASTCALL Do1DBicubicMMX(ARGB filter[4], short w[4])

								{

								    ARGB result;


								    static ULONGLONG HalfFix3 = 0x0004000400040004;


								    // really should do this function without any preamble.

								    _asm

								    {

								        mov        eax, filter     ;

								        mov        ebx, w          ;

								        pxor       mm0, mm0        ; zero


								        movq       mm1, [ebx]      ; w


								        movd       mm4, [eax]      ; filter[0]

								        movd       mm5, [eax+4]    ; filter[1]

								        movd       mm6, [eax+8]    ; filter[2]

								        movd       mm7, [eax+0xc]  ; filter[3]


								        punpcklbw  mm4, mm0        ; 0a0r0g0b (interleave zeros)

								        punpcklbw  mm5, mm0        ;

								        punpcklbw  mm6, mm0        ;

								        punpcklbw  mm7, mm0        ;


								        psllw      mm4, 5          ; 2 to compensate for the kernel resolution +

								        psllw      mm5, 5          ; 3 to support some fractional bits for the add.

								        psllw      mm6, 5          ;

								        psllw      mm7, 5          ;


								        movq       mm2, mm1        ;

								        punpcklwd  mm2, mm2        ; w1 w1 w0 w0

								        movq       mm3, mm2        ;

								        punpckldq  mm2, mm2        ; w0

								        punpckhdq  mm3, mm3        ; w1


								        pmulhw     mm4, mm2        ; filter[0]*w0

								        pmulhw     mm5, mm3        ; filter[1]*w1


								        punpckhwd  mm1, mm1        ; w3 w3 w2 w2

								        movq       mm2, mm1        ;

								        punpckldq  mm1, mm1        ; w2

								        punpckhdq  mm2, mm2        ; w3


								        pmulhw     mm6, mm1        ; filter[2]*w2

								        pmulhw     mm7, mm2        ; filter[3]*w3


								        paddsw     mm4, mm5        ; add

								        paddsw     mm6, mm7        ; add

								        paddsw     mm4, mm6        ; add


								        movq       mm3, HalfFix3   ;

								        paddsw     mm4, mm3        ; add half

								        psraw      mm4, 3          ; round the fractional bits away.


								        packuswb   mm4, mm4        ; saturate between [0, 0xff]


								        ; need to saturate the r, g, b components to range 0..a


								        movq       mm0, mm4        ;

								        punpcklbw  mm0, mm0        ; aarrggbb

								        punpckhwd  mm0, mm0        ; aaaarrrr

								        psrlq      mm0, 32         ; 0000aaaa

								        mov        eax, 0xffffffff ;

								        movd       mm1, eax        ;

								        psubb      mm1, mm0        ; 255-a

								        paddusb    mm4, mm1        ; saturate against 255

								        psubusb    mm4, mm1        ; drop it back to the right range


								        movd       result, mm4     ;

								        //emms; this instruction is done by the caller.

								    }

								    return result;

								}

								#endif


								inline ARGB Do1DBicubic(ARGB filter[4], const FIX16 x)

								{

								    // Lookup the convolution kernel.

								    FIX16 w0 = kern[Oversample+x];

								    FIX16 w1 = kern[x];

								    FIX16 w2 = kern[Oversample-x];

								    FIX16 w3 = kern[2*Oversample-x];


								    // Cast to LONG so that we preserve the sign when we start

								    // shifting values around - the bicubic filter will often

								    // have negative intermediate color components.

								    ULONG *p = (ULONG *)filter;

								    LONG a, r, g, b;


								    // Casting of p to ULONG and then having the LONG casts in the expressions

								    // below is to work around a compiler sign extension bug.

								    // In this particular case, the bug was dropping the '& 0xff' from the

								    // green component expression causing it to become negative

								    // which gets clamped to zero.

								    // When the bug is fixed, p should be reverted to LONG and casted to LONG

								    // and the LONG casts should be removed from the expressions below.


								    // Alpha component

								    a = (w0 * (LONG)((p[0] >> 24) & 0xff) +

								         w1 * (LONG)((p[1] >> 24) & 0xff) +

								         w2 * (LONG)((p[2] >> 24) & 0xff) +

								         w3 * (LONG)((p[3] >> 24) & 0xff)) >> FIX16_SHIFT;

								    a = (a < 0) ? 0 : (a > 255) ? 255 : a;


								    // We have premultiplied alpha values - clamp R, G, B to alpha

								    // Red component

								    r = (w0 * (LONG)((p[0] >> 16) & 0xff) +

								         w1 * (LONG)((p[1] >> 16) & 0xff) +

								         w2 * (LONG)((p[2] >> 16) & 0xff) +

								         w3 * (LONG)((p[3] >> 16) & 0xff)) >> FIX16_SHIFT;

								    r = (r < 0) ? 0 : (r > a) ? a : r;


								    // Green component

								    g = (w0 * (LONG)((p[0] >> 8) & 0xff) +

								         w1 * (LONG)((p[1] >> 8) & 0xff) +

								         w2 * (LONG)((p[2] >> 8) & 0xff) +

								         w3 * (LONG)((p[3] >> 8) & 0xff)) >> FIX16_SHIFT;

								    g = (g < 0) ? 0 : (g > a) ? a : g;


								    // Blue component

								    b = (w0 * (LONG)(p[0] & 0xff) +

								         w1 * (LONG)(p[1] & 0xff) +

								         w2 * (LONG)(p[2] & 0xff) +

								         w3 * (LONG)(p[3] & 0xff)) >> FIX16_SHIFT;

								    b = (b < 0) ? 0 : (b > a) ? a : b;


								    return ((a << 24) | (r << 16) | (g << 8) | b);

								}

								} // end DpOutputBicubicImageSpanNS


								GpStatus

								DpOutputBicubicImageSpan::OutputSpan(

								  INT y,

								  INT xMin,

								  INT xMax     // xMax is exclusive

								)

								{

								    // Nothing to do.


								    if(xMin==xMax)

								    {

								        return Ok;

								    }


								    ASSERT(xMin < xMax);


								    GpPointF p1, p2;

								    p1.X = (REAL) xMin;

								    p1.Y = p2.Y = (REAL) y;

								    p2.X = (REAL) xMax;


								    DeviceToWorld.Transform(&p1);

								    DeviceToWorld.Transform(&p2);


								    // Convert to Fixed point notation - 16 bits of fractional precision.

								    FIX16 dx, dy, x0, y0;

								    x0 = GpRound(p1.X*FIX16_ONE);

								    y0 = GpRound(p1.Y*FIX16_ONE);


								    ASSERT(xMin < xMax);

								    dx = GpRound(((p2.X - p1.X)*FIX16_ONE)/(xMax-xMin));

								    dy = GpRound(((p2.Y - p1.Y)*FIX16_ONE)/(xMax-xMin));


								    return OutputSpanIncremental(y, xMin, xMax, x0, y0, dx, dy);

								}


								GpStatus

								DpOutputBicubicImageSpan::OutputSpanIncremental(

								    INT      y,

								    INT      xMin,

								    INT      xMax,

								    FIX16    x0,

								    FIX16    y0,

								    FIX16    dx,

								    FIX16    dy

								    )

								{

								    using namespace DpOutputBicubicImageSpanNS;

								    INT width  = xMax - xMin;

								    ARGB *buffer = Scan->NextBuffer(xMin, y, width);

								    ARGB *srcPtr0 = static_cast<ARGB*> (BmpData.Scan0);

								    INT stride = BmpData.Stride/sizeof(ARGB);


								    INT ix;

								    INT iy;

								    FIX16 fracx;        // hold the fractional increment for ix

								    FIX16 fracy;        // hold the fractional increment for iy


								    ARGB filter[4][4];  // 4x4 filter array.

								    INT xstep, ystep;   // loop variables in x and y

								    INT wx[4];

								    INT wy[4];          // wrapped coordinates


								    // For all pixels in the destination span...

								    for(int i=0; i<width; i++)

								    {

								        // .. compute the position in source space.


								        // floor

								        ix = x0 >> FIX16_SHIFT;

								        iy = y0 >> FIX16_SHIFT;


								        // Apply the wrapmode to all possible kernel combinations.

								        for(xstep=0;xstep<4;xstep++) {

								            wx[xstep] = ix+xstep-1;

								            wy[xstep] = iy+xstep-1;

								        }


								        if(BWrapMode != WrapModeClamp) {

								            if( ( (UINT)(ix-1) >= (UINT)( max(((INT)BmpData.Width)-4,0))) ||

								                ( (UINT)(iy-1) >= (UINT)( max(((INT)BmpData.Height)-4,0))) )

								            {

								                for(xstep=0;xstep<4;xstep++) {

								                    ApplyWrapMode(BWrapMode, wx[xstep], wy[xstep], BmpData.Width, BmpData.Height);

								                }

								            }

								        }


								        // Check to see if we're outside of the valid drawing range specified

								        // in the DpBitmap.


								        fracx = (x0  & FIX16_MASK) >> (FIX16_SHIFT-KernShift);

								        fracy = (y0  & FIX16_MASK) >> (FIX16_SHIFT-KernShift);


								        // Build up the filter domain surrounding the current pixel.

								        // Technically the loops below should go from -2 to 2 to correctly

								        // handle the case of fracx or fracy == 0, but our convolution kernel

								        // has zero at that point anyway, so we optimize it away.


								        for(ystep=0;ystep<4;ystep++) for(xstep=0;xstep<4;xstep++)

								        {

								            // !!! PERF: check the y step outside

								            //       of the x loop and use memset to fill the entire line.

								            //       This should reduce the complexity of the inner loop

								            //       comparison.


								            // Make sure the pixel is within the bounds of the source before

								            // accessing it.


								            if( ((wx[xstep]) >=0) &&

								                ((wy[ystep]) >=0) &&

								                ((wx[xstep]) < (INT)(BmpData.Width)) &&

								                ((wy[ystep]) < (INT)(BmpData.Height)) )

								            {

								                filter[xstep][ystep] =

								                  *(srcPtr0+stride*(wy[ystep])+(wx[xstep]));

								            } else {

								                // This means that this source pixel is outside of the valid

								                // bits in the source. (edge condition)

								                filter[xstep][ystep] = (ARGB) ClampColor;

								            }

								        }


								        #ifdef _X86_

								        if(OSInfo::HasMMX)

								        {

								            // Lookup the convolution kernel.

								            short w[4];


								            w[0] = kern14[Oversample+fracy];

								            w[1] = kern14[fracy];

								            w[2] = kern14[Oversample-fracy];

								            w[3] = kern14[2*Oversample-fracy];


								            // Filter the 4 vertical pixel columns

								            // Reuse filter[0] to store the intermediate result

								            for(xstep=0;xstep<4;xstep++)

								            {

								                filter[0][xstep] = Do1DBicubicMMX(filter[xstep], w);

								            }


								                // Lookup the convolution kernel.


								            w[0] = kern14[Oversample+fracx];

								            w[1] = kern14[fracx];

								            w[2] = kern14[Oversample-fracx];

								            w[3] = kern14[2*Oversample-fracx];


								            // Filter horizontally.

								            *buffer++ = Do1DBicubicMMX(filter[0], w);


								            // Update source position

								            x0 += dx;

								            y0 += dy;

								        }

								        else

								        #endif

								        {

								            // Filter the 4 vertical pixel columns

								            // Reuse filter[0] to store the intermediate result

								            for(xstep=0;xstep<4;xstep++)

								            {

								                filter[0][xstep] = Do1DBicubic(filter[xstep], fracy);

								            }


								            // Filter horizontally.

								            *buffer++ = Do1DBicubic(filter[0], fracx);


								            // Update source position

								            x0 += dx;

								            y0 += dy;

								        }

								    }


								    // Clear the MMX state


								    #ifdef _X86_

								    if(OSInfo::HasMMX)

								    {

								        _asm emms;

								    }

								    #endif


								    return Ok;

								}