windows-server-2003/windows/advcore/gdiplus/engine/render/bicubic.cpp

/**************************************************************************\
*
* Copyright (c) 1999  Microsoft Corporation
*
* Module Name:
*
*   bicubic.cpp
*
* Abstract:
*
*   Bicubic Resampling code
*
* Created:
*
*   11/03/1999 ASecchia
\**************************************************************************/

#include "precomp.hpp"

DpOutputBicubicImageSpan::DpOutputBicubicImageSpan(
    DpBitmap* bitmap,
    DpScanBuffer * scan,
    DpContext* context,
    DpImageAttributes imageAttributes,
    INT numPoints,
    const GpPointF *dstPoints,
    const GpRectF *srcRect
    )
{
    Scan     = scan;
    BWrapMode = imageAttributes.wrapMode;
    ClampColor = imageAttributes.clampColor;
    SrcRectClamp = imageAttributes.srcRectClamp;
    dBitmap   = bitmap;

    ASSERT(dBitmap != NULL);
    ASSERT(dBitmap->IsValid());

    // on bad bitmap, we return with Valid = FALSE
    if (dBitmap == NULL ||
        !dBitmap->IsValid() )
    {
        dBitmap = NULL;
        return;
    } else {
        BmpData.Width = dBitmap->Width;
        BmpData.Height = dBitmap->Height;
        BmpData.PixelFormat = PIXFMT_32BPP_PARGB;
        BmpData.Stride = dBitmap->Delta;
        BmpData.Scan0 = dBitmap->Bits;
    }

    WorldToDevice = context->WorldToDevice;
    context->GetDeviceToWorld(&DeviceToWorld);

    if(srcRect)
        SrcRect = *srcRect;
    else
    {
        SrcRect.X = 0;
        SrcRect.Y = 0;
        SrcRect.Width  = (REAL) dBitmap->Width;
        SrcRect.Height = (REAL) dBitmap->Height;
    }

    GpPointF points[4];

    GpMatrix xForm;
    BOOL existsTransform = TRUE;

    switch(numPoints)
    {
    case 0:
        points[0].X = 0;
        points[0].Y = 0;
        points[1].X = (REAL) SrcRect.Width;
        points[1].Y = 0;
        points[2].X = 0;
        points[2].Y = (REAL) SrcRect.Height;
        break;

    case 1:
        points[0] = dstPoints[0];
        points[1].X = (REAL) (points[0].X + SrcRect.Width);
        points[1].Y = points[0].Y;
        points[2].X = points[0].X;
        points[2].Y = (REAL) (points[0].Y + SrcRect.Height);
        break;

    case 3:
    case 4:
        GpMemcpy(&points[0], dstPoints, numPoints*sizeof(GpPointF));
        break;

    default:
        existsTransform = FALSE;
    }

    if(existsTransform)
    {
        xForm.InferAffineMatrix(points, SrcRect);
    }

    WorldToDevice = context->WorldToDevice;
    WorldToDevice.Prepend(xForm);
    if(WorldToDevice.IsInvertible())
    {
        DeviceToWorld = WorldToDevice;
        DeviceToWorld.Invert();
    }
}

namespace DpOutputBicubicImageSpanNS {
const INT KernShift = 6;
const INT Oversample = 1 << KernShift;
const FIX16 kern[2*Oversample+1] =
{
    65536, 65496, 65379, 65186, 64920, 64583, 64177, 63705,
    63168, 62569, 61911, 61195, 60424, 59600, 58725, 57802,
    56832, 55818, 54763, 53668, 52536, 51369, 50169, 48939,
    47680, 46395, 45087, 43757, 42408, 41042, 39661, 38268,
    36864, 35452, 34035, 32614, 31192, 29771, 28353, 26941,
    25536, 24141, 22759, 21391, 20040, 18708, 17397, 16110,
    14848, 13614, 12411, 11240, 10104,  9005,  7945,  6927,
     5952,  5023,  4143,  3313, 2536,  1814,  1149,   544,
        0,  -496,  -961, -1395, -1800, -2176, -2523, -2843,
    -3136, -3403, -3645, -3862, -4056, -4227, -4375, -4502,
    -4608, -4694, -4761, -4809, -4840, -4854, -4851, -4833,
    -4800, -4753, -4693, -4620, -4536, -4441, -4335, -4220,
    -4096, -3964, -3825, -3679, -3528, -3372, -3211, -3047,
    -2880, -2711, -2541, -2370, -2200, -2031, -1863, -1698,
    -1536, -1378, -1225, -1077, -936,  -802,  -675,  -557,
     -448,  -349,  -261,  -184, -120,   -69,   -31,    -8,
        0
};


#ifdef _X86_

const short kern14[2*Oversample+1] =
{
    16384, 16374, 16345, 16297, 16230, 16146, 16044, 15926,
    15792, 15642, 15478, 15299, 15106, 14900, 14681, 14451,
    14208, 13955, 13691, 13417, 13134, 12842, 12542, 12235,
    11920, 11599, 11272, 10939, 10602, 10261,  9915,  9567,
     9216,  8863,  8509,  8154,  7798,  7443,  7088,  6735,
     6384,  6035,  5690,  5348,  5010,  4677,  4349,  4028,
     3712,  3404,  3103,  2810,  2526,  2251,  1986,  1732,
     1488,  1256,  1036,   828,   634,   454,   287,   136,
        0,  -124,  -240,  -349,  -450,  -544,  -631,  -711,
     -784,  -851,  -911,  -966, -1014, -1057, -1094, -1126,
    -1152, -1174, -1190, -1202, -1210, -1214, -1213, -1208,
    -1200, -1188, -1173, -1155, -1134, -1110, -1084, -1055,
    -1024,  -991,  -956,  -920,  -882,  -843,  -803,  -762,
     -720,  -678,  -635,  -593,  -550,  -508,  -466,  -425,
     -384,  -345,  -306,  -269,  -234,  -201,  -169,  -139,
     -112,   -87,   -65,   -46,   -30,   -17,    -8,    -2,
        0
};

#pragma warning(disable : 4799)

ARGB FASTCALL Do1DBicubicMMX(ARGB filter[4], short w[4])
{
    ARGB result;
    
    static ULONGLONG HalfFix3 = 0x0004000400040004;

    // really should do this function without any preamble.
    _asm
    {
        mov        eax, filter     ;
        mov        ebx, w          ;
        pxor       mm0, mm0        ; zero

        movq       mm1, [ebx]      ; w

        movd       mm4, [eax]      ; filter[0]
        movd       mm5, [eax+4]    ; filter[1]
        movd       mm6, [eax+8]    ; filter[2]
        movd       mm7, [eax+0xc]  ; filter[3]

        punpcklbw  mm4, mm0        ; 0a0r0g0b (interleave zeros)
        punpcklbw  mm5, mm0        ;
        punpcklbw  mm6, mm0        ;
        punpcklbw  mm7, mm0        ;

        psllw      mm4, 5          ; 2 to compensate for the kernel resolution +
        psllw      mm5, 5          ; 3 to support some fractional bits for the add.
        psllw      mm6, 5          ;
        psllw      mm7, 5          ;

        movq       mm2, mm1        ;
        punpcklwd  mm2, mm2        ; w1 w1 w0 w0
        movq       mm3, mm2        ;
        punpckldq  mm2, mm2        ; w0
        punpckhdq  mm3, mm3        ; w1

        pmulhw     mm4, mm2        ; filter[0]*w0
        pmulhw     mm5, mm3        ; filter[1]*w1

        punpckhwd  mm1, mm1        ; w3 w3 w2 w2
        movq       mm2, mm1        ;
        punpckldq  mm1, mm1        ; w2
        punpckhdq  mm2, mm2        ; w3

        pmulhw     mm6, mm1        ; filter[2]*w2
        pmulhw     mm7, mm2        ; filter[3]*w3

        paddsw     mm4, mm5        ; add
        paddsw     mm6, mm7        ; add
        paddsw     mm4, mm6        ; add

        movq       mm3, HalfFix3   ; 
        paddsw     mm4, mm3        ; add half
        psraw      mm4, 3          ; round the fractional bits away.
        
        packuswb   mm4, mm4        ; saturate between [0, 0xff]

        ; need to saturate the r, g, b components to range 0..a

        movq       mm0, mm4        ;
        punpcklbw  mm0, mm0        ; aarrggbb
        punpckhwd  mm0, mm0        ; aaaarrrr
        psrlq      mm0, 32         ; 0000aaaa
        mov        eax, 0xffffffff ;
        movd       mm1, eax        ;
        psubb      mm1, mm0        ; 255-a
        paddusb    mm4, mm1        ; saturate against 255
        psubusb    mm4, mm1        ; drop it back to the right range

        movd       result, mm4     ;
        //emms; this instruction is done by the caller.
    }
    return result;
}
#endif

inline ARGB Do1DBicubic(ARGB filter[4], const FIX16 x)
{
    // Lookup the convolution kernel.
    FIX16 w0 = kern[Oversample+x];
    FIX16 w1 = kern[x];
    FIX16 w2 = kern[Oversample-x];
    FIX16 w3 = kern[2*Oversample-x];

    // Cast to LONG so that we preserve the sign when we start
    // shifting values around - the bicubic filter will often
    // have negative intermediate color components.
    ULONG *p = (ULONG *)filter;
    LONG a, r, g, b;

    // Casting of p to ULONG and then having the LONG casts in the expressions
    // below is to work around a compiler sign extension bug.
    // In this particular case, the bug was dropping the '& 0xff' from the
    // green component expression causing it to become negative
    // which gets clamped to zero.
    // When the bug is fixed, p should be reverted to LONG and casted to LONG
    // and the LONG casts should be removed from the expressions below.

    // Alpha component
    a = (w0 * (LONG)((p[0] >> 24) & 0xff) +
         w1 * (LONG)((p[1] >> 24) & 0xff) +
         w2 * (LONG)((p[2] >> 24) & 0xff) +
         w3 * (LONG)((p[3] >> 24) & 0xff)) >> FIX16_SHIFT;
    a = (a < 0) ? 0 : (a > 255) ? 255 : a;

    // We have premultiplied alpha values - clamp R, G, B to alpha
    // Red component
    r = (w0 * (LONG)((p[0] >> 16) & 0xff) +
         w1 * (LONG)((p[1] >> 16) & 0xff) +
         w2 * (LONG)((p[2] >> 16) & 0xff) +
         w3 * (LONG)((p[3] >> 16) & 0xff)) >> FIX16_SHIFT;
    r = (r < 0) ? 0 : (r > a) ? a : r;

    // Green component
    g = (w0 * (LONG)((p[0] >> 8) & 0xff) +
         w1 * (LONG)((p[1] >> 8) & 0xff) +
         w2 * (LONG)((p[2] >> 8) & 0xff) +
         w3 * (LONG)((p[3] >> 8) & 0xff)) >> FIX16_SHIFT;
    g = (g < 0) ? 0 : (g > a) ? a : g;

    // Blue component
    b = (w0 * (LONG)(p[0] & 0xff) +
         w1 * (LONG)(p[1] & 0xff) +
         w2 * (LONG)(p[2] & 0xff) +
         w3 * (LONG)(p[3] & 0xff)) >> FIX16_SHIFT;
    b = (b < 0) ? 0 : (b > a) ? a : b;

    return ((a << 24) | (r << 16) | (g << 8) | b);
}
} // end DpOutputBicubicImageSpanNS


GpStatus
DpOutputBicubicImageSpan::OutputSpan(
  INT y,
  INT xMin,
  INT xMax     // xMax is exclusive
)
{
    // Nothing to do.

    if(xMin==xMax)
    {
        return Ok;
    }

    ASSERT(xMin < xMax);

    GpPointF p1, p2;
    p1.X = (REAL) xMin;
    p1.Y = p2.Y = (REAL) y;
    p2.X = (REAL) xMax;

    DeviceToWorld.Transform(&p1);
    DeviceToWorld.Transform(&p2);

    // Convert to Fixed point notation - 16 bits of fractional precision.
    FIX16 dx, dy, x0, y0;
    x0 = GpRound(p1.X*FIX16_ONE);
    y0 = GpRound(p1.Y*FIX16_ONE);

    ASSERT(xMin < xMax);
    dx = GpRound(((p2.X - p1.X)*FIX16_ONE)/(xMax-xMin));
    dy = GpRound(((p2.Y - p1.Y)*FIX16_ONE)/(xMax-xMin));

    return OutputSpanIncremental(y, xMin, xMax, x0, y0, dx, dy);
}

GpStatus
DpOutputBicubicImageSpan::OutputSpanIncremental(
    INT      y,
    INT      xMin,
    INT      xMax,
    FIX16    x0,
    FIX16    y0,
    FIX16    dx,
    FIX16    dy
    )
{
    using namespace DpOutputBicubicImageSpanNS;
    INT width  = xMax - xMin;
    ARGB *buffer = Scan->NextBuffer(xMin, y, width);
    ARGB *srcPtr0 = static_cast<ARGB*> (BmpData.Scan0);
    INT stride = BmpData.Stride/sizeof(ARGB);

    INT ix;
    INT iy;
    FIX16 fracx;        // hold the fractional increment for ix
    FIX16 fracy;        // hold the fractional increment for iy

    ARGB filter[4][4];  // 4x4 filter array.
    INT xstep, ystep;   // loop variables in x and y
    INT wx[4];
    INT wy[4];          // wrapped coordinates

    // For all pixels in the destination span...
    for(int i=0; i<width; i++)
    {
        // .. compute the position in source space.

        // floor
        ix = x0 >> FIX16_SHIFT;
        iy = y0 >> FIX16_SHIFT;

        // Apply the wrapmode to all possible kernel combinations.
        for(xstep=0;xstep<4;xstep++) {
            wx[xstep] = ix+xstep-1;
            wy[xstep] = iy+xstep-1;
        }


        if(BWrapMode != WrapModeClamp) {
            if( ( (UINT)(ix-1) >= (UINT)( max(((INT)BmpData.Width)-4,0))) ||
                ( (UINT)(iy-1) >= (UINT)( max(((INT)BmpData.Height)-4,0))) )
            {
                for(xstep=0;xstep<4;xstep++) {
                    ApplyWrapMode(BWrapMode, wx[xstep], wy[xstep], BmpData.Width, BmpData.Height);
                }
            }
        }

        // Check to see if we're outside of the valid drawing range specified
        // in the DpBitmap.

        fracx = (x0  & FIX16_MASK) >> (FIX16_SHIFT-KernShift);
        fracy = (y0  & FIX16_MASK) >> (FIX16_SHIFT-KernShift);

        // Build up the filter domain surrounding the current pixel.
        // Technically the loops below should go from -2 to 2 to correctly
        // handle the case of fracx or fracy == 0, but our convolution kernel
        // has zero at that point anyway, so we optimize it away.
        
        for(ystep=0;ystep<4;ystep++) for(xstep=0;xstep<4;xstep++)
        {
            // !!! PERF: check the y step outside
            //       of the x loop and use memset to fill the entire line.
            //       This should reduce the complexity of the inner loop
            //       comparison.

            // Make sure the pixel is within the bounds of the source before
            // accessing it.

            if( ((wx[xstep]) >=0) &&
                ((wy[ystep]) >=0) &&
                ((wx[xstep]) < (INT)(BmpData.Width)) &&
                ((wy[ystep]) < (INT)(BmpData.Height)) )
            {
                filter[xstep][ystep] =
                  *(srcPtr0+stride*(wy[ystep])+(wx[xstep]));
            } else {
                // This means that this source pixel is outside of the valid
                // bits in the source. (edge condition)
                filter[xstep][ystep] = (ARGB) ClampColor;
            }
        }

        #ifdef _X86_
        if(OSInfo::HasMMX)
        {
            // Lookup the convolution kernel.
            short w[4];

            w[0] = kern14[Oversample+fracy];
            w[1] = kern14[fracy];
            w[2] = kern14[Oversample-fracy];
            w[3] = kern14[2*Oversample-fracy];

            // Filter the 4 vertical pixel columns
            // Reuse filter[0] to store the intermediate result
            for(xstep=0;xstep<4;xstep++)
            {
                filter[0][xstep] = Do1DBicubicMMX(filter[xstep], w);
            }

                // Lookup the convolution kernel.

            w[0] = kern14[Oversample+fracx];
            w[1] = kern14[fracx];
            w[2] = kern14[Oversample-fracx];
            w[3] = kern14[2*Oversample-fracx];

            // Filter horizontally.
            *buffer++ = Do1DBicubicMMX(filter[0], w);

            // Update source position
            x0 += dx;
            y0 += dy;
        }
        else
        #endif
        {
            // Filter the 4 vertical pixel columns
            // Reuse filter[0] to store the intermediate result
            for(xstep=0;xstep<4;xstep++)
            {
                filter[0][xstep] = Do1DBicubic(filter[xstep], fracy);
            }

            // Filter horizontally.
            *buffer++ = Do1DBicubic(filter[0], fracx);

            // Update source position
            x0 += dx;
            y0 += dy;
        }
    }

    // Clear the MMX state

    #ifdef _X86_
    if(OSInfo::HasMMX)
    {
        _asm emms;
    }
    #endif

    return Ok;
}