/*
 * Copyright (c) 1993-1994      Microsoft Corporation
 * Copyright (c) 1994-1995      Digital Equipment Corporation
 *
 * Module Name: fastfill.c
 *
 * This module uses a quick breakup algorithm to draw unclipped, non-complex
 * rectangles.  The original version of this module was included in the
 * Daytona Beta-1 DDK S3 sample code.  All of the S3-specific portions were
 * removed and replaced by TGA-specific code.
 *
 * History:
 *
 * 30-May-1994  Barry Tannenbaum
 *      Initial TGA version.
 *
 *  1-Jun-1994  Barry Tannenbaum
 *      Fixed bug in masked fill which caused server to accvio.  "i" is *not*
 *      the same as "index" ;-)
 *
 * 10-Jun-1994  Barry Tannenbaum
 *      Fixed bug which resulted in an unpainted line across the bottom of the
 *      straight portion of a filled rounded rectangle.
 *
 * 30-Jun-1994  Barry Tannenbaum
 *      Fixed bug with Copy mode not setting the color properly
 *
 * 21-Jul-1994	Bob Seitsinger
 *	Write the plane mask register when using block fill mode.
 *
 *  9-Aug-1994  Barry Tannenbaum
 *      Setup for 24 plane support:
 *      - TGAMODE and TGAROP now take simple ULONGs instead of structures
 *      - Use default values from ppdev->ulModeTemplate & ppdev->ulRopTemplate
 *
 * 22-Sep-1994  Bob Seitsinger
 *      Make use of ppdev->ulPlanemaskTemplate. Also, alignment is 1-pixel for
 *      all 'Fill' modes, so no need to 'if' on bmf, just align to 4 bytes.
 *
 * 25-Oct-1994  Bob Seitsinger
 *      Write plane mask with ppdev->ulPlanemaskTemplate all the
 *      time.
 *
 *      For 24 plane boards we don't want to blow away the
 *      windows ids for 3d windows. The GL driver removes the
 *      window ids when it relinquishes a rectangular area.
 *
 *  2-Mar-1995  Barry Tannenbaum
 *      EV5 changes
 */

#include "driver.h"
#include "fill.h"

#define RIGHT 0
#define LEFT  1
#define SWAP(a, b, tmp) { tmp = a; a = b; b = tmp; }

typedef struct _EDGEDATA {
LONG      x;                // Current x position
LONG      dx;               // # pixels to advance x on each scan
LONG      lError;           // Current DDA error
LONG      lErrorUp;         // DDA error increment on each scan
LONG      lErrorDown;       // DDA error adjustment
POINTFIX* pptfx;            // Points to start of current edge
LONG      dptfx;            // Delta (in bytes) from pptfx to next point
LONG      cy;               // Number of scans to go for this edge
} EDGEDATA;                         /* ed, ped */

/////////////////////////////////////////////////////////////////////////
// The x86 C compiler insists on making a divide and modulus operation
// into two DIVs, when it can in fact be done in one.  So we use this
// macro.
//
// Note: QUOTIENT_REMAINDER implicitly takes unsigned arguments.

#if defined(i386)

#define QUOTIENT_REMAINDER(ulNumerator, ulDenominator, ulQuotient, ulRemainder) \
{                                                               \
    __asm mov eax, ulNumerator                                  \
    __asm sub edx, edx                                          \
    __asm div ulDenominator                                     \
    __asm mov ulQuotient, eax                                   \
    __asm mov ulRemainder, edx                                  \
}

#else

#define QUOTIENT_REMAINDER(ulNumerator, ulDenominator, ulQuotient, ulRemainder) \
{                                                               \
    ulQuotient  = (ULONG) ulNumerator / (ULONG) ulDenominator;  \
    ulRemainder = (ULONG) ulNumerator % (ULONG) ulDenominator;  \
}

#endif

/*
 * BOOL bMmFastFill
 *
 * Draws a non-complex, unclipped polygon.  'Non-complex' is defined as
 * having only two edges that are monotonic increasing in 'y'.  That is,
 * the polygon cannot have more than one disconnected segment on any given
 * scan.  Note that the edges of the polygon can self-intersect, so hourglass
 * shapes are permissible.  This restriction permits this routine to run two
 * simultaneous DDAs, and no sorting of the edges is required.
 *
 * Note that NT's fill convention is different from that of Win 3.1 or 4.0.
 * With the additional complication of fractional end-points, our convention
 * is the same as in 'X-Windows'.  But a DDA is a DDA is a DDA, so once you
 * figure out how we compute the DDA terms for NT, you're golden.
 *
 * This routine handles patterns only when the S3 hardware patterns can be
 * used.  The reason for this is that once the S3 pattern initialization is
 * done, pattern fills appear to the programmer exactly the same as solid
 * fills (with the slight difference that different registers and commands
 * are used).  Handling 'vIoFillPatSlow' style patterns in this routine
 * would be non-trivial...
 *
 * We take advantage of the fact that the S3 automatically advances the
 * current 'y' to the following scan whenever a rectangle is output so that
 * we have to write to the accelerator three times for every scan: one for
 * the new 'x', one for the new 'width', and one for the drawing command.
 *
 * This routine is in no way the ultimate convex polygon drawing routine
 * (what can I say, I was pressed for time when I wrote this :-).  Some
 * obvious things that would make it faster:
 *
 *    1) Write it in Asm and amortize the FIFO checking costs (check out
 *       i386\fastfill.asm for a version that does this).
 *
 *    2) Take advantage of any hardware such as the ATI's SCAN_TO_X
 *       command, or any built-in trapezoid support (note that with NT
 *       you may get non-integer end-points, so you must be able to
 *       program the trapezoid DDA terms directly).
 *
 *    3) Do some rectangle coalescing when both edges are y-major.  This
 *       could permit removal of my vertical-edges special case.  I
 *       was also thinking of special casing y-major left edges on the
 *       S3, because the S3 leaves the current 'x' unchanged on every blt,
 *       so a scan that starts on the same 'x' as the one above it
 *       would require only two commands to the accelerator (obviously,
 *       this only helps when we're not overdriving the accelerator).
 *
 *    4) Make the non-complex polygon detection faster.  If I could have
 *       modified memory before the start of after the end of the buffer,
 *       I could have simplified the detection code.  But since I expect
 *       this buffer to come from GDI, I can't do that.  Another thing
 *       would be to have GDI give a flag on calls that are guaranteed
 *       to be convex, such as 'Ellipses' and 'RoundRects'.  Note that
 *       the buffer would still have to be scanned to find the top-most
 *       point.
 *
 *    5) Special case integer points.  Unfortunately, for this to be
 *       worth-while would require GDI to give us a flag when all the
 *       end-points of a path are integers, which it doesn't do.
 *
 *    6) Add rectangular clipping support.
 *
 *    7) Implement support for a single sub-path that spans multiple
 *       path data records, so that we don't have to copy all the points
 *       to a single buffer like we do in 'fillpath.c'.
 *
 *    8) Use 'ebp' and/or 'esp' as a general register in the inner loops
 *       of the Asm loops, and also Pentium-optimize the code.  It's safe
 *       to use 'esp' on NT because it's guaranteed that no interrupts
 *       will be taken in our thread context, and nobody else looks at the
 *       stack pointer from our context.
 *
 *    9) Do the fill bottom-up instead of top-down.  With the S3, we have
 *       to only set 'cur_y' once because each drawing command automatically
 *       advances 'cur_y' (unless the polygon has zero pels lit on a scan),
 *       so we set this right at the beginning.  But for an integer end-point
 *       polygon, unless the top edge is horizontal, no pixels are lit on
 *       that first scan (so at the beginning of almost every integer
 *       polygon, we go through the 'zero width' logic and again set
 *       'cur_y').  We could avoid this extra work by building the polygon
 *       from bottom to top: for the bottom-most point B in a polygon, it
 *       is guaranteed that any scan with lit pixels will be no lower than
 *       'ceiling(B.y) - 1'.  Unfortunately, building bottom-up makes the
 *       fractional-DDA calculations a little more complex, so I didn't do it.
 *
 *       Building bottom-up would also improve the polygon score in version
 *       3.11 of a certain benchmark, because it has a big rectangle at the
 *       top of every polygon -- we would get better processing overlap
 *       because we wouldn't have to wait around for the accelerator to
 *       finish drawing the big rectangle.
 *
 *   10) Make a better guess in the initialization as to which edge is the
 *       'left' edge, and which is the 'right'.  As it is, we immediately
 *       go through the swap-edges logic for half of all polygons when we
 *       start to run the DDA.  The reason why I didn't implement better-guess
 *       code is because it would have to look at the end-point of the top
 *       edges, and to get at the end-points we have to watch that we don't
 *       wrap around the ends of the points buffer.
 *
 *   11) Lots of other things I haven't thought of.
 *
 * NOTE: Unlike the x86 Asm version, this routine does NOT assume that it
 *       has 16 FIFO entries available.
 *
 * Returns TRUE if the polygon was drawn; FALSE if the polygon was complex.
 *
 */

BOOL bMmFastFill (PPDEV        ppdev,
                  LONG         cEdges,         // Includes close figure edge
                  POINTFIX    *pptfxFirst,
                  fill_data_t *fill_data)
{
    LONG      yTrapezoid;   // Top scan for next trapezoid
    LONG      cyTrapezoid;  // Number of scans in current trapezoid
    LONG      yStart;       // y-position of start point in current edge
    LONG      dM;           // Edge delta in FIX units in x direction
    LONG      dN;           // Edge delta in FIX units in y direction
    LONG      i;
    POINTFIX* pptfxLast;    // Points to the last point in the polygon array
    POINTFIX* pptfxTop;     // Points to the top-most point in the polygon
    POINTFIX* pptfxOld;     // Start point in current edge
    POINTFIX* pptfxScan;    // Current edge pointer for finding pptfxTop
    LONG      cScanEdges;   // Number of edges scanned to find pptfxTop
                            //  (doesn't include the closefigure edge)
    LONG      iEdge;
    LONG      lQuotient;
    LONG      lRemainder;

    EDGEDATA  aed[2];       // DDA terms and stuff
    EDGEDATA* ped;

    ULONG       mode;               // TGA mode to use
    ULONG       color;
    BOOL        block_fill;

    /////////////////////////////////////////////////////////////////
    // See if the polygon is 'non-complex'

    pptfxScan = pptfxFirst;
    pptfxTop  = pptfxFirst;                 // Assume for now that the first
                                            //  point in path is the topmost
    pptfxLast = pptfxFirst + cEdges - 1;

    // 'pptfxScan' will always point to the first point in the current
    // edge, and 'cScanEdges' will the number of edges remaining, including
    // the current one:

    cScanEdges = cEdges - 1;     // The number of edges, not counting close figure

    if ((pptfxScan + 1)->y > pptfxScan->y)
    {
        // Collect all downs:

        do {
            if (--cScanEdges == 0)
                goto SetUpForFilling;
            pptfxScan++;
        } while ((pptfxScan + 1)->y >= pptfxScan->y);

        // Collect all ups:

        do {
            if (--cScanEdges == 0)
                goto SetUpForFillingCheck;
            pptfxScan++;
        } while ((pptfxScan + 1)->y <= pptfxScan->y);

        // Collect all downs:

        pptfxTop = pptfxScan;

        do {
            if ((pptfxScan + 1)->y > pptfxFirst->y)
                break;

            if (--cScanEdges == 0)
                goto SetUpForFilling;
            pptfxScan++;
        } while ((pptfxScan + 1)->y >= pptfxScan->y);

        return(FALSE);
    }
    else
    {
        // Collect all ups:

        do {
            pptfxTop++;                 // We increment this now because we
                                        //  want it to point to the very last
                                        //  point if we early out in the next
                                        //  statement...
            if (--cScanEdges == 0)
                goto SetUpForFilling;
        } while ((pptfxTop + 1)->y <= pptfxTop->y);

        // Collect all downs:

        pptfxScan = pptfxTop;
        do {
            if (--cScanEdges == 0)
                goto SetUpForFilling;
            pptfxScan++;
        } while ((pptfxScan + 1)->y >= pptfxScan->y);

        // Collect all ups:

        do {
            if ((pptfxScan + 1)->y < pptfxFirst->y)
                break;

            if (--cScanEdges == 0)
                goto SetUpForFilling;
            pptfxScan++;
        } while ((pptfxScan + 1)->y <= pptfxScan->y);

        return(FALSE);
    }

SetUpForFillingCheck:

    // We check to see if the end of the current edge is higher
    // than the top edge we've found so far:

    if ((pptfxScan + 1)->y < pptfxTop->y)
        pptfxTop = pptfxScan + 1;

SetUpForFilling:

    /////////////////////////////////////////////////////////////////
    // Some Initialization

    yTrapezoid = (pptfxTop->y + 15) >> 4;

    // Make sure we initialize the DDAs appropriately:

    aed[LEFT].cy  = 0;
    aed[RIGHT].cy = 0;

    // For now, guess as to which is the left and which is the right edge:

    aed[LEFT].dptfx  = -(LONG) sizeof(POINTFIX);
    aed[RIGHT].dptfx = sizeof(POINTFIX);
    aed[LEFT].pptfx  = pptfxTop;
    aed[RIGHT].pptfx = pptfxTop;


    // Note that the chip is no longer in simple mode

    ppdev->bSimpleMode = FALSE;
    WBFLUSH (ppdev);

    if (fill_data->iSolidColor != -1)
    {
        // If the ROP is COPY (simply set the pixmap to the solid color) we can
        // use BLOCK_FILL mode.  Unfortunately, BLOCK_FILL mode doesn't pay any
        // attention to the ROP, so if we want to do anything fancy (say, XOR
        // the data onto the screen) we have to use TRANSPARENT_FILL.
        // BLOCK_FILL is faster (about 4x) but TRANSPARENT_FILL is more
        // flexible

        switch (fill_data->tga_rop)
        {
            case TGA_ROP_COPY:
                mode = TGA_MODE_BLOCK_FILL;
                if (BMF_8BPP == ppdev->iFormat)
                {
                    color =  fill_data->iSolidColor |
                            (fill_data->iSolidColor <<  8);
                    color |= color << 16;
                }
                else
                    color =  fill_data->iSolidColor;
                TGAPLANEMASK (ppdev, ppdev->ulPlanemaskTemplate);
                break;

            case TGA_ROP_INVERT:
                mode = TGA_MODE_TRANSPARENT_FILL;
                TGAPLANEMASK (ppdev, ppdev->ulPlanemaskTemplate);
                break;

            default:
                mode = TGA_MODE_TRANSPARENT_FILL;
                if (BMF_8BPP == ppdev->iFormat)
                {
                    color =  fill_data->iSolidColor |
                            (fill_data->iSolidColor <<  8);
                    color |= 16;
		}
                else
                    color =  fill_data->iSolidColor;
                TGAFOREGROUND (ppdev, color);
                TGAPLANEMASK (ppdev, ppdev->ulPlanemaskTemplate);
                break;
        }
    }
    else
    {
        if (TGA_ROP_COPY == fill_data->tga_rop)
        {
            mode = TGA_MODE_BLOCK_FILL;
            if (BMF_8BPP == ppdev->iFormat)
            {
                color =  fill_data->iSolidColor |
                        (fill_data->iSolidColor <<  8);
                color |= color << 16;
            }
            else
                color =  fill_data->iSolidColor;
            TGAPLANEMASK (ppdev, ppdev->ulPlanemaskTemplate);
       }
       else
       {
            if (! fill_data->mask)
                mode = TGA_MODE_OPAQUE_FILL;

                ///// !!!! WE'RE NOT SETTING THE FOREGROUND AND BACKGROUND REGISTERS !!!!

            else
            {
                mode = TGA_MODE_TRANSPARENT_FILL;
                if (BMF_8BPP == ppdev->iFormat)
                {
                    color =  fill_data->iSolidColor |
                            (fill_data->iSolidColor <<  8);
                    color |= color << 16;
                }
                else
                    color =  fill_data->iSolidColor;
                TGAFOREGROUND (ppdev, color);
	    }
            TGAPLANEMASK (ppdev, ppdev->ulPlanemaskTemplate);
       }
    }

    // Set the mode and raster op registers

    block_fill = (TGA_MODE_BLOCK_FILL == mode);
    mode |= ppdev->ulModeTemplate;
    TGAMODE (ppdev, mode);

    TGAROP (ppdev, fill_data->tga_rop | ppdev->ulRopTemplate);

    if (NULL == fill_data->mask)
        TGADATA (ppdev, 0xffffffff);         // Write to all 32 pixels

    // If we're using BLOCK_FILL mode, load the BLK_COLOR registers

    if (block_fill && (NULL == fill_data->pattern))
        TGALOADCOLORREGS (ppdev, color, ppdev->ulBitCount);

    CYCLE_REGS (ppdev);

NewTrapezoid:

    /////////////////////////////////////////////////////////////////
    // DDA initialization

    for (iEdge = 1; iEdge >= 0; iEdge--)
    {
        ped = &aed[iEdge];
        if (ped->cy == 0)
        {
            // Need a new DDA:

            do {
                cEdges--;
                if (cEdges < 0)
                    return(TRUE);

                // Find the next left edge, accounting for wrapping:

                pptfxOld = ped->pptfx;
                ped->pptfx = (POINTFIX*) ((BYTE*) ped->pptfx + ped->dptfx);

                if (ped->pptfx < pptfxFirst)
                    ped->pptfx = pptfxLast;
                else if (ped->pptfx > pptfxLast)
                    ped->pptfx = pptfxFirst;

                // Have to find the edge that spans yTrapezoid:

                ped->cy = ((ped->pptfx->y + 15) >> 4) - yTrapezoid;

                // With fractional coordinate end points, we may get edges
                // that don't cross any scans, in which case we try the
                // next one:

            } while (ped->cy <= 0);

            // 'pptfx' now points to the end point of the edge spanning
            // the scan 'yTrapezoid'.

            dN = ped->pptfx->y - pptfxOld->y;
            dM = ped->pptfx->x - pptfxOld->x;

            ASSERT_TGA(dN > 0, "Should be going down only");

            // Compute the DDA increment terms:

            if (dM < 0)
            {
                dM = -dM;
                if (dM < dN)                // Can't be '<='
                {
                    ped->dx       = -1;
                    ped->lErrorUp = dN - dM;
                }
                else
                {
                    QUOTIENT_REMAINDER(dM, dN, lQuotient, lRemainder);

                    ped->dx       = -lQuotient;     // - dM / dN
                    ped->lErrorUp = lRemainder;     // dM % dN
                    if (ped->lErrorUp > 0)
                    {
                        ped->dx--;
                        ped->lErrorUp = dN - ped->lErrorUp;
                    }
                }
            }
            else
            {
                if (dM < dN)                // Can't be '<='
                {
                    ped->dx       = 0;
                    ped->lErrorUp = dM;
                }
                else
                {
                    QUOTIENT_REMAINDER(dM, dN, lQuotient, lRemainder);

                    ped->dx       = lQuotient;      // dM / dN
                    ped->lErrorUp = lRemainder;     // dM % dN
                }
            }

            ped->lErrorDown = dN; // DDA limit
            ped->lError     = -1; // Error is initially zero (add dN - 1 for
                                  //  the ceiling, but subtract off dN so that
                                  //  we can check the sign instead of comparing
                                  //  to dN)

            ped->x = pptfxOld->x;
            yStart = pptfxOld->y;

            if ((yStart & 15) != 0)
            {
                // Advance to the next integer y coordinate

                for (i = 16 - (yStart & 15); i != 0; i--)
                {
                    ped->x      += ped->dx;
                    ped->lError += ped->lErrorUp;
                    if (ped->lError >= 0)
                    {
                        ped->lError -= ped->lErrorDown;
                        ped->x++;
                    }
                }
            }

            if ((ped->x & 15) != 0)
            {
                ped->lError -= ped->lErrorDown * (16 - (ped->x & 15));
                ped->x += 15;       // We'll want the ceiling in just a bit...
            }

            // Chop off those fractional bits:

            ped->x      >>= 4;
            ped->lError >>= 4;
        }
    }

    cyTrapezoid = min(aed[LEFT].cy, aed[RIGHT].cy); // # of scans in this trap
    aed[LEFT].cy  -= cyTrapezoid;
    aed[RIGHT].cy -= cyTrapezoid;
    yTrapezoid    += cyTrapezoid;                   // Top scan in next trap

    // If the left and right edges are vertical, simply output as
    // a rectangle:

    if (((aed[LEFT].lErrorUp | aed[RIGHT].lErrorUp) == 0) &&
        ((aed[LEFT].dx       | aed[RIGHT].dx) == 0) &&
        (cyTrapezoid > 1))
    {
        LONG lWidth;
        PBYTE left_edge;
        ULONG align_bytes;
        LONG y;

        left_edge = ppdev->pjFrameBuffer +
                    ((yTrapezoid - cyTrapezoid) * ppdev->lScreenStride);

        /////////////////////////////////////////////////////////////////
        // Vertical-edge special case

    ContinueVertical:

        lWidth = aed[RIGHT].x - aed[LEFT].x - 1;
        if (lWidth >= 0)
        {
            left_edge += aed[LEFT].x;
            align_bytes = (unsigned int)left_edge & 0x03;
            left_edge = left_edge - align_bytes;
            lWidth |= (align_bytes << 16);

            if (0 == ((ULONG)fill_data->pattern | (ULONG)fill_data->mask))
            {
                for (y = 0; y < cyTrapezoid; y++)
                {
                    TGAWRITE (ppdev, left_edge, lWidth);
                    left_edge += ppdev->lScreenStride;
                }
            }
            else
            {
                LONG    index;
                LONG    j, max_j;
                PBYTE   base_address;
                LONG    stride_8;

                if (8 < cyTrapezoid)
                    max_j = 8;
                else
                    max_j = cyTrapezoid;

                index = ((yTrapezoid - cyTrapezoid) - fill_data->yOffset) % 8;
                if (index < 0)
                    index += 8;

                base_address = left_edge;
                stride_8 = ppdev->lScreenStride * 8;

                if (fill_data->pattern)
                {
                    index *= 2;

                    for (j = 0; j < max_j; j++)
                    {
                        CYCLE_REGS (ppdev);

                        TGACOLOR0 (ppdev, fill_data->pattern[index]);
                        ++index;
                        TGACOLOR1 (ppdev, fill_data->pattern[index]);
                        if (++index >= 16)
                            index = 0;

                        left_edge = base_address;
                        base_address += ppdev->lScreenStride;

                        for (y = j; y < cyTrapezoid; y += 8)
                        {
                            TGAWRITE (ppdev, left_edge, lWidth);
                            left_edge += stride_8;
                        }
		    }
                }
                else
                {
                    ULONG *mask_ptr;

                    if ((ULONG)base_address & 0x04)
                        mask_ptr = fill_data->mask + 8;
                    else
                        mask_ptr = fill_data->mask;

                    for (j = 0; j < max_j; j++)
                    {
                        CYCLE_REGS (ppdev);

                        TGADATA (ppdev, mask_ptr[index]);
                        if (++index >= 8)
                            index = 0;

                        left_edge = base_address;
                        base_address += ppdev->lScreenStride;

                        for (y = j; y < cyTrapezoid; y += 8)
                        {
                            TGAWRITE (ppdev, left_edge, lWidth);
                            left_edge += stride_8;
                        }
		    }
                }
            }

        }
        else if (lWidth != -1)
        {
            LONG      lTmp;
            POINTFIX* pptfxTmp;

            SWAP(aed[LEFT].x,          aed[RIGHT].x,          lTmp);
            SWAP(aed[LEFT].cy,         aed[RIGHT].cy,         lTmp);
            SWAP(aed[LEFT].dptfx,      aed[RIGHT].dptfx,      lTmp);
            SWAP(aed[LEFT].pptfx,      aed[RIGHT].pptfx,      pptfxTmp);
            goto ContinueVertical;
        }

        goto NewTrapezoid;
    }

    while (TRUE)
    {
        LONG lWidth;
        PBYTE left_edge;
        PBYTE y_address;
        ULONG align_bytes;
        LONG    index;
        ULONG  *mask_ptr;

        /////////////////////////////////////////////////////////////////
        // Run the DDAs

        // The very first time through, make sure we set x:

        y_address = ppdev->pjFrameBuffer +
                        ((yTrapezoid - cyTrapezoid) * ppdev->lScreenStride);
        index = ((yTrapezoid - cyTrapezoid) - fill_data->yOffset) % 8;
        if (index < 0)
            index += 8;
        if (fill_data->pattern)
            index *= 2;

        lWidth = aed[RIGHT].x - aed[LEFT].x - 1;
        if (lWidth >= 0)
        {
            left_edge = y_address + aed[LEFT].x;
            align_bytes = (unsigned int)left_edge & 0x03;
            left_edge = left_edge - align_bytes;
            lWidth |= (align_bytes << 16);

            if (fill_data->pattern)
            {
                CYCLE_REGS (ppdev);

                TGACOLOR0 (ppdev, fill_data->pattern[index]);
                ++index;
                TGACOLOR1 (ppdev, fill_data->pattern[index]);
                if (++index >= 16)
                    index = 0;
            }
            else if (fill_data->mask)
            {
                CYCLE_REGS (ppdev);

                if ((ULONG)left_edge & 0x04)
                    mask_ptr = fill_data->mask + 8;
                else
                    mask_ptr = fill_data->mask;

                TGADATA (ppdev, mask_ptr[index]);
            }

            TGAWRITE (ppdev, left_edge, lWidth);
            y_address -= ppdev->lScreenStride;

    ContinueAfterZero:

            // Advance the right wall:

            aed[RIGHT].x      += aed[RIGHT].dx;
            aed[RIGHT].lError += aed[RIGHT].lErrorUp;

            if (aed[RIGHT].lError >= 0)
            {
                aed[RIGHT].lError -= aed[RIGHT].lErrorDown;
                aed[RIGHT].x++;
            }

            // Advance the left wall:

            aed[LEFT].x      += aed[LEFT].dx;
            aed[LEFT].lError += aed[LEFT].lErrorUp;

            if (aed[LEFT].lError >= 0)
            {
                aed[LEFT].lError -= aed[LEFT].lErrorDown;
                aed[LEFT].x++;
            }

            cyTrapezoid--;
            if (cyTrapezoid == 0)
                goto NewTrapezoid;
        }
        else if (lWidth == -1)
        {
            goto ContinueAfterZero;
        }
        else
        {
            // We certainly don't want to optimize for this case because we
            // should rarely get self-intersecting polygons (if we're slow,
            // the app gets what it deserves):

            LONG      lTmp;
            POINTFIX* pptfxTmp;

            SWAP(aed[LEFT].x,          aed[RIGHT].x,          lTmp);
            SWAP(aed[LEFT].dx,         aed[RIGHT].dx,         lTmp);
            SWAP(aed[LEFT].lError,     aed[RIGHT].lError,     lTmp);
            SWAP(aed[LEFT].lErrorUp,   aed[RIGHT].lErrorUp,   lTmp);
            SWAP(aed[LEFT].lErrorDown, aed[RIGHT].lErrorDown, lTmp);
            SWAP(aed[LEFT].cy,         aed[RIGHT].cy,         lTmp);
            SWAP(aed[LEFT].dptfx,      aed[RIGHT].dptfx,      lTmp);
            SWAP(aed[LEFT].pptfx,      aed[RIGHT].pptfx,      pptfxTmp);

            continue;
        }
    }
}