/******************************Module*Header*******************************\
* Module Name: bltmil.c
*
* Contains the low-level blt functions for the Millenium.
*
* Hopefully, if you're basing your display driver on this code, to
* support all of DrvBitBlt and DrvCopyBits, you'll only have to implement
* the following routines.  You shouldn't have to modify much in
* 'bitblt.c'.  I've tried to make these routines as few, modular, simple,
* and efficient as I could, while still accelerating as many calls as
* possible that would be cost-effective in terms of performance wins
* versus size and effort.
*
* Note: In the following, 'relative' coordinates refers to coordinates
*       that haven't yet had the offscreen bitmap (DFB) offset applied.
*       'Absolute' coordinates have had the offset applied.  For example,
*       we may be told to blt to (1, 1) of the bitmap, but the bitmap may
*       be sitting in offscreen memory starting at coordinate (0, 768) --
*       (1, 1) would be the 'relative' start coordinate, and (1, 769)
*       would be the 'absolute' start coordinate'.
*
* Copyright (c) 1992-1996 Microsoft Corporation
* Copyright (c) 1993-1996 Matrox Electronic Systems, Ltd.
\**************************************************************************/

#include "precomp.h"

/******************************Public*Routine******************************\
* VOID vMilFillSolid
*
* Fills a list of rectangles with a solid colour.
*
\**************************************************************************/

VOID vMilFillSolid(
    PDEV*           ppdev,      // pdev
    LONG            c,          // Number of rectangles to be filled,
                                // can't be zero
    RECTL*          prcl,       // List of rectangles to be filled
    ULONG           rop4,       // Rop4
    RBRUSH_COLOR    rbc,        // rbc.prb points to brush realization structure
    POINTL*         pptlBrush)  // Pattern alignment
{
    BYTE*   pjBase;
    LONG    xOffset;
    LONG    yOffset;
    ULONG   ulDwg;
    ULONG   ulHwMix;

    pjBase  = ppdev->pjBase;
    xOffset = ppdev->xOffset;
    yOffset = ppdev->yOffset;

    CHECK_FIFO_SPACE(pjBase, 4);

    ppdev->HopeFlags = (SIGN_CACHE | ARX_CACHE | PATTERN_CACHE);

    if (rop4 == 0xf0f0)         // PATCOPY
    {
        if (ppdev->iBitmapFormat == BMF_24BPP)
        {
            if (((rbc.iSolidColor & 0x000000ff) !=
                 ((rbc.iSolidColor >> 8) & 0x000000ff)) ||
                ((rbc.iSolidColor & 0x000000ff) !=
                 ((rbc.iSolidColor >> 16) & 0x000000ff)))
            {
                // We're in 24bpp, and the color is not a gray level, so we
                // can't use block mode.
                ulDwg = (opcode_TRAP + blockm_OFF + atype_RPL + solid_SOLID +
                         arzero_ZERO + sgnzero_ZERO + shftzero_ZERO +
                         bop_SRCCOPY + pattern_OFF + transc_BG_OPAQUE);
            }
            else
            {
                // We're in 24bpp, and the color is a gray level, so we
                // can use block mode if we prepare our color.
                rbc.iSolidColor = (rbc.iSolidColor << 8) |
                                  (rbc.iSolidColor & 0x000000ff);
                ulDwg   = (opcode_TRAP + blockm_ON + solid_SOLID +
                           arzero_ZERO + sgnzero_ZERO + shftzero_ZERO +
                           bop_SRCCOPY + pattern_OFF + transc_BG_OPAQUE);
            }
        }
        else
        {
            // This is not 24bpp.
            ulDwg = (opcode_TRAP + blockm_ON + solid_SOLID +
                     arzero_ZERO + sgnzero_ZERO + shftzero_ZERO +
                     bop_SRCCOPY + pattern_OFF + transc_BG_OPAQUE);
        }
    }
    else
    {
        // The ROP3 is a combination of P and D only:
        //
        //      ROP3  Mga   ROP3  Mga   ROP3  Mga   ROP3  Mga
        //
        //      0x00  0     0x50  4     0xa0  8     0xf0  c
        //      0x05  1     0x55  5     0xa5  9     0xf5  d
        //      0x0a  2     0x5a  6     0xaa  a     0xfa  e
        //      0x0f  3     0x5f  7     0xaf  b     0xff  f

        ulHwMix = (rop4 & 0x03) + ((rop4 & 0x30) >> 2);

        if (ulHwMix == MGA_WHITENESS)
        {
            rbc.iSolidColor = 0xffffffff;
            ulDwg = (opcode_TRAP + blockm_ON + solid_SOLID +
                     arzero_ZERO + sgnzero_ZERO + shftzero_ZERO +
                     bop_SRCCOPY + pattern_OFF + transc_BG_OPAQUE);
        }
        else if (ulHwMix == MGA_BLACKNESS)
        {
            rbc.iSolidColor = 0x00000000;
            ulDwg = (opcode_TRAP + blockm_ON + solid_SOLID +
                     arzero_ZERO + sgnzero_ZERO + shftzero_ZERO +
                     bop_SRCCOPY + pattern_OFF + transc_BG_OPAQUE);
        }
        else
        {
            ulDwg = (opcode_TRAP + blockm_OFF + atype_RSTR + solid_SOLID +
                     arzero_ZERO + sgnzero_ZERO + shftzero_ZERO +
                     pattern_OFF + transc_BG_OPAQUE +
                     (ulHwMix << 16));
        }
    }

    CP_WRITE(pjBase, DWG_DWGCTL, ulDwg);
    CP_WRITE(pjBase, DWG_FCOL, COLOR_REPLICATE(ppdev, rbc.iSolidColor));

    while(TRUE)
    {
        CP_WRITE(pjBase, DWG_FXBNDRY,
                        (((prcl->right + xOffset) << bfxright_SHIFT) |
                         ((prcl->left  + xOffset) & bfxleft_MASK)));

        // ylength_MASK not is needed since coordinates are within range

        CP_START(pjBase, DWG_YDSTLEN,
                        (((prcl->top    + yOffset  ) << yval_SHIFT) |
                         ((prcl->bottom - prcl->top))));

        if (--c == 0)
            return;

        CHECK_FIFO_SPACE(pjBase, 2);
        prcl++;
    }
}

/******************************Public*Routine******************************\
* VOID vMilPatRealize
*
* Download the Color Brush to the Color brush cache in the Storm offscreen
* memory.  For 8, 16, and 32 bpp, we download an 8x8 brush;  a special
* routine, vPatRealize24bpp, is used for 24bpp brushes.  We'll use direct
* frame buffer access whenever possible.
*
* There are some hardware restrictions concerning the way that a pattern
* must be stored in memory:
* - the first pixel of the pattern must be stored so that the first pixel
*   address mod 256 is 0, 8, 16, or 24;
* - each line of 8 pixels is stored continuously, but there must be a
*   difference of 32 in the pixel addresses of successive pattern lines.
* This means that we will store patterns in the following way:
*
* +----+---------------+---------------+---------------+---------------+
* |    |   Pattern 0   |   Pattern 1   |   Pattern 2   |   Pattern 3   |
* |Line|               |               |1 1 1 1 1 1 1 1|1 1 1 1 1 1 1 1|
* |    |0 1 2 3 4 5 6 7|8 9 a b c d e f|0 1 2 3 4 5 6 7|8 9 a b c d e f|
* +----+---------------+---------------+---------------+---------------+
* |  0 |*   *   *   *  |        X      |      o       o|x       x      |
* |  1 |  *   *   *   *|        X      |    o       o  |  x       x    |
* |  2 |*   *   *   *  |        X      |  o       o    |    x       x  |
* |  3 |  *   *   *   *|        X      |o       o      |      x       x|
* |  4 |*   *   *   *  |X X X X X X X X|      o       o|x       x      |
* |  5 |  *   *   *   *|        X      |    o       o  |  x       x    |
* |  6 |*   *   *   *  |        X      |  o       o    |    x       x  |
* |  7 |  *   *   *   *|        X      |o       o      |      x       x|
* +----+---------------+---------------+---------------+---------------+
*
* where a given pixel address is
*  FirstPixelAddress + Line*0x20 + Pattern*0x08 + xPat.
*
\**************************************************************************/

VOID vMilPatRealize(
    PDEV*   ppdev,
    RBRUSH* prb)
{
    BYTE*       pjBase;
    BRUSHENTRY* pbe;
    LONG        iBrushCache;
    ULONG       culScan;
    ULONG       i;
    ULONG       j;
    ULONG*      pulBrush;
    ULONG*      pulDst;
    ULONG       lDeltaPat;

    pjBase = ppdev->pjBase;

    // Allocate a new off-screen cache brush entry for the brush.
    iBrushCache = ppdev->iBrushCache;
    pbe         = &ppdev->pbe[iBrushCache];

    iBrushCache++;
    if (iBrushCache >= ppdev->cBrushCache)
        iBrushCache = 0;

    ppdev->iBrushCache = iBrushCache;

    // Update our links.
    pbe->prbVerify           = prb;
    prb->apbe[IBOARD(ppdev)] = pbe;

    // Point to the pattern bits.
    pulBrush = prb->aulPattern;

    // Calculate delta from end of pattern scan 1 to start of pattern scan2.
    lDeltaPat = 8 * ppdev->cjHwPel;     // 8 -> 32?

    // Convert it to a byte address.
    culScan = 2 * ppdev->cjHwPel;

    pulDst = (ULONG*) (pbe->pvScan0);

    START_DIRECT_ACCESS_STORM(ppdev, pjBase);

    for (i = 8; i != 0 ; i--)
    {
        for (j = 0; j < culScan; j++)
        {
            pulDst[j] = *pulBrush++;
        }
        pulDst += lDeltaPat;
    }

    END_DIRECT_ACCESS_STORM(ppdev, pjBase);
}

/*****************************************************************************
 * VOID vMilFillPat
 *
 * 8, 16, and 32bpp patterned color fills for Storm.
 ****************************************************************************/

VOID vMilFillPat(
    PDEV*           ppdev,
    LONG            c,          // Can't be zero
    RECTL*          prcl,       // List of rectangles to be filled, in relative
                                //   coordinates
    ULONG           rop4,       // Rop4
    RBRUSH_COLOR    rbc,        // rbc.prb points to brush realization structure
    POINTL*         pptlBrush)  // Pattern alignment
{
    BRUSHENTRY* pbe;
    LONG        xOffset;
    LONG        yOffset;
    LONG        xLeft;
    LONG        yTop;
    LONG        xBrush;
    LONG        yBrush;
    LONG        lSrcAdd;
    ULONG       ulLinear;
    BYTE*       pjBase;

    ASSERTDD(!(rbc.prb->fl & RBRUSH_2COLOR), "Can't do 2 colour brushes here");

    // We have to ensure that no other brush took our spot in off-screen
    // memory, or we might have to realize the brush for the first time.
    pbe = rbc.prb->apbe[IBOARD(ppdev)];
    if (pbe->prbVerify != rbc.prb)
    {
        vMilPatRealize(ppdev, rbc.prb);
        pbe = rbc.prb->apbe[IBOARD(ppdev)];
    }

    pjBase = ppdev->pjBase;
    xOffset = ppdev->xOffset;
    yOffset = ppdev->yOffset;
    lSrcAdd = ppdev->lPatSrcAdd;

    CHECK_FIFO_SPACE(pjBase, 6);

    CP_WRITE(pjBase, DWG_AR5, 32);   // Source (pattern) pitch.

    ppdev->HopeFlags = SIGN_CACHE;

    if ((rop4 & 0x000000FF) == 0x000000F0)
    {
        // The rop is PATCOPY.
        CP_WRITE(pjBase, DWG_DWGCTL, (opcode_BITBLT + atype_RPL + sgnzero_ZERO +
                                   shftzero_ZERO + bop_SRCCOPY +
                                   bltmod_BFCOL + pattern_ON +
                                   transc_BG_OPAQUE));
    }
    else
    {
        CP_WRITE(pjBase, DWG_DWGCTL, (opcode_BITBLT + atype_RSTR + sgnzero_ZERO +
                                   shftzero_ZERO + bltmod_BFCOL + pattern_ON +
                                   transc_BG_OPAQUE +
                                   (((rop4 & 0x03) + ((rop4 & 0x30) >> 2))
                                                            << 16)));
    }

    // The pattern setup is complete.
    while(TRUE)
    {
        // There is a problem with Storm.  We have to program:
        //  AR3: ssa
        //  AR0: sea, where sea<18:3> = ssa<18:3> and
        //                  sea< 2:0> = ssa< 2:0> + 2 for 8bpp;
        //                  sea< 2:0> = ssa< 2:0> + 4 for 16bpp;
        //                  sea< 2:0> = ssa< 2:0> + 6 for 32bpp.

        // Take into account the brush origin.  The upper left pel of the
        // brush should be aligned here in the destination surface.
        yTop     = prcl->top;
        xLeft    = prcl->left;
        xBrush   = (xLeft - pptlBrush->x) & 7;
        yBrush   = (yTop  - pptlBrush->y) & 7;
        ulLinear = pbe->ulLinear + (yBrush << 5) + xBrush;

        CP_WRITE(pjBase, DWG_AR3, ulLinear);
        CP_WRITE(pjBase, DWG_AR0, ((ulLinear & 0xfffffff8) |
                                   ((ulLinear+lSrcAdd) & 7)));

        CP_WRITE(pjBase, DWG_FXBNDRY,
                    (((prcl->right + xOffset - 1) << bfxright_SHIFT) |
                     ((xLeft       + xOffset) & bfxleft_MASK)));

        // ylength_MASK not is needed since coordinates are within range

        CP_START(pjBase, DWG_YDSTLEN,
                    (((yTop + yOffset     ) << yval_SHIFT) |
                     ((prcl->bottom - yTop))));

        if (--c == 0)
            return;

        CHECK_FIFO_SPACE(pjBase, 4);
        prcl++;
    }
}

/******************************Public*Routine******************************\
* vMilXfer1bpp
*
* This routine colour expands a monochrome bitmap.
*
\**************************************************************************/

VOID vMilXfer1bpp(             // Type FNXFER
    PDEV*       ppdev,
    LONG        c,          // Count of rectangles, can't be zero
    RECTL*      prcl,       // List of destination rectangles, in relative
                            //   coordinates
    ULONG       rop4,       // Foreground and background hardware mix
    SURFOBJ*    psoSrc,     // Source surface
    POINTL*     pptlSrc,    // Original unclipped source point
    RECTL*      prclDst,    // Original unclipped destination rectangle
    XLATEOBJ*   pxlo)       // Translate that provides colour-expansion information
{
    LONG    xOffset;
    LONG    yOffset;
    LONG    dx;
    LONG    dy;
    LONG    xSrc;
    LONG    ySrc;
    LONG    xDst;
    LONG    yDst;
    LONG    cxDst;
    LONG    cyDst;
    LONG    xSrcAlign;
    LONG    lSrcDelta;
    LONG    lSrcSkip;
    LONG    i;
    LONG    k;
    LONG    cdSrc;
    LONG    cdSrcPerScan;
    ULONG   FCol;
    ULONG   BCol;
    ULONG   ul;
    BYTE*   pjDma;
    ULONG*  pulXlate;
    ULONG*  pulSrc;
    ULONG*  pulDst;
    BYTE*   pjSrcScan0;
    BYTE*   pjBase;
    LONG    cFifo;
    LONG    xAlign;
    ULONG   cFullLoops;
    ULONG   cRemLoops;

    ASSERTDD(((rop4 & 0xff00) >> 8) == (rop4 & 0xff),
             "Expect only an opaquing rop");

    pjBase = ppdev->pjBase;
    xOffset = ppdev->xOffset;
    yOffset = ppdev->yOffset;

    dx = pptlSrc->x - prclDst->left;
    dy = pptlSrc->y - prclDst->top;     // Add to destination to get source

    pjSrcScan0 = psoSrc->pvScan0;
    lSrcDelta  = psoSrc->lDelta;

    pjDma = pjBase + DMAWND;

    ppdev->HopeFlags = SIGN_CACHE;

    // Get the foreground and background colors.
    pulXlate = pxlo->pulXlate;
    FCol = COLOR_REPLICATE(ppdev, pulXlate[1]);
    BCol = COLOR_REPLICATE(ppdev, pulXlate[0]);

    CHECK_FIFO_SPACE(pjBase, 9);

    if (rop4 == 0x0000CCCC)     // SRCCOPY
    {
        if (ppdev->iBitmapFormat == BMF_24BPP)
        {
            // We're in 24bpp.
            if (((FCol & 0x000000ff) != ((FCol >>  8) & 0x000000ff)) ||
                ((FCol & 0x000000ff) != ((FCol >> 16) & 0x000000ff)) ||
                ((BCol & 0x000000ff) != ((BCol >>  8) & 0x000000ff)) ||
                ((BCol & 0x000000ff) != ((BCol >> 16) & 0x000000ff)))
            {
                // Colors are not gray levels.
                CP_WRITE(pjBase, DWG_DWGCTL, (opcode_ILOAD + atype_RPL +
                                           sgnzero_ZERO + shftzero_ZERO +
                                           bop_SRCCOPY + bltmod_BMONOWF));
            }
            else
            {
                // Colors are gray levels.  Prepare them for block mode.
                CP_WRITE(pjBase, DWG_DWGCTL, (opcode_ILOAD + blockm_ON +
                                           sgnzero_ZERO + shftzero_ZERO +
                                           bop_SRCCOPY + bltmod_BMONOWF));
                BCol = (BCol << 8) | (BCol & 0x000000ff);
                FCol = (FCol << 8) | (FCol & 0x000000ff);
            }
        }
        else
        {
            // We're not in 24bpp.
            CP_WRITE(pjBase, DWG_DWGCTL, (opcode_ILOAD + blockm_ON +
                                       sgnzero_ZERO + shftzero_ZERO +
                                       bop_SRCCOPY + bltmod_BMONOWF));
        }
    }
    else
    {
        CP_WRITE(pjBase, DWG_DWGCTL, (opcode_ILOAD + atype_RSTR +
                                   sgnzero_ZERO + shftzero_ZERO +
                                   ((rop4 & 0xf) << 16) +
                                   bltmod_BMONOWF));
    }

    CP_WRITE(pjBase, DWG_BCOL, BCol);
    CP_WRITE(pjBase, DWG_FCOL, FCol);

    CP_WRITE(pjBase, DWG_AR5, 0);

    while (TRUE)
    {
        // Extents.
        cxDst = prcl->right - prcl->left;
        cyDst = prcl->bottom - prcl->top;

        // Starting (x,y) on screen.
        xDst  = prcl->left + xOffset;
        yDst  = prcl->top  + yOffset;

        // Starting (x,y) within the source bitmap.
        ySrc  = prcl->top + dy;
        xSrc  = prcl->left + dx;

        // Since SSA (AR3) is always zero, we may have to clip the expanded
        // ILOAD using CXLEFT, and we'll have to modify FXLEFT accordingly.
        xSrcAlign = xSrc & 0x1F;
        if (xSrcAlign)
        {
            // We'll have to use clipping.
            CP_WRITE(pjBase, DWG_CXLEFT, xDst);
        }

        // Number of pixels per line.
        CP_WRITE(pjBase, DWG_AR0, (cxDst - 1 + xSrcAlign));
        CP_WRITE(pjBase, DWG_AR3, 0);
        CP_WRITE(pjBase, DWG_FXBNDRY, (((xDst + cxDst - 1) << bfxright_SHIFT) |
                                    ((xDst - xSrcAlign) & bfxleft_MASK)));

        // ylength_MASK not is needed since coordinates are within range

        CP_START(pjBase, DWG_YDSTLEN, ((yDst << yval_SHIFT) | cyDst));

        // Calculate the location of the source rectangle.  This points to the
        // first dword to be downloaded.  It is aligned on a dword boundary.
        // The first bit of interest in the first dword is at (xSrc & 0x1f).
        pulSrc = (ULONG*)(pjSrcScan0 +
                            (ySrc * lSrcDelta) +
                            ((xSrc & 0xFFFFFFE0) >> 3));

        CHECK_FIFO_SPACE(pjBase, FIFOSIZE);
        BLT_WRITE_ON(ppdev, pjBase);

        // Number of bytes, padded to the next dword, to be moved per
        // scanline.  Since we align the starting dword on a dword boundary,
        // we know that we cannot overflow the end of the bitmap.
        cdSrc = ((xSrcAlign + cxDst + 0x1F) & 0xFFFFFFE0) >> 3;

        lSrcSkip = lSrcDelta - cdSrc;

        if (lSrcSkip == 0)
        {
            // There is no line-to-line increment, we can go full speed.

            // Total number of dwords to be sent.
            cdSrc = cyDst * (cdSrc >> 2);
            while ((cdSrc -= FIFOSIZE) > 0)
            {
                pulDst = (ULONG*)pjDma;

                CHECK_FIFO_SPACE(pjBase, FIFOSIZE);

                for (i = FIFOSIZE; i != 0; i--)
                {
                    CP_WRITE_DMA(ppdev, pulDst++, *pulSrc++);
                }
            }

            pulDst = (ULONG*)pjDma;
            cdSrc += FIFOSIZE;

            CHECK_FIFO_SPACE(pjBase, cdSrc);

            for (i = cdSrc; i != 0; i--)
            {
                CP_WRITE_DMA(ppdev, pulDst++, *pulSrc++);
            }
        }
        else
        {
            // We can't go full speed.
            // Number of full dwords to be moved on each scan.  We know that
            // we won't overflow the end of the bitmap with this.
            cdSrc >>= 2;
            cdSrcPerScan = cdSrc;

            for (k = cyDst; k != 0; k--)
            {
                pulDst = (ULONG*)pjDma;
                cdSrc = cdSrcPerScan;

                while ((cdSrc -= FIFOSIZE) > 0)
                {
                    CHECK_FIFO_SPACE(pjBase, FIFOSIZE);

                    for (i = FIFOSIZE; i != 0; i--)
                    {
                        CP_WRITE_DMA(ppdev, pulDst++, *pulSrc++);
                    }
                }

                cdSrc += FIFOSIZE;

                CHECK_FIFO_SPACE(pjBase, cdSrc);

                for (i = cdSrc; i != 0; i--)
                {
                    CP_WRITE_DMA(ppdev, pulDst++, *pulSrc++);
                }

                // We're done with the current scan, go to the next one.
                pulSrc = (ULONG*) ((BYTE*) pulSrc + lSrcSkip);
            }
        }

        BLT_WRITE_OFF(ppdev, pjBase);

        if (xSrcAlign)
        {
            // Restore the clipping:

            CHECK_FIFO_SPACE(pjBase, 1);
            CP_WRITE(pjBase, DWG_CXLEFT, 0);
        }
        if (--c == 0)
            break;

        prcl++;
        CHECK_FIFO_SPACE(pjBase, 5);
    }
}

/******************************Public*Routine******************************\
* LONG lSplitRcl
*
* WRAM-WRAM blts can't span banks, and this routine does the tough work
* of figuring out how much of the blt can be done via WRAM-WRAM in one bank,
* then a regular blt over the bank boundary, and again WRAM-WRAM in the
* next bank.
*
\**************************************************************************/

LONG lSplitRcl(
RECTL   *arclDst,
LONG    *ayBreak,
LONG    cyBreak,
LONG    dy,
ULONG   flDirCode,
LONG    *aiCmd)
{
    LONG    iBreak = 0;
    LONG    iSrc = 0;
    LONG    iDst = 0;
    RECTL   rcl;
    LONG    lBoundsTop;
    LONG    lBoundsBottom;
    LONG    iCmdLast;

    ///////////////////////////////////////////////////////////////////////////////
    // See [WRN] comment below before changing this macro.  This macro is
    // particular to this function.

    #define NON_EMPTY_RECT(rcl) ((rcl.right > rcl.left) && (rcl.bottom > rcl.top))

    aiCmd[0] = 0;

    if (cyBreak == 0)
    {
        return 1;
    }

    while (TRUE)
    {
        rcl = arclDst[iSrc];

        // Find the bounding scans of the union of the source and destination.

        lBoundsTop = min(rcl.top, rcl.top + dy);
        lBoundsBottom = max(rcl.bottom, rcl.bottom + dy);

        if ((ayBreak[iBreak] < lBoundsTop) ||
            (ayBreak[iBreak] >= lBoundsBottom))
        {
            // Do nothing
            iDst++;
            goto next_break;
        }

        // [WRN]  For the following, bottom could be less than top and
        //        right could be less than left.  These should be considered
        //        empty rectangles, and the macro above reflects this.

        arclDst[iDst].left     = rcl.left;
        arclDst[iDst].right    = rcl.right;
        arclDst[iDst].top      = rcl.top;
        arclDst[iDst].bottom   = min(rcl.bottom, (ayBreak[iBreak] - dy));
        if (NON_EMPTY_RECT(arclDst[iDst]))
        {
            aiCmd[iDst++] = 0;
            iCmdLast = 0;
        }

        arclDst[iDst].left     = rcl.left;
        arclDst[iDst].right    = rcl.right;
        arclDst[iDst].top      = max(rcl.top, (ayBreak[iBreak] - dy));
        arclDst[iDst].bottom   = min(rcl.bottom, (ayBreak[iBreak] + 1));
        if (NON_EMPTY_RECT(arclDst[iDst]))
        {
            aiCmd[iDst++] = 1;
            iCmdLast = 1;
        }

        arclDst[iDst].left     = rcl.left;
        arclDst[iDst].right    = rcl.right;
        arclDst[iDst].top      = max(rcl.top, (ayBreak[iBreak] + 1));
        arclDst[iDst].bottom   = rcl.bottom;
        if (NON_EMPTY_RECT(arclDst[iDst]))
        {
            aiCmd[iDst++] = 0;
            iCmdLast = 0;
        }

next_break:

        if ((--cyBreak == 0) ||
            (iCmdLast == 1))
        {
            // If we have run out of breaks, we're done.
            // Once the last rectangle is marked slow, it stays slow.

            break;
        }

        iSrc = --iDst;
        iBreak++;
    };

    return iDst;
}

/******************************Public*Routine******************************\
* VOID vMilCopyBlt
*
* Does a screen-to-screen blt of a list of rectangles.
*
\**************************************************************************/

VOID vMilCopyBlt(   // Type FNCOPY
PDEV*   ppdev,
LONG    c,          // Can't be zero
RECTL*  prcl,       // Array of relative coordinates destination rectangles
ULONG   rop4,       // Rop4
POINTL* pptlSrc,    // Original unclipped source point
RECTL*  prclDst)    // Original unclipped destination rectangle
{
    BYTE*   pjBase;
    LONG    xOffset;
    LONG    yOffset;
    LONG    dx;
    LONG    dy;
    FLONG   flDirCode;
    LONG    lSignedPitch;
    ULONG   ulHwMix;
    ULONG   ulDwg;
    LONG    yDst;
    LONG    ySrc;
    LONG    cy;
    LONG    xSrc;
    LONG    lSignedWidth;
    LONG    lSrcStart;
    ULONG   ulDwgFast = 0;
    LONG    cjPelSize;

    pjBase      = ppdev->pjBase;
    xOffset     = ppdev->xOffset;
    yOffset     = ppdev->yOffset;
    cjPelSize   = ppdev->cjPelSize;

    dx = pptlSrc->x - prclDst->left;
    dy = pptlSrc->y - prclDst->top;         // Add to destination to get source

    flDirCode    = DRAWING_DIR_TBLR;
    lSignedPitch = ppdev->cxMemory;

    // If the destination and source rectangles overlap, we will have to
    // tell the accelerator in which direction the copy should be done:

    if (OVERLAP(prclDst, pptlSrc))
    {
        if (prclDst->left > pptlSrc->x)
        {
            flDirCode |= scanleft_RIGHT_TO_LEFT;
        }
        if (prclDst->top > pptlSrc->y)
        {
            flDirCode |= sdy_BOTTOM_TO_TOP;
            lSignedPitch = -lSignedPitch;
        }
    }

    if (rop4 == 0xcccc)
    {
        ulDwg = opcode_BITBLT   | atype_RPL     | blockm_OFF        |
                bltmod_BFCOL    | pattern_OFF   | transc_BG_OPAQUE  |
                bop_SRCCOPY     | shftzero_ZERO | sgnzero_NO_ZERO;

        if ((dy > 0) && (dx == 0))
        {
            // We enable fast WRAM to WRAM blts only for upward scrolls.
            // We could enable it for more blts, but it has stringent
            // alignment requirements which aren't likely to be met unless
            // it's a vertical scroll.

            ulDwgFast = opcode_FBITBLT  | atype_RPL     | blockm_OFF        |
                        bltmod_BFCOL    | pattern_OFF   | transc_BG_OPAQUE  |
                        bop_NOP         | shftzero_ZERO | sgnzero_NO_ZERO;
        }
    }
    else
    {
        ulHwMix = rop4 & 0xf;

        ulDwg = opcode_BITBLT + atype_RSTR + blockm_OFF + bltmod_BFCOL +
                pattern_OFF + transc_BG_OPAQUE + (ulHwMix << 16);
    }

    // The SRC0 to SRC3 registers are probably trashed by the blt, and we
    // may be using a different SGN:

    ppdev->HopeFlags = 0;

    CHECK_FIFO_SPACE(pjBase, 8);

    CP_WRITE(pjBase, DWG_SGN, flDirCode);
    CP_WRITE(pjBase, DWG_AR5, lSignedPitch);

    // If the overhead for setting up the fast blt is too high, then we should
    // have a minimum size for prclDst.

    if (ulDwgFast)
    {
        RECTL   arclDst[1+(MAX_WRAM_BARRIERS*2)];
        LONG    aiCmd[1+(MAX_WRAM_BARRIERS*2)];
        LONG    *ayBreak;
        LONG    cyBreak;
        RECTL   *prclDst;
        LONG    crclDst;
        ULONG   aulCmd[2] = {ulDwgFast, ulDwg};
        LONG    i;

        ayBreak = ppdev->ayBreak;
        cyBreak = ppdev->cyBreak;

        while (TRUE)
        {
            arclDst[0] = *prcl;
            prclDst = arclDst;

            // split the rectangle at each ayBreak[i]
            // If the first scan was on a split, start with the slow blt,
            //   otherwise, start with the fast blt and alternate.

            crclDst = lSplitRcl(arclDst, ayBreak, cyBreak, dy, flDirCode, aiCmd);
            i = 0;

            while (TRUE)
            {
                LONG xRight;

                ASSERTDD((aiCmd[i] & ~1) == 0, "Only bit 0 of aiCmd[i] should be set.");
                CP_WRITE(pjBase, DWG_DWGCTL, aulCmd[aiCmd[i]]);

                xRight = prclDst->right + xOffset - 1;

                ////////////////////////////////////////////////////////////////
                // The following code is a bugfix for the fast WRAM copies
                // Extend the right edge to a specific value and then
                // clip to the actual desired edge.

                CP_WRITE(pjBase, DWG_CXRIGHT, xRight);

                switch(cjPelSize)
                {
                    case 1: xRight |= 0x40;
                            break;
                    case 2: xRight |= 0x20;
                            break;
                    case 4: xRight |= 0x10;
                            break;
                    case 3: xRight = (((xRight * 3) + 2) | 0x40) / 3;
                            break;
                }
                ////////////////////////////////////////////////////////////////

                CP_WRITE(pjBase, DWG_FXBNDRY,
                                (((xRight) << bfxright_SHIFT) |
                                 ((prclDst->left  + xOffset) & bfxleft_MASK)));

                yDst = yOffset + prclDst->top;
                ySrc = yOffset + prclDst->top + dy;

                // ylength_MASK not is needed since coordinates are within range

                CP_WRITE(pjBase, DWG_YDSTLEN,
                                (((yDst) << yval_SHIFT) |
                                 ((prclDst->bottom - prclDst->top))));

                xSrc         = xOffset + prclDst->left + dx;
                lSignedWidth = prclDst->right - prclDst->left - 1;

                lSrcStart = ppdev->ulYDstOrg + (ySrc * ppdev->cxMemory) + xSrc;
                CP_WRITE(pjBase, DWG_AR3, lSrcStart);
                CP_START(pjBase, DWG_AR0, lSrcStart + lSignedWidth);

                if (--crclDst == 0)
                    break;

                prclDst++;
                i++;

                CHECK_FIFO_SPACE(pjBase, 6);
            }

            if (--c == 0)
                break;

            prcl++;
            CHECK_FIFO_SPACE(pjBase, 6);
        }

        // Restore the clipping:

        CHECK_FIFO_SPACE(pjBase, 1);
        CP_WRITE(pjBase, DWG_CXRIGHT, (ppdev->cxMemory - 1));
    }
    else
    {
        CP_WRITE(pjBase, DWG_DWGCTL, ulDwg);

        while (TRUE)
        {
            CP_WRITE(pjBase, DWG_FXBNDRY,
                            (((prcl->right + xOffset - 1) << bfxright_SHIFT) |
                             ((prcl->left  + xOffset) & bfxleft_MASK)));

            yDst = yOffset + prcl->top;
            ySrc = yOffset + prcl->top + dy;

            if (flDirCode & sdy_BOTTOM_TO_TOP)
            {
                cy = prcl->bottom - prcl->top - 1;
                yDst += cy;
                ySrc += cy;
            }

            // ylength_MASK not is needed since coordinates are within range

            CP_WRITE(pjBase, DWG_YDSTLEN,
                            (((yDst) << yval_SHIFT) |
                             ((prcl->bottom - prcl->top))));

            xSrc         = xOffset + prcl->left + dx;
            lSignedWidth = prcl->right - prcl->left - 1;

            if (flDirCode & scanleft_RIGHT_TO_LEFT)
            {
                xSrc += lSignedWidth;
                lSignedWidth = -lSignedWidth;
            }

            lSrcStart = ppdev->ulYDstOrg + (ySrc * ppdev->cxMemory) + xSrc;
            CP_WRITE(pjBase, DWG_AR3, lSrcStart);
            CP_START(pjBase, DWG_AR0, lSrcStart + lSignedWidth);

            if (--c == 0)
                break;

            prcl++;
            CHECK_FIFO_SPACE(pjBase, 4);
        }
    }
}