/******************************Module*Header*******************************\
* Module Name: bltm32.c
*
* Contains the low-level memory-mapped I/O blt functions for the Mach32.
*
* Hopefully, if you're basing your display driver on this code, to
* support all of DrvBitBlt and DrvCopyBits, you'll only have to implement
* the following routines.  You shouldn't have to modify much in
* 'bitblt.c'.  I've tried to make these routines as few, modular, simple,
* and efficient as I could, while still accelerating as many calls as
* possible that would be cost-effective in terms of performance wins
* versus size and effort.
*
* Note: In the following, 'relative' coordinates refers to coordinates
*       that haven't yet had the offscreen bitmap (DFB) offset applied.
*       'Absolute' coordinates have had the offset applied.  For example,
*       we may be told to blt to (1, 1) of the bitmap, but the bitmap may
*       be sitting in offscreen memory starting at coordinate (0, 768) --
*       (1, 1) would be the 'relative' start coordinate, and (1, 769)
*       would be the 'absolute' start coordinate'.
*
* Copyright (c) 1992-1995 Microsoft Corporation
*
\**************************************************************************/

#include "precomp.h"

/******************************Public*Routine******************************\
* VOID vM32FillSolid
*
* Fills a list of rectangles with a solid colour.
*
\**************************************************************************/

VOID vM32FillSolid(             // Type FNFILL
PDEV*           ppdev,
LONG            c,              // Can't be zero
RECTL*          prcl,           // List of rectangles to be filled, in relative
                                //   coordinates
ULONG           rop4,           // rop4
RBRUSH_COLOR    rbc,            // Drawing colour is rbc.iSolidColor
POINTL*         pptlBrush)      // Not used
{
    BYTE*   pjMmBase;
    LONG    xOffset;
    LONG    yOffset;
    LONG    x;

    ASSERTDD(c > 0, "Can't handle zero rectangles");

    pjMmBase = ppdev->pjMmBase;
    xOffset  = ppdev->xOffset;
    yOffset  = ppdev->yOffset;

    M32_CHECK_FIFO_SPACE(ppdev, pjMmBase, 8);

    M32_OW(pjMmBase, FRGD_COLOR, rbc.iSolidColor);
    M32_OW(pjMmBase, ALU_FG_FN,  gaul32HwMixFromRop2[(rop4 >> 2) & 0xf]);
    M32_OW(pjMmBase, DP_CONFIG,  FG_COLOR_SRC_FG | WRITE | DRAW);

    while (TRUE)
    {
        x = xOffset + prcl->left;
        M32_OW(pjMmBase, CUR_X,        x);
        M32_OW(pjMmBase, DEST_X_START, x);
        M32_OW(pjMmBase, DEST_X_END,   xOffset + prcl->right);
        M32_OW(pjMmBase, CUR_Y,        yOffset + prcl->top);

        vM32QuietDown(ppdev, pjMmBase);

        M32_OW(pjMmBase, DEST_Y_END,   yOffset + prcl->bottom);

        if (--c == 0)
            return;

        prcl++;
        M32_CHECK_FIFO_SPACE(ppdev, pjMmBase, 5);
    }
}

/******************************Public*Routine******************************\
* VOID vM32FillPatMonochrome
*
* This routine uses the pattern hardware to draw a monochrome patterned
* list of rectangles.
*
* See Blt_DS_P8x8_ENG_IO_66_D0 and Blt_DS_P8x8_ENG_IO_66_D1.
*
\**************************************************************************/

VOID vM32FillPatMonochrome(     // Type FNFILL
PDEV*           ppdev,
LONG            c,              // Can't be zero
RECTL*          prcl,           // List of rectangles to be filled, in relative
                                //   coordinates
ULONG           rop4,           // rop4
RBRUSH_COLOR    rbc,            // rbc.prb points to brush realization structure
POINTL*         pptlBrush)      // Pattern alignment
{
    BYTE*   pjMmBase;
    LONG    xOffset;
    LONG    yOffset;
    ULONG   ulHwForeMix;
    BYTE*   pjSrc;
    BYTE*   pjDst;
    LONG    xPattern;
    LONG    yPattern;
    LONG    xOld;
    LONG    yOld;
    LONG    iLeftShift;
    LONG    iRightShift;
    LONG    i;
    BYTE    j;
    LONG    xLeft;
    ULONG   aulTmp[2];
    WORD*   pwPattern;

    ASSERTDD(ppdev->iAsic == ASIC_68800_6 || ppdev->iAsic == ASIC_68800AX,
             "Wrong ASIC type for monochrome 8x8 patterns");

    pjMmBase = ppdev->pjMmBase;
    xOffset  = ppdev->xOffset;
    yOffset  = ppdev->yOffset;

    xPattern = (pptlBrush->x + xOffset) & 7;
    yPattern = (pptlBrush->y + yOffset) & 7;

    // If the alignment isn't correct, we'll have to change it:

    if ((xPattern != rbc.prb->ptlBrush.x) || (yPattern != rbc.prb->ptlBrush.y))
    {
        // Remember that we've changed the alignment on our cached brush:

        xOld = rbc.prb->ptlBrush.x;
        yOld = rbc.prb->ptlBrush.y;

        rbc.prb->ptlBrush.x = xPattern;
        rbc.prb->ptlBrush.y = yPattern;

        // Now do the alignment:

        yPattern    = (yOld - yPattern);
        iRightShift = (xPattern - xOld) & 7;
        iLeftShift  = 8 - iRightShift;

        pjSrc = (BYTE*) &rbc.prb->aulPattern[0];
        pjDst = (BYTE*) &aulTmp[0];

        for (i = 0; i < 8; i++)
        {
            j = *(pjSrc + (yPattern++ & 7));
            *pjDst++ = (j << iLeftShift) | (j >> iRightShift);
        }

        rbc.prb->aulPattern[0] = aulTmp[0];
        rbc.prb->aulPattern[1] = aulTmp[1];
    }

    ulHwForeMix = gaul32HwMixFromRop2[(rop4 >> 2) & 0xf];

    M32_CHECK_FIFO_SPACE(ppdev, pjMmBase, 16);
    M32_OW(pjMmBase, DP_CONFIG,   FG_COLOR_SRC_FG | EXT_MONO_SRC_PATT | DRAW |
                                  WRITE);
    M32_OW(pjMmBase, ALU_FG_FN,   ulHwForeMix);
    M32_OW(pjMmBase, ALU_BG_FN,   ((rop4 & 0xff00) == 0xaa00) ? LEAVE_ALONE
                                                              : ulHwForeMix);

    M32_OW(pjMmBase, FRGD_COLOR,      rbc.prb->ulForeColor);
    M32_OW(pjMmBase, BKGD_COLOR,      rbc.prb->ulBackColor);
    M32_OW(pjMmBase, PATT_LENGTH,     128);
    M32_OW(pjMmBase, PATT_DATA_INDEX, 16);

    pwPattern = (WORD*) &rbc.prb->aulPattern[0];
    M32_OW(pjMmBase, PATT_DATA, *(pwPattern));
    M32_OW(pjMmBase, PATT_DATA, *(pwPattern + 1));
    M32_OW(pjMmBase, PATT_DATA, *(pwPattern + 2));
    M32_OW(pjMmBase, PATT_DATA, *(pwPattern + 3));

    while(TRUE)
    {
        xLeft = xOffset + prcl->left;
        M32_OW(pjMmBase, CUR_X,        xLeft);
        M32_OW(pjMmBase, DEST_X_START, xLeft);
        M32_OW(pjMmBase, DEST_X_END,   xOffset + prcl->right);
        M32_OW(pjMmBase, CUR_Y,        yOffset + prcl->top);
        M32_OW(pjMmBase, DEST_Y_END,   yOffset + prcl->bottom);

        if (--c == 0)
            break;

        prcl++;
        M32_CHECK_FIFO_SPACE(ppdev, pjMmBase, 5);
    }
}

/******************************Public*Routine******************************\
* VOID vM32FillPatColor
*
* This routine uses the pattern hardware to draw a colour patterned list of
* rectangles.
*
* See Blt_DS_PCOL_ENG_IO_F0_D0 and Blt_DS_PCOL_ENG_IO_F0_D1.
*
\**************************************************************************/

VOID vM32FillPatColor(          // Type FNFILL
PDEV*           ppdev,
LONG            c,              // Can't be zero
RECTL*          prcl,           // List of rectangles to be filled, in relative
                                //   coordinates
ULONG           rop4,           // rop4
RBRUSH_COLOR    rbc,            // rbc.prb points to brush realization structure
POINTL*         pptlBrush)      // Pattern alignment
{
    BYTE*   pjMmBase;
    LONG    xOffset;
    LONG    yOffset;
    ULONG   ulHwMix;
    LONG    xLeft;
    LONG    xRight;
    LONG    yTop;
    LONG    cy;
    LONG    cyVenetian;
    LONG    cyRoll;
    WORD*   pwPattern;
    LONG    xPattern;
    LONG    yPattern;

    ASSERTDD(ppdev->iBitmapFormat == BMF_8BPP,
             "Colour patterns work only at 8bpp");

    pjMmBase = ppdev->pjMmBase;
    xOffset  = ppdev->xOffset;
    yOffset  = ppdev->yOffset;

    ulHwMix = gaul32HwMixFromRop2[(rop4 >> 2) & 0xf];

    M32_CHECK_FIFO_SPACE(ppdev, pjMmBase, 9);
    M32_OW(pjMmBase, ALU_FG_FN,    ulHwMix);
    M32_OW(pjMmBase, SRC_Y_DIR,    1);
    M32_OW(pjMmBase, PATT_LENGTH,  7);          // 8 pixel wide pattern

    while (TRUE)
    {
        xLeft  = xOffset + prcl->left;
        xRight = xOffset + prcl->right;
        yTop   = yOffset + prcl->top;
        cy     = prcl->bottom - prcl->top;

        xPattern = (xLeft - pptlBrush->x - xOffset) & 7;
        yPattern = (yTop  - pptlBrush->y - yOffset) & 7;

        if (ulHwMix == OVERPAINT)
        {
            cyVenetian = min(cy, 8);
            cyRoll     = cy - cyVenetian;
        }
        else
        {
            cyVenetian = cy;
            cyRoll     = 0;
        }

        M32_OW(pjMmBase, DP_CONFIG,    FG_COLOR_SRC_PATT | DATA_WIDTH | DRAW | WRITE);
        M32_OW(pjMmBase, PATT_INDEX,   xPattern);
        M32_OW(pjMmBase, DEST_X_START, xLeft);
        M32_OW(pjMmBase, CUR_X,        xLeft);
        M32_OW(pjMmBase, DEST_X_END,   xRight);
        M32_OW(pjMmBase, CUR_Y,        yTop);

        do {
            // Each scan of the pattern is eight bytes:

            pwPattern = (WORD*) ((BYTE*) &rbc.prb->aulPattern[0]
                      + (yPattern << 3));
            yPattern  = (yPattern + 1) & 7;

            M32_CHECK_FIFO_SPACE(ppdev, pjMmBase, 6);
            M32_OW(pjMmBase, PATT_DATA_INDEX, 0);   // Reset index for download
            M32_OW(pjMmBase, PATT_DATA,  *(pwPattern));
            M32_OW(pjMmBase, PATT_DATA,  *(pwPattern + 1));
            M32_OW(pjMmBase, PATT_DATA,  *(pwPattern + 2));
            M32_OW(pjMmBase, PATT_DATA,  *(pwPattern + 3));
            yTop++;

            vM32QuietDown(ppdev, pjMmBase);

            M32_OW(pjMmBase, DEST_Y_END, yTop);

        } while (--cyVenetian != 0);

        if (cyRoll != 0)
        {
            // When the ROP is PATCOPY, we can take advantage of the fact
            // that we've just laid down an entire row of the pattern, and
            // can do a 'rolling' screen-to-screen blt to draw the rest:

            M32_CHECK_FIFO_SPACE(ppdev, pjMmBase,    7);
            M32_OW(pjMmBase, DP_CONFIG,       FG_COLOR_SRC_BLIT | DATA_WIDTH |
                                              DRAW | WRITE);
            M32_OW(pjMmBase, M32_SRC_X,       xLeft);
            M32_OW(pjMmBase, M32_SRC_X_START, xLeft);
            M32_OW(pjMmBase, M32_SRC_X_END,   xRight);
            M32_OW(pjMmBase, M32_SRC_Y,       yTop - 8);
            M32_OW(pjMmBase, CUR_Y,           yTop);

            vM32QuietDown(ppdev, pjMmBase);

            M32_OW(pjMmBase, DEST_Y_END,      yTop + cyRoll);
        }

        if (--c == 0)
            break;

        prcl++;
        M32_CHECK_FIFO_SPACE(ppdev, pjMmBase, 6);
    }
}

/******************************Public*Routine******************************\
* VOID vM32Xfer1bpp
*
* This routine colour expands a monochrome bitmap, possibly with different
* Rop2's for the foreground and background.  It will be called in the
* following cases:
*
* 1) To colour-expand the monochrome text buffer for the vFastText routine.
* 2) To blt a 1bpp source with a simple Rop2 between the source and
*    destination.
* 3) To blt a true Rop3 when the source is a 1bpp bitmap that expands to
*    white and black, and the pattern is a solid colour.
* 4) To handle a true Rop4 that works out to be Rop2's between the pattern
*    and destination.
*
* Needless to say, making this routine fast can leverage a lot of
* performance.
*
\**************************************************************************/

VOID vM32Xfer1bpp(      // Type FNXFER
PDEV*       ppdev,
LONG        c,          // Count of rectangles, can't be zero
RECTL*      prcl,       // List of destination rectangles, in relative
                        //   coordinates
ROP4        rop4,       // rop4
SURFOBJ*    psoSrc,     // Source surface
POINTL*     pptlSrc,    // Original unclipped source point
RECTL*      prclDst,    // Original unclipped destination rectangle
XLATEOBJ*   pxlo)       // Translate that provides colour-expansion information
{
    BYTE*   pjMmBase;
    LONG    xOffset;
    LONG    yOffset;
    ULONG*  pulXlate;
    ULONG   ulHwForeMix;
    LONG    dx;
    LONG    dy;
    LONG    lSrcDelta;
    BYTE*   pjSrcScan0;
    LONG    xLeft;
    LONG    xRight;
    LONG    yTop;
    LONG    cy;
    LONG    cx;
    LONG    xBias;
    LONG    culScan;
    LONG    lSrcSkip;
    ULONG*  pulSrc;
    LONG    i;
    ULONG   ulFifo;

    ASSERTDD(c > 0, "Can't handle zero rectangles");
    ASSERTDD(((rop4 & 0xff00) >> 8) == (rop4 & 0xff),
             "Expect only a rop2");

    pjMmBase = ppdev->pjMmBase;
    xOffset  = ppdev->xOffset;
    yOffset  = ppdev->yOffset;
    ulFifo   = 0;

    ulHwForeMix = gaul32HwMixFromRop2[rop4 & 0xf];
    pulXlate    = pxlo->pulXlate;
    M32_CHECK_FIFO_SPACE(ppdev, pjMmBase, 12);
    M32_OW(pjMmBase, DP_CONFIG, (WORD)(FG_COLOR_SRC_FG | BG_COLOR_SRC_BG | BIT16 |
                            EXT_MONO_SRC_HOST | DRAW | WRITE | LSB_FIRST) );
    M32_OW(pjMmBase, ALU_FG_FN, (WORD) ulHwForeMix );
    M32_OW(pjMmBase, ALU_BG_FN, (WORD) ulHwForeMix );
    M32_OW(pjMmBase, BKGD_COLOR, (WORD) pulXlate[0]);
    M32_OW(pjMmBase, FRGD_COLOR, (WORD) pulXlate[1]);

    dx = pptlSrc->x - prclDst->left;
    dy = pptlSrc->y - prclDst->top;

    lSrcDelta  = psoSrc->lDelta;
    pjSrcScan0 = psoSrc->pvScan0;


    while (TRUE)
    {
        xLeft  = prcl->left;
        xRight = prcl->right;

        // The Mach32 'bit packs' monochrome transfers, but GDI gives
        // us monochrome bitmaps whose scans are always dword aligned.
        // Consequently, we use the Mach32's clip registers to make
        // our transfers a multiple of 32 to match the dword alignment:

        M32_OW(pjMmBase, EXT_SCISSOR_L, (SHORT) (xLeft + xOffset) );
        M32_OW(pjMmBase, EXT_SCISSOR_R, (SHORT) (xRight + xOffset - 1) );

        yTop = prcl->top;
        cy   = prcl->bottom - yTop;

        xBias  = (xLeft + dx) & 31;             // Floor
        xLeft -= xBias;
        cx     = (xRight - xLeft + 31) & ~31;   // Ceiling

        M32_OW(pjMmBase, CUR_X,        (WORD) xLeft + xOffset );
        M32_OW(pjMmBase, DEST_X_START, (WORD) xLeft + xOffset );
        M32_OW(pjMmBase, DEST_X_END,   (WORD) (xLeft + xOffset + cx)  );
        M32_OW(pjMmBase, CUR_Y,        (WORD) yTop  + yOffset  );

        M32_OW(pjMmBase, DEST_Y_END, (WORD) (yTop + yOffset + cy) );

        pulSrc   = (ULONG*) (pjSrcScan0 + (yTop + dy) * lSrcDelta
                                        + ((xLeft + dx) >> 3));
        culScan  = cx >> 5;
        lSrcSkip = lSrcDelta - (culScan << 2);

        ASSERTDD(((ULONG_PTR)pulSrc & 3) == 0, "Source should be dword aligned");

        do {
            i = culScan;
            do {
                M32_CHECK_FIFO_SPACE(ppdev, pjMmBase, 2);
                M32_OW(pjMmBase, PIX_TRANS, *((USHORT*) pulSrc) );
                M32_OW(pjMmBase, PIX_TRANS, *((USHORT*) pulSrc + 1) );
                pulSrc++;

            } while (--i != 0);

            pulSrc = (ULONG*) ((BYTE*) pulSrc + lSrcSkip);

        } while (--cy != 0);

        if (--c == 0)
            break;

        prcl++;
        M32_CHECK_FIFO_SPACE(ppdev, pjMmBase, 7);
    }

    // Don't forget to reset the clip register:

    M32_CHECK_FIFO_SPACE(ppdev, pjMmBase, 2);
    M32_OW(pjMmBase, EXT_SCISSOR_L, (SHORT) 0 );
    M32_OW(pjMmBase, EXT_SCISSOR_R, (SHORT) M32_MAX_SCISSOR );
}

/******************************Public*Routine******************************\
* VOID vM32XferNative
*
* Transfers a bitmap that is the same colour depth as the display to
* the screen via the data transfer register, with no translation.
*
\**************************************************************************/

VOID vM32XferNative(    // Type FNXFER
PDEV*       ppdev,
LONG        c,          // Count of rectangles, can't be zero
RECTL*      prcl,       // Array of relative coordinates destination rectangles
ULONG       rop4,       // rop4
SURFOBJ*    psoSrc,     // Source surface
POINTL*     pptlSrc,    // Original unclipped source point
RECTL*      prclDst,    // Original unclipped destination rectangle
XLATEOBJ*   pxlo)       // Not used
{
    BYTE*   pjMmBase;
    LONG    xOffset;
    LONG    yOffset;
    ULONG   ulHwForeMix;
    LONG    dx;
    LONG    dy;
    LONG    lSrcDelta;
    BYTE*   pjSrcScan0;
    LONG    xLeft;
    LONG    xRight;
    LONG    yTop;
    LONG    cy;
    LONG    cx;
    LONG    xBias;
    ULONG*  pulSrc;
    ULONG   culScan;
    LONG    lSrcSkip;
    LONG    i;
    ULONG   ulFifo;

    ASSERTDD(c > 0, "Can't handle zero rectangles");
    ASSERTDD(((rop4 & 0xff00) >> 8) == (rop4 & 0xff),
             "Expect only a rop2");

    pjMmBase = ppdev->pjMmBase;
    xOffset  = ppdev->xOffset;
    yOffset  = ppdev->yOffset;
    ulFifo   = 0;

    ulHwForeMix = gaul32HwMixFromRop2[rop4 & 0xf];
    M32_CHECK_FIFO_SPACE(ppdev, pjMmBase, 10);
    M32_OW(pjMmBase, DP_CONFIG, (WORD)(FG_COLOR_SRC_HOST | BIT16 |
                            DRAW | WRITE | LSB_FIRST) );
    M32_OW(pjMmBase, ALU_FG_FN, (WORD) ulHwForeMix );
    M32_OW(pjMmBase, ALU_BG_FN, (WORD) ulHwForeMix );

    dx = pptlSrc->x - prclDst->left;
    dy = pptlSrc->y - prclDst->top;

    lSrcDelta  = psoSrc->lDelta;
    pjSrcScan0 = psoSrc->pvScan0;


    while (TRUE)
    {
        xLeft  = prcl->left;
        xRight = prcl->right;

        M32_OW(pjMmBase, EXT_SCISSOR_L, (SHORT) (xLeft + xOffset) );
        M32_OW(pjMmBase, EXT_SCISSOR_R, (SHORT) (xRight + xOffset - 1) );

        yTop = prcl->top;
        cy   = prcl->bottom - yTop;

        // We compute 'xBias' in order to dword-align the source pointer.
        // This way, we don't have to do unaligned reads of the source,
        // and we're guaranteed not to read even a byte past the end of
        // the bitmap.
        //
        // Note that this bias works at 24bpp, too:

        xBias  = (xLeft + dx) & 3;              // Floor
        xLeft -= xBias;
        cx     = (xRight - xLeft + 3) & ~3;     // Ceiling

        M32_OW(pjMmBase, CUR_X,        (WORD) xLeft + xOffset );
        M32_OW(pjMmBase, DEST_X_START, (WORD) xLeft + xOffset );
        M32_OW(pjMmBase, DEST_X_END,   (WORD) (xLeft + xOffset + cx)  );
        M32_OW(pjMmBase, CUR_Y,        (WORD) yTop  + yOffset  );

        M32_OW(pjMmBase, DEST_Y_END, (WORD) (yTop + yOffset + cy) );

        pulSrc   = (ULONG*) (pjSrcScan0 + (yTop + dy) * lSrcDelta
                                        + ((xLeft + dx) * ppdev->cjPelSize));
        culScan  = (cx * ppdev->cjPelSize) >> 2;
        lSrcSkip = lSrcDelta - (culScan << 2);

        ASSERTDD(((ULONG_PTR)pulSrc & 3) == 0, "Source should be dword aligned");

        do {
            i = culScan;
            do {
                M32_CHECK_FIFO_SPACE(ppdev, pjMmBase, 2);
                M32_OW(pjMmBase, PIX_TRANS, *((USHORT*) pulSrc) );
                M32_OW(pjMmBase, PIX_TRANS, *((USHORT*) pulSrc + 1) );
                pulSrc++;

            } while (--i != 0);

            pulSrc = (ULONG*) ((BYTE*) pulSrc + lSrcSkip);

        } while (--cy != 0);

        if (--c == 0)
            break;

        prcl++;
        M32_CHECK_FIFO_SPACE(ppdev, pjMmBase, 7);
    }

    // Don't forget to reset the clip register:

    M32_CHECK_FIFO_SPACE(ppdev, pjMmBase, 2);
    M32_OW(pjMmBase, EXT_SCISSOR_L, (SHORT) 0 );
    M32_OW(pjMmBase, EXT_SCISSOR_R, (SHORT) M32_MAX_SCISSOR );
}

/******************************Public*Routine******************************\
* VOID vM32Xfer4bpp
*
* Does a 4bpp transfer from a bitmap to the screen.
*
* The reason we implement this is that a lot of resources are kept as 4bpp,
* and used to initialize DFBs, some of which we of course keep off-screen.
*
\**************************************************************************/

VOID vM32Xfer4bpp(      // Type FNXFER
PDEV*       ppdev,
LONG        c,          // Count of rectangles, can't be zero
RECTL*      prcl,       // List of destination rectangles, in relative
                        //   coordinates
ULONG       rop4,       // Rop4
SURFOBJ*    psoSrc,     // Source surface
POINTL*     pptlSrc,    // Original unclipped source point
RECTL*      prclDst,    // Original unclipped destination rectangle
XLATEOBJ*   pxlo)       // Translate that provides colour-expansion information
{
    BYTE*   pjMmBase;
    LONG    xOffset;
    LONG    yOffset;
    LONG    cjPelSize;
    ULONG   ulHwForeMix;
    LONG    xLeft;
    LONG    xRight;
    LONG    yTop;
    LONG    xBias;
    LONG    dx;
    LONG    dy;
    LONG    cx;
    LONG    cy;
    LONG    lSrcDelta;
    BYTE*   pjSrcScan0;
    BYTE*   pjSrc;
    BYTE    jSrc;
    ULONG*  pulXlate;
    LONG    i;
    USHORT  uw;
    LONG    cjSrc;
    LONG    lSrcSkip;
    ULONG   ulFifo;

    ASSERTDD(psoSrc->iBitmapFormat == BMF_4BPP, "Source must be 4bpp");
    ASSERTDD(c > 0, "Can't handle zero rectangles");
    ASSERTDD(ppdev->iBitmapFormat != BMF_24BPP, "Can't handle 24bpp");

    pjMmBase  = ppdev->pjMmBase;
    xOffset   = ppdev->xOffset;
    yOffset   = ppdev->yOffset;
    cjPelSize = ppdev->cjPelSize;
    pulXlate  = pxlo->pulXlate;
    ulFifo    = 0;

    dx = pptlSrc->x - prclDst->left;
    dy = pptlSrc->y - prclDst->top;     // Add to destination to get source

    lSrcDelta  = psoSrc->lDelta;
    pjSrcScan0 = psoSrc->pvScan0;

    ulHwForeMix = gaul32HwMixFromRop2[rop4 & 0xf];
    M32_CHECK_FIFO_SPACE(ppdev, pjMmBase, 10);
    M32_OW(pjMmBase, DP_CONFIG, (WORD)(FG_COLOR_SRC_HOST | BIT16 |
                            DRAW | WRITE | LSB_FIRST) );
    M32_OW(pjMmBase, ALU_FG_FN, (WORD) ulHwForeMix );
    M32_OW(pjMmBase, ALU_BG_FN, (WORD) ulHwForeMix );


    while(TRUE)
    {
        xLeft  = prcl->left;
        xRight = prcl->right;

        M32_OW(pjMmBase, EXT_SCISSOR_L, (SHORT) (xLeft + xOffset) );
        M32_OW(pjMmBase, EXT_SCISSOR_R, (SHORT) (xRight + xOffset - 1) );

        yTop = prcl->top;
        cy   = prcl->bottom - yTop;

        // We compute 'xBias' in order to dword-align the source pointer.
        // This way, we don't have to do unaligned reads of the source,
        // and we're guaranteed not to read even a byte past the end of
        // the bitmap.
        //
        // Note that this bias works at 24bpp, too:

        xBias  = (xLeft + dx) & 3;              // Floor
        xLeft -= xBias;
        cx     = (xRight - xLeft + 3) & ~3;     // Ceiling

        M32_OW(pjMmBase, CUR_X,        (WORD) xLeft + xOffset );
        M32_OW(pjMmBase, DEST_X_START, (WORD) xLeft + xOffset );
        M32_OW(pjMmBase, DEST_X_END,   (WORD) (xLeft + xOffset + cx)  );
        M32_OW(pjMmBase, CUR_Y,        (WORD) yTop  + yOffset  );

        M32_OW(pjMmBase, DEST_Y_END, (WORD) (yTop + yOffset + cy) );

        pjSrc    = pjSrcScan0 + (yTop + dy) * lSrcDelta
                              + ((xLeft + dx) >> 1);
        cjSrc    = cx >> 1;         // Number of source bytes touched
        lSrcSkip = lSrcDelta - cjSrc;

        if (cjPelSize == 1)
        {
            // This part handles 8bpp output:

            do {
                i = cjSrc;
                do {
                    jSrc = *pjSrc++;
                    uw   = (USHORT) (pulXlate[jSrc >> 4]);
                    uw  |= (USHORT) (pulXlate[jSrc & 0xf] << 8);
                    M32_CHECK_FIFO_SPACE(ppdev, pjMmBase, 1);
                    M32_OW(pjMmBase, PIX_TRANS, uw );
                } while (--i != 0);

                pjSrc += lSrcSkip;
            } while (--cy != 0);
        }
        else if (cjPelSize == 2)
        {
            // This part handles 16bpp output:

            do {
                i = cjSrc;
                do {
                    M32_CHECK_FIFO_SPACE(ppdev, pjMmBase, 2);
                    jSrc = *pjSrc++;
                    uw   = (USHORT) (pulXlate[jSrc >> 4]);
                    M32_OW(pjMmBase, PIX_TRANS, uw );
                    uw   = (USHORT) (pulXlate[jSrc & 0xf]);
                    M32_OW(pjMmBase, PIX_TRANS, uw );
                } while (--i != 0);

                pjSrc += lSrcSkip;
            } while (--cy != 0);
        }

        if (--c == 0)
            break;

        prcl++;
        M32_CHECK_FIFO_SPACE(ppdev, pjMmBase, 7);
    }

    // Don't forget to reset the clip register:

    M32_CHECK_FIFO_SPACE(ppdev, pjMmBase, 2);
    M32_OW(pjMmBase, EXT_SCISSOR_L, (SHORT) 0 );
    M32_OW(pjMmBase, EXT_SCISSOR_R, (SHORT) M32_MAX_SCISSOR );
}

/******************************Public*Routine******************************\
* VOID vM32Xfer8bpp
*
* Does a 8bpp transfer from a bitmap to the screen.
*
* The reason we implement this is that a lot of resources are kept as 8bpp,
* and used to initialize DFBs, some of which we of course keep off-screen.
*
\**************************************************************************/

VOID vM32Xfer8bpp(      // Type FNXFER
PDEV*       ppdev,
LONG        c,          // Count of rectangles, can't be zero
RECTL*      prcl,       // List of destination rectangles, in relative
                        //   coordinates
ULONG       rop4,       // Rop4
SURFOBJ*    psoSrc,     // Source surface
POINTL*     pptlSrc,    // Original unclipped source point
RECTL*      prclDst,    // Original unclipped destination rectangle
XLATEOBJ*   pxlo)       // Translate that provides colour-expansion information
{
    BYTE*   pjMmBase;
    LONG    xOffset;
    LONG    yOffset;
    LONG    cjPelSize;
    ULONG   ulHwForeMix;
    LONG    xLeft;
    LONG    xRight;
    LONG    yTop;
    LONG    xBias;
    LONG    dx;
    LONG    dy;
    LONG    cx;
    LONG    cy;
    LONG    lSrcDelta;
    BYTE*   pjSrcScan0;
    BYTE*   pjSrc;
    ULONG*  pulXlate;
    LONG    i;
    USHORT  uw;
    LONG    cwSrc;
    LONG    cxRem;
    LONG    lSrcSkip;
    ULONG   ulFifo;

    ASSERTDD(psoSrc->iBitmapFormat == BMF_8BPP, "Source must be 8bpp");
    ASSERTDD(c > 0, "Can't handle zero rectangles");
    ASSERTDD(ppdev->iBitmapFormat != BMF_24BPP, "Can't handle 24bpp");

    pjMmBase  = ppdev->pjMmBase;
    xOffset   = ppdev->xOffset;
    yOffset   = ppdev->yOffset;
    cjPelSize = ppdev->cjPelSize;
    pulXlate  = pxlo->pulXlate;
    ulFifo    = 0;

    dx = pptlSrc->x - prclDst->left;
    dy = pptlSrc->y - prclDst->top;     // Add to destination to get source

    lSrcDelta  = psoSrc->lDelta;
    pjSrcScan0 = psoSrc->pvScan0;

    ulHwForeMix = gaul32HwMixFromRop2[rop4 & 0xf];
    M32_CHECK_FIFO_SPACE(ppdev, pjMmBase, 10);
    M32_OW(pjMmBase, DP_CONFIG, (WORD)(FG_COLOR_SRC_HOST | BIT16 |
                            DRAW | WRITE | LSB_FIRST) );
    M32_OW(pjMmBase, ALU_FG_FN, (WORD) ulHwForeMix );
    M32_OW(pjMmBase, ALU_BG_FN, (WORD) ulHwForeMix );


    while(TRUE)
    {
        xLeft  = prcl->left;
        xRight = prcl->right;

        M32_OW(pjMmBase, EXT_SCISSOR_L, (SHORT) (xLeft + xOffset) );
        M32_OW(pjMmBase, EXT_SCISSOR_R, (SHORT) (xRight + xOffset - 1) );

        yTop = prcl->top;
        cy   = prcl->bottom - yTop;

        // We compute 'xBias' in order to dword-align the source pointer.
        // This way, we don't have to do unaligned reads of the source,
        // and we're guaranteed not to read even a byte past the end of
        // the bitmap.
        //
        // Note that this bias works at 24bpp, too:

        xBias  = (xLeft + dx) & 3;              // Floor
        xLeft -= xBias;
        cx     = (xRight - xLeft + 3) & ~3;     // Ceiling

        M32_OW(pjMmBase, CUR_X,        (WORD) xLeft + xOffset );
        M32_OW(pjMmBase, DEST_X_START, (WORD) xLeft + xOffset );
        M32_OW(pjMmBase, DEST_X_END,   (WORD) (xLeft + xOffset + cx)  );
        M32_OW(pjMmBase, CUR_Y,        (WORD) yTop  + yOffset  );

        M32_OW(pjMmBase, DEST_Y_END, (WORD) (yTop + yOffset + cy) );

        pjSrc    = pjSrcScan0 + (yTop + dy) * lSrcDelta
                              + (xLeft + dx);
        lSrcSkip = lSrcDelta - cx;

        if (cjPelSize == 1)
        {
            // This part handles 8bpp output:

            cwSrc = (cx >> 1);
            cxRem = (cx & 1);

            do {
                for (i = cwSrc; i != 0; i--)
                {
                    uw  = (USHORT) (pulXlate[*pjSrc++]);
                    uw |= (USHORT) (pulXlate[*pjSrc++] << 8);
                    M32_CHECK_FIFO_SPACE(ppdev, pjMmBase, 1);
                    M32_OW(pjMmBase, PIX_TRANS, uw );
                }

                if (cxRem > 0)
                {
                    uw  = (USHORT) (pulXlate[*pjSrc++]);
                    M32_CHECK_FIFO_SPACE(ppdev, pjMmBase, 1);
                    M32_OW(pjMmBase, PIX_TRANS, uw );
                }

                pjSrc += lSrcSkip;
            } while (--cy != 0);
        }
        else if (cjPelSize == 2)
        {
            // This part handles 16bpp output:

            do {
                for (i = cx; i != 0; i--)
                {
                    uw  = (USHORT) (pulXlate[*pjSrc++]);
                    M32_CHECK_FIFO_SPACE(ppdev, pjMmBase, 1);
                    M32_OW(pjMmBase, PIX_TRANS, uw );
                }

                pjSrc += lSrcSkip;
            } while (--cy != 0);
        }

        if (--c == 0)
            break;

        prcl++;
        M32_CHECK_FIFO_SPACE(ppdev, pjMmBase, 7);
    }

    // Don't forget to reset the clip register:

    M32_CHECK_FIFO_SPACE(ppdev, pjMmBase, 2);
    M32_OW(pjMmBase, EXT_SCISSOR_L, (SHORT) 0 );
    M32_OW(pjMmBase, EXT_SCISSOR_R, (SHORT) M32_MAX_SCISSOR );
}

/******************************Public*Routine******************************\
* VOID vM32CopyBlt
*
* Does a screen-to-screen blt of a list of rectangles.
*
* See Blt_DS_SS_ENG_IO_D0 and Blt_DS_SS_TLBR_ENG_IO_D1.
*
\**************************************************************************/

VOID vM32CopyBlt(   // Type FNCOPY
PDEV*   ppdev,
LONG    c,          // Can't be zero
RECTL*  prcl,       // Array of relative coordinates destination rectangles
ULONG   rop4,       // rop4
POINTL* pptlSrc,    // Original unclipped source point
RECTL*  prclDst)    // Original unclipped destination rectangle
{
    BYTE*   pjMmBase;
    LONG    xOffset;
    LONG    yOffset;
    LONG    dx;
    LONG    dy;
    LONG    xLeft;
    LONG    yTop;
    LONG    cx;
    LONG    cy;

    ASSERTDD(c > 0, "Can't handle zero rectangles");
    ASSERTDD(((rop4 & 0xff00) >> 8) == (rop4 & 0xff),
             "Expect only a rop2");

    pjMmBase = ppdev->pjMmBase;
    xOffset  = ppdev->xOffset;
    yOffset  = ppdev->yOffset;

    M32_CHECK_FIFO_SPACE(ppdev, pjMmBase, 12);

    M32_OW(pjMmBase, DP_CONFIG, FG_COLOR_SRC_BLIT | DRAW | WRITE);
    M32_OW(pjMmBase, ALU_FG_FN, gaul32HwMixFromRop2[rop4 & 0xf]);

    dx = pptlSrc->x - prclDst->left;
    dy = pptlSrc->y - prclDst->top;

    // The accelerator may not be as fast at doing right-to-left copies, so
    // only do them when the rectangles truly overlap:

    if (!OVERLAP(prclDst, pptlSrc))
    {
        M32_OW(pjMmBase, SRC_Y_DIR, 1);
        goto Top_Down_Left_To_Right;
    }

    M32_OW(pjMmBase, SRC_Y_DIR, (prclDst->top <= pptlSrc->y));

    if (prclDst->top <= pptlSrc->y)
    {
        if (prclDst->left <= pptlSrc->x)
        {

Top_Down_Left_To_Right:

            while (TRUE)
            {
                xLeft = xOffset + prcl->left + dx;  // Destination coordinates
                yTop  = yOffset + prcl->top  + dy;
                cx    = prcl->right - prcl->left;
                cy    = prcl->bottom - prcl->top;

                M32_OW(pjMmBase, M32_SRC_X,        xLeft);
                M32_OW(pjMmBase, M32_SRC_X_START,  xLeft);
                M32_OW(pjMmBase, M32_SRC_X_END,    xLeft + cx);
                M32_OW(pjMmBase, M32_SRC_Y,        yTop);

                xLeft -= dx;                        // Source coordinates
                yTop  -= dy;

                M32_OW(pjMmBase, CUR_X,            xLeft);
                M32_OW(pjMmBase, DEST_X_START,     xLeft);
                M32_OW(pjMmBase, DEST_X_END,       xLeft + cx);
                M32_OW(pjMmBase, CUR_Y,            yTop);

                vM32QuietDown(ppdev, pjMmBase);

                M32_OW(pjMmBase, DEST_Y_END,       yTop + cy);

                if (--c == 0)
                    break;

                prcl++;
                M32_CHECK_FIFO_SPACE(ppdev, pjMmBase, 9);
            }
        }
        else
        {
            while (TRUE)
            {
                xLeft = xOffset + prcl->left + dx;  // Destination coordinates
                yTop  = yOffset + prcl->top  + dy;
                cx    = prcl->right - prcl->left;
                cy    = prcl->bottom - prcl->top;

                M32_OW(pjMmBase, M32_SRC_X,        xLeft + cx);
                M32_OW(pjMmBase, M32_SRC_X_START,  xLeft + cx);
                M32_OW(pjMmBase, M32_SRC_X_END,    xLeft);
                M32_OW(pjMmBase, M32_SRC_Y,        yTop);

                xLeft -= dx;                        // Source coordinates
                yTop  -= dy;

                M32_OW(pjMmBase, CUR_X,            xLeft + cx);
                M32_OW(pjMmBase, DEST_X_START,     xLeft + cx);
                M32_OW(pjMmBase, DEST_X_END,       xLeft);
                M32_OW(pjMmBase, CUR_Y,            yTop);

                vM32QuietDown(ppdev, pjMmBase);

                M32_OW(pjMmBase, DEST_Y_END,       yTop + cy);

                if (--c == 0)
                    break;

                prcl++;
                M32_CHECK_FIFO_SPACE(ppdev, pjMmBase, 9);
            }
        }
    }
    else
    {
        if (prclDst->left <= pptlSrc->x)
        {
            while (TRUE)
            {
                xLeft = xOffset + prcl->left + dx;  // Destination coordinates
                yTop  = yOffset + prcl->top  + dy - 1;
                cx    = prcl->right - prcl->left;
                cy    = prcl->bottom - prcl->top;

                M32_OW(pjMmBase, M32_SRC_X,        xLeft);
                M32_OW(pjMmBase, M32_SRC_X_START,  xLeft);
                M32_OW(pjMmBase, M32_SRC_X_END,    xLeft + cx);
                M32_OW(pjMmBase, M32_SRC_Y,        yTop + cy);

                xLeft -= dx;                        // Source coordinates
                yTop  -= dy;

                M32_OW(pjMmBase, CUR_X,            xLeft);
                M32_OW(pjMmBase, DEST_X_START,     xLeft);
                M32_OW(pjMmBase, DEST_X_END,       xLeft + cx);
                M32_OW(pjMmBase, CUR_Y,            yTop + cy);

                vM32QuietDown(ppdev, pjMmBase);

                M32_OW(pjMmBase, DEST_Y_END,       yTop);

                if (--c == 0)
                    break;

                prcl++;
                M32_CHECK_FIFO_SPACE(ppdev, pjMmBase, 9);
            }
        }
        else
        {
            while (TRUE)
            {
                xLeft = xOffset + prcl->left + dx;  // Destination coordinates
                yTop  = yOffset + prcl->top  + dy - 1;
                cx    = prcl->right - prcl->left;
                cy    = prcl->bottom - prcl->top;

                M32_OW(pjMmBase, M32_SRC_X,        xLeft + cx);
                M32_OW(pjMmBase, M32_SRC_X_START,  xLeft + cx);
                M32_OW(pjMmBase, M32_SRC_X_END,    xLeft);
                M32_OW(pjMmBase, M32_SRC_Y,        yTop + cy);

                xLeft -= dx;                        // Source coordinates
                yTop  -= dy;

                M32_OW(pjMmBase, CUR_X,            xLeft + cx);
                M32_OW(pjMmBase, DEST_X_START,     xLeft + cx);
                M32_OW(pjMmBase, DEST_X_END,       xLeft);
                M32_OW(pjMmBase, CUR_Y,            yTop + cy);

                vM32QuietDown(ppdev, pjMmBase);

                M32_OW(pjMmBase, DEST_Y_END,       yTop);

                if (--c == 0)
                    break;

                prcl++;
                M32_CHECK_FIFO_SPACE(ppdev, pjMmBase, 9);
            }
        }
    }
}