windows-server-2003/printscan/print/drivers/usermode/unidrv2/render/transpos.c

/*++

Copyright (c) 1996 - 1999  Microsoft Corporation

Module Name:

    raster.c

Abstract:

    The module contains the functions associated with transposing bitmaps.
    This includes rotation of 1, 4, 8, and 24 bit formats as well as special
    transformations of color formats for planar of vertical head devices.

Environment:

    Windows NT Unidrv driver

Revision History:

    12/15/96 -alvins-
        Created

--*/
#include        "raster.h"
#include        "rmrender.h"


/*
 *   The transpose table:  maps one byte into two longs,  such that the
 * 8 bits of the byte turn into 64 bits: each bit of the original is
 * turned into one byte of output.
 *   THUS:
 * Input byte:   hgfedcba
 *   transposes into output bytes:
 *      0000000a  0000000b  0000000c  0000000d
 *      0000000e  0000000f  0000000g  0000000h
 *
 *   The table is allocated at DrvEnableSurface time,  thus ensuring that
 *  we do not allocate memory that we are not going to use.
 */

#define TABLE_SIZE      (256 * 2 * sizeof( DWORD ))

/*
 *   We also need a similar table for colour separation.  This one
 *  consists of 256 DWORDs,  and is used to split the RGB(K) format
 *  input byte into an output DWORD with the two R bits in one byte,
 *  the two G bits in the next byte etc.  Used for single pin colour
 *  printers,  like the HP PaintJet.
 *   The table is generated according to the following rule:
 *
 *  INPUT BYTE:  KRGBkrgb
 *
 *  OUTPUT DWORD:  000000Kk 000000Rr 000000Gg 000000Bb
 */

#define SEP_TABLE_SIZE  (256 * sizeof( DWORD ))


//*******************************************************
BOOL
bInitTrans (
    PDEV *pPDev
    )
/*++

Routine Description:

    This function initializes the transpose tables.  This is done to make
    the table independent of whether the processor is big endian or little
    endian since the data is generated by the processor that is going
    to use it!

Arguments:

    pPDev           Pointer to PDEV structure

Return Value:

    TRUE for success and FALSE for failure (MemAlloc failure)

--*/
{
    /*
     *   Function to generate the transposition table.  There is nothing
     * difficult about generating the table.  The only trick is the use
     * of the union.  This allows us to setup a DWORD table with the
     * byte ordering of the hardware on which we are running.  This is
     * achieved by writing the data into the BYTE entry,  then using
     * the same memory as a DWORD to be put away into memory.  The reason
     * for using DWORDS is to get maximum benefit from memory references
     * in the inner loop of the transpose functions.
     *   Note that the 8/24 bits per pel case is special, as we are shuffling
     * bytes around, and thus do not need any tables.  For this case,
     * return TRUE without allocating any storage.
     */

    register  DWORD   *pdw;
    register  int   iShift,  j;

    int    i;
    PRASTERPDEV pRPDev = pPDev->pRasterPDEV;

    union
    {
        BYTE   b[ 8 ];          /* Exactly 64 bits */
        DWORD  dw[ 2 ];         /* Also exactly 64 bits */
    } u;


    if( pRPDev->sDevBPP == 8 || pRPDev->sDevBPP == 24)
    {
        pRPDev->pdwTrans = NULL;

        return   TRUE;              /* Byte operations - no table needed */
    }

    if( !(pRPDev->pdwTrans = (DWORD *)MemAlloc( TABLE_SIZE )) )
        return  FALSE;


    pdw = pRPDev->pdwTrans;              /* Speedier access */


    /*
     *   Colour requires different tables,  as the pixel data consists of
     * 4 bits which need to move in a single group.
     */

    if( pRPDev->fDump & RES_DM_COLOR )
    {
        /*
         *   First generate the landscape to portrait transpose data.
         *  The only complication is maintaining 4 bit nibbles as a single
         *  entity.
         */
        u.dw[0] = 0;
        for (iShift = 0; iShift < 256; iShift++)
        {
            u.b[1] = (BYTE)((iShift >> 4) & 0x0f);
            u.b[3] = (BYTE)(iShift & 0x0f);
            *pdw = u.dw[0];
            *(pdw+1) = u.dw[0] << 4;
            pdw += 2;
        }
        /*
         * There is an additional transpose operation that requires
         * 4 bit pixel data be transformed to another format.
         */
        pRPDev->pdwColrSep = (DWORD *)MemAlloc( (pRPDev->fDump & RES_DM_GDI) ?
                                                        SEP_TABLE_SIZE : TABLE_SIZE );
        if( pRPDev->pdwColrSep == NULL )
        {
            MemFree((LPSTR)pRPDev->pdwTrans );
            pRPDev->pdwTrans = 0;

            return   FALSE;
        }

        pdw = pRPDev->pdwColrSep;    /* Speedier access */

        if( pRPDev->fDump & RES_DM_GDI )
        {

           /*
            *  HP Paintjet type devices require separating the RGB pixels
            *  (2 per colour per byte) into bytes where the two bits for
            *  each color are consecutive.
            */
            for( i = 0; i <= 0xff; i++ )
            {
                u.dw[ 0 ] = 0;

                iShift = i;
                if (!(pRPDev->fColorFormat & DC_OEM_BLACK))
                {
                    //
                    // if required combine the RGB to CMY(K) conversion
                    //
                    if ( !(pRPDev->fColorFormat & DC_PRIMARY_RGB))
                    {
                        iShift = (~iShift) & 0x77;
                        if (pRPDev->fColorFormat & DC_EXTRACT_BLK)
                        {
                            if( (iShift & 0x07) == 0x07 )
                                iShift = (iShift & ~0x07) | 0x08;

                            if( (iShift & 0x70) == 0x70 )
                                iShift = (iShift & ~0x70) | 0x80;
                        }
                    }
                    else
                        iShift &= 0x77;
                }

                /*   The two bits Bb  */
                u.b[ 3 ] = (BYTE)(((iShift >> 3) & 0x02) | (iShift & 0x1));
                iShift >>= 1;


                /*   The two bits Gg  */
                u.b[ 2 ] = (BYTE)(((iShift >> 3) & 0x02) | (iShift & 0x1));
                iShift >>= 1;


                /*   The two bits Rr  */
                u.b[ 1 ] = (BYTE)(((iShift >> 3) & 0x02) | (iShift & 0x1));
                iShift >>= 1;


                /*   The two bits Kk  */
                u.b[ 0 ] = (BYTE)(((iShift >> 3) & 0x02) | (iShift & 0x1));

                *pdw++ = u.dw[ 0 ];             /* Safe for posterity */
            }
        }
        else
        {
            /*
             *   The dot matrix case.  Here we will call the relevant
             * transpose function,  but use the modified table below.  This
             * table will do the colour separation,  and will result in the
             * transpose operation splitting up the data for each head pass.
             */

            for( i = 0; i <= 0xff; i++ )
            {
                /*  Each bit of i goes into one byte of the output  */
                u.dw[ 0 ] = 0;
                u.dw[ 1 ] = 0;

                iShift = i;
                if (!(pRPDev->fColorFormat & DC_OEM_BLACK))
                {
                    //
                    // if required combine the RGB to CMY(K) conversion
                    //
                    if ( !(pRPDev->fColorFormat & DC_PRIMARY_RGB))
                    {
                        iShift = (~iShift) & 0x77;
                        if (pRPDev->fColorFormat & DC_EXTRACT_BLK)
                        {
                            if( (iShift & 0x07) == 0x07 )
                                iShift = (iShift & ~0x07) | 0x08;

                            if( (iShift & 0x70) == 0x70 )
                                iShift = (iShift & ~0x70) | 0x80;
                        }
                    }
                    else
                        iShift &= 0x77;
                }

                for( j = 8; --j >= 0; )
                {
                    u.b[ j ] = (BYTE)(iShift & 0x1);
                    iShift >>= 1;
                }

                /*   Store the result  */
                *pdw = u.dw[0];
                *(pdw+1) = u.dw[1];
                pdw += 2;
            }
        }
    }
    else
    {
        /*
         *   Monochrome case - simple transpositions.
         */

        for( i = 0; i <= 0xff; i++ )
        {
            /*  Each bit of i goes into one byte of the output  */
            iShift = i;
            u.dw[ 0 ] = 0;
            u.dw[ 1 ] = 0;

            for( j = 8; --j >= 0; )
            {
                u.b[ j ] = (BYTE)(iShift & 0x1);
                iShift >>= 1;
            }

            /*   Store the result  */
            *pdw = u.dw[0];
            *(pdw+1) = u.dw[1];
            pdw += 2;
        }
    }

    return  TRUE;
}

//*******************************************************
void
vTrans8x8 (
    BYTE  *pbIn,
    RENDER  *pRData
    )
/*++

Routine Description:

    Function to transpose the input array into the output array,
    where the input data is to be considered 8 rows of bitmap data,
    and the output area is dword aligned.

Arguments:

    pbIn        Pointer to input data buffer to transform
    pRData      Pointer to render structure containing all the
                necessary information about transforming

Return Value:

    none

--*/
{
    /*
     *    The technique is quite simple,  though not necessarily obvious.
     *  Take an 8 scan line by 8 bits block of data,  and transform it
     *  into 8 bytes with bits in the scan line order,  rather than
     *  along the scan line as supplied.
     *    To do this as quickly as possible, each byte to be converted
     *  is used as an index into a lookup table;  each table entry is
     *  64 bits long (a pair of longs above).  These 64 bits are ORed
     *  with the running total of 64 bits (the two variables, dw0, dw1);
     *  shift the running total one bit left.  Repeat this operation
     *  for the corresponding byte in the next scan line - this is
     *  the new table lookup index.  Repeat for all 8 bytes in the 8
     *  scan lines being processed.  Store the 64 bit temporary results
     *  in the output dword array.  Move to the next byte in the
     *  scan line,  and repeat the loop for this column.
     */

    register  DWORD  dw0,  dw1;         /* Inner loop temporaries */
    register  BYTE  *pbTemp;
    register  DWORD *pdw;

    register  int    cbLine;            /* Bytes per line in scan data */
    register  int    i;                 /* Loop variable. */


    int      iWide;                     /* Pixels across the bitmap */
    DWORD   *pdwOut;                    /* Destination */
    DWORD   *pdwTrans;                  /* Local copy of output buffer */


    /*
     *   Some initialisation:  byte count,  area limits, etc.
     */


    cbLine = pRData->cbTLine;
    pdwOut = pRData->pvTransBuf;
    pdwTrans = pRData->Trans.pdwTransTab;

    if( pRData->iTransHigh != 8 )
    {
        /*  This can happen at the end of a page. */

        vTrans8N( pbIn,  pRData );

        return;
    }


    /*
     *   Scan across the lines in groups of 8 bits.  In the case that the
     *  input is not a multiple of 8,  we will produce a few extra
     *  bytes at the end;  the caller should allow for this when allocating
     *  storage for pdwOut.  The consequence is that the last few
     *  bytes will contain garbage; presumably the caller will not
     *  process them further.
     */

    for( iWide = pRData->iTransWide; iWide > 0; iWide -= BBITS )
    {
        dw0 = 0;
        dw1 = 0;

        /*
         *   Loop DOWN the scanlines at the starting byte location,
         * generating the transposed data as we go.
         */

        for( i = BBITS, pbTemp = pbIn++; --i >= 0; pbTemp += cbLine )
        {
            dw0 <<= 1;
            dw1 <<= 1;
            pdw = pdwTrans + (*pbTemp << 1);
            dw0 |= *pdw;
            dw1 |= *(pdw + 1);
        }

        /*   Store the two temporary values in the output buffer. */
        *pdwOut = dw0;
        *(pdwOut + 1) = dw1;
        pdwOut += 2;
    }

    return;
}

//*******************************************************
void
vTrans8N (
    BYTE  *pbIn,
    RENDER  *pRData
    )
/*++

Routine Description:

    Function to transpose the input array into the output array,
    where the input data is to be considered N rows of bitmap data,
    and the output area is byte aligned.

Arguments:

    pbIn        Pointer to input data buffer to transform
    pRData      Pointer to render structure containing all the
                necessary information about transforming

Return Value:

    none

--*/
{
    /*
     *    The technique is quite simple,  though not necessarily obvious.
     *  Take an 8 scan line by 8 bits block of data,  and transform it
     *  into 8 bytes with bits in the scan line order,  rather than
     *  along the scan line as supplied.
     *    To do this as quickly as possible, each byte to be converted
     *  is used as an index into a lookup table;  each table entry is
     *  64 bits long (a pair of longs above).  These 64 bits are ORed
     *  with the running total of 64 bits (the two variables, dw0, dw1);
     *  shift the running total one bit left.  Repeat this operation
     *  for the corresponding byte in the next scan line - this is
     *  the new table lookup index.  Repeat for all 8 bytes in the 8
     *  scan lines being processed.  Store the 64 bit temporary results
     *  in the output dword array.  Move to the next byte in the
     *  scan line,  and repeat the loop for this column.
     *    This function is based on the special 8 X 8 case (vTrans8x8).
     *  The significant differences are that the transposed data needs
     *  to be written byte at a time (instead of DWORD at a time),
     *  and that there are N scan lines to convert in each loop.
     */

    DWORD  dw0,  dw1;         /* Inner loop temporaries */
    BYTE  *pbTemp;
    DWORD *pdw;
    int    cbLine;            /* Bytes per line in scan data */
    int    i;                 /* Loop variable. */
    int    iBand;             /* For moving down the scan lines */

    int      iSkip;                     /* Output interleave factor */
    int      iWide;                     /* Pixels across the bitmap */

    BYTE    *pbOut;                     /* Destination, local copy */
    BYTE    *pbBase;                    /* Start addr of 8 scan line group */
    BYTE    *pbOutTmp;                  /* For output loop */

    DWORD   *pdwTrans;                  /* Speedier access */
    BOOL    bOptimize = FALSE;


    /*
     *   Set up the local variables from the RENDER structure passed in.
     */

    cbLine = pRData->cbTLine;
    iSkip = pRData->iTransSkip;
    pbOut = pRData->pvTransBuf;                 /* Reserved for us! */
    pdwTrans = pRData->Trans.pdwTransTab;

    // if the translation table isn't inverting bits and the rows are DWORD aligned
    // we can optimize the algorithm by initializing everything to white
    // and then skipping the rotation of 32x8 white areas.
    //
    if (pdwTrans[0] == 0 && !(cbLine & 3) && pRData->iPassHigh == 1)
    {
        bOptimize = TRUE;
        FillMemory (pbOut, pRData->iTransWide * iSkip, 0xff);
    }
    /*
     *     To ease MMU thrashing,  we scan ACROSS the bitmap in 8 line
     *  groups.  This results in closer memory references,  and so less
     *  page faults and so faster execution.  Hence,  the outer most loop
     *  loops DOWN the scanlines.  The next inner loop scans across groups
     *  of 8 scan lines at a time,  while the inner most loop transposes
     *  one byte by 8 scan lines of bitmap image.
     *     Note that processing the data this way causes a slight increase
     *  in scattered memory addresses when writing the output data.
     *  There is no way to avoid one or the other memory references being
     *  scattered;  however,  the output area is smaller than the input
     *  input,  so scattering here will be less severe to the MMU.
     */
    for( iBand = pRData->iTransHigh; iBand >= BBITS; iBand -= BBITS )
    {
        /*
         *    Have selected the next group of 8 scan lines to process,
         *  so scan from left to right,  transposing data in 8 x 8 bit
         *  groups.  This is the size that can be done very quickly with
         *  a 32 bit environment.
         */

        pbBase = pbIn;
        pbIn += BBITS * cbLine;         /* Next address */

        pbOutTmp = pbOut;
        ++pbOut;                /* Onto the next byte sequence */

        for( iWide = pRData->iTransWide; iWide > 0; iWide -= BBITS )
        {
            //
            // White space optimization. If input data is white then
            // we don't have to output rotated data since the
            // output buffer is already initialized to white.
            //
            if (bOptimize)
            {
                // test whether we are dword aligned so we can check
                // 32x8 area for white
                if (!((ULONG_PTR)pbBase & 3) && iWide >= DWBITS)
                {
                    if (*(DWORD *)&pbBase[0] == -1 &&
                        *(DWORD *)&pbBase[cbLine] == -1 &&
                        *(DWORD *)&pbBase[cbLine*2] == -1 &&
                        *(DWORD *)&pbBase[cbLine*3] == -1 &&
                        *(DWORD *)&pbBase[cbLine*4] == -1 &&
                        *(DWORD *)&pbBase[cbLine*5] == -1 &&
                        *(DWORD *)&pbBase[cbLine*6] == -1 &&
                        *(DWORD *)&pbBase[cbLine*7] == -1)
                    {
                        pbBase += 4;
                        iWide -= BBITS * 3;
                        pbOutTmp += iSkip * DWBITS;
                        continue;
                    }
                }
                // check 8x8 area for white
                else
                {
                    if (pbBase[0] == (BYTE)-1 &&
                        pbBase[cbLine] == (BYTE)-1 &&
                        pbBase[cbLine*2] == (BYTE)-1 &&
                        pbBase[cbLine*3] == (BYTE)-1 &&
                        pbBase[cbLine*4] == (BYTE)-1 &&
                        pbBase[cbLine*5] == (BYTE)-1 &&
                        pbBase[cbLine*6] == (BYTE)-1 &&
                        pbBase[cbLine*7] == (BYTE)-1)
                    {
                        pbBase++;
                        pbOutTmp += iSkip * BBITS;
                        continue;
                    }
                }
            }
            /*
             *    Process the bitmap byte at a time moving across, and
             *  8 scan lines high.  This corresponds to transposing an
             *  8 x 8 bit array.  We can do that quickly.
             */
            pbTemp = pbBase++;
            dw0 = 0;
            dw1 = 0;
            for( i = BBITS; --i >= 0; pbTemp += cbLine )
            {
                /*   The INNER loop - the bit swapping operations */
                dw0 <<= 1;
                dw1 <<= 1;
                pdw = pdwTrans + (*pbTemp << 1);
                dw0 |= *pdw;
                dw1 |= *(pdw + 1);
            }

            /*   Store the two temporary values in the output buffer. */
            *pbOutTmp = (BYTE)dw0;

            pbOutTmp += iSkip;
            dw0 >>= BBITS;              /* One byte's worth */
            *pbOutTmp = (BYTE)dw0;

            pbOutTmp += iSkip;
            dw0 >>= BBITS;
            *pbOutTmp = (BYTE)dw0;

            pbOutTmp += iSkip;
            dw0 >>= BBITS;
            *pbOutTmp = (BYTE)dw0;

            pbOutTmp += iSkip;
            *pbOutTmp = (BYTE)dw1;

            pbOutTmp += iSkip;
            dw1 >>= BBITS;
            *pbOutTmp = (BYTE)dw1;

            pbOutTmp += iSkip;
            dw1 >>= BBITS;
            *pbOutTmp = (BYTE)dw1;

            pbOutTmp += iSkip;
            dw1 >>= BBITS;
            *pbOutTmp = (BYTE)dw1;

            pbOutTmp += iSkip;  /* Next chunk of output data */
        }
    }

    /*
     *    There may be some scan lines remaining.  If so,  iBand will
     *  be > 0,  and that indicates the number of output scan lines
     *  remaining.
     */

    if( iBand > 0 )
    {

        /*
         *   This is basically the same as the stripped down version
         *  in the outer loop above.  Note that the output data is still
         *  byte aligned,  IT IS PRESUMED THAT THE 'MISSING' LINES ARE
         *  ZERO FILLED.  This may not be what is desired - it is for
         *  transposing bits to output to a dot matrix printer where
         *  the page length is not a multiple of the number of pins.
         *  I don't know if that can ever happen.
         */

        pbBase = pbIn;
        pbOutTmp = pbOut;

        for( iWide = pRData->iTransWide; iWide > 0; iWide -= BBITS )
        {
            /*
             *    Process the bitmap byte at a time moving across, and
             *  8 scan lines high.  This corresponds to transposing an
             *  8 x 8 bit array.  We can do that quickly.
             */
            dw0 = 0;
            dw1 = 0;
            pbTemp = pbBase++;

            /*
             *    The inner loop now only transposes as many scan lines
             *  as the bitmap actually contains - we must not run off
             *  the end of memory.
             */

            for( i = iBand; --i >= 0; pbTemp += cbLine )
            {
                /*   The INNER loop - the bit swapping operations */
                dw0 <<= 1;
                dw1 <<= 1;
                pdw = pdwTrans + (*pbTemp << 1);
                dw0 |= *pdw;
                dw1 |= *(pdw + 1);

            }

            // white fill remaining bits
            //
            pdw = pdwTrans + (pRData->ubFillWhite << 1);
            i = BBITS - iBand;
            while (--i >= 0)
            {
                dw0 <<= 1;
                dw1 <<= 1;
                dw0 |= *pdw;
                dw1 |= *(pdw + 1);
            }

            /*   Store the two temporary values in the output buffer. */
            pbTemp = pbOutTmp;
            *pbTemp = (BYTE)dw0;

            pbTemp += iSkip;
            dw0 >>= BBITS;              /* One byte's worth */
            *pbTemp = (BYTE)dw0;

            pbTemp += iSkip;
            dw0 >>= BBITS;
            *pbTemp = (BYTE)dw0;

            pbTemp += iSkip;
            dw0 >>= BBITS;
            *pbTemp = (BYTE)dw0;

            pbTemp += iSkip;
            *pbTemp = (BYTE)dw1;

            pbTemp += iSkip;
            dw1 >>= BBITS;
            *pbTemp = (BYTE)dw1;

            pbTemp += iSkip;
            dw1 >>= BBITS;
            *pbTemp = (BYTE)dw1;

            pbTemp += iSkip;
            dw1 >>= BBITS;
            *pbTemp = (BYTE)dw1;

            pbOutTmp += BBITS * iSkip;  /* Next chunk of output data */
        }

    }

    return;
}


/*
 *   Define the number of pels transposed per loop iteration.  In the case
 * of a colour bitmap, this is 2,  since there are 4 bits per pel, thus
 * 2 per byte.
 */

#define PELS_PER_LOOP   (BBITS / 4)


//*******************************************************
void
vTrans8N4BPP (
    BYTE  *pbIn,
    RENDER  *pRData
    )
/*++

Routine Description:

    Function to transpose the input array into the output array,
    where the input data is to be considered N rows of bitmap data,
    and the output area is byte aligned.
    This version works on 4 bits per pel bitmaps (colour for us).

Arguments:

    pbIn        Pointer to input data buffer to transform
    pRData      Pointer to render structure containing all the
                necessary information about transforming

Return Value:

    none

--*/
{
    /*
     *    The technique is quite simple,  though not necessarily obvious.
     *  Take an 8 scan line by 8 bits block of data,  and transform it
     *  into 8 bytes with bits in the scan line order,  rather than
     *  along the scan line as supplied.
     *    To do this as quickly as possible, each byte to be converted
     *  is used as an index into a lookup table;  each table entry is
     *  64 bits long (a pair of longs above).  These 64 bits are ORed
     *  with the running total of 64 bits (the two variables, dw0, dw1);
     *  shift the running total one bit left.  Repeat this operation
     *  for the corresponding byte in the next scan line - this is
     *  the new table lookup index.  Repeat for all 8 bytes in the 8
     *  scan lines being processed.  Store the 64 bit temporary results
     *  in the output dword array.  Move to the next byte in the
     *  scan line,  and repeat the loop for this column.
     *    This function is based on the special 8 X 8 case (vTrans8x8).
     *  The significant differences are that the transposed data needs
     *  to be written byte at a time (instead of DWORD at a time),
     *  and that there are N scan lines to convert in each loop.
     */

    register  DWORD  dw0,  dw1;         /* Inner loop temporaries */
    register  BYTE  *pbTemp;
    register  DWORD *pdw;

    register  int    cbLine;            /* Bytes per line in scan data */
    register  int    i;                 /* Loop variable. */
    register  int    iBand;             /* For moving down the scan lines */

    int      iSkip;                     /* Output interleave factor */
    int      iWide;                     /* Pixels across the bitmap */

    DWORD   *pdwOut;                    /* Destination, local copy */
    BYTE    *pbBase;                    /* Start addr of 8 scan line group */
    DWORD   *pdwOutTmp;                 /* For output loop */

    DWORD   *pdwTrans;                  /* Speedier access */


    /*
     *   Set up the local variables from the RENDER structure passed in.
     *  See the above function for explanation of iSkip.
     */

    cbLine = pRData->cbTLine;
    iSkip = pRData->iTransSkip / DWBYTES;
    pdwOut = pRData->pvTransBuf;                        /* Reserved for us! */
    pdwTrans = pRData->Trans.pdwTransTab;

    /*
     *     To ease MMU thrashing,  we scan ACROSS the bitmap in 8 line
     *  groups.  This results in closer memory references,  and so less
     *  page faults and faster execution.  Hence,  the outer most loop
     *  loops DOWN the scanlines.  Then next inner loop scans across groups
     *  of 8 scan lines at a time,  while the inner most loop transposes
     *  one byte by 8 scan lines of bitmap image.
     *     Note that processing the data this way causes a slight increase
     *  in scattered memory addresses when writing the output data.
     *  There is no way to avoid one or the other memory references being
     *  scattered;  however,  the output area is smaller than the input
     *  input,  so scattering here will be less severe on the MMU.
     */


    for( iBand = pRData->iTransHigh; iBand >= BBITS; iBand -= BBITS )
    {

        /*
         *    Have selected the next group of 8 scan lines to process,
         *  so scan from left to right,  transposing data in 8 x 8 bit
         *  groups.  This is the size that can be done very quickly with
         *  a 32 bit environment.
         */

        pbBase = pbIn;
        pbIn += BBITS * cbLine;         /* Next address */

        pdwOutTmp = pdwOut;
        ++pdwOut;               /* Onto the next byte sequence */

        for( iWide = pRData->iTransWide; iWide > 0; iWide -= BBITS )
        {
            /*
             *    Process the bitmap byte at a time moving across, and
             *  8 scan lines high.  This corresponds to transposing an
             *  8 x 8 pixel array.  We can do that quickly.
             */

            pbTemp = pbBase++;

            dw0 = *(pdwTrans + 1 + (*pbTemp << 1));
            pbTemp += cbLine;
            dw0 |= *(pdwTrans + (*pbTemp << 1));
            pbTemp += cbLine;

            dw0 >>= 8;

            dw0 |= *(pdwTrans + 1 + (*pbTemp << 1));
            pbTemp += cbLine;
            dw0 |= *(pdwTrans + (*pbTemp << 1));
            pbTemp += cbLine;

            dw1 = *(pdwTrans + 1 + (*pbTemp << 1));
            pbTemp += cbLine;
            dw1 |= *(pdwTrans + (*pbTemp << 1));
            pbTemp += cbLine;

            dw1 >>= 8;

            dw1 |= *(pdwTrans + 1 + (*pbTemp << 1));
            pbTemp += cbLine;
            dw1 |= *(pdwTrans + (*pbTemp << 1));

            *(WORD *)pdwOutTmp = (WORD)dw0;
            *(((WORD *)pdwOutTmp)+1) = (WORD)dw1;
            *(pdwOutTmp+iSkip) = (dw1 & 0xffff0000) | (dw0 >> 16);
            pdwOutTmp += PELS_PER_LOOP * iSkip; /* Next chunk of output data */
        }

    }

    /*
     *    There may be some scan lines remaining.  If so,  iBand will
     *  be > 0,  and that indicates the number of output scan lines
     *  remaining.
     */

    if( iBand > 0 )
    {

        /*
         *   This is basically the same as the stripped down version
         *  in the outer loop above.  Note that the output data is still
         *  byte aligned,  IT IS PRESUMED THAT THE 'MISSING' LINES ARE
         *  ZERO FILLED.  This may not be what is desired - it is for
         *  transposing bits to output to a dot matrix printer where
         *  the page length is not a multiple of the number of pins.
         *  I don't know if that can ever happen.
         */

        pbBase = pbIn;
        pdwOutTmp = pdwOut;

        for( iWide = pRData->iTransWide; iWide > 0; iWide -= BBITS )
        {
            /*
             *    Process the bitmap byte at a time moving across, and
             *  8 scan lines high.  This corresponds to transposing an
             *  8 x 8 bit array.  We can do that quickly.
             */
            pbTemp = pbBase++;

            dw0 = *(pdwTrans + 1 + (*pbTemp << 1));
            dw1 = 0;
            if (iBand > 1)
            {
                pbTemp += cbLine;
                dw0 |= *(pdwTrans + (*pbTemp << 1));
                dw0 >>= 8;
                if (iBand > 2)
                {
                    pbTemp += cbLine;
                    dw0 |= *(pdwTrans + 1 + (*pbTemp << 1));
                    if (iBand > 3)
                    {
                        pbTemp += cbLine;
                        dw0 |= *(pdwTrans + (*pbTemp << 1));
                        if (iBand > 4)
                        {
                            pbTemp += cbLine;
                            dw1 = *(pdwTrans + 1 + (*pbTemp << 1));
                            if (iBand > 5)
                            {
                                pbTemp += cbLine;
                                dw1 |= *(pdwTrans + (*pbTemp << 1));
                                dw1 >>= 8;
                                if (iBand > 6)
                                {
                                    pbTemp += cbLine;
                                    dw1 |= *(pdwTrans + 1 + (*pbTemp << 1));
                                }
                            }
                            else
                                dw1 >>= 8;
                        }
                    }
                }
            }
            else
                dw0 >>= 8;

            *(WORD *)pdwOutTmp = (WORD)dw0;
            *(((WORD *)pdwOutTmp)+1) = (WORD)dw1;
            *(pdwOutTmp+iSkip) = (dw1 & 0xffff0000) | (dw0 >> 16);

            pdwOutTmp += 2 * iSkip;     /* Next chunk of output data */
        }

    }

    return;
}

//*******************************************************
void
vTransColSep (
    register BYTE  *pbIn,
    RENDER  *pRData
    )
/*++

Routine Description:

    Function to transpose the colour bits in a 4 Bits Per Pel colour
    bitmap into an array of bytes,  where the bytes are ordered in
    the same way as the original bits.  An example of this is provided
    in the explanation for the SEP_TABLE_SIZE value at the top of this file.

Arguments:

    pbIn        Pointer to input data buffer to transform
    pRData      Pointer to render structure containing all the
                necessary information about transforming

Return Value:

    none

--*/
{
    /*
     *    Operation is quite simple - pass along the input array byte
     *  at a time,  and use each 4 byte group to generate a DWORD of
     *  output - placed in pdwOut.  The previously generated translation
     *  table is especially formulated to do this job!
     *
     *    NOTE:  pdwOut and pbIn MAY POINT TO THE SAME ADDRESS!  THERE IS
     *  NO OVERLAP IN OPERATIONS TO CAUSE CONFUSION.
     */

    register  DWORD   dwTemp;
    register  DWORD  *pdwSep;

    int      iI;
    int      iBlock;
    DWORD   *pdwOut;            /* Destination - DWORD aligned */
    DWORD   dwWhite;


    iBlock = pRData->cDWLine * pRData->iNumScans;

    pdwSep = pRData->pdwColrSep;                /* Colour separation table */
    pdwOut = pRData->pvTransBuf;                /* Where the data goes */


    /*   Loop through the line in 4 byte groups */

    //
    // calculate the white conversion value
    //
    dwWhite = *(pdwSep + 0x77);
    dwWhite |= (dwWhite << 2) | (dwWhite << 4) | (dwWhite << 6);

    //
    // convert the data to planar including RGB to CMY(K)
    //
    for (iI = iBlock; --iI >= 0;)
    {
        if (*(DWORD *)pbIn == 0x77777777L)
        {
            *pdwOut++ = dwWhite;
        }
        else
        {
            dwTemp = *(pdwSep + *pbIn);

            dwTemp <<= 2;
            dwTemp |= *(pdwSep + pbIn[1]);

            dwTemp <<= 2;
            dwTemp |= *(pdwSep + pbIn[2]);

            *pdwOut++ = (dwTemp << 2) | *(pdwSep + pbIn[3]);
        }
        pbIn += DWBYTES;
    }

    return;

}


//*******************************************************
void
vTrans8BPP (
    BYTE  *pbIn,
    RENDER  *pRData
    )
/*++

Routine Description:

    The transpose function for 8 bits per pel bitmaps.  This is rather
    easy, as all we do is shuffle bytes!

Arguments:

    pbIn        Pointer to input data buffer to transform
    pRData      Pointer to render structure containing all the
                necessary information about transforming

Return Value:

    none

--*/
{

    /*
     *    Scan along the input bitmap,  writing the data to the output
     *  in column order.  This results in reduced MMU thrashing, as
     *  the output addresses are all limited to a much smaller range
     *  than the incoming addresses.
     */

    register  BYTE   *pbBase;             /* Scan along input bitmap */
    register  BYTE   *pbOut;              /* The output scan column pointer */

    int     iBand;                 /* Count down scan lines */
    int     iSkip;                 /* Offset between output bytes */
    int     iWide;                 /* Loop across the input scan line */
    int     cbLine;                /* Bytes per input scan line */

    BYTE   *pbOutBase;             /* Start of column of output data */


    /*
     *   Set up the local copies (for faster access) of data passed in.
     */

    cbLine = pRData->cbTLine;
    iSkip = pRData->iTransSkip;
    pbOutBase = pRData->pvTransBuf;       /* Base output buffer address */


    for( iBand = pRData->iTransHigh; iBand > 0; --iBand )
    {
        /*
         *    This loop processes scan lines in the input bitmap. As
         *  we progress across the scan line, the output data is written
         *  in column order.
         */

        pbBase = pbIn;
        pbIn += cbLine;            /* Next scan line, DWORD aligned */

        pbOut = pbOutBase;
        ++pbOutBase;               /* One column across output area */

        for( iWide = pRData->iTransWide; iWide > 0; iWide -= BBITS )
        {
            /*
             *   This loop traverses the input scan line, taking bytes
             *  and writing them to the output area in column order.
             */

            *pbOut = *pbBase++;
            pbOut += iSkip;
        }
    }

    return;
}

//*******************************************************
void
vTrans24BPP (
    BYTE  *pbIn,
    RENDER  *pRData
    )
/*++

Routine Description:

    The transpose function for 8 bits per pel bitmaps.  This is rather
    easy, as all we do is shuffle bytes!

Arguments:

    pbIn        Pointer to input data buffer to transform
    pRData      Pointer to render structure containing all the
                necessary information about transforming

Return Value:

    none

--*/
{
    /*
     *    Scan along the input bitmap,  writing the data to the output
     *  in column order.  This results in reduced MMU thrashing, as
     *  the output addresses are all limited to a much smaller range
     *  than the incoming addresses.
     */

    register  BYTE   *pbBase;             /* Scan along input bitmap */
    register  BYTE   *pbOut;              /* The output scan column pointer */

    int     iBand;                 /* Count down scan lines */
    int     iSkip;                 /* Offset between output bytes */
    int     iWide;                 /* Loop across the input scan line */
    int     iCol;
    int     iRow;
    int     cbLine;                /* Bytes per input scan line */
    int     iBytesLeftOver;

    BYTE   *pbOutBase;             /* Start of column of output data */


    /*
     *   Set up the local copies (for faster access) of data passed in.
     */

    iSkip = pRData->iTransSkip;
    cbLine = pRData->cbTLine;
    pbOutBase = pRData->pvTransBuf;       /* Base output buffer address */
    iCol =  pRData->iTransWide/pRData->iBPP;
    iRow = pRData->iTransHigh;
    iBytesLeftOver = (pRData->iTransHigh *pRData->iBPP) % DWBITS;

    for( iBand = iRow; iBand > 0; --iBand )
    {
        /*
         *    This loop processes scan lines in the input bitmap. As
         *  we progress across the scan line, the output data is written
         *  in column order.
         */

        pbBase = pbIn;
        pbIn += cbLine;            /* Next scan line, DWORD aligned */

        pbOut = pbOutBase;
        pbOutBase+=3;               /* One column across output area */

        for( iWide = iCol; iWide > 0; --iWide )
        {
            /*
             *   This loop traverses the input scan line, taking bytes
             *  and writing them to the output area in column order.
             */
            *pbOut = *pbBase++;
            *(pbOut+1) = *pbBase++;
            *(pbOut+2) = *pbBase++;
            pbOut += iSkip;
        }
    }

    return;
}