/**************************** Function Header ******************************
 * transpos.c
 *      Functions associated with transposing bitmaps.  Several flavours
 *      are available,  to be used as appropriate in some special cases.
 *      The special cases are generally faster than the general method.
 *
 * HISTORY:
 *  10:46 on Wed 27 Feb 1991    -by-    Lindsay Harris   [lindsayh]
 *      Added table creation at load time - big endian/little endian
 *
 *  14:24 on Wed 23 Jan 1991    -by-    Lindsay Harris   [lindsayh]
 *      Created it - initially for the 8 x 8 case.
 *
 * Copyright (C) 1991 - 1993 Micrososft Corporation
 *
 ***************************************************************************/

#include        <stddef.h>
#include        <windows.h>
#include        <libproto.h>
#include        <winddi.h>
#include        "pdev.h"
#include        "win30def.h"
#include        "udmindrv.h"
#include        "udpfm.h"
#include        "uddevice.h"
#include        "udrender.h"
#include        "rasdd.h"

/*
 *   The transpose table:  maps one byte into two longs,  such that the
 * 8 bits of the byte turn into 64 bits: each bit of the original is
 * turned into one byte of output.
 *   THUS:
 * Input byte:   hgfedcba
 *   transposes into output bytes:
 *      0000000a  0000000b  0000000c  0000000d
 *      0000000e  0000000f  0000000g  0000000h
 *
 *   The table is allocated at DrvEnableSurface time,  thus ensuring that
 *  we do not allocate memory that we are not going to use.
 */

#define TABLE_SIZE      (256 * 2 * sizeof( DWORD ))

/*
 *   We also need a similar table for colour separation.  This one
 *  consists of 256 DWORDs,  and is used to split the RGB(K) format
 *  input byte into an output DWORD with the two R bits in one byte,
 *  the two G bits in the next byte etc.  Used for single pin colour
 *  printers,  like the HP PaintJet.
 *   The table is generated according to the following rule:
 *
 *  INPUT BYTE:  KRGBkrgb
 *
 *  OUTPUT DWORD:  000000Kk 000000Rr 000000Gg 000000Bb
 */

#define SEP_TABLE_SIZE  (256 * sizeof( DWORD ))


/************************** Function Header *******************************
 * vInitTrans
 *      Initialise the transpose tables.  This is done to make the tables
 *      independent of whether the processor is big endian or little endian,
 *      since the data is generated by the processor that is going to
 *      use it!  There are still some minor questions of byte ordering,
 *      but nothing too major to resolve.
 *
 * RETURNS:
 *      TRUE/FALSE;  FALSE for lack of storage for table.
 *
 * HISTORY:
 *  Friday December 3 1993      -by-    Norman Hendley   [normanh]
 *      Changed graphics mode check to RES_DM_GDI from nPins
 *
 *  10:52 on Wed 27 Feb 1991    -by-    Lindsay Harris   [lindsayh]
 *      Borrowed from program which generated original table.
 *
 **************************************************************************/

BOOL
bInitTrans( pPDev )
PDEV  *pPDev;
{

    /*
     *   Function to generate the transposition table.  There is nothing
     * difficult about generating the table.  The only trick is the use
     * of the union.  This allows us to setup a DWORD table with the
     * byte ordering of the hardware on which we are running.  This is
     * achieved by writing the data into the BYTE entry,  then using
     * the same memory as a DWORD to be put away into memory.  The reason
     * for using DWORDS is to get maximum benefit from memory references
     * in the inner loop of the transpose functions.
     *   Storage space is allocated on the heap,  and the address is
     * stored in the PDEV.
     *   Note that the 8 bits per pel case is special, as we are shuffling
     * bytes around, and thus do not need any tables.  For this case,
     * return TRUE without allocating any storage.
     *
     *   Returns TRUE for success,  FALSE meaning storage unavailable.
     */

    register  DWORD   *pdw;
    register  int   iShift,  j;

    int    i;

#define pUDPDev ((UD_PDEV *)(pPDev->pUDPDev))

    union
    {
        BYTE   b[ 8 ];          /* Exactly 64 bits */
        DWORD  dw[ 2 ];         /* Also exactly 64 bits */
    } u;


    if( pUDPDev->sBitsPixel == 8 )
    {
        pPDev->pdwTrans = NULL;

        return   TRUE;              /* Byte operations - no table needed */
    }

    if( !(pPDev->pdwTrans = (DWORD *)HeapAlloc( pPDev->hheap, 0, TABLE_SIZE )) )
        return  FALSE;


    pdw = pPDev->pdwTrans;              /* Speedier access */


    /*
     *   Colour requires different tables,  as the pixel data consists of
     * 4 bits which need to move in a single group.
     */

    if( pUDPDev->Resolution.fDump & RES_DM_COLOR )
    {
        /*
         *   First generate the landscape to portrait transpose data.
         *  The only complication is maintaining 4 bit nibbles as a single
         *  entity.
         */

        for( iShift = 0; iShift < 256; iShift++ )
        {
            /*
             *   The low nibble goes into the highest byte address,  the
             *  next nibble goes 4 bytes lower in memory.
             */
            u.dw[ 0 ] = 0;
            u.dw[ 1 ] = 0;

            u.b[ 3 ] = (BYTE)((iShift >> 4) & 0x0f);
            u.b[ 7 ] = (BYTE)(iShift & 0x0f);

            /*   Store the result  */

            *pdw++ = u.dw[ 0 ];
            *pdw++ = u.dw[ 1 ];
        }

        /*
         *   There is an additional transpose operation required for single
         *  pin colour printers.  The HP Paintjet typifies this class.
         *  This operation is required to separate the RGB pixels (2 of each
         *  colour per byte) into bytes that may be sent to the printer,
         *  such that all R bytes are sent in one go,  followed by G etc.
         *  For multiple pin printers,  this falls out of the standard
         *  transpose operations.
         */


        if( pUDPDev->Resolution.fDump & RES_DM_GDI )
        {
            pPDev->pdwColrSep = (DWORD *)HeapAlloc( pPDev->hheap, 0, SEP_TABLE_SIZE );
            if( pPDev->pdwColrSep == NULL )
            {
                HeapFree( pPDev->hheap, 0, (LPSTR)pPDev->pdwTrans );
                pPDev->pdwTrans = 0;

                return   FALSE;
            }

            pdw = pPDev->pdwColrSep;    /* Speedier access */

            /*
             *   The explanation above for SEP_TABLE_SIZE explains what is
             *  taking place in the following loops.
             */

            for( i = 0; i <= 0xff; i++ )
            {
                u.dw[ 0 ] = 0;
                iShift = i & 0x77;              /* Only use 3 bits per pel */

                if( pUDPDev->fColorFormat & DC_EXTRACT_BLK )
                {
                    if( pUDPDev->fColorFormat & DC_PRIMARY_RGB )
                    {
                    /*
                     *    Whenever we have a 0 nibble,  replace it with 8.
                     *  This does the colour separation for us!  The
                     *  separation happens when the transpose happens.
                     */

                        if( (iShift & 0x07) == 0 )
                            iShift |= 0x08;

                        if( (iShift & 0x70) == 0 )
                            iShift |= 0x80;
                    }
                    else
                    {
                        /*  CMY - same idea,  different conditions!  */
                        if( (iShift & 0x07) == 0x07 )
                            iShift = (iShift & ~0x07) | 0x08;

                        if( (iShift & 0x70) == 0x70 )
                            iShift = (iShift & ~0x70) | 0x80;
                    }
                }

                /*   The two bits Bb  */
                u.b[ 3 ] = (BYTE)(((iShift >> 3) & 0x02) | (iShift & 0x1));
                iShift >>= 1;


                /*   The two bits Gg  */
                u.b[ 2 ] = (BYTE)(((iShift >> 3) & 0x02) | (iShift & 0x1));
                iShift >>= 1;


                /*   The two bits Rr  */
                u.b[ 1 ] = (BYTE)(((iShift >> 3) & 0x02) | (iShift & 0x1));
                iShift >>= 1;


                /*   The two bits Kk  */
                u.b[ 0 ] = (BYTE)(((iShift >> 3) & 0x02) | (iShift & 0x1));

                *pdw++ = u.dw[ 0 ];             /* Safe for posterity */
            }
        }
        else
        {
            /*
             *   The dot matrix case.  Here we will call the relevant
             * transpose function,  but use the modified table below.  This
             * table will do the colour separation,  and will result in the
             * transpose operation splitting up the data for each head pass.
             */

            pPDev->pdwColrSep = (DWORD *)HeapAlloc( pPDev->hheap, 0, TABLE_SIZE );
            if( pPDev->pdwColrSep == NULL )
            {
                HeapFree( pPDev->hheap, 0, (LPSTR)pPDev->pdwTrans );
                pPDev->pdwTrans = 0;

                return   FALSE;
            }

            pdw = pPDev->pdwColrSep;    /* Speedier access */

            for( i = 0; i <= 0xff; i++ )
            {
                /*  Each bit of i goes into one byte of the output  */
                u.dw[ 0 ] = 0;
                u.dw[ 1 ] = 0;

                iShift = i & 0x77;              /* Only 3 bits per pel */

                if( pUDPDev->fColorFormat & DC_EXTRACT_BLK )
                {
                    if( pUDPDev->fColorFormat & DC_PRIMARY_RGB )
                    {
                    /*
                     *    Whenever we have a 0 nibble,  replace it with 8.
                     *  This does the colour separation for us!  The
                     *  separation happens when the transpose happens.
                     */

                        if( (iShift & 0x07) == 0 )
                            iShift |= 0x08;

                        if( (iShift & 0x70) == 0 )
                            iShift |= 0x80;
                    }
                    else
                    {
                        /*  CMY - same idea,  different conditions!  */
                        if( (iShift & 0x07) == 0x07 )
                            iShift = (iShift & ~0x07) | 0x08;

                        if( (iShift & 0x70) == 0x70 )
                            iShift = (iShift & ~0x70) | 0x80;
                    }
                }

                for( j = 8; --j >= 0; )
                {
                    u.b[ j ] = (BYTE)(iShift & 0x1);
                    iShift >>= 1;
                }

                /*   Store the result  */

                *pdw++ = u.dw[ 0 ];
                *pdw++ = u.dw[ 1 ];
            }
        }
    }
    else
    {
        /*
         *   Monochrome case - simple transpositions.
         */

        for( i = 0; i <= 0xff; i++ )
        {
            /*  Each bit of i goes into one byte of the output  */
            iShift = i;
            u.dw[ 0 ] = 0;
            u.dw[ 1 ] = 0;

            for( j = 8; --j >= 0; )
            {
                u.b[ j ] = (BYTE)(iShift & 0x1);
                iShift >>= 1;
            }

            /*   Store the result  */

            *pdw++ = u.dw[ 0 ];
            *pdw++ = u.dw[ 1 ];
        }
    }

    return  TRUE;
}
#undef  pUDPDev


/************************** Function Header *******************************
 * vTrans8x8
 *      Function to transpose the input array into the output array,
 *      where the input data is to be considered 8 rows of bitmap data,
 *      and the output area is dword aligned.
 *
 * RETURNS:
 *      Nothing
 *
 * HISTORY:
 *  14:27 on Wed 23 Jan 1991    -by-    Lindsay Harris   [lindsayh]
 *      First incarnation.
 *
 *************************************************************************/

void
vTrans8x8( pbIn,  pRData )
BYTE    *pbIn;          /* Source */
RENDER  *pRData;        /* Rendering info */
{
    /*
     *    The technique is quite simple,  though not necessarily obvious.
     *  Take an 8 scan line by 8 bits block of data,  and transform it
     *  into 8 bytes with bits in the scan line order,  rather than
     *  along the scan line as supplied.
     *    To do this as quickly as possible, each byte to be converted
     *  is used as an index into a lookup table;  each table entry is
     *  64 bits long (a pair of longs above).  These 64 bits are ORed
     *  with the running total of 64 bits (the two variables, dw0, dw1);
     *  shift the running total one bit left.  Repeat this operation
     *  for the corresponding byte in the next scan line - this is
     *  the new table lookup index.  Repeat for all 8 bytes in the 8
     *  scan lines being processed.  Store the 64 bit temporary results
     *  in the output dword array.  Move to the next byte in the
     *  scan line,  and repeat the loop for this column.
     */

    register  DWORD  dw0,  dw1;         /* Inner loop temporaries */
    register  BYTE  *pbTemp;
    register  DWORD *pdw;

    register  int    cbLine;            /* Bytes per line in scan data */
    register  int    i;                 /* Loop variable. */


    int      iWide;                     /* Pixels across the bitmap */
    DWORD   *pdwOut;                    /* Destination */
    DWORD   *pdwTrans;                  /* Local copy of output buffer */


    /*
     *   Some initialisation:  byte count,  area limits, etc.
     */


    cbLine = pRData->cbTLine;
    pdwOut = pRData->pvTransBuf;
    pdwTrans = pRData->Trans.pdwTransTab;

    if( pRData->iTransHigh != 8 )
    {
        /*  This can happen at the end of a page. */

        vTrans8N( pbIn,  pRData );

        return;
    }


    /*
     *   Scan across the lines in groups of 8 bits.  In the case that the
     *  input is not a multiple of 8,  we will produce a few extra
     *  bytes at the end;  the caller should allow for this when allocating
     *  storage for pdwOut.  The consequence is that the last few
     *  bytes will contain garbage; presumably the caller will not
     *  process them further.
     */

    for( iWide = pRData->iTransWide; iWide > 0; iWide -= BBITS )
    {
        dw0 = 0;
        dw1 = 0;

        /*
         *   Loop DOWN the scanlines at the starting byte location,
         * generating the transposed data as we go.
         */

        for( i = BBITS, pbTemp = pbIn++; --i >= 0; pbTemp += cbLine )
        {
            dw0 <<= 1;
            dw1 <<= 1;
            pdw = pdwTrans + (*pbTemp << 1);
            dw0 |= *pdw;
            dw1 |= *(pdw + 1);
        }

        /*   Store the two temporary values in the output buffer. */
        *pdwOut = dw0;
        *(pdwOut + 1) = dw1;
        pdwOut += 2;

    }

    return;
}


/************************** Function Header *******************************
 * vTrans8N
 *      Function to transpose the input array into the output array,
 *      where the input data is to be considered N rows of bitmap data,
 *      and the output area is byte aligned.
 *
 * RETURNS:
 *      Nothing
 *
 * HISTORY:
 *  16:34 on Mon 28 Jan 1991    -by-    Lindsay Harris   [lindsayh]
 *      First incarnation.
 *
 *************************************************************************/

void
vTrans8N( pbIn,  pRData )
BYTE    *pbIn;          /* Source */
RENDER  *pRData;        /* Overall rendering info */
{
    /*
     *    The technique is quite simple,  though not necessarily obvious.
     *  Take an 8 scan line by 8 bits block of data,  and transform it
     *  into 8 bytes with bits in the scan line order,  rather than
     *  along the scan line as supplied.
     *    To do this as quickly as possible, each byte to be converted
     *  is used as an index into a lookup table;  each table entry is
     *  64 bits long (a pair of longs above).  These 64 bits are ORed
     *  with the running total of 64 bits (the two variables, dw0, dw1);
     *  shift the running total one bit left.  Repeat this operation
     *  for the corresponding byte in the next scan line - this is
     *  the new table lookup index.  Repeat for all 8 bytes in the 8
     *  scan lines being processed.  Store the 64 bit temporary results
     *  in the output dword array.  Move to the next byte in the
     *  scan line,  and repeat the loop for this column.
     *    This function is based on the special 8 X 8 case (vTrans8x8).
     *  The significant differences are that the transposed data needs
     *  to be written byte at a time (instead of DWORD at a time),
     *  and that there are N scan lines to convert in each loop.
     */

    register  DWORD  dw0,  dw1;         /* Inner loop temporaries */
    register  BYTE  *pbTemp;
    register  DWORD *pdw;

    register  int    cbLine;            /* Bytes per line in scan data */
    register  int    i;                 /* Loop variable. */
    register  int    iBand;             /* For moving down the scan lines */

    int      iSkip;                     /* Output interleave factor */
    int      iWide;                     /* Pixels across the bitmap */

    BYTE    *pbOut;                     /* Destination, local copy */
    BYTE    *pbBase;                    /* Start addr of 8 scan line group */
    BYTE    *pbOutTmp;                  /* For output loop */

    DWORD   *pdwTrans;                  /* Speedier access */


    /*
     *   Set up the local variables from the RENDER structure passed in.
     */

    cbLine = pRData->cbTLine;
    iSkip = pRData->iTransSkip;
    pbOut = pRData->pvTransBuf;                 /* Reserved for us! */
    pdwTrans = pRData->Trans.pdwTransTab;

    /*
     *     To ease MMU thrashing,  we scan ACROSS the bitmap in 8 line
     *  groups.  This results in closer memory references,  and so less
     *  page faults and so faster execution.  Hence,  the outer most loop
     *  loops DOWN the scanlines.  The next inner loop scans across groups
     *  of 8 scan lines at a time,  while the inner most loop transposes
     *  one byte by 8 scan lines of bitmap image.
     *     Note that processing the data this way causes a slight increase
     *  in scattered memory addresses when writing the output data.
     *  There is no way to avoid one or the other memory references being
     *  scattered;  however,  the output area is smaller than the input
     *  input,  so scattering here will be less severe to the MMU.
     */

    for( iBand = pRData->iTransHigh; iBand >= BBITS; iBand -= BBITS )
    {

        /*
         *    Have selected the next group of 8 scan lines to process,
         *  so scan from left to right,  transposing data in 8 x 8 bit
         *  groups.  This is the size that can be done very quickly with
         *  a 32 bit environment.
         */

        pbBase = pbIn;
        pbIn += BBITS * cbLine;         /* Next address */

        pbOutTmp = pbOut;
        ++pbOut;                /* Onto the next byte sequence */

        for( iWide = pRData->iTransWide; iWide > 0; iWide -= BBITS )
        {
            /*
             *    Process the bitmap byte at a time moving across, and
             *  8 scan lines high.  This corresponds to transposing an
             *  8 x 8 bit array.  We can do that quickly.
             */
            dw0 = 0;
            dw1 = 0;
            pbTemp = pbBase++;

            for( i = BBITS; --i >= 0; pbTemp += cbLine )
            {
                /*   The INNER loop - the bit swapping operations */
                dw0 <<= 1;
                dw1 <<= 1;
                pdw = pdwTrans + (*pbTemp << 1);
                dw0 |= *pdw;
                dw1 |= *(pdw + 1);

            }

/*   !!!LindsayH:   Note that the following code is big endian/little endian
 *   sensitive,  and currently works on the 80386 (which ever way that is).
 *   There are two alternatives to cure this problem:  first is to have
 *   another function,  with the order of byte extraction reversed; second
 *   is to offset the value in pbTemp,  and change the sign of iSkip.
 *   There are disadvantages to both.
 *  FOR NOW,  this is not a problem,  and will be left as an exercise
 *  for the student.
 */
            /*   Store the two temporary values in the output buffer. */
            pbTemp = pbOutTmp;
            *pbTemp = (BYTE)dw0;

            pbTemp += iSkip;
            dw0 >>= BBITS;              /* One byte's worth */
            *pbTemp = (BYTE)dw0;

            pbTemp += iSkip;
            dw0 >>= BBITS;
            *pbTemp = (BYTE)dw0;

            pbTemp += iSkip;
            dw0 >>= BBITS;
            *pbTemp = (BYTE)dw0;

            pbTemp += iSkip;
            *pbTemp = (BYTE)dw1;

            pbTemp += iSkip;
            dw1 >>= BBITS;
            *pbTemp = (BYTE)dw1;

            pbTemp += iSkip;
            dw1 >>= BBITS;
            *pbTemp = (BYTE)dw1;

            pbTemp += iSkip;
            dw1 >>= BBITS;
            *pbTemp = (BYTE)dw1;

            pbOutTmp += BBITS * iSkip;  /* Next chunk of output data */
        }

    }

    /*
     *    There may be some scan lines remaining.  If so,  iBand will
     *  be > 0,  and that indicates the number of output scan lines
     *  remaining.
     */

    if( iBand > 0 )
    {

        /*
         *   This is basically the same as the stripped down version
         *  in the outer loop above.  Note that the output data is still
         *  byte aligned,  IT IS PRESUMED THAT THE 'MISSING' LINES ARE
         *  ZERO FILLED.  This may not be what is desired - it is for
         *  transposing bits to output to a dot matrix printer where
         *  the page length is not a multiple of the number of pins.
         *  I don't know if that can ever happen.
         */

        pbBase = pbIn;
        pbOutTmp = pbOut;

        for( iWide = pRData->iTransWide; iWide > 0; iWide -= BBITS )
        {
            /*
             *    Process the bitmap byte at a time moving across, and
             *  8 scan lines high.  This corresponds to transposing an
             *  8 x 8 bit array.  We can do that quickly.
             */
            dw0 = 0;
            dw1 = 0;
            pbTemp = pbBase++;

            /*
             *    The inner loop now only transposes as many scan lines
             *  as the bitmap actually contains - we must not run off
             *  the end of memory.
             */

            for( i = iBand; --i >= 0; pbTemp += cbLine )
            {
                /*   The INNER loop - the bit swapping operations */
                dw0 <<= 1;
                dw1 <<= 1;
                pdw = pdwTrans + (*pbTemp << 1);
                dw0 |= *pdw;
                dw1 |= *(pdw + 1);

            }

            /*   Zero fill the missing bits  */
            dw0 <<= BBITS - iBand;
            dw1 <<= BBITS - iBand;

            /*   Store the two temporary values in the output buffer. */
            pbTemp = pbOutTmp;
            *pbTemp = (BYTE)dw0;

            pbTemp += iSkip;
            dw0 >>= BBITS;              /* One byte's worth */
            *pbTemp = (BYTE)dw0;

            pbTemp += iSkip;
            dw0 >>= BBITS;
            *pbTemp = (BYTE)dw0;

            pbTemp += iSkip;
            dw0 >>= BBITS;
            *pbTemp = (BYTE)dw0;

            pbTemp += iSkip;
            *pbTemp = (BYTE)dw1;

            pbTemp += iSkip;
            dw1 >>= BBITS;
            *pbTemp = (BYTE)dw1;

            pbTemp += iSkip;
            dw1 >>= BBITS;
            *pbTemp = (BYTE)dw1;

            pbTemp += iSkip;
            dw1 >>= BBITS;
            *pbTemp = (BYTE)dw1;

            pbOutTmp += BBITS * iSkip;  /* Next chunk of output data */
        }

    }

    return;
}


/*
 *   Define the number of pels transposed per loop iteration.  In the case
 * of a colour bitmap, this is 2,  since there are 4 bits per pel, thus
 * 2 per byte.
 */

#define PELS_PER_LOOP   (BBITS / 4)


/************************** Function Header *******************************
 * vTrans8N4BPP
 *      Function to transpose the input array into the output array,
 *      where the input data is to be considered N rows of bitmap data,
 *      and the output area is byte aligned.
 *      This version works on 4 bits per pel bitmaps (colour for us).
 *
 * RETURNS:
 *      Nothing
 *
 * HISTORY:
 *  15:20 on Tue 30 Jul 1991    -by-    Lindsay Harris   [lindsayh]
 *      First incarnation, based on vTrans8N.
 *
 *************************************************************************/

void
vTrans8N4BPP( pbIn,  pRData )
BYTE    *pbIn;          /* Source */
RENDER  *pRData;        /* Overall rendering info */
{
    /*
     *    The technique is quite simple,  though not necessarily obvious.
     *  Take an 8 scan line by 8 bits block of data,  and transform it
     *  into 8 bytes with bits in the scan line order,  rather than
     *  along the scan line as supplied.
     *    To do this as quickly as possible, each byte to be converted
     *  is used as an index into a lookup table;  each table entry is
     *  64 bits long (a pair of longs above).  These 64 bits are ORed
     *  with the running total of 64 bits (the two variables, dw0, dw1);
     *  shift the running total one bit left.  Repeat this operation
     *  for the corresponding byte in the next scan line - this is
     *  the new table lookup index.  Repeat for all 8 bytes in the 8
     *  scan lines being processed.  Store the 64 bit temporary results
     *  in the output dword array.  Move to the next byte in the
     *  scan line,  and repeat the loop for this column.
     *    This function is based on the special 8 X 8 case (vTrans8x8).
     *  The significant differences are that the transposed data needs
     *  to be written byte at a time (instead of DWORD at a time),
     *  and that there are N scan lines to convert in each loop.
     */

    register  DWORD  dw0,  dw1;         /* Inner loop temporaries */
    register  BYTE  *pbTemp;
    register  DWORD *pdw;

    register  int    cbLine;            /* Bytes per line in scan data */
    register  int    i;                 /* Loop variable. */
    register  int    iBand;             /* For moving down the scan lines */

    int      iSkip;                     /* Output interleave factor */
    int      iWide;                     /* Pixels across the bitmap */

    DWORD   *pdwOut;                    /* Destination, local copy */
    BYTE    *pbBase;                    /* Start addr of 8 scan line group */
    DWORD   *pdwOutTmp;                 /* For output loop */

    DWORD   *pdwTrans;                  /* Speedier access */


    /*
     *   Set up the local variables from the RENDER structure passed in.
     *  See the above function for explanation of iSkip.
     */

    cbLine = pRData->cbTLine;
    iSkip = pRData->iTransSkip / DWBYTES;
    pdwOut = pRData->pvTransBuf;                        /* Reserved for us! */
    pdwTrans = pRData->Trans.pdwTransTab;

    /*
     *     To ease MMU thrashing,  we scan ACROSS the bitmap in 8 line
     *  groups.  This results in closer memory references,  and so less
     *  page faults and faster execution.  Hence,  the outer most loop
     *  loops DOWN the scanlines.  Then next inner loop scans across groups
     *  of 8 scan lines at a time,  while the inner most loop transposes
     *  one byte by 8 scan lines of bitmap image.
     *     Note that processing the data this way causes a slight increase
     *  in scattered memory addresses when writing the output data.
     *  There is no way to avoid one or the other memory references being
     *  scattered;  however,  the output area is smaller than the input
     *  input,  so scattering here will be less severe on the MMU.
     */


    for( iBand = pRData->iTransHigh; iBand >= BBITS; iBand -= BBITS )
    {

        /*
         *    Have selected the next group of 8 scan lines to process,
         *  so scan from left to right,  transposing data in 8 x 8 bit
         *  groups.  This is the size that can be done very quickly with
         *  a 32 bit environment.
         */

        pbBase = pbIn;
        pbIn += BBITS * cbLine;         /* Next address */

        pdwOutTmp = pdwOut;
        ++pdwOut;               /* Onto the next byte sequence */

        for( iWide = pRData->iTransWide; iWide > 0; iWide -= BBITS )
        {
            /*
             *    Process the bitmap byte at a time moving across, and
             *  8 scan lines high.  This corresponds to transposing an
             *  8 x 8 pixel array.  We can do that quickly.
             */
            dw0 = 0;
            dw1 = 0;
            pbTemp = pbBase++;

            for( i = BBITS; --i >= 0; pbTemp += cbLine )
            {
                /*   The INNER loop - the bit swapping operations */
                dw0 >>= 8;
                dw1 >>= 8;
                pdw = pdwTrans + (*pbTemp << 1);
                dw0 |= *pdw << 4;
                dw1 |= *(pdw + 1) << 4;

                pbTemp += cbLine;
                --i;

                pdw = pdwTrans + (*pbTemp << 1);
                dw0 |= *pdw;
                dw1 |= *(pdw + 1);

            }

            /*   Store the two temporary values in the output buffer. */

            *pdwOutTmp = dw0;
            *(pdwOutTmp + iSkip) = dw1;

            pdwOutTmp += PELS_PER_LOOP * iSkip; /* Next chunk of output data */
        }

    }

    /*
     *    There may be some scan lines remaining.  If so,  iBand will
     *  be > 0,  and that indicates the number of output scan lines
     *  remaining.
     */

    if( iBand > 0 )
    {

        /*
         *   This is basically the same as the stripped down version
         *  in the outer loop above.  Note that the output data is still
         *  byte aligned,  IT IS PRESUMED THAT THE 'MISSING' LINES ARE
         *  ZERO FILLED.  This may not be what is desired - it is for
         *  transposing bits to output to a dot matrix printer where
         *  the page length is not a multiple of the number of pins.
         *  I don't know if that can ever happen.
         */

        pbBase = pbIn;
        pdwOutTmp = pdwOut;

        for( iWide = pRData->iTransWide; iWide > 0; iWide -= BBITS )
        {
            /*
             *    Process the bitmap byte at a time moving across, and
             *  8 scan lines high.  This corresponds to transposing an
             *  8 x 8 bit array.  We can do that quickly.
             */
            dw0 = 0;
            dw1 = 0;
            pbTemp = pbBase++;

            /*
             *    The inner loop now only transposes as many scan lines
             *  as the bitmap actually contains - we must not run off
             *  the end of memory.
             */

            for( i = iBand; --i >= 0; pbTemp += cbLine )
            {
                /*   The INNER loop - the bit swapping operations. */

                pdw = pdwTrans + (*pbTemp << 1);

                if( (i ^ iBand) & 0x1 )
                {
                    /*  Every even time through the loop */
                    dw0 >>= 8;
                    dw1 >>= 8;

                    dw0 |= *pdw << 4;
                    dw1 |= *(pdw + 1) << 4;
                }
                else
                {
                    /*  Odd times through the loop */
                    dw0 |= *pdw;
                    dw1 |= *(pdw + 1);
                }

            }

            /*   Zero fill the missing bits  */
            dw0 >>= 8 * ((BBITS - iBand) / 2);
            dw1 >>= 8 * ((BBITS - iBand) / 2);

            /*   Store the two temporary values in the output buffer. */

            *pdwOutTmp = dw0;
            *(pdwOutTmp + iSkip) = dw1;

            pdwOutTmp += 2 * iSkip;     /* Next chunk of output data */
        }

    }

    return;
}


/***************************** Function Header ******************************
 * vTransColSep()
 *      Function to transpose the colour bits in a 4 Bits Per Pel colour
 *      bitmap into an array of bytes,  where the bytes are ordered in
 *      the same way as the original bits.  An example of this is provided
 *      in the explanation for the SEP_TABLE_SIZE value at the top of this file.
 *
 * RETURNS:
 *      Nothing
 *
 * HISTORY:
 *  13:48 on Mon 10 Jun 1991    -by-    Lindsay Harris   [lindsayh]
 *      Time ZERO
 *
 ***************************************************************************/

void
vTransColSep( pbIn, pRData )
register  BYTE    *pbIn;        /* Source */
RENDER  *pRData;                /* Overall rendering info */
{
    /*
     *    Operation is quite simple - pass along the input array byte
     *  at a time,  and use each 4 byte group to generate a DWORD of
     *  output - placed in pdwOut.  The previously generated translation
     *  table is especially formulated to do this job!
     *
     *    NOTE:  pdwOut and pbIn MAY POINT TO THE SAME ADDRESS!  THERE IS
     *  NO OVERLAP IN OPERATIONS TO CAUSE CONFUSION.
     */

    register  DWORD   dwTemp;
    register  DWORD  *pdwSep;

    int      iI;
    int      iBlock;
    DWORD   *pdwOut;            /* Destination - DWORD aligned */


    iBlock = pRData->cDWLine * pRData->iNumScans;

    pdwSep = pRData->pdwColrSep;                /* Colour separation table */
    pdwOut = pRData->pvTransBuf;                /* Where the data goes */


    /*   Loop through the line in 4 byte groups */
    for( iI = iBlock; --iI >= 0; )
    {

        dwTemp = *(pdwSep + *pbIn++);

        dwTemp <<= 2;
        dwTemp |= *(pdwSep + *pbIn++);

        dwTemp <<= 2;
        dwTemp |= *(pdwSep + *pbIn++);

        *pdwOut++ = (dwTemp << 2) | *(pdwSep + *pbIn++);
    }

    return;

}


/************************** Function Header ********************************
 * vTrans8BPP
 *      The transpose function for 8 bits per pel bitmaps.  This is rather
 *      easy, as all we do is shuffle bytes!
 *
 * RETURNS:
 *      Nothing
 *
 * HISTORY:
 *  17:32 on Tue 03 Nov 1992    -by-    Lindsay Harris   [lindsayh]
 *      Initial version.
 *
 ****************************************************************************/

void
vTrans8BPP( pbIn,  pRData )
BYTE    *pbIn;          /* Source */
RENDER  *pRData;        /* Overall rendering info */
{


    /*
     *    Scan along the input bitmap,  writing the data to the output
     *  in column order.  This results in reduced MMU thrashing, as
     *  the output addresses are all limited to a much smaller range
     *  than the incoming addresses.
     */

    register  BYTE   *pbBase;             /* Scan along input bitmap */
    register  BYTE   *pbOut;              /* The output scan column pointer */

    int     iBand;                 /* Count down scan lines */
    int     iSkip;                 /* Offset between output bytes */
    int     iWide;                 /* Loop across the input scan line */
    int     cbLine;                /* Bytes per input scan line */

    BYTE   *pbOutBase;             /* Start of column of output data */


    /*
     *   Set up the local copies (for faster access) of data passed in.
     */

    cbLine = pRData->cbTLine;
    iSkip = pRData->iTransSkip;
    pbOutBase = pRData->pvTransBuf;       /* Base output buffer address */


    for( iBand = pRData->iTransHigh; iBand > 0; --iBand )
    {
        /*
         *    This loop processes scan lines in the input bitmap. As
         *  we progress across the scan line, the output data is written
         *  in column order.
         */

        pbBase = pbIn;
        pbIn += cbLine;            /* Next scan line, DWORD aligned */

        pbOut = pbOutBase;
        ++pbOutBase;               /* One column across output area */

        for( iWide = pRData->iTransWide; iWide > 0; iWide -= BBITS )
        {
            /*
             *   This loop traverses the input scan line, taking bytes
             *  and writing them to the output area in column order.
             */

            *pbOut = *pbBase++;
            pbOut += iSkip;
        }
    }

    return;
}

/************************** Function Header ********************************
 * vTrans24BPP
 *      The transpose function for 8 bits per pel bitmaps.  This is rather
 *      easy, as all we do is shuffle bytes!
 *
 * RETURNS:
 *      Nothing
 *
 * HISTORY:
 *  17:32 on Tue 03 Nov 1992    -by-    Lindsay Harris   [lindsayh]
 *      Initial version.
 *
 ****************************************************************************/

void
vTrans24BPP( pbIn,  pRData )
BYTE    *pbIn;          /* Source */
RENDER  *pRData;        /* Overall rendering info */
{


    /*
     *    Scan along the input bitmap,  writing the data to the output
     *  in column order.  This results in reduced MMU thrashing, as
     *  the output addresses are all limited to a much smaller range
     *  than the incoming addresses.
     */

    register  BYTE   *pbBase;             /* Scan along input bitmap */
    register  BYTE   *pbOut;              /* The output scan column pointer */

    int     iBand;                 /* Count down scan lines */
    int     iSkip;                 /* Offset between output bytes */
    int     iWide;                 /* Loop across the input scan line */
    int     iCol;
    int     iRow;
    int     cbLine;                /* Bytes per input scan line */
    int     iBytesLeftOver;

    BYTE   *pbOutBase;             /* Start of column of output data */


    /*
     *   Set up the local copies (for faster access) of data passed in.
     */

    iSkip = pRData->iTransSkip;
    cbLine = pRData->cbTLine;
    pbOutBase = pRData->pvTransBuf;       /* Base output buffer address */
    iCol =  pRData->iTransWide/pRData->iBPP;
    iRow = pRData->iTransHigh;
    iBytesLeftOver = (pRData->iTransHigh *pRData->iBPP) % DWBITS;

    for( iBand = iRow; iBand > 0; --iBand )
    {
        /*
         *    This loop processes scan lines in the input bitmap. As
         *  we progress across the scan line, the output data is written
         *  in column order.
         */

        pbBase = pbIn;
        pbIn += cbLine;            /* Next scan line, DWORD aligned */

        pbOut = pbOutBase;
        pbOutBase+=3;               /* One column across output area */

        for( iWide = iCol; iWide > 0; --iWide )
        {
            /*
             *   This loop traverses the input scan line, taking bytes
             *  and writing them to the output area in column order.
             *  Since we are writing bytes three at a time iSkip
             *  points to the end of the first RGB triplet in the line.
             *  therefore we have to backup two bytes so pbOut will
             *  point to the beginning of the RGB triplet.
             */

            *pbOut++ = *pbBase++;
            *pbOut++ = *pbBase++;
            *pbOut = *pbBase++;
            pbOut += (iSkip-2);
        }
    }

    return;
}