/******************************Module*Header*********************************\
*
*                           ***************
*                           * SAMPLE CODE *
*                           ***************
*
* Module Name: Permedia.c
*
* Content:     This module implements basic access to the Permedia chip and
*              DMA transport. It shows also how to implement synchronization
*              between a display driver and the miniport interrupt by using
*              a shared buffer.
*
*              
*
* Copyright (c) 1994-1998 3Dlabs Inc. Ltd. All rights reserved.
* Copyright (c) 1995-1999 Microsoft Corporation.  All rights reserved.
\*****************************************************************************/

#include "precomp.h"
#define ALLOC_TAG ALLOC_TAG_EP2P
//----------------------------------------------------------------------------
// 
//  here some notes to the transport of data to the Permedia Fifo
//  via standard CPU writes or DMA:
//
//  The Permedia 2 chip allows to download data via three methods:
//      1. program registers by writing once to a specific address
//      2. writing an address and a data tag to a special area on the chip
//      3. writing an address and a data tag to a DMA buffer, then
//          download via DMA
//
//  The third method is preferred, because the CPU writes fastest to memory
//  and the DMA does not stall the CPU. Also many commands can be queued 
//  in a buffer while the graphic processor continues to render 
//  independently. Methods one and two need to read the space in the Input
//  Fifo before data can be written to the Fifo. The disconnect mode of the 
//  chip should not be used, because it can stall the CPU in PCI Disconnect/Retry
//  cycles, where the CPU is not even able to acknoledge an interrupt.
//  On the other hand writing to a DMA buffer introduces a latency compared
//  to write directly to the chip registers. The more data is queued in the
//  DMA buffer, the higher will be the latency. 
//
//  Methods one and two force the CPU to access the chip, which costs more 
//  PCI/AGP bus bandwidth than a DMA burst. Also sequential writes using
//  method one are less efficient, because only accesses to consecutive
//  addresses can be combined to a burst. 
//  The special FIFO area on the chip which is used for method two is 2kb
//  wide and can be written by using a memory copy. These copies can be 
//  combined to bursts by the PCI-Bridge. On processors implementing writeback
//  caches also normal writes to this area are combined to bursts. 
//  (in this driver the "Fifo" memory area on the 
//  chip is not marked as write combined, because writes to the Fifo
//  need to preserve the order). Also the data 
//  format which is written to the chip is exactly the same as in the DMA case. 
//  For that reason a very simple fallback mechanism can be implemented in case
//  the DMA doesn't work on the target system. This could be due to low memory,
//  problems in sharing interrupts, incompatible PCI devices etc.  
//
//  here is a typical piece of code sending some data to the chip:
//  
//  RESERVEDMAPTR(2);     //  wait until two entries are left in Fifo
//  LD_INPUT_FIFO(__Permedia2TagFogMode,0);    // write data        
//  LD_INPUT_FIFO(__Permedia2TagScissorMode,0);
//  COMMITDMAPTR();       //  commit write pointer for next DMA flush
//  FLUSHDMA();           //  do the actual flush (optional)
//
//  Here is a brief description of the DMA memory model:
//
//  There is one huge DMA buffer. It is organized as a ring and is typically
//  between 32kb and 256kb big. There are three main pointers  and one helper
//  handling the DMA operation. They reside in the shared memory section
//  (nonpaged) of the interrupt handler and the display driver.      
//
//
//            pulDMAPrevStart;        // start address of previous DMA
//            pulDMANextStart;        // start address of next DMA
//            pulDMAWritePos;         // address of current write pointer
//
//            pulDMAWriteEnd;         // helper address for reserve function
//
//  In the idle case all three pointers have the same value. In the above sample
//  the write pointer is incremented by two and the execute command would start
//  a 2 command long DMA and setting NextStart to the current value of WritePos and
//  PrevStart to the previous NextStart. Since there can only be one DMA active
//  at a time, a check is necessary if subsequent DMAs have finished before 
//  starting a new one. As long as there are no unfinished DMAs pending, the
//  current implementation does not use interrupts to save CPU time.
//  In the case there is still a DMA pending, a mechanism for flushing the buffer
//  is necessary without stalling the CPU. Interrupts are enabled in this case to 
//  ensure the buffer flush. The interrupt handler in the miniport can also access
//  the current pointer positions in the shared memory area. Updates to these
//  pointers have to be done carefully and synchronization between the interrupt 
//  thread and the display driver thread is necessary for some operations.
//  On multiprocessor systems, special care has to be taken to handle cases where
//  both CPUs access the shared memory area at the same time.
//
//  The access to the shared memory area is secured by calls to
//  InterlockedExchange on a variable in this area. Pointer updates like
//  the "CommitDMAPtr", which are only done one at a time by one thread
//  need not to be secured (as long as they are atomic)
//  Since the call to InterlockedExchange in the kernel
//  is also very expensive, different versions of the FlushDMA function are 
//  provided for single processor and multiprocessor environments.
//
//-------------------------------------------------------------------------

//-------------------------------------------------------------------------
//
//  here are some hints of how to vary parameters of the CPermedia class:
//
//      the DMA buffer size can be changed between
//      8kb and 256kb by setting:
//
//      #define DMA_BUFFERSIZE   0x40000 // set size between 8kb and 256kb
//
//      The 256kb allocation limit is set by VideoPortGetCommonBuffer.
//      Also the Permedia2 can only transfer 256 kb in one piece.
//      On the Alpha processor we have a limit of 8kb, because some alpha 
//      machines cannot handle DMAs which pass a 8kb page limit.
//
//-------------------------------------------------------------------------

//-------------------------------------------------------------------------
//  on x86 machines we need to call InterlockedExchange in ntoskrnl, but 
//  the display driver is only allowed to import EngXXX functions. So the 
//  VideoPort maps the function for us and we call it directly. On other 
//  platforms InterlockedExchange is implemented as inline. (in fact we 
//  are calling VideoPortInterlockedExchange)
//
#if defined(_X86_)
#define InterlockedExchange(a,b) (*pP2dma->pInterlockedExchange)(a, b)
#endif


//----------------------------------------------------------------------------
//
//  vFree()
//
//  frees allocated DMA buffer, instance count to DMA buffer will be 
//  decremented by one. if usage counts gets down to zero, 
//  the DMA buffer(s) will be freed.
//
//----------------------------------------------------------------------------

VOID vFree(P2DMA *pP2dma)
{
    ULONG   MagicNum;

    pP2dma->uiInstances--;
    if (pP2dma->uiInstances==0)
    {
        ASSERTDD(pP2dma->bEnabled == FALSE,
                 "vFree: Trying to free enabled DMA");

        if (pP2dma->pSharedDMABuffer != NULL)
        {
            FreeDMABuffer(pP2dma->hDriver, pP2dma->pSharedDMABuffer);
        }

        if (pP2dma->pEmulatedDMABuffer != NULL)
        {
            FreeEmulatedDMABuffer(pP2dma->hDriver, pP2dma->pEmulatedDMABuffer);
        }

        // Back to zeroed state retaining magic number
        MagicNum = pP2dma->ICB.ulMagicNo;
        RtlZeroMemory(pP2dma, sizeof(P2DMA));
        pP2dma->ICB.ulMagicNo = MagicNum;
    }
}

//----------------------------------------------------------------------------
//
//  bInitializeP2DMA
//
//  Initialize chip registers for use with display driver and decide if we
//  will use DMA. DMA will only be used if:
//      - the acceleration level is zero (full acc.)
//      - the miniport can map at least 8kb of DMA memory for us
//      - we get the receipt from the IRQ handler after starting a DMA
//      - x86 only: if we get the pointer to the InterlockedExchange function
//          in the videoport
//
//  TODO: parameters
//
//----------------------------------------------------------------------------

BOOL bInitializeP2DMA(P2DMA *pP2dma,
                      HANDLE hDriver, 
                      ULONG *pChipBase, 
                      DWORD dwAccelLevel,
                      BOOL NewReference
                     )
{
    ASSERTDD(pP2dma->bEnabled == FALSE,
                "bInitializeP2DMA: DMA already enabled");

    if (NewReference)
    {
        // increment usage count
        // we rely here on the fact that the videport initializes the shared
        // memory section to zero at start of day

        pP2dma->uiInstances++;

        if (pP2dma->uiInstances == 1)
        {
            ASSERTDD(pP2dma->pSharedDMABuffer == NULL,
                     "Shared DMA Buffer already allocated");
            ASSERTDD(pP2dma->pEmulatedDMABuffer == NULL,
                     "Emulated DMA Buffer already allocated");
        }
    }
    else
    {
        ASSERTDD(pP2dma->uiInstances != 0, "bInitializeP2DMA: DMA hasn't been initialized");
    }

    // save pointers to Permedia 2 registers for later use
    //
    pP2dma->pCtrlBase   = pChipBase+CTRLBASE/sizeof(ULONG);
    pP2dma->pGPFifo     = pChipBase+GPFIFO/sizeof(ULONG);

    DISPDBG((5, "Initialize: pCtrlBase=0x%p\n", pP2dma->pCtrlBase));
    DISPDBG((5, "Initialize: pGPFifo=0x%p\n", pP2dma->pGPFifo));

    BOOL bUseDMA=FALSE;

    // read number of processors we are running on:
    // If we are on a multiprocessing environment we have to take special care
    // about the synchronization of the interrupt 
    // service routine and the display driver
    
    ULONG ulNumberOfProcessors = 1; // Init to 1 by default.
    if(!g_bOnNT40)
        EngQuerySystemAttribute(EngNumberOfProcessors,
            (ULONG *)&ulNumberOfProcessors);
    DISPDBG((1,"running on %ld processor machine", 
        ulNumberOfProcessors));

    //
    // Allow DMA initialization only at full acceleration level (0) on NT5.0
    // and when the magic number of the miniport is the same as ours
    // Otherwise the miniport could use a different version of data structures
    // where the synchronization would probably fail. The magic no. is the 
    // first entry in the shared memory data structure.
    //
    if ( dwAccelLevel==0 && 
        (pP2dma->ICB.ulMagicNo==P2_ICB_MAGICNUMBER) &&
        !g_bOnNT40)
    {
        bUseDMA=TRUE;
    }

    pP2dma->hDriver=hDriver;

    //
    // On x86 machines the InterlockedExchange routine is implemented different
    // in the single- and multiprocessor versions of the kernel. So we have to
    // make sure we call the same function as the interrupt service routine in 
    // the miniport.
    // The miniport returns us a pointer to his InterlockedExchange function, 
    // which is implemented as __fastcall. Otherwise the lock could also be 
    // implemented using an x86 assembler xchg instruction, which is 
    // multiprocessor safe.
    //
    // On the Alpha architecture the compiler generates inline code for 
    // InterlockedExchange and the pointer to this function is not needed.
    //
#if defined(_X86_)
    // get pointer to InterlockedExchange in kernel
    pP2dma->pInterlockedExchange=
        (PInterlockedExchange) GetPInterlockedExchange(hDriver);
    if (pP2dma->pInterlockedExchange==NULL)
    {
        bUseDMA=FALSE;
    }
#endif

    // set DMA control status to default
    //
    WRITE_CTRL_REG(PREG_DMACONTROL,0);

    //  disable all interrupts
    //
    WRITE_CTRL_REG(PREG_INTENABLE, 0);

    // We turn the register on by default, so no entries written to the Fifo can
    // be lost. But the code checks the number of available entries anyway,
    // because when the CPU ends up in a PCI Disconnect-Retry cycle because of an
    // Fifo overflow, it would not even allow an interrupt to come through.
    WRITE_CTRL_REG(PREG_FIFODISCON, DISCONNECT_INPUT_FIFO_ENABLE);

    pP2dma->bDMAEmulation=FALSE;

    pP2dma->lDMABufferSize=0;

    pP2dma->ICB.pDMAActualBufferEnd = 
    pP2dma->ICB.pDMAWriteEnd =
    pP2dma->ICB.pDMAPrevStart=
    pP2dma->ICB.pDMANextStart=
    pP2dma->ICB.pDMAWritePos = NULL;
    pP2dma->ICB.pDMABufferEnd = 
    pP2dma->ICB.pDMABufferStart=NULL;

    //
    //  the following code first tries to allocate a reasonably sized DMA 
    //  buffer, does some initialization and fires off a DMA transfer to see 
    //  if the systems responds as expected. If the system doesn't, it falls 
    //  back to DMA emulation.
    //
    if (bUseDMA) 
    {
        //
        // preset flush and Check function pointers first
        //

//@@BEGIN_DDKSPLIT
#if !MULTITHREADED
//@@END_DDKSPLIT
        if (ulNumberOfProcessors==1)
        {
            pP2dma->pgfnFlushDMA= vFlushDMA;
            pP2dma->pgfnCheckEOB= vCheckForEOB;
        } else
//@@BEGIN_DDKSPLIT
#endif !MULTITHREADED
//@@END_DDKSPLIT
        {   
            pP2dma->pgfnFlushDMA= vFlushDMAMP;
            pP2dma->pgfnCheckEOB= vCheckForEOBMP;
        }

        // Allocate the DMA buffer shared with videoport
        // if we haven't previously allocated one.
        if (pP2dma->pSharedDMABuffer == NULL)
        {
            // allocate a buffer between 8kb and 256kb
            pP2dma->lSharedDMABufferSize = DMACMDSIZE;

            //
            //  allocate the DMA buffer in the videoport
            //
            if (AllocateDMABuffer(  pP2dma->hDriver, 
                                    &pP2dma->lSharedDMABufferSize, 
                                    &pP2dma->pSharedDMABuffer,
                                    &pP2dma->ICB.liDMAPhysAddr))
            {
                // for now we limit DMA Buffer size on alpha to 8kb, because
                // of hardware problems on some Miata machines
#if defined(_ALPHA_)
                ASSERTDD(pP2dma->lSharedDMABufferSize<=0x2000,
                         "DMA Buffer too big for alpha, fix constants!");
#endif
                if (pP2dma->lSharedDMABufferSize < DMACMDMINSIZE)
                {
                    DISPDBG((0,"allocated %ld bytes for DMA, not enough! No DMA!", 
                             pP2dma->lSharedDMABufferSize));

                    FreeDMABuffer(  pP2dma->hDriver, 
                                    pP2dma->pSharedDMABuffer);

                    pP2dma->pSharedDMABuffer = NULL;
                }
            }
            else
            {
                DISPDBG((0,"couldn't allocate memory for DMA"));
                pP2dma->pSharedDMABuffer = NULL;
            }
        }

        // Make sure we have a shared DMA buffer
        if (pP2dma->pSharedDMABuffer == NULL)
        {
            bUseDMA=FALSE;
        }
        else
        {
            // we always do "ULONG" arithmetics in the DMA routines
            pP2dma->lDMABufferSize=pP2dma->lSharedDMABufferSize/sizeof(ULONG);

            pP2dma->ICB.ulControl=0;

            pP2dma->ICB.pDMABufferStart = pP2dma->pSharedDMABuffer;
            pP2dma->ICB.pDMAActualBufferEnd = 
            pP2dma->ICB.pDMABufferEnd = 
                pP2dma->ICB.pDMABufferStart+
                pP2dma->lDMABufferSize;

            pP2dma->ICB.pDMAWriteEnd =
                pP2dma->ICB.pDMABufferEnd;
            pP2dma->ICB.pDMAPrevStart=
            pP2dma->ICB.pDMANextStart=
            pP2dma->ICB.pDMAWritePos =
                pP2dma->ICB.pDMABufferStart;


            // check if we get an interrupt...
            // clear the flags before we check for a DMA
            WRITE_CTRL_REG( PREG_ERRORFLAGS, 0xffffffffl);

            //
            // clear DMA, VSync and Error interrupt flags
            //
            WRITE_CTRL_REG( PREG_INTFLAGS, PREG_INTFLAGS_DMA|
                                           PREG_INTFLAGS_VS|
                                           PREG_INTFLAGS_ERROR);
            //
            //  enable DMA interrupts
            //
            WRITE_CTRL_REG( PREG_INTENABLE, PREG_INTFLAGS_DMA);

            BOOL bIRQsOk=FALSE;
            DWORD dwTimeOut=5;

            // send a small sequence and see if we get a response 
            // by the interrupt handler
            //
            pP2dma->bEnabled = TRUE;

            PULONG pTmp=ReserveDMAPtr(pP2dma,10);
            LD_INPUT_FIFO(__Permedia2TagDeltaMode, 0);
            LD_INPUT_FIFO(__Permedia2TagColorDDAMode, 0);
            LD_INPUT_FIFO(__Permedia2TagScissorMode, 0);
            LD_INPUT_FIFO(__Permedia2TagTextureColorMode, 0);
            LD_INPUT_FIFO(__Permedia2TagFogMode, 0);
            CommitDMAPtr(pP2dma,pTmp);
            vFlushDMAMP(pP2dma);

            pP2dma->bEnabled = FALSE;

            //
            //  The videoport IRQ service routine marks ulControl
            //  on a DMA Interrupt
            //
            while (!(pP2dma->ICB.ulControl & DMA_INTERRUPT_AVAILABLE))
            {
                // wait for some Vsyncs here, then continue
                // 
                if (READ_CTRL_REG( PREG_INTFLAGS) & PREG_INTFLAGS_VS)
                {
                    WRITE_CTRL_REG( PREG_INTFLAGS, PREG_INTFLAGS_VS);

                    if (--dwTimeOut==0) 
                        break;
                }
            } 

            // interrupt service is ok if the IRQ handler marked the flag
            //
            bIRQsOk=pP2dma->ICB.ulControl & DMA_INTERRUPT_AVAILABLE;

            if (!bIRQsOk)
            {
                // disable IRQs and go back to emulation...
                // 
                WRITE_CTRL_REG( PREG_INTENABLE, 0);
                bUseDMA=FALSE;

                pP2dma->lDMABufferSize=0;

                pP2dma->ICB.pDMAActualBufferEnd = 
                pP2dma->ICB.pDMAWriteEnd =
                pP2dma->ICB.pDMAPrevStart=
                pP2dma->ICB.pDMANextStart=
                pP2dma->ICB.pDMAWritePos = NULL;
                pP2dma->ICB.pDMABufferEnd = 
                pP2dma->ICB.pDMABufferStart=NULL;

                DISPDBG((0,"no interrupts available...no DMA available"));
            }
            else
            {
                // VS IRQs can be turned off for now.
                // but enable DMA and Error interrupts
                pP2dma->ulIntFlags=PREG_INTFLAGS_DMA|PREG_INTFLAGS_ERROR;
                WRITE_CTRL_REG(PREG_INTENABLE, pP2dma->ulIntFlags);
                WRITE_CTRL_REG(PREG_INTFLAGS, PREG_INTFLAGS_ERROR);

                DISPDBG((2,"allocated %ld bytes for DMA, interrupts ok", 
                    pP2dma->lDMABufferSize*4));
            }

        }

    }

    if (!bUseDMA)
    {
        // DMA didn't work, then try to allocate memory for DMA emulation
        pP2dma->pgfnFlushDMA= vFlushDMAEmulation;
        pP2dma->pgfnCheckEOB= vCheckForEOBEmulation;

        if (pP2dma->pEmulatedDMABuffer == NULL)
        {
            pP2dma->lEmulatedDMABufferSize=DMACMDMINSIZE;

            pP2dma->pEmulatedDMABuffer=
                AllocateEmulatedDMABuffer( pP2dma->hDriver,
                                           pP2dma->lEmulatedDMABufferSize,
                                           ALLOC_TAG);

            if (pP2dma->pEmulatedDMABuffer == NULL)
            {
                DISPDBG((0,"failed to run in DMA emulation mode"));
                return FALSE;
            }
        }

        DISPDBG((0,"running in DMA emulation mode"));

        pP2dma->bDMAEmulation=TRUE;

        pP2dma->lDMABufferSize = pP2dma->lEmulatedDMABufferSize/sizeof(ULONG);

        pP2dma->ICB.pDMABufferStart = pP2dma->pEmulatedDMABuffer;
        pP2dma->ICB.pDMAActualBufferEnd = 
        pP2dma->ICB.pDMABufferEnd = 
            pP2dma->ICB.pDMABufferStart+
            pP2dma->lDMABufferSize;

        pP2dma->ICB.pDMAWriteEnd = 
            pP2dma->ICB.pDMABufferEnd;
        pP2dma->ICB.pDMAPrevStart=
        pP2dma->ICB.pDMANextStart=
        pP2dma->ICB.pDMAWritePos = 
            pP2dma->ICB.pDMABufferStart;

    }

    pP2dma->bEnabled = TRUE;
    
    return TRUE;
}



//----------------------------------------------------------------------------
//
//  vSyncWithPermedia
//
//  Send a sync tag through the Permedia and make sure all pending reads and 
//  writes are flushed from the graphics pipeline. 
//
//  MUST be called before accessing the Frame Buffer directly
//
//----------------------------------------------------------------------------

VOID vSyncWithPermedia(P2DMA *pP2dma)
{ 
    PULONG pTmp;        // pointer for pTmp in macros

    ASSERTDD(pP2dma->bEnabled, "vSyncWithPermedia: not enabled");

    pTmp=ReserveDMAPtr(pP2dma,6);

    // let the filter tag walk through the whole core
    // by setting the filter mode to passthrough
    //
    LD_INPUT_FIFO(__Permedia2TagFilterMode, 0x400);
    LD_INPUT_FIFO(__Permedia2TagSync, 0L); 
    LD_INPUT_FIFO(__Permedia2TagFilterMode, 0x0); 

    CommitDMAPtr(pP2dma,pTmp);

    (pP2dma->pgfnFlushDMA)(pP2dma);

    vWaitDMAComplete(pP2dma);

    ULONG   ulSync; 

    //
    // now wait until the sync tag has walked through the
    // graphic core and shows up at the output
    //
    do { 
        if (lWaitOutputFifoReady(pP2dma)==0) break; 
        ulSync=READ_CTRL_REG(PREG_FIFOINTERFACE);
    } while (ulSync != __Permedia2TagSync); 

}


//----------------------------------------------------------------------------
//
//  vWaitDMAComplete
//
//  Flush the DMA Buffer and wait until all data is at least sent to the chip.
//  Does not wait until the graphics pipeline is idle.
//
//----------------------------------------------------------------------------

VOID vWaitDMAComplete(P2DMA *pP2dma)
{
    while ( READ_CTRL_REG(PREG_INDMACOUNT)!=0 || 
            pP2dma->ICB.pDMAWritePos!=pP2dma->ICB.pDMANextStart || 
            pP2dma->ICB.pDMAPrevStart!=pP2dma->ICB.pDMANextStart)
    {

        if (READ_CTRL_REG(PREG_INDMACOUNT)!=0) 
        {
            // stall for 1 us
            // we shouldn't access the P2 chip here too often, because
            // reading from the DMA register too often would stall an
            // ongoing DMA transfer. So we better wait for a microsecond.
            // Also we eat up less PCI bus bandwidth by polling only every
            // 1 microsecond.
            // 
            StallExecution( pP2dma->hDriver, 1);
        }
        (pP2dma->pgfnFlushDMA)(pP2dma);
    }

}


//----------------------------------------------------------------------------
//
//  vBlockLoadInputFifo
//
//  pP2dma-----shared 
//  uiTag------register tag to write the data to
//  pImage-----pointer to data
//  lWords-----number of pixels to transfer
//
//  download a block of data with lWords pixels
//  to register uiTag from buffer at pImage. The size of the source pixels
//  are DWORDS.
//
//----------------------------------------------------------------------------


VOID vBlockLoadInputFifo( P2DMA *pP2dma, ULONG uiTag, ULONG *pImage, LONG lWords)
{
    ASSERTDD(pP2dma->bEnabled, "vBlockLoadInputFifo: not enabled");
    
    while (lWords>0)
    {
        PULONG pTmp=ReserveDMAPtr(pP2dma,MAXINPUTFIFOLENGTH);
        LONG lBufferEntries=GetFreeEntries(pP2dma)-1;

        if (lWords < lBufferEntries)
        {
            lBufferEntries = lWords;
        }

        *pTmp++ = uiTag | ((lBufferEntries-1) << 16);

        lWords -= lBufferEntries;

        while (lBufferEntries--)
        {
            *pTmp++=*pImage++;
        }

        CommitDMAPtr(pP2dma,pTmp);
        (pP2dma->pgfnFlushDMA)(pP2dma);
    }
}


//----------------------------------------------------------------------------
//
//  lWaitOutputFifoReady
//
//  return---number of words ready in output fifo
//
//  Wait until some data appears at the output Fifo of the P2. Flush DMA 
//  if necessary. 
//
//----------------------------------------------------------------------------

LONG lWaitOutputFifoReady(P2DMA *pP2dma)
{
    ULONG    x=1000000L;    // equals a timeout of 1s
    ULONG   uiResult;
    while ((uiResult=READ_CTRL_REG(PREG_OUTFIFOWORDS)) == 0)
    {
        if (x-- == 0) 
        {
            // we will end up here if nothing shows up at the output
            // Usually a download operation did not provide the right
            // amount of data if we end up here
            ASSERTDD( FALSE, "chip output fifo timed out");

            break;
        }

        // Make sure we do not read from the control register too often
        // when waiting. Permanent reading from the chip can stall DMA
        // downloads
        if (READ_CTRL_REG(PREG_INDMACOUNT)!=0)
            StallExecution( pP2dma->hDriver, 1);  // stall 1us if DMA still busy
        else
            (pP2dma->pgfnFlushDMA)(pP2dma);  // make sure buffer is flushed

    }
    return uiResult;
}


//----------------------------------------------------------------------------
//
//  vFlushDMA
//
//      single processor version of FlushDMA
//
//  vFlushDMAMP
//
//      multiprocessor version of FlushDMA
//
//  vFlushDMAEmulation
//
//      buffer flush using DMA emulation, where the normal DMA doesn't work
//
//  This routine really kicks off DMAs and handles synchronization with the
//  miniport interrupt service routine.
//
//  several scenarios can happen:
//      1.) DMA is inactive, then just kick off the data currently in the 
//          buffer
//          a) WritePos > NextStart, kick off DMA
//          a) otherwise we wrap around, just flush to buffer end
//
//      2.) DMA still active, make sure interrupts are started and let
//          the interrupt handler 
//
//  The synchronization between this routine and the miniport is essential
//  for our DMA model to work on Multiprocessor machines. The display driver
//  is single threaded, but the miniport interrupt handler can be called 
//  any time and be processed by another CPU. For that reason we loop with 
//  InterlockedExchange until we get the lock. The interrupt handler behaves 
//  a bit different. Since we don't want an interrupt being stalled, it just
//  falls through doing nothing when it cannot get the lock, since then the
//  DMA start will be handled by the display driver anyway. 
//
//  For the single processor case InterlockedExchange needs not to be called.
//  A simple assignment instead of the lock is enough.
//
//----------------------------------------------------------------------------

//----------------------------------------------------------------------------
//
//  VOID vFlushDMAMP()
//
//  multiprocessor safe version of FlushDMA. Its basically the same as the single
//  processor version, but we are calling here the expensive InterlockedExchange
//  functions to lock the shared memory section    
//
//----------------------------------------------------------------------------

VOID vFlushDMAMP(P2DMA *pP2dma)
{
    ASSERTDD(pP2dma->bEnabled, "vFlushDMAMP: not enabled");
    
    ASSERTDD(!pP2dma->bDMAEmulation, "FlushDMA called with DMA mode disabled");
    ASSERTDD(pP2dma->ICB.pDMAWritePos<=
        pP2dma->ICB.pDMABufferEnd,"Index exceeds buffer limit");
    ASSERTDD(pP2dma->ICB.pDMANextStart<=
        pP2dma->ICB.pDMABufferEnd,"NextStart exceeds buffer limit!");

    // lock the access to the shared memory section first
    while (InterlockedExchange((PLONG)&pP2dma->ICB.ulICBLock,TRUE))
        ;
 
    // check if DMA channel is still busy, count is zero if not
    if (READ_CTRL_REG(PREG_INDMACOUNT)==0)
    {
        // this code is called frequently. To help the processors branch
        // prediction the most common case should be reached 
        // without a cond. jump

        if (pP2dma->ICB.pDMAWritePos>pP2dma->ICB.pDMANextStart)
        {
            // This is the most common case for DMA start
            // set Permedia 2 DMA unit to fire the DMA
            WRITE_CTRL_REG( PREG_INDMAADDRESS, (ULONG)
                           (pP2dma->ICB.liDMAPhysAddr.LowPart+
                            (pP2dma->ICB.pDMANextStart-
                             pP2dma->ICB.pDMABufferStart)*sizeof(ULONG)));
            WRITE_CTRL_REG( PREG_INDMACOUNT, (ULONG)
                           (pP2dma->ICB.pDMAWritePos-
                            pP2dma->ICB.pDMANextStart));

            // in this case we always continue to fill to buffer end,
            // iterate the other pointers
            pP2dma->ICB.pDMAWriteEnd=pP2dma->ICB.pDMABufferEnd;
            pP2dma->ICB.pDMAPrevStart=pP2dma->ICB.pDMANextStart;
            pP2dma->ICB.pDMANextStart=pP2dma->ICB.pDMAWritePos;

            // free the shared memory lock
            InterlockedExchange((PLONG)&pP2dma->ICB.ulICBLock,FALSE);

            return;

        } else if (pP2dma->ICB.pDMAWritePos<pP2dma->ICB.pDMANextStart)       
        {
            // wraparound case: the write pointer already wrapped around 
            // to the beginning and we finish up to the end of the buffer.
            WRITE_CTRL_REG( PREG_INDMAADDRESS, (ULONG)
                           (pP2dma->ICB.liDMAPhysAddr.LowPart+
                           (pP2dma->ICB.pDMANextStart-
                            pP2dma->ICB.pDMABufferStart)*sizeof(ULONG)));
            WRITE_CTRL_REG( PREG_INDMACOUNT, (ULONG)
                           (pP2dma->ICB.pDMAActualBufferEnd-
                            pP2dma->ICB.pDMANextStart));

            // reset buffer size back to full length for next round
            pP2dma->ICB.pDMAActualBufferEnd=pP2dma->ICB.pDMABufferEnd;
            
            // in this case we don't want the write pointer 
            // to catch up to last start...
            pP2dma->ICB.pDMAWriteEnd=pP2dma->ICB.pDMANextStart-1;

            // iterate last and next start pointer:
            pP2dma->ICB.pDMAPrevStart=pP2dma->ICB.pDMANextStart;
            pP2dma->ICB.pDMANextStart=pP2dma->ICB.pDMABufferStart;

            // free the shared memory lock
            InterlockedExchange((PLONG)&pP2dma->ICB.ulICBLock,FALSE);

            return;

        } else     // nothing to do
        {
            pP2dma->ICB.pDMAWriteEnd=pP2dma->ICB.pDMABufferEnd;
            pP2dma->ICB.pDMAPrevStart=pP2dma->ICB.pDMANextStart;
        }

        // free the shared memory lock
        InterlockedExchange((PLONG)&pP2dma->ICB.ulICBLock,FALSE);

        return;

    } else
    {
        // the index pointer has been passed to IRQ service routine, nothing more to do..
        // 

        // unlock shared section, 
        InterlockedExchange((PLONG)&pP2dma->ICB.ulICBLock,FALSE);

        // now we are filling the DMA buffer faster than the hardware
        // can follow up and we want to make sure that the DMA channel
        // keeps being busy and start the interrupt handler
       
        WRITE_CTRL_REG( PREG_INTFLAGS, PREG_INTFLAGS_DMA);
        WRITE_CTRL_REG( PREG_INTENABLE, pP2dma->ulIntFlags );

        return;
    } 
}


//----------------------------------------------------------------------------
//
//  VOID vFlushDMA()
//
//  single processor version of FlushDMA.
//
//----------------------------------------------------------------------------

VOID vFlushDMA(P2DMA *pP2dma)
{
    ASSERTDD(pP2dma->bEnabled, "vFlushDMA: not enabled");
    ASSERTDD(!pP2dma->bDMAEmulation, "FlushDMA called with DMA mode disabled");
    ASSERTDD(pP2dma->ICB.pDMAWritePos<=
        pP2dma->ICB.pDMABufferEnd,"Index exceeds buffer limit");
    ASSERTDD(pP2dma->ICB.pDMANextStart<=
        pP2dma->ICB.pDMABufferEnd,"NextStart exceeds buffer limit!");

    // lock the access to the shared memory section first
    pP2dma->ICB.ulICBLock=TRUE;
 
    // check if DMA channel is still busy, count is zero if not
    if (READ_CTRL_REG(PREG_INDMACOUNT)==0)
    {
        // this code is called frequently. To help the processors branch
        // prediction the most common case should be reached 
        // without a cond. jump

        if (pP2dma->ICB.pDMAWritePos>pP2dma->ICB.pDMANextStart)
        {
            // This is the most common case for DMA start
            // set Permedia 2 DMA unit to fire the DMA
            WRITE_CTRL_REG( PREG_INDMAADDRESS, (ULONG)
                            (pP2dma->ICB.liDMAPhysAddr.LowPart+
                             (pP2dma->ICB.pDMANextStart-
                              pP2dma->ICB.pDMABufferStart)*sizeof(ULONG)));
            WRITE_CTRL_REG( PREG_INDMACOUNT, (ULONG)
                           (pP2dma->ICB.pDMAWritePos-
                            pP2dma->ICB.pDMANextStart));

            // in this case we always continue to fill to buffer end,
            // iterate the other pointers
            pP2dma->ICB.pDMAWriteEnd=pP2dma->ICB.pDMABufferEnd;
            pP2dma->ICB.pDMAPrevStart=pP2dma->ICB.pDMANextStart;
            pP2dma->ICB.pDMANextStart=pP2dma->ICB.pDMAWritePos;

            // free the shared memory lock
            pP2dma->ICB.ulICBLock=FALSE;

            return;

        } else if (pP2dma->ICB.pDMAWritePos<pP2dma->ICB.pDMANextStart)       
        {
            // wraparound case: the write pointer already wrapped around 
            // to the beginning and we finish up to the end of the buffer.
            WRITE_CTRL_REG( PREG_INDMAADDRESS, (ULONG)
                           (pP2dma->ICB.liDMAPhysAddr.LowPart+
                           (pP2dma->ICB.pDMANextStart-
                            pP2dma->ICB.pDMABufferStart)*sizeof(ULONG)));
            WRITE_CTRL_REG( PREG_INDMACOUNT, (ULONG)
                           (pP2dma->ICB.pDMAActualBufferEnd-
                            pP2dma->ICB.pDMANextStart));

            // reset buffer size back to full length for next round
            pP2dma->ICB.pDMAActualBufferEnd=pP2dma->ICB.pDMABufferEnd;
            
            // in this case we don't want the write pointer 
            // to catch up to last start...
            pP2dma->ICB.pDMAWriteEnd=pP2dma->ICB.pDMANextStart-1;

            // iterate last and next start pointer:
            pP2dma->ICB.pDMAPrevStart=pP2dma->ICB.pDMANextStart;
            pP2dma->ICB.pDMANextStart=pP2dma->ICB.pDMABufferStart;

            // free the shared memory lock
            pP2dma->ICB.ulICBLock=FALSE;

            return;

        } else     // nothing to do
        {
            pP2dma->ICB.pDMAWriteEnd=pP2dma->ICB.pDMABufferEnd;
            pP2dma->ICB.pDMAPrevStart=pP2dma->ICB.pDMANextStart;
        }

        // free the shared memory lock
        pP2dma->ICB.ulICBLock=FALSE;

        return;

    } else
    {
        // the index pointer has been passed to IRQ service routine, nothing more to do..
        // 

        // unlock shared section, 
        pP2dma->ICB.ulICBLock=FALSE;

        // now we are filling the DMA buffer faster than the hardware
        // can follow up and we want to make sure that the DMA channel
        // keeps being busy and start the interrupt handler
       
        WRITE_CTRL_REG( PREG_INTFLAGS, PREG_INTFLAGS_DMA);
        WRITE_CTRL_REG( PREG_INTENABLE, pP2dma->ulIntFlags );

        return;
    } 
}


//----------------------------------------------------------------------------
// 
//  vFlushDMAEmulation
//
//  this version of FlushDMA emulates the DMA copy and 
//  lets the CPU copy the data
//
//----------------------------------------------------------------------------

VOID vFlushDMAEmulation(P2DMA *pP2dma)
{
    ASSERTDD(pP2dma->bEnabled, "vFlushDMAEmulation: not enabled");
    DISPDBG((10,"Emu::FlushDMA: Write: %04lx Next: %04lx Prev: %04lx End: %04lx",
        pP2dma->ICB.pDMAWritePos, pP2dma->ICB.pDMANextStart,
        pP2dma->ICB.pDMAPrevStart, pP2dma->ICB.pDMABufferEnd));
    ASSERTDD(pP2dma->bDMAEmulation, "FlushDMA called with DMA mode disabled");

    ULONG *pData=pP2dma->ICB.pDMABufferStart;
    ULONG *pDst;
    LONG   lWords=(LONG)(pP2dma->ICB.pDMAWritePos-pP2dma->ICB.pDMABufferStart);

    while (lWords > 0)
    {
        LONG lFifoSpace=(LONG)READ_CTRL_REG(PREG_INFIFOSPACE);
        if (lWords<lFifoSpace) lFifoSpace=lWords;
        lWords -= lFifoSpace;
        pDst = pP2dma->pGPFifo;
        while (lFifoSpace--)
        {
            WRITE_REGISTER_ULONG(pDst++,*pData++); 
            MEMORY_BARRIER();
        }
    }    

    pP2dma->ICB.pDMAWritePos=pP2dma->ICB.pDMANextStart=
        pP2dma->ICB.pDMAPrevStart=pP2dma->ICB.pDMABufferStart;
    pP2dma->ICB.pDMAWriteEnd=pP2dma->ICB.pDMABufferEnd;
}

//----------------------------------------------------------------------------
//
//  bDrawEngineBusy
//
//  check if P2 is still busy drawing. 
//
//  return----  TRUE  P2 is still busy
//              FALSE P2 has finished drawing and is not busy anymore
//
//----------------------------------------------------------------------------

BOOL bDrawEngineBusy(P2DMA *pP2dma)
{
    if (READ_CTRL_REG(PREG_INDMACOUNT)!=0) return TRUE;   

    if (READ_CTRL_REG(PREG_FIFODISCON) & PREG_FIFODISCON_GPACTIVE)
    {
        return TRUE;
    }

    return FALSE;
}



//----------------------------------------------------------------------------
//
//  bInVerticalRetrace
//
//  Return----- TRUE if beam position is within current vertical sync.
//              FALSE otherwise
//
//----------------------------------------------------------------------------

BOOL bInVerticalRetrace(PPDev ppdev)
{
    return P2_READ_CTRL_REG(PREG_LINECOUNT) < P2_READ_CTRL_REG(PREG_VBEND);
}

//----------------------------------------------------------------------------
//
//  lCurrentLine
//
//  returns current line of beam on display
//
//----------------------------------------------------------------------------

LONG lCurrentLine(PPDev ppdev)
{
    LONG lScanline=P2_READ_CTRL_REG(PREG_LINECOUNT)-P2_READ_CTRL_REG(PREG_VBEND);
    if (lScanline<0) return 0;
    return lScanline;
}

//----------------------------------------------------------------------------
//
//  vCheckFOREOB (End of Buffer)
//
//  Check if buffer end would be overrun and adjust actual buffer size.
//  The buffer size will be restored when the DMA handler passes the wrap 
//  around.
//
//----------------------------------------------------------------------------

VOID vCheckForEOBEmulation( P2DMA *pP2dma, LONG lEntries)
{
    vFlushDMAEmulation(pP2dma);
}

//
//  multiprocessor safe version of vCheckForEOB
//

VOID vCheckForEOBMP( P2DMA *pP2dma, LONG lEntries)
{
    // check for overrun condition over the buffer end:
    // if we would exceed the current buffer size, 
    // LastStart has already wrapped around (LastStart<=writepos)
    // but is not at the wraparound position
    // and the buffer size was already reset to the full size

    if (pP2dma->ICB.pDMAWritePos+lEntries >= pP2dma->ICB.pDMABufferEnd && 
        pP2dma->ICB.pDMAPrevStart<=pP2dma->ICB.pDMAWritePos &&            
        pP2dma->ICB.pDMAPrevStart!=pP2dma->ICB.pDMABufferStart)   
    {
        DISPDBG((10,"wrap condition before: %04lx %04lx %04lx", 
            pP2dma->ICB.pDMAWritePos, 
            pP2dma->ICB.pDMANextStart, 
            pP2dma->ICB.pDMAPrevStart));

        while (InterlockedExchange((PLONG)&pP2dma->ICB.ulICBLock,TRUE))
            ;

        if (pP2dma->ICB.pDMAWritePos==pP2dma->ICB.pDMANextStart)
        {
            // special case one:
            // NextStart equals LastStart, so we just reset Index and Next
            // to the buffer start and see if we have enough space
            pP2dma->ICB.pDMANextStart=pP2dma->ICB.pDMABufferStart;
        } else
        {
            // index exceeds buffer end on the next block, but there is
            // a DMA pending to the current position of Index. Set Buffer
            // end temporarily to the current index.
            pP2dma->ICB.pDMAActualBufferEnd = pP2dma->ICB.pDMAWritePos; 
        }

        // wrap index around and see if there are enought free entries
        pP2dma->ICB.pDMAWriteEnd=pP2dma->ICB.pDMAPrevStart-1;
        pP2dma->ICB.pDMAWritePos=pP2dma->ICB.pDMABufferStart;

        InterlockedExchange((PLONG)&pP2dma->ICB.ulICBLock,FALSE);

        DISPDBG((10,"wrap condition after: %04lx %04lx %04lx", 
            pP2dma->ICB.pDMAWritePos, 
            pP2dma->ICB.pDMANextStart, 
            pP2dma->ICB.pDMAPrevStart));
    }
    vFlushDMAMP(pP2dma);
}

VOID vCheckForEOB( P2DMA *pP2dma, LONG lEntries)
{
    // check for overrun condition over the buffer end:
    // if we would exceed the current buffer size, 
    // LastStart has already wrapped around (LastStart<=writepos)
    // but is not at the wraparound position
    // and the buffer size was already reset to the full size

    if (pP2dma->ICB.pDMAWritePos+lEntries >= pP2dma->ICB.pDMABufferEnd && 
        pP2dma->ICB.pDMAPrevStart<=pP2dma->ICB.pDMAWritePos &&            
        pP2dma->ICB.pDMAPrevStart!=pP2dma->ICB.pDMABufferStart)   
    {
        DISPDBG((10,"wrap condition before: %04lx %04lx %04lx", 
            pP2dma->ICB.pDMAWritePos, 
            pP2dma->ICB.pDMANextStart, 
            pP2dma->ICB.pDMAPrevStart));

        pP2dma->ICB.ulICBLock=TRUE;

        if (pP2dma->ICB.pDMAWritePos==pP2dma->ICB.pDMANextStart)
        {
            // special case one:
            // NextStart equals LastStart, so we just reset Index and Next
            // to the buffer start and see if we have enough space
            pP2dma->ICB.pDMANextStart=pP2dma->ICB.pDMABufferStart;
        } else
        {
            // index exceeds buffer end on the next block, but there is
            // a DMA pending to the current position of Index. Set Buffer
            // end temporarily to the current index.
            pP2dma->ICB.pDMAActualBufferEnd = pP2dma->ICB.pDMAWritePos; 
        }

        // wrap index around and see if there are enought free entries
        pP2dma->ICB.pDMAWriteEnd=pP2dma->ICB.pDMAPrevStart-1;
        pP2dma->ICB.pDMAWritePos=pP2dma->ICB.pDMABufferStart;

        pP2dma->ICB.ulICBLock=FALSE;

        DISPDBG((10,"wrap condition after: %04lx %04lx %04lx", 
            pP2dma->ICB.pDMAWritePos, 
            pP2dma->ICB.pDMANextStart, 
            pP2dma->ICB.pDMAPrevStart));
    }
    vFlushDMA(pP2dma);
}

#if DBG

//----------------------------------------------------------------------------
//
//  ReserveDMAPtr
//
//  return a pointer to current position in DMA buffer. The function guarantees
//  that there are at least lEntries available in the buffer.
//  Otherwise the caller can ask GetFreeEntries and adjust the download to 
//  batch more entries. The caller MUST call CommitDMAPtr after a call to
//  to ReserveDMAPtr to readjust the Index pointer.
//
//----------------------------------------------------------------------------

ULONG *ReserveDMAPtr(P2DMA *pP2dma, const LONG lEntries)
{
    ASSERTDD(pP2dma->bEnabled, "ReserveDMAPtr: not enabled");
    ASSERTDD(pP2dma->lDBGState==0,
        "ReserveDMAPtr called, but previous called was not closed");
//@@BEGIN_DDKSPLIT
#if MULTITHREADED
    ASSERTDD(pP2dma->ppdev != NULL, "ReserveDMAPtr: pP2dma->ppdev = NULL");
#endif
    ASSERTLOCK(pP2dma->ppdev, ReserveDMAPtr);
//@@END_DDKSPLIT

    pP2dma->lDBGState=2;

    while (pP2dma->ICB.pDMAWritePos+lEntries>=
           pP2dma->ICB.pDMAWriteEnd)
    {
        (*pP2dma->pgfnCheckEOB)(pP2dma,lEntries);
    }   

    if (lEntries<MAXINPUTFIFOLENGTH)
        pP2dma->pDBGReservedEntries=
            (ULONG *)(lEntries+pP2dma->ICB.pDMAWritePos);
    else
        pP2dma->pDBGReservedEntries=NULL;

    return (ULONG *)pP2dma->ICB.pDMAWritePos;  
}


//----------------------------------------------------------------------------
//
//  CommitDMAPtr
//
//  pDMAPtr----DMA buffer address to which the caller has written to.
//
//  Readjust write pointer after being reserved by ReserveDMAPtr. 
//  By committing the pointer a DMA to the committed position could already
//  be started by interrupt handler!
//
//----------------------------------------------------------------------------

VOID CommitDMAPtr(P2DMA *pP2dma,ULONG *pDMAPtr)
{
    ASSERTDD(pP2dma->bEnabled, "CommitDMAPtr: not enabled");
    ASSERTDD(pP2dma->lDBGState==2,
        "CommitDMAPtr called, but previous without calling Reserve before");
    pP2dma->lDBGState=0;
    if (pDMAPtr==NULL) return;

    pP2dma->ICB.pDMAWritePos=pDMAPtr;

    ASSERTDD(pP2dma->ICB.pDMAWritePos<=
        pP2dma->ICB.pDMABufferEnd,"CommitDMAPtr: DMA buffer overrun");

    if (pP2dma->pDBGReservedEntries!=NULL)
    {
        ASSERTDD(pP2dma->ICB.pDMAWritePos<=pP2dma->pDBGReservedEntries, 
            "reserved not enough entries in ReserveDMAPtr");
    }
}


//----------------------------------------------------------------------------
//
//  GetFreeEntries
//
//  Get free entries available for consecutive writing to the DMA buffer.
//  The maximum number of returned entries is now MAXBLKSIZE.
// 
//  returns---number of available entries in ULONGS
//
//----------------------------------------------------------------------------

LONG  GetFreeEntries(P2DMA *pP2dma)
{   
    LONG EntriesAvailable;
    ASSERTDD(pP2dma->bEnabled, "GetFreeEntries: not enabled");
    EntriesAvailable = (LONG)(pP2dma->ICB.pDMAWriteEnd - pP2dma->ICB.pDMAWritePos);
    return min(MAXBLKSIZE,EntriesAvailable);
}
#endif