/*++

Copyright (c) 1995  Digital Equipment Corporation

Module Name:

    ioderr.c

Abstract:

    This module implements error handling functions for the Rawhide
    IOD (CAP and MDP ASICs).

Author:

    Eric Rehm 13-Apr-1995

Environment:

    Kernel mode

Revision History:


--*/

#include "halp.h"
//#include "iod.h"
#include "rawhide.h"
#include "stdio.h"

//
// Externals and globals.
//

extern PERROR_FRAME PUncorrectableError;
extern ULONG HalDisablePCIParityChecking;
ULONG IodCorrectedErrors = 0;

//
// Define the context structure for use by interrupt service routines.
//

typedef BOOLEAN  (*PSECOND_LEVEL_DISPATCH)(
    PKINTERRUPT InterruptObject,
    PVOID ServiceContext
    );

//
// The Soft Error interrupt is always turned on for Rawhide. When a 
// Soft Error interrupt occurs, HalpIodSoftErrorInterrupt() must
// be called to reset the error condition on the offending IOD to
// insure system integrity.
//
// A Correctable Error Driver might also connect to the Soft Error interrupt
// via the Internal Bus interface.  When a Soft Error interrupt occurs,
// we determine if it is also necessary to dispatch an ISR for the 
// Correctable Error Driver via a boolean.
// 

BOOLEAN HalpLogCorrectableErrors = FALSE;

//
//  Keep the first time we read the WhoAmI register
//  since it does not always read the same the second time.
//
//  Zero value indicates that we haven't read WhoAmI yet and that
//  this global variable is not valid.
//
//  (On machine checks that we dismiss, we must remember to
//  to reset this to zero.)
//

IOD_WHOAMI HalpIodWhoAmIOnError = { 0 };

//
// Function prototypes.
//

VOID
HalpSetMachineCheckEnables(
    IN BOOLEAN DisableMachineChecks,
    IN BOOLEAN DisableProcessorCorrectables,
    IN BOOLEAN DisableSystemCorrectables
    );

VOID
HalpUpdateMces(
    IN BOOLEAN ClearMachineCheck,
    IN BOOLEAN ClearCorrectableError
    );

//
// Function prototypes for routines not visible outside this module
//

VOID
HalpBuildIodErrorFrame(
    MC_DEVICE_ID McDeviceId,
    PIOD_ERROR_FRAME IodErrorFrame
    );

BOOLEAN
bFindIodError( 
   PMC_DEVICE_ID pMcDeviceId,  
   PIOD_CAP_ERR pIodCapErr
);

BOOLEAN
bHandleFatalIodError(
    MC_DEVICE_ID McDeviceId,
    BOOLEAN bMachineCheck
    );

BOOLEAN
bHandleIsaError( 
   MC_DEVICE_ID pMcDeviceId,  
   IOD_CAP_ERR IodCapErr
);

VOID
HalpErrorFrameString(
    PUNCORRECTABLE_ERROR uncorr,
    PUCHAR OutBuffer
    );

ULONG 
BuildActiveCpus (
    VOID
    );

//
// Allocate a flag that indicates when a PCI Master Abort is expected.
// PCI Master Aborts are signaled on configuration reads to non-existent
// PCI slots.  A cardinal value (0-128) indicates that a Master Abort is expected.
// A value of 0xffffffff indicates that a Master Abort is *not* expected.
//

IOD_EXPECTED_ERROR  HalpMasterAbortExpected = {MASTER_ABORT_NOT_EXPECTED, 0x0};


VOID
HalpInitializeIodMachineChecks(
    IN BOOLEAN ReportCorrectableErrors,
    IN BOOLEAN PciParityChecking
    )
/*++

Routine Description:

    This routine initializes machine check handling for a IOD-based
    system by clearing all pending errors in the IOD registers and
    enabling correctable errors according to the callers specification.

Arguments:

    ReportCorrectableErrors - Supplies a boolean value which specifies
                              if correctable error reporting should be
                              enabled.

Return Value:

    None.

--*/
{
    IOD_CAP_CONTROL IodCapControl;
    IOD_CAP_ERR IodCapError;
    IOD_MDPA_DIAG IodMdpaDiag;
    IOD_MDPB_DIAG IodMdpbDiag;
    IOD_INT_MASK IodIntMask;

    MC_DEVICE_ID McDeviceId;
    MC_ENUM_CONTEXT mcCtx;
    ULONG numIods;
    BOOLEAN bfoundIod;

    //
    // Clear any pending error bits in the IOD_CAP_ERR register:
    //

    IodCapError.all = 0;               // Clear all bits

    IodCapError.Perr = 1;              // PCI bus perr detected
    IodCapError.Serr = 1;              // PCI bus serr detected
    IodCapError.Mab = 1;               // PCI bus master abort detected
    IodCapError.PteInv = 1;            // Invalid Pte
    IodCapError.PioOvfl = 1;           // Pio Ovfl
    IodCapError.LostMcErr = 1;         // Lost error
    IodCapError.McAddrPerr = 1;        // MC bus comd/addr parity error 
    IodCapError.Nxm = 1;               // Non-existent memory error
    IodCapError.CrdA = 1;              // Correctable ECC error on MDPA
    IodCapError.CrdB = 1;              // Correctable ECC error on MDPB
    IodCapError.RdsA = 1;              // Uncorrectable ECC error on MDPA
    IodCapError.RdsA = 1;              // Uncorrectable ECC error on MDPA

    //
    // Intialize enumerator.
    //

    numIods = HalpMcBusEnumStart ( HalpIodMask, &mcCtx );

    //
    // Intialize each Iod
    //

    while ( bfoundIod = HalpMcBusEnum( &mcCtx ) ) {

       McDeviceId = mcCtx.McDeviceId;

       //
       // Initialize IOD_CAP_ERR
       //
    
       WRITE_IOD_REGISTER_NEW( McDeviceId,
                            &((PIOD_ERROR_CSRS)(IOD_ERROR_CSRS_QVA))->CapErr,
                            IodCapError.all );

       //
       //  Set the Iod error enable bits in the IOD_CAP_CTRL and 
       //  IOD_MDPA/B_DIAG registers.  The configuration bits in the IOD 
       //  will be left as set by the Extended SROM, with the few 
       //  exceptions documented below.
       //

       IodCapControl.all = READ_IOD_REGISTER_NEW( McDeviceId, 
                           &((PIOD_GENERAL_CSRS)(IOD_GENERAL_CSRS_QVA))->CapCtrl );

#if 0 // CAP/MDP Bug
       IodMdpaDiag.all = 
           READ_IOD_REGISTER_NEW( McDeviceId,
               &((PIOD_ERROR_CSRS)(IOD_ERROR_CSRS_QVA))->MdpaDiag ); 

       IodMdpbDiag.all = 
           READ_IOD_REGISTER_NEW( McDeviceId,
               &((PIOD_ERROR_CSRS)(IOD_ERROR_CSRS_QVA))->MdpbDiag ); 
#else

       //
       //  Clear Mdp Diagnotic Check Registers....
       //

       IodMdpaDiag.all = 0;
       IodMdpbDiag.all = 0;

       //
       //  Enable ECC checking on all MC Bus transactions
       //

       IodMdpaDiag.EccCkEn = 1;
       IodMdpbDiag.EccCkEn = 1;

#endif

       //
       // Disable/enable PCI parity checking as requested
       //

       if (PciParityChecking == FALSE) {

           IodCapControl.PciAddrPe= 0;   // Do *not* check PCI address parity
           IodMdpaDiag.ParCkEn = 0;      // Do *not* check PCI data parity
           IodMdpbDiag.ParCkEn = 0;      // Do *not* check PCI data parity

       } else {

           IodCapControl.PciAddrPe= PciParityChecking; 
           IodMdpaDiag.ParCkEn = PciParityChecking;
           IodMdpbDiag.ParCkEn = PciParityChecking;

       }


       //
       // Disable McBus NXM's 
       //
       // (If enabled, accesses to non-existent McBus device will cause an
       // EV5 fill error.  Non existant CSRs will return all 0s most of the time
       // and not fill error.)
       //

       IodCapControl.McNxmEn = 0;

       //
       // Disable monitoring of McBus bystander errors.
       //
       // That means the IOD will not capture the failing address in the event of 
       // an MC bus NXM. It has no effect on what the IOD does in the event of a 
       // PCI NXM (which causes a Master Abort).
       //
       // Regardless of how McBusMonEn PCI PERR, SERR, MAB, and PTE_INV 
       // will only show up in IOD CAP_ERR of the participant in the transaction.
       //
       // If McBusMonEn is set, there can be a difference between the bystander CAP_ERR 
       // state and the participant CAP_ERR state (as per Sam Duncan, 5/3/95)
       // shows up in an unlikely situation:
       //   "Cache single bit or double bit error: read is dirty in a cache 
       //   and the fill has an ecc error, don't want to indite a memory for this 
       //   (very unlikely) error."
       // Thus, we choose not to be able to correctly detect this situation in
       // order to make machine check and error handling easier, i.e., we
       // always only need to clear only one IOD's CAP_ERROR.
       //

       IodCapControl.McBusMonEn= 0;


       WRITE_IOD_REGISTER_NEW( McDeviceId,
                        &((PIOD_GENERAL_CSRS)(IOD_GENERAL_CSRS_QVA))->CapCtrl,
                        IodCapControl.all ); 

       WRITE_IOD_REGISTER_NEW( McDeviceId,
                        &((PIOD_ERROR_CSRS)(IOD_ERROR_CSRS_QVA))->MdpaDiag,
                        IodMdpaDiag.all ); 

       WRITE_IOD_REGISTER_NEW( McDeviceId,
                        &((PIOD_ERROR_CSRS)(IOD_ERROR_CSRS_QVA))->MdpbDiag,
                        IodMdpbDiag.all ); 


       //
       // Soft and Hard Error handling
       //
       // ecrfix - IntMask0 on Bus 0 only.

       IodIntMask.all = READ_IOD_REGISTER_NEW( McDeviceId,
                           &((PIOD_INT_CSRS)(IOD_INT_CSRS_QVA))->IntMask0 );

       IodIntMask.SoftErr = (ReportCorrectableErrors == TRUE);   
       IodIntMask.HardErr = 0;    // ecrfix - Mask Hard Errors for now

       WRITE_IOD_REGISTER_NEW( McDeviceId,
                         &((PIOD_INT_CSRS)(IOD_INT_CSRS_QVA))->IntMask0,
                         IodIntMask.all ); 

    } // while ( HalpMcBusEnum ( &mcCtx ) )

    //
    // Set the machine check enables within the EV5.
    //

    if( ReportCorrectableErrors == TRUE ){
        HalpSetMachineCheckEnables( FALSE, FALSE, FALSE );
    } else {
        HalpSetMachineCheckEnables( FALSE, TRUE, TRUE );
    }

    return;

}

#define MAX_ERROR_STRING 128


BOOLEAN
HalpIodUncorrectableError(
    PMC_DEVICE_ID pMcDeviceId
    )
/*++

Routine Description:

    Read the IOD error register and determine if an uncorrectable error
    is latched in the error bits.

Arguments:

    None.

Return Value:

    TRUE is returned if an uncorrectable error has been detected.  FALSE
    is returned otherwise.

--*/
{
    UCHAR OutBuffer[ MAX_ERROR_STRING ];
    IOD_WHOAMI  IodWhoAmI;
    IOD_CAP_ERR IodCapErr;

    //
    // Check for a duplicate tag parity error on this (in the Smalltalk 
    // sense) processor.
    //

    IodWhoAmI.all = HalpReadWhoAmI();
    HalpIodWhoAmIOnError.all = IodWhoAmI.all;

    if ( IodWhoAmI.CpuInfo & CACHED_CPU_DTAG_PARITY_ERROR ) {

      pMcDeviceId->all = IodWhoAmI.Devid;

      return TRUE;

    } else {

      //
      // None of the uncorrectable error conditions were detected.
      //

      return FALSE;
    }

}

VOID
HalpBuildIodErrorFrame(
    MC_DEVICE_ID McDeviceId,
    PIOD_ERROR_FRAME IodErrorFrame
    )
/*++

Routine Description:

   This function reports and interprets a fatal hardware error
   detected by the IOD chipset. It is assumed that HalGetDisplayOwnership()
   has been called prior to this function.

Arguments:

   McDevid     - Supplies the MC Bus Device ID of the IOD 
   IodErrorFrame     - Supplies a pointer to an IOD_ERROR_FRAME

Return Value:

   None.

--*/
{
    //
    // Clear it first, since caller may reuse the IodErrorFrame
    //

    RtlZeroMemory(IodErrorFrame, sizeof(IOD_ERROR_FRAME));

    //
    // Everything is valid
    //

    IodErrorFrame->ValidBits.all = 0xffffffff;  // all valid


    //
    //  Read the General registers
    //

    IodErrorFrame->BaseAddress = IOD_IO_SPACE_START     |
                                 IOD_SPARSE_CSR_OFFSET  |
                                 MCDEVID_TO_PHYS_ADDR(McDeviceId.all);

    IodErrorFrame->WhoAmI = (ULONG)READ_IOD_REGISTER_NEW( McDeviceId,
                              &((PIOD_GENERAL_CSRS)(IOD_GENERAL_CSRS_QVA))->WhoAmI
                                        );
    
    IodErrorFrame->PciRevision = (ULONG)READ_IOD_REGISTER_NEW( McDeviceId,
                              &((PIOD_GENERAL_CSRS)(IOD_GENERAL_CSRS_QVA))->PciRevision
                                        );

    IodErrorFrame->CapCtrl = (ULONG)READ_IOD_REGISTER_NEW( McDeviceId,
                              &((PIOD_GENERAL_CSRS)(IOD_GENERAL_CSRS_QVA))->CapCtrl
                                        );

    IodErrorFrame->HaeMem = (ULONG)READ_IOD_REGISTER_NEW( McDeviceId,
                              &((PIOD_GENERAL_CSRS)(IOD_GENERAL_CSRS_QVA))->HaeMem
                                        );

    IodErrorFrame->HaeIo = (ULONG)READ_IOD_REGISTER_NEW( McDeviceId,
                              &((PIOD_GENERAL_CSRS)(IOD_GENERAL_CSRS_QVA))->HaeIo
                                        );

    //
    //  Read Interrupt Control and Status Registers
    //
    
    IodErrorFrame->IntCtrl = (ULONG)READ_IOD_REGISTER_NEW( McDeviceId,
                              &((PIOD_INT_CSRS)(IOD_INT_CSRS_QVA))->IntCtrl
                                        );

    IodErrorFrame->IntReq = (ULONG)READ_IOD_REGISTER_NEW( McDeviceId,
                              &((PIOD_INT_CSRS)(IOD_INT_CSRS_QVA))->IntReq
                                        );

    IodErrorFrame->IntMask0 = (ULONG)READ_IOD_REGISTER_NEW( McDeviceId,
                              &((PIOD_INT_CSRS)(IOD_INT_CSRS_QVA))->IntMask0
                                        );

    IodErrorFrame->IntMask1 = (ULONG)READ_IOD_REGISTER_NEW( McDeviceId,
                              &((PIOD_INT_CSRS)(IOD_INT_CSRS_QVA))->IntMask1
                                        );

    //
    //  Read the rest of the error registers and then unlock them by
    //  writing to CAP_ERR
    //

    IodErrorFrame->CapErr = (ULONG)READ_IOD_REGISTER_NEW( McDeviceId,
                              &((PIOD_ERROR_CSRS)(IOD_ERROR_CSRS_QVA))->CapErr
                                        );

    IodErrorFrame->PciErr1  = (ULONG)READ_IOD_REGISTER_NEW( McDeviceId,
                              &((PIOD_ERROR_CSRS)(IOD_ERROR_CSRS_QVA))->PciErr1
                                        );

    IodErrorFrame->McErr0 = (ULONG)READ_IOD_REGISTER_NEW( McDeviceId,
                              &((PIOD_ERROR_CSRS)(IOD_ERROR_CSRS_QVA))->McErr0
                                        );

    IodErrorFrame->McErr1 = (ULONG)READ_IOD_REGISTER_NEW( McDeviceId,
                              &((PIOD_ERROR_CSRS)(IOD_ERROR_CSRS_QVA))->McErr1
                                        );
#if 0  // CAP/MDP Bug
    IodErrorFrame->MdpaStat = (ULONG)READ_IOD_REGISTER_NEW( McDeviceId,
                              &((PIOD_ERROR_CSRS)(IOD_ERROR_CSRS_QVA))->MdpaStat
                                        );

    IodErrorFrame->MdpaSyn = (ULONG)READ_IOD_REGISTER_NEW( McDeviceId,
                              &((PIOD_ERROR_CSRS)(IOD_ERROR_CSRS_QVA))->MdpaSyn
                                        );

    IodErrorFrame->MdpbStat = (ULONG)READ_IOD_REGISTER_NEW( McDeviceId,
                              &((PIOD_ERROR_CSRS)(IOD_ERROR_CSRS_QVA))->MdpbStat
                                        );

    IodErrorFrame->MdpbSyn = (ULONG)READ_IOD_REGISTER_NEW( McDeviceId,
                              &((PIOD_ERROR_CSRS)(IOD_ERROR_CSRS_QVA))->MdpbSyn
                                        );
#else

    //
    //  CAP/MDP Bug - these registers are not valid.
    //

    IodErrorFrame->ValidBits.MdpaStatValid = 0;
    IodErrorFrame->ValidBits.MdpbStatValid = 0;
    IodErrorFrame->ValidBits.MdpaSynValid = 0;
    IodErrorFrame->ValidBits.MdpbSynValid = 0;
    
#endif // CAP/MDP Bug

}

VOID
HalpIodReportFatalError(
    MC_DEVICE_ID ErrorMcDeviceId
    )
/*++

Routine Description:

   This function reports and interprets a fatal hardware error
   detected by the IOD chipset. It is assumed that HalGetDisplayOwnership()
   has been called prior to this function.

Arguments:

   ErrorMcDeviceId  - Supplies the MC Bus Device ID of the IOD
                      where the error was found

                    - In the case of a Duplicate Tag Parity Error, supplies
                      the CPU that took the error.  Note, in this case
                      the ErrorMcDeviceId will never match a IOD McDeviceId.
                      No MC Bus snapshot is present in this case.

Return Value:

   None.

--*/
{
    UCHAR   OutBuffer[ MAX_ERROR_STRING ];
    IOD_ERROR_FRAME IodErrorFrame, *pCurrentIodErrorFrame;
    MC_ENUM_CONTEXT mcCtx;
    MC_DEVICE_ID McDeviceId;
    ULONG numIods;
    BOOLEAN bfoundIod;

    PUNCORRECTABLE_ERROR  uncorr = NULL;
    PRAWHIDE_UNCORRECTABLE_FRAME rawerr = NULL;
    PEXTENDED_ERROR PExtErr;

    //
    //  Do we have an uncorrectable error frame?
    // 
    
    if (PUncorrectableError) {
        uncorr = (PUNCORRECTABLE_ERROR) 
                    &PUncorrectableError->UncorrectableFrame;
        rawerr = (PRAWHIDE_UNCORRECTABLE_FRAME)
            PUncorrectableError->UncorrectableFrame.RawSystemInformation;
        PExtErr = &PUncorrectableError->UncorrectableFrame.ErrorInformation;
    }

    //
    // Validate the ProcessorInfo portion of the Error Frame.
    //

    if (uncorr) {
        uncorr->Flags.ProcessorInformationValid = 1;
        HalpGetProcessorInfo(&uncorr->ReportingProcessor);

        //
        // Initialize our "error string accumulator"
        //

        HalpErrorFrameString( uncorr, NULL );

    }

    //
    //  Validate the Rawhide Uncorrectable Frame
    //  (Common RCUD Header was already set up.)
    //
    
    if (rawerr) {
        rawerr->Revision = RAWHIDE_UNCORRECTABLE_FRAME_REVISION;
        rawerr->WhoAmI = HalpIodWhoAmIOnError.all;
        rawerr->ErrorSubpacketFlags.all = 0;
        rawerr->CudHeader.ActiveCpus = BuildActiveCpus();

    }

    //
    //  Handle cached CPU duplicate tag parity error.
    //  (Note that a DTAG parity error implies that we don't
    //  take an MC Bus Snapshot.
    // 
    
    if ( HalpIodWhoAmIOnError.CpuInfo & CACHED_CPU_DTAG_PARITY_ERROR ) {

      sprintf( OutBuffer, "Duplicate Tag Parity Error on CPU %x\n",
                       MCDEVID_TO_PHYS_CPU(HalpIodWhoAmIOnError.McDevId.all) );

      HalDisplayString( OutBuffer );
#if HALDBG
      DbgPrint( "Duplicate Tag Parity Error on CPU (%d, %d)\n",
                 HalpIodWhoAmIOnError.McDevId.Gid, HalpIodWhoAmIOnError.McDevId.Mid);
#endif
      HalpErrorFrameString( uncorr, OutBuffer );

      //
      // OK.  This is tedious:
      // * Error is in memory space and is the system (external) cache.
      // * And we know this is the L3 cache.
      // * And we'll subvert the "CacheBoard" to squirrel away the
      //   Cached CPU Revision Info and Cache size.
      // 
      
      uncorr->Flags.AddressSpace = MEMORY_SPACE;
      uncorr->Flags.ExtendedErrorValid = 1;
      uncorr->Flags.MemoryErrorSource = SYSTEM_CACHE;
      PExtErr->CacheError.Flags.CacheLevelValid = 1;
      PExtErr->CacheError.Flags.CacheBoardValid = 1;
      PExtErr->CacheError.Flags.CacheSimmValid = 0;
      PExtErr->CacheError.CacheLevel = 3;
      PExtErr->CacheError.CacheBoardNumber = HalpIodWhoAmIOnError.CpuInfo;

      return;
    }
    
    //
    //  Handle cached CPU fill error.
    //  Since this could be caused by an MC Bus or PCI error,
    //  we continue to create an MC Bus snapshot.
    // 
    
    if ( HalpIodWhoAmIOnError.CpuInfo & CACHED_CPU_FILL_ERROR ) {

      sprintf( OutBuffer, "Fill Error on CPU %x\n",
                       MCDEVID_TO_PHYS_CPU(HalpIodWhoAmIOnError.McDevId.all) );

      HalDisplayString( OutBuffer );
#if HALDBG
      DbgPrint( "Fill Error on CPU (%d, %d)\n",
                 HalpIodWhoAmIOnError.McDevId.Gid, HalpIodWhoAmIOnError.McDevId.Mid);
#endif
      HalpErrorFrameString( uncorr, OutBuffer );

      //
      // * WhoAmI tells us Addr<38:33> of reference causing error.
      // * However, PciErr1 and/or McErr0/McErr1 give us more bits,
      //   so the data entered here my get overwritten later.
      // 
      
      uncorr->Flags.PhysicalAddressValid = 1;
      uncorr->PhysicalAddress =
            ( ((ULONGLONG)(HalpIodWhoAmIOnError.CpuInfo & 0x3f)) << 33 );

    }
    
    //
    // Validate the MCBusSnapshot header.
    //

     if (rawerr) {
        rawerr->ErrorSubpacketFlags.McBusPresent = 1;
        rawerr->McBusSnapshot.ReportingCpuBaseAddr =
            IOD_IO_SPACE_START |
            MCDEVID_TO_PHYS_ADDR( HalpIodWhoAmIOnError.Devid );
        pCurrentIodErrorFrame = (PIOD_ERROR_FRAME) (rawerr + 1);
    }

    //
    // Intialize enumerator.
    //

    numIods = HalpMcBusEnumStart ( HalpIodMask, &mcCtx );
    ASSERT( numIods == HalpNumberOfIods);

    //
    // Gather data from each Iod
    //

    while ( bfoundIod = HalpMcBusEnum( &mcCtx ) ) {

       McDeviceId.all = mcCtx.McDeviceId.all;
    
       HalpBuildIodErrorFrame( McDeviceId, &IodErrorFrame );
      
       //
       // Fill in IOD_ERROR_FRAME portion of the RAWHIDE_UNCORRECTABLE_FRAME
       //

       if (rawerr) {

           RtlCopyMemory( pCurrentIodErrorFrame,
                          &IodErrorFrame,
                          sizeof(IOD_ERROR_FRAME));

           pCurrentIodErrorFrame++;
         } 

       //
       // If this is the IOD where we found the error  
       // a. clear the error
       // b. complete the uncorrectable error frame processing
       // c. Display an interpretation of the error to the screen
       //

       if (ErrorMcDeviceId.all == McDeviceId.all) { 

       // ecrfix  Put below into HalpInterpretIodError(McDeviceId, IodErrorFrame) ???
          IOD_WHOAMI IodWhoAmI;
          IOD_CAP_CONTROL IodCapCtrl;
          IOD_CAP_ERR IodCapErr;
          IOD_PCI_ERR1 IodPciErr1;
          IOD_MC_ERR0 IodMcErr0;
          IOD_MC_ERR1 IodMcErr1;
          IOD_MDPA_STAT IodMdpaStat;
          IOD_MDPB_STAT IodMdpbStat;
          ULONG HwBusNumber = ErrorMcDeviceId.Mid & 0x3;

          //
          // Copy error frame variables in locals for bitfield access
          //

          IodWhoAmI.all  = IodErrorFrame.WhoAmI;
          IodCapCtrl.all = IodErrorFrame.CapCtrl;
          IodCapErr.all   = IodErrorFrame.CapErr;
          IodPciErr1.PciAddress = IodErrorFrame.PciErr1;
          IodMcErr0.all   = IodErrorFrame.McErr0;
          IodMcErr1.all   = IodErrorFrame.McErr1;

   #if 0  // CAP/MDP Bug
          IodMdpaStat.all = IodErrorFrame.MdpaStat;
          IodMdpbStat.all = IodErrorFrame.MdpbStat;
          IodMdpaSyn.all  = IodErrorFrame.MdpaSyn;
          IodMdpbSyn.all  = IodErrorFrame.MdpbSyn;
   #else
          IodMdpaStat.all = 0xffffffff;
          IodMdpbStat.all = 0xffffffff;
   #endif // CAP/MDP Bug


          //
          // Clear state in MDPA and MDPB before clearing CAP_ERR
          //


          WRITE_IOD_REGISTER_NEW( McDeviceId, 
                           &((PIOD_ERROR_CSRS)(IOD_ERROR_CSRS_QVA))->MdpaStat,
                           IodErrorFrame.MdpaStat
                         );

          WRITE_IOD_REGISTER_NEW( McDeviceId, 
                           &((PIOD_ERROR_CSRS)(IOD_ERROR_CSRS_QVA))->MdpbStat,
                           IodErrorFrame.MdpbStat
                         );

          WRITE_IOD_REGISTER_NEW( McDeviceId, 
                           &((PIOD_ERROR_CSRS)(IOD_ERROR_CSRS_QVA))->CapErr,
                           IodErrorFrame.CapErr
                         );

          sprintf( OutBuffer, 
              "IOD MC_DEVICE_ID : (%x, %x)  CAP_CTRL : %08x  CAP_ERR : %08x\n", 
               McDeviceId.Gid, McDeviceId.Mid,
               IodCapCtrl.all, 
               IodCapErr.all );

          HalDisplayString( OutBuffer );
#if HALDBG
          DbgPrint( OutBuffer );
#endif

          sprintf( OutBuffer,
              "PCI_ERR1  %08x  MC_ERR0 : %08x  MC_ERR1 : %08x\n", 
               IodPciErr1.PciAddress,
               IodMcErr0.all,
               IodMcErr1.all );

          HalDisplayString( OutBuffer );
#if HALDBG
          DbgPrint( OutBuffer );
#endif

   #if 0 // CAP/MDP Bug
          sprintf( OutBuffer, 
            "MDPA_STAT : %08x MDPA_SYN : %08x  MDPB_STAT : %08x MDPB_SYN : %08x\n",
               IodMdpaStat.all,
               IodMdpaSyn.all,
               IodMdpbStat.all,
               IodMdpbSyn.all );
          HalDisplayString( OutBuffer );
#if HALDBG
          DbgPrint( OutBuffer );
#endif
   #endif

       //
       // If no valid error then no interpretation.
       //

       if (( IodCapErr.PciErrValid == 0 ) && ( IodCapErr.McErrValid == 0 ) ){

           return;                         // No IOD error detected

       }

       //
       //  Interpret any detected errors:
       //

       if (IodCapErr.McErrValid == 1) {

          if ( IodMcErr1.Dirty != 1 ) {
              sprintf( OutBuffer,
                           "MC Bus Error, Bus Master=(%x,%x)\n",

                           ( ( IodMcErr1.DevId & 0x38) >> 3 ),
                             ( IodMcErr1.DevId & 0x07)
                          );
          } else {

              sprintf( OutBuffer,
                           "MC bus error on a read/dirty transaction\n"
                          );
          }


          //
          //  Output the detected error message:
          //

          HalDisplayString( OutBuffer );
#if HALDBG
          DbgPrint( OutBuffer );
#endif
          HalpErrorFrameString( uncorr, OutBuffer);


          sprintf( OutBuffer,
                   "IOD Addr=%x%x, Cmd=%x\n",
                    IodMcErr1.Addr39_32,        // bits 39:32
                    IodMcErr0.Addr,             // bits 31:4
                    IodMcErr1.McCmd
                  );

          //
          //  Output the detected error message:
          //

          HalDisplayString( OutBuffer );
#if HALDBG
          DbgPrint( OutBuffer );
#endif
          HalpErrorFrameString( uncorr, OutBuffer);

          //
          // Interpret specific MC bus error
          //

          uncorr->Flags.PhysicalAddressValid = 1;
          uncorr->PhysicalAddress = (
               (((ULONGLONG)IodMcErr1.Addr39_32) << 32) |
               ((ULONGLONG)IodMcErr0.Addr) );

          //     
          // McAddr<39> indicates whether this was a
          // memory or I/O transaction.
          //

          if ( (IodMcErr1.Addr39_32 & 0x80) == 1) {
             uncorr->Flags.AddressSpace = IO_SPACE;
          } else {
             uncorr->Flags.AddressSpace = MEMORY_SPACE;
          }

          if ( IodCapErr.PioOvfl == 1 ){

              sprintf( OutBuffer,
                       "IOD PIO Overflow, PendNumb=%x\n",
                        IodCapCtrl.PendNum
                      );

          } else if ( IodCapErr.McAddrPerr == 1 ){

              sprintf( OutBuffer,
                       "MC bus parity error\n"
                      );

          } else if ( IodCapErr.Nxm == 1 ){

              sprintf( OutBuffer,
                       "MC bus NXM\n"
                      );

          } else if ( IodCapErr.CrdA == 1 ){

              sprintf( OutBuffer,
                       "IOD Correctable ECC error in MDPA\n"
                      );

          } else if ( IodCapErr.CrdB == 1 ){

              sprintf( OutBuffer,
                       "IOD Correctable ECC error in MDPB\n"
                      );

          } else if ( IodCapErr.RdsA == 1 ){

              sprintf( OutBuffer,
                       "IOD Uncorrectable ECC error in MDPA\n"
                      );

          } else if ( IodCapErr.RdsB == 1 ){

              sprintf( OutBuffer,
                       "IOD Uncorrectable ECC error in MDPB\n"
                      );

          }

          //
          //  Output the detected error message:
          //

          HalDisplayString( OutBuffer );
#if HALDBG
          DbgPrint( OutBuffer );
#endif
          HalpErrorFrameString( uncorr, OutBuffer);

       }

       if ( IodCapErr.PciErrValid == 1 ){

          //
          // Interpret specific PCI bus error
          //

          uncorr->Flags.AddressSpace = IO_SPACE;
          uncorr->Flags.PhysicalAddressValid = 1;
          uncorr->PhysicalAddress = IOD_IO_SPACE_START |
               MCDEVID_TO_PHYS_ADDR(IodWhoAmI.McDevId.all) |
               IodPciErr1.PciAddress << IO_BIT_SHIFT;

          uncorr->Flags.ExtendedErrorValid = 1;
          PExtErr->IoError.Interface = PCIBus;
          PExtErr->IoError.BusNumber = HwBusNumber;
          PExtErr->IoError.BusAddress.LowPart = IodPciErr1.PciAddress;

          if ( IodCapErr.Perr == 1 ){
              sprintf( OutBuffer,
                       "PERR detected on PCI-%d, Addr=%x\n",
                       HwBusNumber,
                       IodPciErr1.PciAddress
                      );

          } else if ( IodCapErr.Serr == 1 ){

              sprintf( OutBuffer,
                       "SERR detected on PCI-%d, Addr=%x\n",
                       HwBusNumber,
                       IodPciErr1.PciAddress
                      );

          } else if ( IodCapErr.Mab == 1 ){

              sprintf( OutBuffer,
                       "Master Abort on PCI-%d, Addr=%x\n",
                       HwBusNumber,
                       IodPciErr1.PciAddress
                      );

          } else if ( IodCapErr.PteInv == 1 ){

              sprintf( OutBuffer,
                       "Invalid Scatter/Gather PTE on PCI-%d, Addr=%x\n",
                       HwBusNumber,
                       IodPciErr1.PciAddress
                      );
          }

          //
          //  Output the detected error message:
          //

          HalDisplayString( OutBuffer );
#if HALDBG
          DbgPrint( OutBuffer );
#endif
          HalpErrorFrameString( uncorr, OutBuffer);

       }                  

       //
       //  Check for lost errors and output message if any occurred:
       //

       if ( IodCapErr.LostMcErr == 1 ){
           HalDisplayString("IOD Lost errors were detected\n");
#if HALDBG
           DbgPrint("IOD Lost errors were detected\n");
#endif
           HalpErrorFrameString(uncorr, "IOD Lost errors were detected\n");
       }

      } // if (ErrorMcDeviceID == McDeviceId)

  } // while (bfoundIod = HalpMcBusEnum)
  
  return;                                 // Fatal error detected
}


BOOLEAN
HalpIodMachineCheck(
    IN PEXCEPTION_RECORD ExceptionRecord,
    IN PKEXCEPTION_FRAME ExceptionFrame,
    IN PKTRAP_FRAME TrapFrame
    )
/*++

Routine Description:

    This routine is given control when an hard error is acknowledged
    by the IOD chipset.  The routine is given the chance to
    correct and dismiss the error.

Arguments:

    ExceptionRecord - Supplies a pointer to the exception record generated
                      at the point of the exception.

    ExceptionFrame - Supplies a pointer to the exception frame generated
                     at the point of the exception.

    TrapFrame - Supplies a pointer to the trap frame generated
                at the point of the exception.

Return Value:

    TRUE is returned if the machine check has been handled and dismissed -
    indicating that execution can continue.  FALSE is return otherwise.

--*/
{
    IOD_CAP_ERR IodCapErr;
    IOD_CAP_ERR IodCapErrMask;
    IOD_MC_ERR1 IodMcErr1;
    IOD_WHOAMI  IodWhoAmI;
    MC_DEVICE_ID McDeviceId;
    BOOLEAN ExpectedMchk;
    BOOLEAN ExpectedMcAddrPerr;
    BOOLEAN PciMemReadMchk;
    BOOLEAN bfoundIod;

    //
    // We don't expect a machine check yet...
    //

    ExpectedMchk = FALSE;
    ExpectedMcAddrPerr = FALSE;

    //
    // Make sure any error due to 2Mb/4Mb Cached CUD bug is latched.
    //
    // At this point, WhoAmI may indicate the symptoms of a fill_error
    // and CUD cache size is not available.  We'll read it again when
    // we need to know the Cache size.  However, we save he here so we
    // can figure out if this was a fill error or not.
    //
     
    HalpIodWhoAmIOnError.all = HalpReadWhoAmI();

    //
    //  Where do we look for the error symptoms?
    //
    //  1.  If we expected this machine check, then we know which 
    //      IOD to check.
    //  2.  If we didn't expect this machine check, find the IOD that
    //      generated the error.
    //

    //
    //  For an expected machine check, HalpMasterAbortExpected will
    //  contain the processor number and address of a PCI config
    //  space read.  CAP_ERR will indicate a MasterAbort.
    //

    if( HalpMasterAbortExpected.Number == (ULONG)KeGetCurrentProcessorNumber() ) {
    
       //
       // Determine expected IOD from the address of the PCI config read
       //
     
       McDeviceId.all = MCDEVID_FROM_PHYS_ADDR(HalpMasterAbortExpected.Addr);
     
       //
       // Now get the Bcache size information.
       //
     
       IodWhoAmI.all = READ_IOD_REGISTER_NEW( McDeviceId, 
		       &((PIOD_GENERAL_CSRS)(IOD_GENERAL_CSRS_QVA))->WhoAmI);
     
       //
       //
       // Make sure there is a Master abort on this IOD
       //
     
       IodCapErr.all = READ_IOD_REGISTER_NEW( McDeviceId,
		       &((PIOD_ERROR_CSRS)(IOD_ERROR_CSRS_QVA))->CapErr );
     

       if( IodCapErr.Mab == 1 ) {
 	  ExpectedMchk = TRUE;

          //
          // If 2Mb or 4 Mb cached CUD, and we may get an MCbus address parity 
          // error with MC command signature in MC_ERR1 equal to zero (cached 
          // CPU idle transaction).  Also dismiss this error that's the result
          // of the cached 2Mb/4Mb cached CPU VCTY bug.
          //

          IodMcErr1.all = READ_IOD_REGISTER_NEW( McDeviceId, 
                          &((PIOD_ERROR_CSRS)(IOD_ERROR_CSRS_QVA))->McErr1 );

          if ((IodWhoAmI.CpuInfo & 0x7) &&   // Cached CPU?
              IodCapErr.McAddrPerr      &&   // McAddrPerr?
              (IodMcErr1.McCmd == 0)     ) { // McCmd is zero?

              ExpectedMcAddrPerr = TRUE;     // All yes, then dismiss it!
          } 
       } 

#if HALDBG 
       DbgPrint( "Expected Mchk (Mab) on IOD (%x, %x), Processor number %x\n", 
		McDeviceId.Gid,
		McDeviceId.Mid,
		HalpMasterAbortExpected.Number);
#endif //HALDBG
    }

    //
    // If this isn't the machine check we expected, then
    // we must find the IOD that took the error.
    //

    if (!ExpectedMchk) {

      bfoundIod = bFindIodError( &McDeviceId, &IodCapErr );

      //
      // Check that we found an IOD that has a valid PCI or MC error.
      // If it is not this is a pretty weird (fatal???) condition.
      // For now, we'll just go return TRUE.
      //
      // ecrfix - should we check the error interrupts?  probably not...
      //

      if( !bfoundIod ) {

#if HALDBG
          DbgPrint( "HalpIodMachineCheck called but no PCI or MC error found\n");
#endif
          return (TRUE);
      }
      

#if 0 // HALDBG 
      DbgPrint( "Unexpected Mchk on IOD (%x, %x)\n", 
	       McDeviceId.Gid,
	       McDeviceId.Mid );
#endif //HALDBG

       //
       // Case: Uexpected Master Abort, e.g. a PCI memory or I/O space read to
       // legacy ISA space (0 - 1 Mb) on PCI-1,2,3.
       //

       if ( bHandleIsaError( McDeviceId, IodCapErr) ) {
         return TRUE;
       }

    }

    //
    // Case: PCI or MC Bus error other than master abort
    //
    // At this point we have either:
    //  (a) an expected PCI Master Abort (ExpectedMchk == TRUE), or
    //  (b) an unexpected PCI or MC Bus error.  
    //
    // However, it's possible that we have *both* (a) AND (b).
    // So, even if ExpectedMch == TRUE, check for other PCI or MC Bus
    // errors.  Any of these other errors indicate a
    // fatal condition.
    //

    if( (IodCapErr.Perr == 1) ||           // PCI bus perr detected
	(IodCapErr.Serr == 1) ||           // PCI bus serr detected
	(IodCapErr.PteInv == 1) ||         // Invalid Pte
	(IodCapErr.PioOvfl == 1) ||        // Pio Ovfl

        //
        // Cached CUD with 2 Mb and 4 Mb Cache may also assert an MCAddrPerr
        // or Nxm upon a config space read.   Lost Error bit will also be set.
        // 
        //

	( (IodCapErr.LostMcErr == 1)  && !ExpectedMcAddrPerr)  ||     
                                           // Lost error
	( (IodCapErr.McAddrPerr == 1) && !ExpectedMcAddrPerr ) ||   
                                           // MC bus comd/addr parity error 


	( (IodCapErr.Nxm == 1)        && !ExpectedMcAddrPerr ) ||   
                                           // Non-existent memory error
	(IodCapErr.CrdA == 1) ||           // Correctable ECC error on MDPA
	(IodCapErr.CrdB == 1) ||           // Correctable ECC error on MDPB
	(IodCapErr.RdsA == 1) ||           // Uncorrectable ECC error on MDPA
	(IodCapErr.RdsA == 1)              // Uncorrectable ECC error on MDPA

    ){
        return ( bHandleFatalIodError(McDeviceId, TRUE) );
    }

    //
    // At this point, we have either an expected or unexpected Master
    // abort.  There are three cases:
    // 1.  Expected MAB from a PCI config space read that must be handled
    // 2.  Unexpected MAB from a PCI memory or I/O space read in ISA legacy 
    //     space that can be handled.
    // 3.  Unexpected MAB.  Don't handle or fix up this error condition.
    //     (Really take the machine check.)
    //

    //
    // Case 1: Expected Master Abort, e.g. a PCI configuration read error. 
    //

    if ( (IodCapErr.Mab == 1) && ExpectedMchk ){
        
        //
        // Here's how a PCI config space read to an empty slot will transpire:
        //
        //    READ_CONFIG_Usize indicates the issuing CPU and address in 
        //    HalpMasterAbortExpected.Number and HalpMasterAbortExpected.Addr.
        //
        //    PCI config space read will case a MC Bus FILL_ERROR on the issuing CPU
	//    FILL_ERROR causes a machine check.
        //
        //    The targeted MC-PCI bus bridge will set CAP_ERR<MasterAbort> bit.
        //
        // So far, the error looks like a PCI configuration space read
        // that accessed a device that does not exist.  In order to fix
        // this up we expect that the original faulting instruction must 
        // be a load with v0 as the destination register.  Unfortunately,
        // machine checks are not precise exceptions so we may have exectued
        // many instructions since the faulting load.  For EV5 a pair of 
        // memory barrier instructions following the load will stall the pipe
        // waiting for load completion before the second memory barrier can
        // be issued.  Therefore, we expect the exception PC to point to either
        // the load instruction or one of the two memory barriers.  We will 
        // assume that if the exception pc is not an mb that instead it
        // points to the load that machine checked.  We must be careful to
        // not reexectute the load.
        //

        ALPHA_INSTRUCTION FaultingInstruction;


        FaultingInstruction.Long = *(PULONG)((ULONG)TrapFrame->Fir); 
        if( FaultingInstruction.Memory.Opcode != MEMSPC_OP ){

            //
            // Exception pc does not point to a memory barrier, return
            // to the instruction after the exception pc.
            //

            TrapFrame->Fir += 4;

        }

        //
        // The error has matched all of our conditions.  Fix it up by
        // writing the value 0xffffffff into the destination of the load.
        // 

        TrapFrame->IntV0 = (ULONGLONG)0xffffffffffffffff;

        //
        // Clear all error conditions in CAP_ERR.
        // (McAddrPerr, LostMcErr, Mab)
        //
#if 0
        WRITE_IOD_REGISTER_NEW( McDeviceId,
                          &((PIOD_ERROR_CSRS)(IOD_ERROR_CSRS_QVA))->CapErr,
                          IodCapErr.all );
#else
        IodCapErrMask.all = ALL_CAP_ERRORS;
        HalpClearAllIods( IodCapErrMask );
#endif

	//
	// Clear the hard error interrupt.
        // ecrfix - For now, the Hard error interrupt is masked, so
        // we don't have to clear it.
        // 

        return TRUE;

    } 
#if 0
    //
    // Case 2: Uexpected Master Abort, e.g. a PCI memory or I/O space read to
    // legacy ISA space (0 - 1 Mb) on PCI-1,2,3.
    //

    if ( bHandleIsaError( McDeviceId, IodCapErr) ) {
      return TRUE;
    }
#endif
    //
    // Case 3: Unexpected Master abort.
    // (Or anything I might have missed.... )
    //

#if (DBG) || (HALDBG)
    DbgPrint( "Unexpected PCI master abort\n" );
#endif

    return ( bHandleFatalIodError(McDeviceId, TRUE) );

}


#define ENTIRE_FRAME_SIZE (sizeof(ERROR_FRAME) + sizeof(RAWHIDE_CORRECTABLE_FRAME))
VOID
HalpIodSoftErrorInterrupt(
    VOID
    )
/*++

Routine Description:

    Handle a IOD soft (correctable) error interrupt.

Arguments:

    None.

Return Value:

    None.

--*/
{
    BOOLEAN bfoundIod;
    MC_DEVICE_ID McDeviceId;

    static UCHAR Frame[ENTIRE_FRAME_SIZE];
    static PERROR_FRAME pFrame;
    static RAWHIDE_CORRECTABLE_FRAME RawhideFrame;
    static BOOLEAN RawhideFrameInitialized = FALSE;
    
    UCHAR TempFrame[ENTIRE_FRAME_SIZE];
    PERROR_FRAME pTempFrame;
    PCORRECTABLE_ERROR pCorr;
    PRAWHIDE_CORRECTABLE_FRAME pRawCorr;
    
    PBOOLEAN ErrorlogBusy;
    PULONG DispatchCode;
    PKINTERRUPT InterruptObject;
    PKSPIN_LOCK ErrorlogSpinLock;
    PRAWHIDE_UNCORRECTABLE_FRAME rawerr;

    IOD_CAP_ERR IodCapErr;
    IOD_MDPA_STAT IodMdpaStat;
    IOD_MDPA_STAT IodMdpbStat;
    IOD_MC_ERR0 IodMcErr0;
    IOD_MC_ERR1 IodMcErr1;

    KIRQL Irql;

#if 0 // CAP/MDP Bug
    IOD_MDPA_SYN IodMdpaSyn;
    IOD_MDPB_SYN IodMdpbSyn;
#endif


//ecrfix - later we should log the error, throttle the logging and turn off
//        correctable error reporting if the frequency is too high

    //
    // The error is expected to be a corrected ECC error on a DMA or
    // Scatter/Gather TLB read/write.  Read the error registers relevant
    // to this error.
    //

    // 
    // Find the IOD that latched the error.
    //

    bfoundIod = bFindIodError( &McDeviceId, &IodCapErr );

#ifdef FORCE_CORRECTABLE_ERROR
    IodCapErr.all = 0x88000000;
    bfoundIod = 1;
#endif  // FORCE_CORRECTABLE_ERROR

    //
    // Check that we found an IOD that has a valid PCI or MC error.
    // If it is not this is a pretty weird (fatal???) condition.
    // For now, we'll just go return TRUE.
    //

    if( !bfoundIod ) {

#if 0 //HALDBG
        DbgPrint( "HalpIodSoftErrorInterrupt: no PCI or MC error found.\n");
#endif
        return;
    }

    //
    // Check if an error is latched into the IOD.  If not, goodbye.
    //

    if( IodCapErr.McErrValid == 0 ){ 

#if HALDBG 
        DbgPrint( "Iod soft error interrupt without valid MC error\n" );
#endif //HALDBG

        return;
    }

    //
    // Check for the correctable error bit. 
    //

    if( (IodCapErr.CrdA == 0) && (IodCapErr.CrdB == 0) ){

#if HALDBG 
        DbgPrint( "Iod soft error interrupt without correctable error indicated in CapErr\n" );
#endif //HALDBG

    }


    //
    // Increment the number of IOD correctable errors.
    //

    IodCorrectedErrors += 1;

    //
    //  Read the rest of the error registers
    //

    IodMcErr0.all = (ULONG)READ_IOD_REGISTER_NEW( McDeviceId,
                            &((PIOD_ERROR_CSRS)(IOD_ERROR0_CSRS_QVA))->McErr0
                                        );

    IodMcErr1.all = (ULONG)READ_IOD_REGISTER_NEW( McDeviceId,
                            &((PIOD_ERROR_CSRS)(IOD_ERROR0_CSRS_QVA))->McErr1
                                        );
#ifdef FORCE_CORRECTABLE_ERROR
    IodMcErr0.all = 0x00bebad0;
    IodMcErr1.all = 0x800f3f00;
#endif  // FORCE_CORRECTABLE_ERROR

#if 0 // CAP/MDP Bug
    IodMdpaStat.all = (ULONG)READ_IOD_REGISTER_NEW( McDeviceId,
                            &((PIOD_ERROR_CSRS)(IOD_ERROR0_CSRS_QVA))->MdpaStat
                                        );

    IodMdpaSyn.all = (ULONG)READ_IOD_REGISTER_NEW( McDeviceId,
                            &((PIOD_ERROR_CSRS)(IOD_ERROR0_CSRS_QVA))->MdpaSyn
                                        );

    IodMdpbStat.all = (ULONG)READ_IOD_REGISTER_NEW( McDeviceId,
                            &((PIOD_ERROR_CSRS)(IOD_ERROR0_CSRS_QVA))->MdpbStat
                                        );

    IodMdpbSyn.all = (ULONG)READ_IOD_REGISTER_NEW( McDeviceId,
                            &((PIOD_ERROR_CSRS)(IOD_ERROR0_CSRS_QVA))->MdpbSyn
                                        );
#endif


#if HALDBG 

    //
    // Print a correctable error message to the debugger.
    //

    DbgPrint( "IOD Correctable Error Number %d, state follows: \n",
                IodCorrectedErrors );
    DbgPrint( "\tIOD_CAP_ERR: 0x%x\n", IodCapErr.all );
    DbgPrint( "\tIOD_MC_ERR0: 0x%x\n", IodMcErr0.all );
    DbgPrint( "\tIOD_MC_ERR1: 0x%x\n", IodMcErr1.all );
//    DbgPrint( "\tIOD_MDPA_STAT: 0x%x\n", IodMdpaStat.all );
//    DbgPrint( "\tIOD_MDPA_SYN:  0x%x\n", IodMdpaSyn.all );
//    DbgPrint( "\tIOD_MDPB_STAT: 0x%x\n", IodMdpbStat.all );
//    DbgPrint( "\tIOD_MDPB_SYN:  0x%x\n", IodMdpbSyn.all );

#endif //HALDBG

    //
    // Fill in the Correctable Error frame only if we've connected 
    // to the Correctable Error interrupt.
    //

    if (HalpLogCorrectableErrors) {

       //
       // Real error, get the interrupt object.
       //

       DispatchCode = (PULONG)PCR->InterruptRoutine[RawhideSoftErrVector];
       InterruptObject = CONTAINING_RECORD(
                               DispatchCode,
                               KINTERRUPT,
                               DispatchCode
                               );

       //
       // Set various pointers so we can use them later.
       //

       pFrame     = (PERROR_FRAME) Frame;
       pTempFrame = (PERROR_FRAME) TempFrame;
       pCorr      = (PCORRECTABLE_ERROR) &pTempFrame->CorrectableFrame;
       pRawCorr   = (PRAWHIDE_CORRECTABLE_FRAME) (TempFrame + 
                                              sizeof(ERROR_FRAME) );

       ErrorlogBusy = (PBOOLEAN)((PUCHAR)InterruptObject->ServiceContext +
                     sizeof(PERROR_FRAME));
       ErrorlogSpinLock = (PKSPIN_LOCK)((PUCHAR)ErrorlogBusy + sizeof(PBOOLEAN));

       //
       // Clear the data structures that we will use.
       //

       RtlZeroMemory(&TempFrame, ENTIRE_FRAME_SIZE);

       //
       // Fill in the error frame information.
       //

       pTempFrame->Signature = ERROR_FRAME_SIGNATURE;
       pTempFrame->LengthOfEntireErrorFrame = ENTIRE_FRAME_SIZE;
       pTempFrame->FrameType = CorrectableFrame;
       pTempFrame->VersionNumber = ERROR_FRAME_VERSION;
       pTempFrame->SequenceNumber = IodCorrectedErrors;
       pTempFrame->PerformanceCounterValue =
         KeQueryPerformanceCounter(NULL).QuadPart;

       //
       // Check for lost error.
       //

       if( IodCapErr.LostMcErr ) {

         //
         // Since the error registers are locked from a previous error,
         // we do not know where the error came from.  Mark everything
         // as UNIDENTIFIED.
         //

         pCorr->Flags.LostCorrectable = 1;
         pCorr->Flags.LostAddressSpace = UNIDENTIFIED;
         pCorr->Flags.LostMemoryErrorSource = UNIDENTIFIED;
      }

       pCorr->Flags.ErrorBitMasksValid = 0;

       //
       // Determine error type.
       //

       if (IodMcErr1.Addr39_32 & 0x80) {

         //
         // I/O ECC error occurred.
         //

         pCorr->Flags.AddressSpace = IO_SPACE;
         pCorr->Flags.ExtendedErrorValid = 1;
         pCorr->ErrorInformation.IoError.Interface = PCIBus;
         pCorr->ErrorInformation.IoError.BusNumber = IodMcErr1.DevId & 0x3;

         // We never alloc PCI address higher than 1 Gb for any PCI
         // address space (sparse mem, dense mem, sparse I/O), so this
         // trick works.

         pCorr->ErrorInformation.IoError.BusAddress.LowPart = 
           ((IodMcErr0.Addr & 0x3FFFFFFF) >> IO_BIT_SHIFT);

         // The code below is not strictly correct.  Based on the MC Bus
         // spec, p.32, we can roughly say that McCmd<3> tells us whether
         // there was a write or read transaction on the bus.  If I looked
         // at the spec harder, I might be able to distinguish a PIO op
         // from a DMA operation.
         
         pCorr->ErrorInformation.IoError.TransferType 
           = ((IodMcErr1.McCmd & 0x8) ? BUS_IO_READ : BUS_IO_WRITE);

       } else {

         //
         // Memory ECC error occurred.
         //

         pCorr->Flags.AddressSpace = MEMORY_SPACE;

       }

       //
       // Get the physical address where the error occurred.
       //

       if (IodMcErr1.Valid) {
          pCorr->Flags.PhysicalAddressValid = 1;
          pCorr->PhysicalAddress =
               ((ULONGLONG) (IodMcErr1.Addr39_32)) << 32;
          pCorr->PhysicalAddress |= IodMcErr0.all;
       }

       //
       // Scrub the error if it's any type of memory error.
       //

       if ( pCorr->Flags.AddressSpace == MEMORY_SPACE &&
            pCorr->Flags.PhysicalAddressValid ) {
          pCorr->Flags.ScrubError = 1;
       }

       //
       // Acquire the spinlock.
       //

       KeAcquireSpinLock(ErrorlogSpinLock, &Irql );

       //
       // Check to see if an errorlog operation is in progress already.
       //

       if (!*ErrorlogBusy) {

         //
         // Set reporting processor information.  Disregard at the moment.
         //

         pCorr->Flags.ProcessorInformationValid = 0;

         // 
         // Copy the SYSTEM_INFORMATION from the uncorrectable frame
         //              

         pCorr->System = PUncorrectableError->UncorrectableFrame.System;

         //
         //
         // Set raw system information flag.  
         //

         pCorr->Flags.SystemInformationValid = 1;

         //
         // Do the Rawhide-specific stuff here
         //

         pRawCorr->Revision = RAWHIDE_CORRECTABLE_FRAME_REVISION;

         //
         // Copy the CUD header from the uncorrectable frame
         //

         rawerr = (PRAWHIDE_UNCORRECTABLE_FRAME)
             PUncorrectableError->UncorrectableFrame.RawSystemInformation;
         if (rawerr) {
             pRawCorr->CudHeader = rawerr->CudHeader;
         }

         //
         // Fill in the rest of the dynamic portion of the
         // correctable frame.
         //

         pRawCorr->CudHeader.ActiveCpus = BuildActiveCpus();
         pRawCorr->ErrorSubpacketFlags.all = 0;
         pRawCorr->ErrorSubpacketFlags.IodSubpacketPresent = 1;
         pRawCorr->WhoAmI = HalpReadWhoAmI();
         HalpBuildIodErrorFrame( McDeviceId, &(pRawCorr->IodErrorFrame) );

         //
         // Copy the information that we need to log.
         //

         RtlCopyMemory(&Frame,
               &TempFrame,
               ENTIRE_FRAME_SIZE);

         pFrame->CorrectableFrame.RawSystemInformation = 
              (PVOID)((PUCHAR)pFrame + sizeof(ERROR_FRAME) );

         pFrame->CorrectableFrame.RawSystemInformationLength = 
              sizeof(RAWHIDE_CORRECTABLE_FRAME);


         //
         // Put frame into ISR service context.
         //

         *(PERROR_FRAME *)InterruptObject->ServiceContext = pFrame;

       } else {

         //
         // An errorlog operation is in progress already.  We will
         // set various lost bits and then get out without doing
         // an actual errorloging call.
         //

         pFrame->CorrectableFrame.Flags.LostCorrectable = TRUE;
         pFrame->CorrectableFrame.Flags.LostAddressSpace =
           pTempFrame->CorrectableFrame.Flags.AddressSpace;
         pFrame->CorrectableFrame.Flags.LostMemoryErrorSource =
           pTempFrame->CorrectableFrame.Flags.MemoryErrorSource;
       }

       //
       // Release the spinlock.
       //

       KeReleaseSpinLock(ErrorlogSpinLock, Irql );

       //
       // Dispatch to the secondary correctable interrupt service routine.
       // The assumption here is that if this interrupt ever happens, then
       // some driver enabled it, and the driver should have the ISR connected.
       //

       ((PSECOND_LEVEL_DISPATCH)InterruptObject->DispatchAddress)(
                               InterruptObject,
                               InterruptObject->ServiceContext
                               );

    }


    //
    // Clear state in MDPA and MDPB before clearing CAP_ERR
    //

    IodCapErr.all = 0;
    IodCapErr.CrdA = 1;
    IodCapErr.CrdB = 1;
    IodMdpaStat.all = 0xffffffff;
    IodMdpbStat.all = 0xffffffff;

    WRITE_IOD_REGISTER_NEW( McDeviceId, 
                        &((PIOD_ERROR_CSRS)(IOD_ERROR_CSRS_QVA))->MdpaStat,
                        IodMdpaStat.all
                      );

    WRITE_IOD_REGISTER_NEW( McDeviceId, 
                        &((PIOD_ERROR_CSRS)(IOD_ERROR_CSRS_QVA))->MdpaStat,
                        IodMdpbStat.all
                      );

    WRITE_IOD_REGISTER_NEW( McDeviceId, 
                        &((PIOD_ERROR_CSRS)(IOD_ERROR_CSRS_QVA))->CapErr,
                        IodCapErr.all
                      );

    return;

}

VOID
HalpIodHardErrorInterrupt(
    VOID
    )
/*++

Routine Description:

    Handle a IOD hard (uncorrectable) error interrupt.

Arguments:

    None.

Return Value:

    None.

--*/
{
    BOOLEAN bfoundIod;
    MC_DEVICE_ID McDeviceId;
    IOD_CAP_ERR IodCapErr;
    IOD_WHOAMI IodWhoAmI;
    KIRQL OldIrql;


    //
    // Raise IRQL to the highest level.
    // Prevents us from taking other hard error interrupts
    // during this one.
    //
    // Also, acquire a spin lock to keep entry
    // to this code serialized.
    //

    KeRaiseIrql(HIGH_LEVEL, &OldIrql);
    KiAcquireSpinLock(&HalpSystemInterruptLock);

    // 
    // Find the IOD that latched the error.
    //

    bfoundIod = bFindIodError( &McDeviceId, &IodCapErr );

    //
    // Check that we found an IOD that has a valid PCI or MC error.
    // If it is not this is a pretty weird (fatal???) condition.
    // For now, we'll just return.
    //

    if( !bfoundIod ) {

#if 0 // HALDBG
        DbgPrint( "HalpIodHardErrorInterrupt: no PCI or MC error found.\n");
#endif
        //
        // Lower IRQL to the previous level.
        //

        KiReleaseSpinLock(&HalpSystemInterruptLock);
        KeLowerIrql(OldIrql);
        return;
    }

#if 1  // ecrfix
    //
    //  See if this was an ISA legacy space access
    //  on PCI-1,2,3.  If so, dismiss this interrupt.
    //

    if ( bHandleIsaError( McDeviceId, IodCapErr) ) {

        //
        // Lower IRQL to the previous level.
        //

        KiReleaseSpinLock(&HalpSystemInterruptLock);
        KeLowerIrql(OldIrql);
        return;
    }
#endif

#if HALDBG
    DbgPrint( "Hard Error Found on IOD (%x, %x)\n", 
             McDeviceId.Gid,
             McDeviceId.Mid );
#endif //HALDBG

    //
    // Save IodWhoAmI 
    //

    IodWhoAmI.all = HalpReadWhoAmI();
    HalpIodWhoAmIOnError.all = IodWhoAmI.all;

    //
    // Handle the Fatal Error
    //

    bHandleFatalIodError( McDeviceId, FALSE );
      
    KeBugCheckEx( DATA_BUS_ERROR,
                  0xbeadfeed,	     //ecrfix - quick error interrupt id
                  McDeviceId.all,
                  0,
                  (ULONG) PUncorrectableError );


}

BOOLEAN
bHandleFatalIodError(
    MC_DEVICE_ID McDeviceId,
    BOOLEAN bMachineCheck
    )
/*++

Routine Description:

    Handles the epilogue of a fatal IOD unccorrectable error
    from either a machine check or IOD hard error interrupt.

Arguments:

    McDeviceId - IOD on which the error was found

    bMachineCheck - TRUE if we're handling a fatal machine check
                    FALSE if we're handling a fatal hard error interrupt

Return Value:

    TRUE is returned if the IOD error has been handled and dismissed -
    indicating that execution can continue.  FALSE is return otherwise.

--*/
{

#if HALDBG
    if (bMachineCheck ) {
       DbgPrint( "Handling fatal error - machine check\n" );
    } else {
       DbgPrint( "Handling fatal error - hard error interrupt\n" );
    }
#endif

    //
    // Clear the error condition in the MCES register.
    //
    // ecrfix - the way this is written, this will be done on hard 
    // error interrupts too (where there has been *no* machine check).
    // I hope it will be benign in this case....
    //

    HalpUpdateMces( TRUE, TRUE );

    //
    // Proceed to display the error.
    //

    HalAcquireDisplayOwnership(NULL);

    //
    // Display the dreaded banner.
    //

    HalDisplayString( "\nFatal system hardware error.\n" );

#ifdef DUMPIODS
    DumpAllIods(AllRegisters);
#endif


    HalpIodReportFatalError( McDeviceId );

    return( FALSE );

} 

BOOLEAN
bFindIodError( 
   PMC_DEVICE_ID pMcDeviceId,  
   PIOD_CAP_ERR pIodCapErr
)
/*++

Routine Description:

    Determines which IOD has an error latched in it.

Arguments:

    None.

Return Value:

    TRUE if an IOD was found with an error latched in CAP_ERR.
    FALSE otherwise.

--*/
{
    MC_ENUM_CONTEXT mcCtx;
    ULONG numIods;
    BOOLEAN bfoundIod;
    IOD_CAP_ERR IodCapErr;


     //
    // Intialize enumerator.
    //

    numIods = HalpMcBusEnumStart ( HalpIodMask, &mcCtx );

#if 0 // HALDBG
    DbgPrint( "FindIodError:  Searching: %d Iods: ", numIods);
#endif // HALDBG

    //
    // Search each Iod and look for a PCI or McBus error.
    //

    while ( bfoundIod = HalpMcBusEnum( &mcCtx ) ) {

       //
       // Read the IOD error register to determine the source of the
       // error.
       //

#if 0 //HALDBG
       DbgPrint( "(%d, %d) ", mcCtx.McDeviceId.Gid, mcCtx.McDeviceId.Mid);
#endif // HALDBG

       IodCapErr.all = READ_IOD_REGISTER_NEW( mcCtx.McDeviceId,
                       &((PIOD_ERROR_CSRS)(IOD_ERROR_CSRS_QVA))->CapErr );

       if( (IodCapErr.PciErrValid != 0) || (IodCapErr.McErrValid != 0) ){
         break;
       }
    }

#if 0 // HALDBG
    if (bfoundIod) {
      DbgPrint( "Found!\n");
    } else {
      DbgPrint( "Error Not Found!\n");
    }
#endif // HALDBG

    //
    // Return the McDeviceId and CapErr register contents 
    // of the first IOD that has an error.
    //

    *pMcDeviceId = mcCtx.McDeviceId;
    pIodCapErr->all = IodCapErr.all;
 
    return (bfoundIod);
}

BOOLEAN
bHandleIsaError( 
   MC_DEVICE_ID McDeviceId,  
   IOD_CAP_ERR IodCapErrIn
)
/*++

Routine Description:

    Gives PCI-1,2,3 ISA legacy semantics for I/O and memory accesses.

Arguments:

    None.

Return Value:

    TRUE if the error was handled.
    FALSE otherwise.

--*/
{

    MC_ENUM_CONTEXT mcCtx;
    MC_DEVICE_ID    McDeviceIdWithMab;
    ULONG numIods;
    BOOLEAN bfoundIod;
    IOD_CAP_ERR IodCapErr;
    IOD_CAP_ERR IodCapErrMask;


    //
    // Find an IOD that has Mab set.  If we do not find one, then
    // we don't have this error.
    //


    numIods = HalpMcBusEnumStart ( HalpIodMask, &mcCtx );

    //
    // Search each Iod and look for a PCI or McBus error.
    //

    while ( bfoundIod = HalpMcBusEnum( &mcCtx ) ) {

       //
       // Read the IOD error register to determine who has Mab set
       //

       IodCapErr.all = READ_IOD_REGISTER_NEW( mcCtx.McDeviceId,
                       &((PIOD_ERROR_CSRS)(IOD_ERROR_CSRS_QVA))->CapErr );

       if( (IodCapErr.PciErrValid == 1) && 
            (IodCapErr.Perr == 0)       &&
            (IodCapErr.Serr == 0)       &&
            (IodCapErr.Mab == 1)        &&
            (IodCapErr.PteInv == 0) ) {
         break;
       }
    }

    //
    // If we didn't find an IOD with Mab set, then do not handle this error.
    //

    if (!bfoundIod) {
       return FALSE;
    }

    //
    // Save the McDevice Id of the offending IOD
    //

    McDeviceIdWithMab.all = mcCtx.McDeviceId.all;

    //
    // This must be on a bus other than PCI0 for us to handle this error
    // (PCI0 reads to non-existent ISA addresses will be fixed by by the
    // PCI-EISA bridge.  Thus we'll never get here on PCI0 unless there
    // really is an error.)
    //
 
    if ( McDeviceIdWithMab.Mid != MidPci0 ) {


        IOD_PCI_ERR1 IodPciErr1;

        //
        // Get the PCI address of the transaction that caused the MAB
        //

        IodPciErr1.PciAddress = 
               (ULONG) READ_IOD_REGISTER_NEW( McDeviceIdWithMab,
                       &((PIOD_ERROR_CSRS)(IOD_ERROR_CSRS_QVA))->PciErr1 );
               

        //
        // To be handled as an ISA legacy memory or I/O space read, the 
        // FaultingPciAddress must be in the range 0-1 Mb
        //     

        if( IodPciErr1.PciAddress < __1MB ) {

            //
            // The error has matched all of our conditions.  Assume that
            // V0 has already been set to 0xffffffff.  (This is a contract
            // with the HAL access routines in iodio.s.)
            // 

            IodCapErrMask.all = ALL_CAP_ERRORS;
            HalpClearAllIods( IodCapErrMask );

            return TRUE;

          }

#if HALDBG
          DbgPrint( "Failed checking for legacy ISA read:\n");
          DbgPrint( "PciErr1 : %08x\n", IodPciErr1.PciAddress );
#endif //HALDBG

      }

      //
      // We have a PCI Mab on PCI0.  Do not handle this error.
      //
      
      return FALSE;

  }

VOID
HalpErrorFrameString(
    PUNCORRECTABLE_ERROR uncorr,
    PUCHAR OutBuffer
    )
/*++

Routine Description:

    Append an Error message to the Uncorrectable Error Frame
    string

Arguments:

    uncorr - Pointer to the UNCORRECTABLE_ERROR frame.

    OutBuffer - message to be appended.
             (If null, no string is appended, and pCurrentString
             is reset to NULL).
    

Return Value:

    none.

--*/
{
    ULONG len;
    static PCHAR pCurrentString = NULL;

    //
    //  If OutBuffer is NULL, reset pointer and flag
    //

    if (OutBuffer == NULL) {
       pCurrentString = NULL;
       if (uncorr) uncorr->Flags.ErrorStringValid = 0;      
       return;
    }

    //
    // Uncorrectable frame valid?
    //
    
    if (uncorr) {                      

       // 
       // On first error message:
       // * Init pCurrentString to beginning of ErrorString
       // * Set valid flag
       //

       if (pCurrentString == NULL) {
         pCurrentString = uncorr->ErrorString;
         uncorr->Flags.ErrorStringValid = 1;      
       }

       //
       // Append OutBuffer to ErrorString
       //

       len = strlen(OutBuffer);
       strncpy(pCurrentString, 
               OutBuffer, 
               len); 

       //
       // Zero-terminate the error string.
       //

       pCurrentString += len;
       *pCurrentString = 0;

    } 
}

ULONG 
BuildActiveCpus (
    VOID
    )
{
    ULONG ActiveLogicalProcessors = HalpActiveProcessors;
    ULONG ActivePhysicalCpus = 0;
    ULONG i;

    //
    // Make a physical processor mask from the logical processor mask
    //

    for (i = 0; i < HalpNumberOfCpus; i++, ActiveLogicalProcessors >> 1) {
        if (ActiveLogicalProcessors & 0x1) {
            ActivePhysicalCpus |=  (1 << (ULONG) (MCDEVID_TO_PHYS_CPU( 
                 HalpLogicalToPhysicalProcessor[i].all)));
            }
    }

    return (ActivePhysicalCpus);

}