mirror of https://github.com/lianthony/NT4.0
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
677 lines
17 KiB
677 lines
17 KiB
/*++
|
|
|
|
Copyright (c) 1994 Digital Equipment Corporation
|
|
|
|
Module Name:
|
|
|
|
ev5mchk.c
|
|
|
|
Abstract:
|
|
|
|
This module implements generalized machine check handling for
|
|
platforms based on the DECchip 21164 (EV5) microprocessor.
|
|
|
|
Author:
|
|
|
|
Joe Notarangelo 30-Jun-1994
|
|
|
|
Environment:
|
|
|
|
Kernel mode only.
|
|
|
|
Revision History:
|
|
|
|
--*/
|
|
|
|
#include "halp.h"
|
|
#include "axp21164.h"
|
|
#include "stdio.h"
|
|
|
|
|
|
//
|
|
// Declare the extern variable UncorrectableError declared in
|
|
// inithal.c.
|
|
//
|
|
extern PERROR_FRAME PUncorrectableError;
|
|
|
|
|
|
VOID
|
|
HalpDisplayLogout21164(
|
|
IN PLOGOUT_FRAME_21164 LogoutFrame );
|
|
|
|
BOOLEAN
|
|
HalpPlatformMachineCheck(
|
|
IN PEXCEPTION_RECORD ExceptionRecord,
|
|
IN PKEXCEPTION_FRAME ExceptionFrame,
|
|
IN PKTRAP_FRAME TrapFrame
|
|
);
|
|
|
|
VOID
|
|
HalpUpdateMces(
|
|
IN BOOLEAN ClearMachineCheck,
|
|
IN BOOLEAN ClearCorrectableError
|
|
);
|
|
|
|
//
|
|
// System-wide controls for machine check reporting.
|
|
//
|
|
|
|
ProcessorCorrectableDisable = FALSE;
|
|
SystemCorrectableDisable = FALSE;
|
|
MachineCheckDisable = FALSE;
|
|
|
|
//
|
|
// Error counts.
|
|
//
|
|
|
|
ULONG CorrectableErrors = 0;
|
|
ULONG RetryableErrors = 0;
|
|
|
|
VOID
|
|
HalpSetMachineCheckEnables(
|
|
IN BOOLEAN DisableMachineChecks,
|
|
IN BOOLEAN DisableProcessorCorrectables,
|
|
IN BOOLEAN DisableSystemCorrectables
|
|
)
|
|
/*++
|
|
|
|
Routine Description:
|
|
|
|
This function sets the enables that define which machine check
|
|
errors will be signaled by the processor.
|
|
|
|
N.B. - The system has the capability to ignore all machine checks
|
|
by indicating DisableMachineChecks = TRUE. This is intended
|
|
for debugging purposes on broken hardware. If you disable
|
|
this you will get no machine check no matter what error the
|
|
system/processor detects. Consider the consequences.
|
|
|
|
Arguments:
|
|
|
|
DisableMachineChecks - Supplies a boolean which indicates if all
|
|
machine checks should be disabled and not
|
|
reported. (see note above).
|
|
|
|
DisableProcessorCorrectables - Supplies a boolean which indicates if
|
|
processor correctable error reporting
|
|
should be disabled.
|
|
DisableSystemCorrectables - Supplies a boolean which indicates if
|
|
system correctable error reporting
|
|
should be disabled.
|
|
|
|
Return Value:
|
|
|
|
None.
|
|
|
|
--*/
|
|
{
|
|
|
|
|
|
ProcessorCorrectableDisable = DisableProcessorCorrectables;
|
|
SystemCorrectableDisable = DisableSystemCorrectables;
|
|
MachineCheckDisable = DisableMachineChecks;
|
|
|
|
HalpUpdateMces( FALSE, FALSE );
|
|
|
|
return;
|
|
}
|
|
|
|
VOID
|
|
HalpUpdateMces(
|
|
IN BOOLEAN ClearMachineCheck,
|
|
IN BOOLEAN ClearCorrectableError
|
|
)
|
|
/*++
|
|
|
|
Routine Description:
|
|
|
|
This function updates the state of the MCES internal processor
|
|
register.
|
|
|
|
Arguments:
|
|
|
|
ClearMachineCheck - Supplies a boolean that indicates if the machine
|
|
check indicator in the MCES should be cleared.
|
|
|
|
ClearCorrectableError - Supplies a boolean that indicates if the
|
|
correctable error indicators in the MCES should
|
|
be cleared.
|
|
|
|
Return Value:
|
|
|
|
None.
|
|
|
|
--*/
|
|
{
|
|
MCES Mces;
|
|
|
|
Mces.MachineCheck = ClearMachineCheck;
|
|
Mces.SystemCorrectable = ClearCorrectableError;
|
|
Mces.ProcessorCorrectable = ClearCorrectableError;
|
|
Mces.DisableProcessorCorrectable = ProcessorCorrectableDisable;
|
|
Mces.DisableSystemCorrectable = SystemCorrectableDisable;
|
|
Mces.DisableMachineChecks = MachineCheckDisable;
|
|
|
|
HalpWriteMces( Mces );
|
|
|
|
}
|
|
|
|
|
|
BOOLEAN
|
|
HalMachineCheck (
|
|
IN PEXCEPTION_RECORD ExceptionRecord,
|
|
IN PKEXCEPTION_FRAME ExceptionFrame,
|
|
IN PKTRAP_FRAME TrapFrame
|
|
)
|
|
/*++
|
|
|
|
Routine Description:
|
|
|
|
This function fields machine check for 21164-based machines.
|
|
|
|
Arguments:
|
|
|
|
ExceptionRecord - Supplies a pointer to the exception record for the
|
|
machine check. Included in the exception information
|
|
is the pointer to the logout frame.
|
|
|
|
ExceptionFrame - Supplies a pointer to the kernel exception frame.
|
|
|
|
TrapFrame - Supplies a pointer to the kernel trap frame.
|
|
|
|
Return Value:
|
|
|
|
A value of TRUE is returned if the machine check has been
|
|
handled by the HAL. If it has been handled then execution may
|
|
resume at the faulting address. Otherwise, a value of FALSE
|
|
is returned.
|
|
|
|
N.B. - Under some circumstances this routine may not return at
|
|
all.
|
|
|
|
--*/
|
|
|
|
{
|
|
|
|
BOOLEAN Handled;
|
|
PLOGOUT_FRAME_21164 LogoutFrame;
|
|
PMCHK_STATUS MachineCheckStatus;
|
|
MCES Mces;
|
|
PICPERR_STAT_21164 icPerrStat;
|
|
PDC_PERR_STAT_21164 dcPerrStat;
|
|
PSC_STAT_21164 scStat;
|
|
PEI_STAT_21164 eiStat;
|
|
BOOLEAN UnhandledPlatformError = FALSE;
|
|
|
|
PUNCORRECTABLE_ERROR uncorrerr = NULL;
|
|
PPROCESSOR_EV5_UNCORRECTABLE ev5uncorr = NULL;
|
|
|
|
//
|
|
// Check for retryable errors. These are usually I-stream parity
|
|
// errors, which may be retried following a cache flush (the cache
|
|
// flush is handled by the PAL).
|
|
//
|
|
|
|
MachineCheckStatus =
|
|
(PMCHK_STATUS)&ExceptionRecord->ExceptionInformation[0];
|
|
|
|
//
|
|
// Handle any retryable errors.
|
|
//
|
|
|
|
if( MachineCheckStatus->Retryable == 1 ){
|
|
|
|
//
|
|
// Log the error.
|
|
//
|
|
|
|
RetryableErrors += 1;
|
|
|
|
#if (DBG) || (HALDBG)
|
|
|
|
if( (RetryableErrors % 32) == 0 ){
|
|
DbgPrint( "HAL Retryable Errors = %d\n", RetryableErrors );
|
|
}
|
|
|
|
#endif //DBG || HALDBG
|
|
|
|
//
|
|
// Acknowledge receipt of the retryable machine check.
|
|
//
|
|
|
|
HalpUpdateMces( TRUE, TRUE );
|
|
|
|
return TRUE;
|
|
|
|
}
|
|
|
|
//
|
|
// Capture the logout frame pointer.
|
|
//
|
|
|
|
LogoutFrame =
|
|
(PLOGOUT_FRAME_21164)ExceptionRecord->ExceptionInformation[1];
|
|
|
|
//
|
|
// Check for any hard errors that cannot be dismissed.
|
|
// They are:
|
|
// Tag parity error
|
|
// Tag control parity error
|
|
// Multiple external errors
|
|
// Fill ECC error
|
|
// Fill parity error
|
|
// Multiple fill errors
|
|
//
|
|
|
|
icPerrStat = (PICPERR_STAT_21164)&LogoutFrame->IcPerrStat;
|
|
dcPerrStat = (PDC_PERR_STAT_21164)&LogoutFrame->DcPerrStat;
|
|
scStat = (PSC_STAT_21164)&LogoutFrame->ScStat;
|
|
eiStat = (PEI_STAT_21164)&LogoutFrame->EiStat;
|
|
|
|
if(PUncorrectableError) {
|
|
//
|
|
// Fill in the processor specific uncorrectable error frame
|
|
//
|
|
uncorrerr = (PUNCORRECTABLE_ERROR)
|
|
&PUncorrectableError->UncorrectableFrame;
|
|
|
|
//
|
|
// first fill in some generic processor Information.
|
|
// For the Current (Reporting) Processor.
|
|
//
|
|
HalpGetProcessorInfo(&uncorrerr->ReportingProcessor);
|
|
uncorrerr->Flags.ProcessorInformationValid = 1;
|
|
|
|
ev5uncorr = (PPROCESSOR_EV5_UNCORRECTABLE)
|
|
uncorrerr->RawProcessorInformation;
|
|
}
|
|
if(ev5uncorr){
|
|
ev5uncorr->IcPerrStat = LogoutFrame->IcPerrStat.all;
|
|
ev5uncorr->DcPerrStat = LogoutFrame->DcPerrStat.all;
|
|
ev5uncorr->ScStat = LogoutFrame->ScStat.all;
|
|
ev5uncorr->ScAddr = LogoutFrame->ScAddr.all;
|
|
ev5uncorr->EiStat = LogoutFrame->EiStat.all;
|
|
ev5uncorr->BcTagAddr = LogoutFrame->BcTagAddr.all;
|
|
ev5uncorr->EiAddr = LogoutFrame->EiAddr.all;
|
|
ev5uncorr->FillSyn = LogoutFrame->FillSyn.all;
|
|
ev5uncorr->BcConfig = LogoutFrame->BcConfig.all;
|
|
ev5uncorr->BcControl = LogoutFrame->BcControl.all;
|
|
}
|
|
|
|
//
|
|
// SjBfix. The External parity error checking is disabled due to bug
|
|
// Rattler chipset on Gamma which causes the parity error on
|
|
// machine checks due to reads to PCI config space. (fixed in pass 2)
|
|
//
|
|
|
|
if ( icPerrStat->Dpe == 1 || icPerrStat->Tpe == 1 ||
|
|
icPerrStat->Tmr == 1 || dcPerrStat->Lock == 1 ||
|
|
scStat->ScTperr == 1 || scStat->ScDperr == 1 ||
|
|
eiStat->BcTperr == 1 || eiStat->BcTcperr == 1 ||
|
|
// eiStat->UncEccErr == 1 || eiStat->EiParErr == 1 ||
|
|
eiStat->SeoHrdErr == 1 || scStat->ScScndErr == 1 ){
|
|
|
|
//
|
|
// A serious, uncorrectable error has occured, under no circumstances
|
|
// can it be simply dismissed.
|
|
//
|
|
|
|
goto FatalError;
|
|
|
|
}
|
|
|
|
//
|
|
// It is possible that the system has experienced a hard error and
|
|
// that nonetheless the error is recoverable. This is a system-specific
|
|
// decision - allow it to be handled as such.
|
|
//
|
|
|
|
UnhandledPlatformError = TRUE;
|
|
if( (Handled = HalpPlatformMachineCheck(
|
|
ExceptionRecord,
|
|
ExceptionFrame,
|
|
TrapFrame )) == TRUE ){
|
|
|
|
//
|
|
// The system-specific code has handled the error. Dismiss
|
|
// the error and continue execution.
|
|
//
|
|
|
|
HalpUpdateMces( TRUE, TRUE );
|
|
|
|
return TRUE;
|
|
|
|
}
|
|
|
|
//
|
|
// The system has experienced a fatal error that cannot be corrected.
|
|
// Print any possible relevant information and crash the system.
|
|
//
|
|
// N.B. - In the future some of these fatal errors could be potential
|
|
// recovered. Example, a user process gets a fatal error on one
|
|
// of its pages - we kill the user process, mark the page as bad
|
|
// and continue executing.
|
|
//
|
|
|
|
FatalError:
|
|
|
|
uncorrerr->Flags.ErrorStringValid = 1;
|
|
sprintf(uncorrerr->ErrorString,"Uncorrectable Error From "
|
|
"Processor Detected");
|
|
//
|
|
// Begin the error output if this is a processor error. If this is
|
|
// an unhandled platform error than that code is responsible for
|
|
// beginning the error output.
|
|
//
|
|
|
|
if( UnhandledPlatformError == FALSE ){
|
|
|
|
//
|
|
// Acquire ownership of the display. This is done here in case we take
|
|
// a machine check before the display has been taken away from the HAL.
|
|
// When the HAL begins displaying strings after it has lost the
|
|
// display ownership then the HAL will be careful not to scroll
|
|
// information off of the screen.
|
|
//
|
|
|
|
HalAcquireDisplayOwnership(NULL);
|
|
|
|
//
|
|
// Display the dreaded banner.
|
|
//
|
|
|
|
HalDisplayString( "\nFatal system hardware error.\n\n" );
|
|
|
|
}
|
|
|
|
//
|
|
// Display the EV5 logout frame.
|
|
//
|
|
|
|
HalpDisplayLogout21164( LogoutFrame );
|
|
|
|
//
|
|
// Bugcheck to dump the rest of the machine state, this will help
|
|
// if the machine check is software-related.
|
|
//
|
|
|
|
KeBugCheckEx( DATA_BUS_ERROR,
|
|
(ULONG)MachineCheckStatus->Correctable,
|
|
(ULONG)MachineCheckStatus->Retryable,
|
|
0,
|
|
(ULONG)PUncorrectableError );
|
|
|
|
}
|
|
|
|
#define MAX_ERROR_STRING 100
|
|
|
|
VOID
|
|
HalpDisplayLogout21164 (
|
|
IN PLOGOUT_FRAME_21164 LogoutFrame
|
|
)
|
|
|
|
/*++
|
|
|
|
Routine Description:
|
|
|
|
This function displays the logout frame for a 21164.
|
|
|
|
Arguments:
|
|
|
|
LogoutFrame - Supplies a pointer to the logout frame generated
|
|
by the 21164.
|
|
Return Value:
|
|
|
|
None.
|
|
|
|
--*/
|
|
|
|
{
|
|
UCHAR OutBuffer[ MAX_ERROR_STRING ];
|
|
|
|
sprintf( OutBuffer, "ICSR : %016Lx ICPERR_STAT : %016Lx\n",
|
|
LogoutFrame->Icsr.all, LogoutFrame->IcPerrStat.all );
|
|
|
|
HalDisplayString( OutBuffer );
|
|
|
|
sprintf( OutBuffer, "MM_STAT : %016Lx DC_PERR_STAT : %016Lx\n",
|
|
LogoutFrame->MmStat.all,
|
|
LogoutFrame->DcPerrStat.all );
|
|
|
|
HalDisplayString( OutBuffer );
|
|
|
|
|
|
sprintf( OutBuffer, "PS : %016Lx VA : %016Lx VA_FORM : %016Lx\n",
|
|
LogoutFrame->Ps,
|
|
LogoutFrame->Va,
|
|
LogoutFrame->VaForm );
|
|
|
|
HalDisplayString( OutBuffer );
|
|
|
|
|
|
sprintf( OutBuffer, "ISR : %016Lx IPL : %016Lx INTID : %016Lx\n",
|
|
LogoutFrame->Isr.all,
|
|
LogoutFrame->Ipl,
|
|
LogoutFrame->IntId );
|
|
|
|
HalDisplayString( OutBuffer );
|
|
|
|
|
|
sprintf( OutBuffer, "SC_STAT : %016Lx SC_CTL : %016Lx SC_ADDR : %016Lx\n",
|
|
LogoutFrame->ScStat.all,
|
|
LogoutFrame->ScCtl.all,
|
|
LogoutFrame->ScAddr.all );
|
|
|
|
HalDisplayString( OutBuffer );
|
|
|
|
|
|
sprintf( OutBuffer, "EI_STAT : %016Lx EI_ADDR : %016Lx\n",
|
|
LogoutFrame->EiStat.all, LogoutFrame->EiAddr.all );
|
|
|
|
HalDisplayString( OutBuffer );
|
|
|
|
|
|
sprintf( OutBuffer, "BC_TAG_ADDR : %016Lx FILL_SYN : %016Lx\n",
|
|
LogoutFrame->BcTagAddr.all, LogoutFrame->FillSyn.all );
|
|
|
|
|
|
HalDisplayString( OutBuffer );
|
|
|
|
|
|
sprintf( OutBuffer, "BC_CONTROL : %016Lx BC_CONFIG : %016Lx\n",
|
|
LogoutFrame->BcControl.all, LogoutFrame->BcConfig.all );
|
|
|
|
HalDisplayString( OutBuffer );
|
|
|
|
|
|
sprintf( OutBuffer, "EXC_ADDR : %016Lx PAL_BASE : %016Lx\n",
|
|
LogoutFrame->ExcAddr, LogoutFrame->PalBase );
|
|
|
|
HalDisplayString( OutBuffer );
|
|
|
|
//
|
|
// Print out interpretation of the error.
|
|
//
|
|
|
|
HalDisplayString( "\n" );
|
|
|
|
//
|
|
// Check for tag parity error.
|
|
//
|
|
|
|
if ( LogoutFrame->IcPerrStat.Dpe == 1 ||
|
|
LogoutFrame->IcPerrStat.Tpe == 1 ){
|
|
|
|
//
|
|
// Note: The excAddr may contain the address of the instruction
|
|
// the caused the parity error but it is not guaranteed:
|
|
//
|
|
sprintf( OutBuffer, "Icache %s parity error, Addr: %x\n",
|
|
LogoutFrame->IcPerrStat.Dpe ? "Data" : "Tag",
|
|
LogoutFrame->ExcAddr );
|
|
|
|
HalDisplayString( OutBuffer );
|
|
|
|
} else if ( LogoutFrame->DcPerrStat.Lock == 1 ){
|
|
|
|
sprintf( OutBuffer, "Dcache %s parity error, Addr: %x\n",
|
|
LogoutFrame->DcPerrStat.Dp0 || LogoutFrame->DcPerrStat.Dp1 ?
|
|
"Data" : "Tag",
|
|
LogoutFrame->Va );
|
|
|
|
HalDisplayString( OutBuffer );
|
|
|
|
} else if ( LogoutFrame->ScStat.ScTperr != 0 ) {
|
|
|
|
sprintf( OutBuffer,
|
|
"Scache Tag parity error, Addr: %x Tag: %x Cmd: %x\n",
|
|
LogoutFrame->ScAddr.ScAddr,
|
|
LogoutFrame->ScStat.ScTperr,
|
|
LogoutFrame->ScStat.CboxCmd);
|
|
|
|
HalDisplayString( OutBuffer );
|
|
|
|
|
|
} else if ( LogoutFrame->ScStat.ScDperr != 0 ) {
|
|
|
|
sprintf( OutBuffer,
|
|
"Scache Data parity error, Addr: %x Tag: %x Cmd: %x\n",
|
|
LogoutFrame->ScAddr.ScAddr,
|
|
LogoutFrame->ScStat.ScDperr,
|
|
LogoutFrame->ScStat.CboxCmd);
|
|
|
|
HalDisplayString( OutBuffer );
|
|
|
|
|
|
} else if ( LogoutFrame->EiStat.BcTperr == 1 ||
|
|
LogoutFrame->EiStat.BcTcperr == 1 ){
|
|
|
|
sprintf( OutBuffer,
|
|
"Bcache Tag Parity error, Addr: %x Tag: %x\n",
|
|
LogoutFrame->EiAddr.EiAddr,
|
|
LogoutFrame->BcTagAddr.Tag1);
|
|
|
|
HalDisplayString( OutBuffer );
|
|
|
|
}
|
|
|
|
//
|
|
// Check for timeout reset error:
|
|
//
|
|
|
|
if ( LogoutFrame->IcPerrStat.Tmr == 1 ){
|
|
|
|
sprintf( OutBuffer, "Timeout Reset Error\n" );
|
|
|
|
HalDisplayString( OutBuffer );
|
|
}
|
|
|
|
//
|
|
// Check for fill ECC errors.
|
|
//
|
|
|
|
if( LogoutFrame->EiStat.UncEccErr == 1 ){
|
|
|
|
sprintf( OutBuffer, "Uncorrectable ECC error: %s\n",
|
|
LogoutFrame->EiStat.FilIrd ? "Icache Fill" : "Dcache Fill" );
|
|
|
|
HalDisplayString( OutBuffer );
|
|
|
|
sprintf( OutBuffer,
|
|
"PA: %16Lx Longword0: %x Longword1: %x\n",
|
|
LogoutFrame->EiAddr.EiAddr,
|
|
LogoutFrame->FillSyn.Lo,
|
|
LogoutFrame->FillSyn.Hi );
|
|
|
|
HalDisplayString( OutBuffer );
|
|
|
|
}
|
|
|
|
//
|
|
// Check for address/command parity error
|
|
//
|
|
|
|
if( LogoutFrame->EiStat.EiParErr == 1 ){
|
|
|
|
sprintf( OutBuffer, "Address/Command parity error, Addr=%x\n",
|
|
LogoutFrame->EiAddr.EiAddr );
|
|
|
|
HalDisplayString( OutBuffer );
|
|
|
|
}
|
|
|
|
//
|
|
// Check for multiple hard errors.
|
|
//
|
|
|
|
if ( LogoutFrame->ScStat.ScScndErr == 1 ){
|
|
|
|
HalDisplayString( "Multiple Scache parity errors detected.\n" );
|
|
}
|
|
|
|
if( LogoutFrame->EiStat.SeoHrdErr == 1 ){
|
|
|
|
HalDisplayString( "Multiple external/tag errors detected.\n" );
|
|
|
|
}
|
|
|
|
return;
|
|
}
|
|
|
|
|
|
BOOLEAN
|
|
Halp21164CorrectedErrorInterrupt (
|
|
VOID
|
|
)
|
|
|
|
/*++
|
|
|
|
Routine Description:
|
|
|
|
This is the interrupt handler for the 21164 processor corrected error
|
|
interrupt.
|
|
|
|
Arguments:
|
|
|
|
None.
|
|
|
|
Return Value:
|
|
|
|
None.
|
|
|
|
--*/
|
|
|
|
{
|
|
//
|
|
// Handle any processor correctable errors.
|
|
//
|
|
|
|
|
|
//
|
|
// Log the error.
|
|
//
|
|
// simply assume this was a fill ecc correctable for now, print
|
|
// a debug message periodically
|
|
|
|
CorrectableErrors += 1;
|
|
|
|
#if 0 //jnfix
|
|
#if (DBG) || (HALDBG)
|
|
|
|
if( (CorrectableErrors % 32) == 0 ){
|
|
DbgPrint( "Correctable errors = %d\n", CorrectableErrors );
|
|
}
|
|
|
|
#endif //DBG || HALDBG
|
|
#endif //0 jnfix
|
|
|
|
//
|
|
// Acknowledge receipt of the correctable error by clearing
|
|
// the error in the MCES register.
|
|
//
|
|
|
|
HalpUpdateMces( FALSE, TRUE );
|
|
|
|
return TRUE;
|
|
|
|
}
|