Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1163 lines
30 KiB

  1. /*++
  2. Copyright (c) 2002 Microsoft Corporation
  3. Module Name:
  4. mcheck.c
  5. Abstract:
  6. This module implments machine check functions for the AMD64 platform.
  7. Author:
  8. David N. Cutler (davec) 18-May-2002
  9. Environment:
  10. Kernel mode.
  11. --*/
  12. #include <bugcodes.h>
  13. #include <halp.h>
  14. #include <stdlib.h>
  15. #include <stdio.h>
  16. #include <nthal.h>
  17. //
  18. // Define retry counts.
  19. //
  20. #define MAX_CACHE_LIMIT 3
  21. #define MIN_TIME_TO_CLEAR (2 * 1000 * 1000 * 100)
  22. //
  23. // Default MCA bank enable mask.
  24. //
  25. #define MCA_DEFAULT_BANK_ENABLE 0xFFFFFFFFFFFFFFFF
  26. //
  27. // MCG_CTL enable mask.
  28. //
  29. #define MCA_MCGCTL_ENABLE_LOGGING 0xffffffffffffffff
  30. //
  31. // MCA architecture related definitions.
  32. //
  33. #define MCA_NUMBER_REGISTERS 4 // number of registers per bank
  34. //
  35. // Bit masks for MCA_CAP register.
  36. //
  37. #define MCA_COUNT_MASK 0xFF // number of banks
  38. #define MCG_CTL_PRESENT 0x100 // control register present
  39. //
  40. // Bit masks for MCG_STATUS register.
  41. //
  42. #define MCG_RESTART_RIP_VALID 0x1 // restart RIP valid
  43. #define MCG_ERROR_RIP_VALID 0x2 // error RIP valid
  44. #define MCG_MC_IN_PROGRESS 0x4 // machine check in progress
  45. //
  46. // Define machine check state variables.
  47. //
  48. BOOLEAN McaBlockErrorClearing = FALSE;
  49. PVOID McaDeviceContext;
  50. PDRIVER_MCA_EXCEPTION_CALLBACK McaDriverExceptionCallback;
  51. KERNEL_MCA_DELIVERY McaWmiCallback; // WMI corrected MC handler
  52. BOOLEAN McaInterfaceLocked;
  53. FAST_MUTEX McaMutex;
  54. BOOLEAN McaNoBugCheck = FALSE;
  55. ULONG McaEnableCmc;
  56. UCHAR McaNumberOfBanks;
  57. KAFFINITY McaSavedAffinity = 0;
  58. ULONG McaSavedBankNumber = 0;
  59. ULONG64 McaSavedStatus = 0;
  60. ULONG McaStatusCount = 0;
  61. ULONG64 McaSavedTimeStamp = 0;
  62. //
  63. // Define external references.
  64. //
  65. extern KAFFINITY HalpActiveProcessors;
  66. extern WCHAR rgzSessionManager[];
  67. extern WCHAR rgzEnableMCA[];
  68. extern WCHAR rgzEnableCMC[];
  69. extern WCHAR rgzNoMCABugCheck[];
  70. //
  71. // Define forward referenced prototypes.
  72. //
  73. VOID
  74. HalpMcaInit (
  75. VOID
  76. );
  77. VOID
  78. HalpMcaLockInterface (
  79. VOID
  80. );
  81. NTSTATUS
  82. HalpMcaReadProcessorException (
  83. IN OUT PMCA_EXCEPTION Exception,
  84. IN BOOLEAN NonRestartableOnly
  85. );
  86. NTSTATUS
  87. HalpMcaReadRegisterInterface (
  88. IN ULONG BankNumber,
  89. IN OUT PMCA_EXCEPTION Exception
  90. );
  91. VOID
  92. HalpMcaUnlockInterface (
  93. VOID
  94. );
  95. #pragma alloc_text(PAGELK, HalpMcaCurrentProcessorSetConfig)
  96. #pragma alloc_text(PAGE, HalpGetMcaLog)
  97. #pragma alloc_text(INIT, HalpMcaInit)
  98. #pragma alloc_text(PAGE, HalpMcaLockInterface)
  99. #pragma alloc_text(PAGE, HalpMceRegisterKernelDriver)
  100. #pragma alloc_text(PAGE, HalpMcaRegisterDriver)
  101. #pragma alloc_text(PAGE, HalpMcaUnlockInterface)
  102. #pragma alloc_text(PAGE, HalpGetMceInformation)
  103. VOID
  104. HalpMcaInit (
  105. VOID
  106. )
  107. /*++
  108. Routine Description:
  109. This routine initializes the machine check configuration for the system.
  110. Arguments:
  111. None.
  112. Return Value:
  113. None.
  114. --*/
  115. {
  116. KAFFINITY ActiveProcessors;
  117. KAFFINITY Affinity;
  118. ULONG MCAEnabled;
  119. KIRQL OldIrql;
  120. RTL_QUERY_REGISTRY_TABLE Parameters[4];
  121. //
  122. // Initialize the fast mutext that is used to synchronize access to
  123. // machine check information.
  124. //
  125. ExInitializeFastMutex(&McaMutex);
  126. //
  127. // Clear registered driver information.
  128. //
  129. McaDriverExceptionCallback = NULL;
  130. McaDeviceContext = NULL;
  131. McaWmiCallback = NULL;
  132. //
  133. // Get the machine check configuration enables from the registry.
  134. //
  135. // N.B. It is assumed that all AMD64 chip implementations support MCE
  136. // and MCA.
  137. //
  138. // N.B. MCA is enabled by default. MCA can be disabled via the registry.
  139. //
  140. ASSERT((HalpFeatureBits & HAL_MCA_PRESENT) != 0);
  141. ASSERT((HalpFeatureBits & HAL_MCE_PRESENT) != 0);
  142. RtlZeroMemory(Parameters, sizeof(Parameters));
  143. MCAEnabled = TRUE;
  144. Parameters[0].Flags = RTL_QUERY_REGISTRY_DIRECT;
  145. Parameters[0].Name = &rgzEnableMCA[0];
  146. Parameters[0].EntryContext = &MCAEnabled;
  147. Parameters[0].DefaultType = REG_DWORD;
  148. Parameters[0].DefaultData = &MCAEnabled;
  149. Parameters[0].DefaultLength = sizeof(ULONG);
  150. McaNoBugCheck = FALSE;
  151. Parameters[1].Flags = RTL_QUERY_REGISTRY_DIRECT;
  152. Parameters[1].Name = &rgzNoMCABugCheck[0];
  153. Parameters[1].EntryContext = &McaNoBugCheck;
  154. Parameters[1].DefaultType = REG_DWORD;
  155. Parameters[1].DefaultData = &McaNoBugCheck;
  156. Parameters[1].DefaultLength = sizeof(ULONG);
  157. McaEnableCmc = 60; // default polling interval, in seconds
  158. Parameters[2].Flags = RTL_QUERY_REGISTRY_DIRECT;
  159. Parameters[2].Name = &rgzEnableCMC[0];
  160. Parameters[2].EntryContext = &McaEnableCmc;
  161. Parameters[2].DefaultType = REG_DWORD;
  162. Parameters[2].DefaultData = &McaEnableCmc;
  163. Parameters[2].DefaultLength = sizeof(ULONG);
  164. RtlQueryRegistryValues(RTL_REGISTRY_CONTROL | RTL_REGISTRY_OPTIONAL,
  165. rgzSessionManager,
  166. &Parameters[0],
  167. NULL,
  168. NULL);
  169. //
  170. // If MCA support is enabled, then initialize the MCA configuration.
  171. // Otherwise, disable MCA and MCE support.
  172. //
  173. if (MCAEnabled == FALSE) {
  174. HalpFeatureBits &= ~(HAL_MCA_PRESENT | HAL_MCE_PRESENT);
  175. McaEnableCmc = HAL_CMC_DISABLED; // disable CMC too
  176. } else {
  177. //
  178. // Make sure the value for CMCEnabled is valid. If less than 0, set it to
  179. // 0 (disabled). If greater than 0, make sure polling isn't too frequent.
  180. //
  181. if ( (LONG)McaEnableCmc <= 0 ) {
  182. McaEnableCmc = HAL_CMC_DISABLED;
  183. } else if ( McaEnableCmc < 15 ) {
  184. McaEnableCmc = 15;
  185. }
  186. //
  187. // Read the number of banks.
  188. //
  189. McaNumberOfBanks = (UCHAR)RDMSR(MSR_MCG_CAP) & MCA_COUNT_MASK;
  190. //
  191. // Initialize the machine check configuration for each processor in
  192. // the host system.
  193. //
  194. ActiveProcessors = HalpActiveProcessors;
  195. Affinity = 1;
  196. do {
  197. if (ActiveProcessors & Affinity) {
  198. ActiveProcessors &= ~Affinity;
  199. KeSetSystemAffinityThread(Affinity);
  200. OldIrql = KfRaiseIrql(HIGH_LEVEL);
  201. HalpMcaCurrentProcessorSetConfig();
  202. KeLowerIrql(OldIrql);
  203. }
  204. Affinity <<= 1;
  205. } while (ActiveProcessors);
  206. KeRevertToUserAffinityThread();
  207. }
  208. return;
  209. }
  210. VOID
  211. HalHandleMcheck (
  212. IN PKTRAP_FRAME TrapFrame,
  213. IN PKEXCEPTION_FRAME ExceptionFrame
  214. )
  215. /*++
  216. Routine Description:
  217. This function is called by the machine check exception dispatch code to
  218. process a machine check exception.
  219. N.B. Machine check in progress is not cleared by this function. If the
  220. machine check is subsequently restartable as the result of software
  221. fixup, then machine check in progress will be cleared by the machine
  222. check exception dispatch code. This makes the window between clearing
  223. the machine check and continuing execution as small as posssible. If
  224. a machine check occurs within this window, then a recursion onto the
  225. machine check stack will occur.
  226. Arguments:
  227. TrapFrame - Supplies a pointer to the machine check trap frame.
  228. ExceptionFrame - Supplies a pointer to the machine check exception
  229. frame.
  230. Return Value:
  231. None.
  232. --*/
  233. {
  234. ERROR_SEVERITY ErrorCode;
  235. MCA_EXCEPTION Exception;
  236. NTSTATUS Status;
  237. //
  238. // Block clearing of status state and attempt to find a nonrestartable
  239. // machine check.
  240. //
  241. ASSERT((RDMSR(MSR_MCG_STATUS) & MCG_MC_IN_PROGRESS) != 0);
  242. McaBlockErrorClearing = TRUE;
  243. Exception.VersionNumber = 1;
  244. Status = HalpMcaReadProcessorException(&Exception, TRUE);
  245. //
  246. // Check if a nonrestartable machine check was found.
  247. //
  248. if (Status == STATUS_SEVERITY_ERROR) {
  249. //
  250. // A nonrestartable machine check was located. If a driver has
  251. // registered for a callback, then call the driver to see if it
  252. // can resolve the machine check.
  253. //
  254. ErrorCode = ErrorFatal;
  255. if (McaDriverExceptionCallback != NULL) {
  256. ErrorCode = McaDriverExceptionCallback(McaDeviceContext,
  257. TrapFrame,
  258. ExceptionFrame,
  259. &Exception);
  260. }
  261. //
  262. // If an uncorrected error was encountered and bug checks are not being
  263. // suppressed, then bug check the system.
  264. //
  265. if ((ErrorCode != ErrorCorrected) && (McaNoBugCheck == FALSE)) {
  266. KeBugCheckEx(MACHINE_CHECK_EXCEPTION,
  267. Exception.u.Mca.BankNumber,
  268. (ULONG64)&Exception,
  269. (ULONG64)Exception.u.Mca.Status.QuadPart >> 32,
  270. (ULONG64)Exception.u.Mca.Status.QuadPart & 0xffffffff);
  271. }
  272. }
  273. //
  274. // The machine check was either restartable or a driver was registered
  275. // and the driver was able to recover the operation. Signal the clock
  276. // routine that it should call the routine to queue a DPC to log the
  277. // machine check information.
  278. //
  279. // NOTE: This used to check for the MCA logging driver being registered.
  280. // We no longer deliver corrected machine checks to the driver. They only
  281. // go to WMI.
  282. //
  283. McaBlockErrorClearing = FALSE;
  284. if (McaWmiCallback != NULL) {
  285. HalpClockMcaQueueDpc = 1;
  286. }
  287. return;
  288. }
  289. VOID
  290. HalpMcaCurrentProcessorSetConfig (
  291. VOID
  292. )
  293. /*++
  294. Routine Description:
  295. This function sets the machine check configuration for the current
  296. processor.
  297. Arguments:
  298. None.
  299. Return Value:
  300. None.
  301. --*/
  302. {
  303. ULONG Bank;
  304. ULONG64 MciCtl;
  305. //
  306. // If MCA is enabled, then initialize the MCA control register and all
  307. // bank control registers.
  308. //
  309. if ((HalpFeatureBits & HAL_MCA_PRESENT) != 0) {
  310. //
  311. // Enable logging all errors in the global control register.
  312. //
  313. ASSERT((RDMSR(MSR_MCG_CAP) & MCG_CTL_PRESENT) != 0);
  314. WRMSR(MSR_MCG_CTL, MCA_MCGCTL_ENABLE_LOGGING);
  315. //
  316. // Enable logging all errors for each bank.
  317. //
  318. for (Bank = 0; Bank < McaNumberOfBanks; Bank += 1) {
  319. MciCtl = MCA_DEFAULT_BANK_ENABLE;
  320. //
  321. // Enable machine checks for the bank.
  322. //
  323. WRMSR(MSR_MC0_CTL + (Bank * MCA_NUMBER_REGISTERS), MciCtl);
  324. }
  325. //
  326. // Enable MCE bit in CR4.
  327. //
  328. WriteCR4(ReadCR4() | CR4_MCE);
  329. }
  330. return;
  331. }
  332. NTSTATUS
  333. HalpGetMcaLog (
  334. IN OUT PMCA_EXCEPTION Exception,
  335. IN ULONG BufferSize,
  336. OUT PULONG Length
  337. )
  338. /*++
  339. Routine Description:
  340. This function returns machine check error information for a MCA bank
  341. that contains an error.
  342. Arguments:
  343. Exception - Supplies a pointer to a machine check exception log area.
  344. BufferSize - Supplies the size of the machine check exception log area.
  345. Length - Supplies a pointer to a variable that receives the machine
  346. information log.
  347. Return Value:
  348. STATUS_SUCCESS - if the error data for an MCA bank is copied into the
  349. exception buffer and the machine check is restartable.
  350. STATUS_SEVERITY_ERROR - if the error data for an MCA bank is copied
  351. into the exception buffer and the machine check is not restartable.
  352. STATUS_NOT_FOUND - if no bank had any error information present.
  353. STATUS_INVALID_PARAMETER - if the size of the specifed buffer is not
  354. large enough or the version number is not valid. The length of the
  355. required buffer is returned.
  356. --*/
  357. {
  358. KAFFINITY ActiveProcessors;
  359. KAFFINITY Affinity;
  360. NTSTATUS Status;
  361. ULONG64 TimeStamp;
  362. PAGED_CODE();
  363. //
  364. // If MCA support is not enabled, return a failure status.
  365. //
  366. if ((HalpFeatureBits & HAL_MCA_PRESENT) == 0) {
  367. return STATUS_NOT_FOUND;
  368. }
  369. //
  370. // Don't allow the logging driver to read machine check information.
  371. // Only WMI is allowed to retrieve this information.
  372. //
  373. if ( *(PULONG)Exception != HALP_KERNEL_TOKEN ) {
  374. return STATUS_NOT_FOUND;
  375. }
  376. //
  377. // If the buffer size is not equal to the MCA exception information
  378. // record size, then return a failure status.
  379. //
  380. if (BufferSize < sizeof(MCA_EXCEPTION)) {
  381. *Length = sizeof(MCA_EXCEPTION);
  382. return STATUS_INVALID_PARAMETER;
  383. }
  384. Exception->VersionNumber = 1;
  385. //
  386. // Scan through the machine check banks on each processor until error
  387. // information is located or there are no more banks to scan.
  388. //
  389. *Length = 0;
  390. Status = STATUS_NOT_FOUND;
  391. ActiveProcessors = HalpActiveProcessors;
  392. HalpMcaLockInterface();
  393. for (Affinity = 1; ActiveProcessors; Affinity <<= 1) {
  394. if (ActiveProcessors & Affinity) {
  395. ActiveProcessors &= ~Affinity;
  396. KeSetSystemAffinityThread(Affinity);
  397. //
  398. // Attempt to find machine check error status information
  399. // from the MCA banks on the current processor.
  400. //
  401. Status = HalpMcaReadProcessorException(Exception, FALSE);
  402. //
  403. // Check to determine if any machine check information was found.
  404. //
  405. if (Status != STATUS_NOT_FOUND) {
  406. //
  407. // If the relative time between this machine check event
  408. // and the previous machine check event is greater than
  409. // the minimum time, then reset the machine check identity
  410. // information.
  411. //
  412. TimeStamp = ReadTimeStampCounter();
  413. if ((TimeStamp - McaSavedTimeStamp) > MIN_TIME_TO_CLEAR) {
  414. McaStatusCount = 0;
  415. McaSavedAffinity = Affinity;
  416. McaSavedBankNumber = Exception->u.Mca.BankNumber;
  417. McaSavedStatus = Exception->u.Mca.Status.QuadPart;
  418. }
  419. McaSavedTimeStamp = TimeStamp;
  420. //
  421. // Check to determine if the same processor is reporting
  422. // the same status.
  423. //
  424. if ((Affinity == McaSavedAffinity) &&
  425. (McaSavedBankNumber == Exception->u.Mca.BankNumber) &&
  426. (McaSavedStatus == Exception->u.Mca.Status.QuadPart)) {
  427. //
  428. // Check to determine if the same error has occurred
  429. // more than the cache flush limit. Exceeding the
  430. // cache flush limit results in a writeback invalidate
  431. // of the cache on the current processor.
  432. //
  433. McaStatusCount += 1;
  434. if (McaStatusCount >= MAX_CACHE_LIMIT) {
  435. WritebackInvalidate();
  436. }
  437. } else {
  438. //
  439. // This is the first occurrence of the error on the
  440. // current processor. Reset the machine check identity
  441. // information.
  442. //
  443. McaStatusCount = 0;
  444. McaSavedAffinity = Affinity;
  445. McaSavedBankNumber = Exception->u.Mca.BankNumber;
  446. McaSavedStatus = Exception->u.Mca.Status.QuadPart;
  447. }
  448. //
  449. // Set the the length of the information, save the time
  450. // stamp, and break out of the scan.
  451. //
  452. //
  453. *Length = sizeof(MCA_EXCEPTION);
  454. break;
  455. }
  456. }
  457. }
  458. //
  459. // Restore the thread affinity, release the fast mutex, and return
  460. // the completion status.
  461. //
  462. KeRevertToUserAffinityThread();
  463. HalpMcaUnlockInterface();
  464. return Status;
  465. }
  466. VOID
  467. HalpMcaLockInterface (
  468. VOID
  469. )
  470. /*++
  471. Routine Description:
  472. This function acquires the MCA fast mutex.
  473. N.B. This function is exported via HalQueryMcaInterface information
  474. code.
  475. Arguments:
  476. None.
  477. Return Value:
  478. None.
  479. --*/
  480. {
  481. PAGED_CODE();
  482. ExAcquireFastMutex(&McaMutex);
  483. #if DBG
  484. ASSERT(McaInterfaceLocked == FALSE);
  485. McaInterfaceLocked = TRUE;
  486. #endif
  487. }
  488. VOID
  489. HalpMcaQueueDpc (
  490. VOID
  491. )
  492. /*++
  493. Routine Description:
  494. This function is called from the timer tick to tell WMI about a corrected
  495. machine check.
  496. Arguments:
  497. None.
  498. Return Value:
  499. None.
  500. --*/
  501. {
  502. ASSERT( McaWmiCallback != NULL );
  503. McaWmiCallback( (PVOID)HALP_KERNEL_TOKEN, McaAvailable, NULL );
  504. return;
  505. }
  506. NTSTATUS
  507. HalpMcaReadProcessorException (
  508. IN OUT PMCA_EXCEPTION Exception,
  509. IN BOOLEAN NonRestartableOnly
  510. )
  511. /*++
  512. Routine Description:
  513. This function returns error information from the MCA banks on the
  514. current processor.
  515. Arguments:
  516. Exception - Supplies a pointer to a MCA exception record.
  517. NonRestartableOnly - Supplies a boolean variable that determines the type
  518. of error information returned.
  519. Return Value:
  520. STATUS_SUCCESS - if the data for the bank registers is copied into the
  521. exception buffer and the machine check is restartable.
  522. STATUS_SEVERITY_ERROR - if the data for the bank registers is copied
  523. into the exception buffer and the machine check is not restartable.
  524. STATUS_NOT_FOUND - if no bank had any error information present.
  525. --*/
  526. {
  527. ULONG Bank;
  528. NTSTATUS Status;
  529. //
  530. // Scan the MCA banks on current processor and return the exception
  531. // information for the first bank reporting an error.
  532. //
  533. for (Bank = 0; Bank < McaNumberOfBanks; Bank += 1) {
  534. Status = HalpMcaReadRegisterInterface(Bank, Exception);
  535. //
  536. // If the status is unsuccessful, then the current bank has no
  537. // error information present.
  538. //
  539. if (Status != STATUS_UNSUCCESSFUL) {
  540. //
  541. // If the status is success, then the current bank has restartable
  542. // error information. Otherwise, if the status is severity error,
  543. // then the current bank has nonrestartable error information.
  544. //
  545. if (((Status == STATUS_SUCCESS) &&
  546. (NonRestartableOnly == FALSE)) ||
  547. (Status == STATUS_SEVERITY_ERROR)) {
  548. return Status;
  549. }
  550. }
  551. }
  552. return STATUS_NOT_FOUND;
  553. }
  554. NTSTATUS
  555. HalpMcaReadRegisterInterface (
  556. IN ULONG Bank,
  557. IN OUT PMCA_EXCEPTION Exception
  558. )
  559. /*++
  560. Routine Description:
  561. This function reads the MCA registers from the current processor
  562. and returns the result in the specified exception structure.
  563. N.B. This function is exported via HalQueryMcaInterface information
  564. code.
  565. Arguments:
  566. Bank - Supplies the MCA bank to be be read.
  567. Exception - Supplies a pointer to the exception information buffer.
  568. Return Value:
  569. STATUS_SUCCESS - if the data for the specified bank registers is copied
  570. into the exception buffer and the machine check is restartable.
  571. STATUS_SEVERITY_ERROR - if the data for the specified bank registers
  572. is copied into the exception buffer and the machine check is not
  573. restartable.
  574. STATUS_UNSUCCESSFUL - if the specified bank has no error information
  575. present.
  576. STATUS_NOT_FOUND - if the specified bank number exceeds the number of
  577. MCA banks.
  578. STATUS_INVALID_PARAMETER - if the exception record is of an unknown
  579. version.
  580. --*/
  581. {
  582. ULONG BankBase = Bank * MCA_NUMBER_REGISTERS;
  583. MCI_STATS BankStatus;
  584. ULONG64 McgStatus;
  585. NTSTATUS Status;
  586. //
  587. // Check for a valid MCA register bank number.
  588. //
  589. if (Bank >= McaNumberOfBanks) {
  590. return STATUS_NOT_FOUND;
  591. }
  592. //
  593. // Check if the exception buffer specifies a correct version number.
  594. //
  595. if (Exception->VersionNumber != 1) {
  596. return STATUS_INVALID_PARAMETER;
  597. }
  598. //
  599. // Check if any errors are present for the specified bank.
  600. //
  601. BankStatus.QuadPart = RDMSR(MSR_MC0_STATUS + BankBase);
  602. if (BankStatus.MciStatus.Valid == 0) {
  603. return STATUS_UNSUCCESSFUL;
  604. }
  605. //
  606. // Set the return status to indicate whether execution can be continued.
  607. // STATUS_SUCCESS means "An exception was found, and execution can be
  608. // continued." STATUS_SEVERITY_ERROR means "An exception was found, and
  609. // execution must not be continued."
  610. //
  611. // If a machine check exception is not in progress, then execution can be
  612. // continued. This happens when polling for hardware-corrected errors
  613. // finds an error that the hardware corrected without interrupting
  614. // execution. (Note that this case also applies to an error that was fatal
  615. // to an earlier boot. The system bugchecked, and initial polling on the
  616. // reboot is now finding the error.)
  617. //
  618. // If a machine check exception is in progress, then execution can be
  619. // restarted only if the error has been corrected and the necessary
  620. // restart information is intact (restart RIP valid and processor context
  621. // not corrupt).
  622. //
  623. // This code used to check only for the restart information being valid.
  624. // These bits do indicate whether there is valid context for restarting
  625. // from an error, but there has still been an error, and unless we plan
  626. // to correct the error, we should not continue. Currently we do not do
  627. // any correction or containment in software, so all uncorrected errors
  628. // are fatal.
  629. //
  630. Status = STATUS_SUCCESS;
  631. McgStatus = RDMSR(MSR_MCG_STATUS);
  632. if ( ((McgStatus & MCG_MC_IN_PROGRESS) != 0) &&
  633. ( (BankStatus.MciStatus.UncorrectedError == 1) ||
  634. ((McgStatus & MCG_RESTART_RIP_VALID) == 0) ||
  635. (BankStatus.MciStatus.ContextCorrupt == 1) ) ) {
  636. Status = STATUS_SEVERITY_ERROR;
  637. }
  638. //
  639. // Fill in the complete exception record.
  640. //
  641. Exception->ExceptionType = HAL_MCA_RECORD;
  642. Exception->TimeStamp.QuadPart = 0;
  643. Exception->TimeStamp.LowPart = SharedUserData->SystemTime.LowPart;
  644. Exception->TimeStamp.HighPart = SharedUserData->SystemTime.High1Time;
  645. Exception->ProcessorNumber = KeGetCurrentProcessorNumber();
  646. Exception->Reserved1 = 0;
  647. Exception->u.Mca.BankNumber = (UCHAR)Bank;
  648. memset(&Exception->u.Mca.Reserved2[0], 0, sizeof(Exception->u.Mca.Reserved2));
  649. Exception->u.Mca.Status = BankStatus;
  650. Exception->u.Mca.Address.QuadPart = 0;
  651. if (BankStatus.MciStatus.AddressValid != 0) {
  652. Exception->u.Mca.Address.QuadPart = RDMSR(MSR_MC0_ADDR + BankBase);
  653. }
  654. Exception->u.Mca.Misc = RDMSR(MSR_MC0_MISC + BankBase);
  655. //
  656. // If error clearing is not blocked, then clear the machine check in the
  657. // bank status register.
  658. //
  659. if (McaBlockErrorClearing == FALSE) {
  660. WRMSR(MSR_MC0_STATUS + BankBase, 0);
  661. }
  662. //
  663. // When the valid bit of status register is cleared, a new buffered
  664. // error may be written into the bank status registers. A serializing
  665. // instruction is required to permit the update to complete.
  666. //
  667. HalpSerialize();
  668. return Status;
  669. }
  670. NTSTATUS
  671. HalpMceRegisterKernelDriver(
  672. IN PKERNEL_ERROR_HANDLER_INFO DriverInfo,
  673. IN ULONG InfoSize
  674. )
  675. /*++
  676. Routine Description:
  677. This routine is called by the kernel (via HalSetSystemInformation)
  678. to register its presence. This is mostly for WMI callbacks registration.
  679. Arguments:
  680. DriverInfo: Contains kernel info about the callbacks and associated objects.
  681. Return Value:
  682. Unless a MCA driver is already registered OR one of the two callback
  683. routines are NULL, this routine returns Success.
  684. Implementation Notes:
  685. - the current implementation assumes the kernel registers its callbacks
  686. earlier than a driver will. The current kernel registration is done by
  687. WMI and should be done at WMI-Phase 0.
  688. - the registrations do not consider if the HAL supports or not the MCA
  689. functionalities. It simply registers the callbacks if no other callback was
  690. registered before. This allows us to allow some flexibility if a machine event
  691. functionality is enabled AFTER the hal initialization (e.g. HalpGetFeatureBits())
  692. through the mean of a registry key or driver event, for example.
  693. --*/
  694. {
  695. NTSTATUS status;
  696. PAGED_CODE();
  697. if ( DriverInfo == NULL ) {
  698. return STATUS_INVALID_PARAMETER;
  699. }
  700. //
  701. // Backward compatibility only.
  702. //
  703. if ( (DriverInfo->Version != 0) &&
  704. (DriverInfo->Version > KERNEL_ERROR_HANDLER_VERSION) ) {
  705. return STATUS_REVISION_MISMATCH;
  706. }
  707. //
  708. // Register Kernel MCA notification.
  709. //
  710. status = STATUS_UNSUCCESSFUL;
  711. HalpMcaLockInterface();
  712. if ( McaWmiCallback == NULL ) {
  713. McaWmiCallback = DriverInfo->KernelMcaDelivery;
  714. status = STATUS_SUCCESS;
  715. }
  716. HalpMcaUnlockInterface();
  717. return status;
  718. } // HalpMceRegisterKernelDriver
  719. NTSTATUS
  720. HalpMcaRegisterDriver (
  721. IN PMCA_DRIVER_INFO DriverInformation
  722. )
  723. /*++
  724. Routine Description:
  725. This function to register or deregister an exception callout.
  726. It is called via the interface HalSetSystemInformation.
  727. Arguments:
  728. DriverInformation - Supplies a pointer to the driver information.
  729. Return Value:
  730. STATUS_SUCCESS is returned if the driver is registered or deregistered.
  731. Otherwise, STATUS_UNSUCCESSFUL is returned.
  732. --*/
  733. {
  734. KIRQL OldIrql;
  735. NTSTATUS Status;
  736. PAGED_CODE();
  737. //
  738. // If MCA is enabled and the driver exception callback is not NULL, then
  739. // attempt to register the driver. Otherwise, attempt to deregister the
  740. // driver.
  741. //
  742. Status = STATUS_UNSUCCESSFUL;
  743. if (((HalpFeatureBits & HAL_MCA_PRESENT) != 0) &&
  744. (DriverInformation->ExceptionCallback != NULL)) {
  745. //
  746. // If a driver is not already registered, then register the driver.
  747. //
  748. HalpMcaLockInterface();
  749. if (McaDriverExceptionCallback == NULL) {
  750. McaDriverExceptionCallback = DriverInformation->ExceptionCallback;
  751. McaDeviceContext = DriverInformation->DeviceContext;
  752. Status = STATUS_SUCCESS;
  753. }
  754. HalpMcaUnlockInterface();
  755. } else if (DriverInformation->ExceptionCallback == NULL) {
  756. //
  757. // If the driver is deregistering itself, then deregister the driver.
  758. //
  759. HalpMcaLockInterface();
  760. if (McaDeviceContext == DriverInformation->DeviceContext) {
  761. McaDriverExceptionCallback = NULL;
  762. McaDeviceContext = NULL;
  763. Status = STATUS_SUCCESS;
  764. }
  765. HalpMcaUnlockInterface();
  766. }
  767. return Status;
  768. }
  769. VOID
  770. HalpMcaUnlockInterface (
  771. VOID
  772. )
  773. /*++
  774. Routine Description:
  775. This function releases the MCA fast mutex.
  776. N.B. This function is exported via HalQueryMcaInterface information
  777. code.
  778. Arguments:
  779. None.
  780. Return Value:
  781. None.
  782. --*/
  783. {
  784. PAGED_CODE();
  785. #if DBG
  786. ASSERT(McaInterfaceLocked == TRUE);
  787. McaInterfaceLocked = FALSE;
  788. #endif
  789. ExReleaseFastMutex(&McaMutex);
  790. }
  791. NTSTATUS
  792. HalpGetMceInformation(
  793. PHAL_ERROR_INFO ErrorInfo,
  794. PULONG ErrorInfoLength
  795. )
  796. /*++
  797. Routine Description:
  798. This routine is called by HaliQuerySystemInformation for the HalErrorInformation class.
  799. Arguments:
  800. ErrorInfo : pointer to HAL_ERROR_INFO structure.
  801. ErrorInfoLength : size of the valid memory structure pointed by ErrorInfo.
  802. Return Value:
  803. STATUS_SUCCESS if successful
  804. error status otherwise
  805. --*/
  806. {
  807. NTSTATUS status;
  808. ULONG savedVersion;
  809. PAGED_CODE();
  810. ASSERT( ErrorInfo != NULL );
  811. ASSERT( *ErrorInfoLength == sizeof(HAL_ERROR_INFO) );
  812. //
  813. // Backward compatibility only.
  814. //
  815. if ( (ErrorInfo->Version == 0) || (ErrorInfo->Version > HAL_ERROR_INFO_VERSION) ) {
  816. return STATUS_REVISION_MISMATCH;
  817. }
  818. ASSERT( ErrorInfo->Version == HAL_ERROR_INFO_VERSION );
  819. //
  820. // Zero the output structure, then in the few fields that are meaningful.
  821. //
  822. savedVersion = ErrorInfo->Version;
  823. RtlZeroMemory( ErrorInfo, sizeof(HAL_ERROR_INFO) );
  824. ErrorInfo->Version = savedVersion;
  825. ErrorInfo->McaMaxSize = sizeof(MCA_EXCEPTION);
  826. ErrorInfo->CmcMaxSize = sizeof(MCA_EXCEPTION);
  827. ErrorInfo->McaPreviousEventsCount = 1; // Set to 1 to get WMI to poll immediately
  828. if ( (HalpFeatureBits & HAL_MCA_PRESENT) != 0 ) {
  829. ErrorInfo->CmcPollingInterval = McaEnableCmc;
  830. } else {
  831. ErrorInfo->CmcPollingInterval = HAL_CMC_DISABLED;
  832. }
  833. ErrorInfo->CpePollingInterval = HAL_CPE_DISABLED;
  834. ErrorInfo->McaKernelToken = HALP_KERNEL_TOKEN;
  835. ErrorInfo->CmcKernelToken = HALP_KERNEL_TOKEN;
  836. *ErrorInfoLength = sizeof(HAL_ERROR_INFO);
  837. return STATUS_SUCCESS;
  838. } // HalpGetMceInformation