/*++ Copyright (c) 1996 Microsoft Corporation Module Name: chbeat.c Abstract: membership state heart beat code. Tracks node availability through exchanging heart beat messages with nodes that are marked as alive. Author: Charlie Wickham (charlwi) 05-Mar-1997 Environment: Kernel Mode Revision History: --*/ #include "precomp.h" #pragma hdrstop #include "chbeat.tmh" #include "clusvmsg.h" #include "stdio.h" /* External */ /* Static */ // // heart beat structures - heart beats are driven by a timer and DPC // routine. In order to synchronize the shutdown of the DPC, we also need two // flags, an event and a spin lock. // KTIMER HeartBeatTimer; KDPC HeartBeatDpc; KEVENT HeartBeatDpcFinished; BOOLEAN HeartBeatEnabled = FALSE; BOOLEAN HeartBeatDpcRunning = FALSE; CN_LOCK HeartBeatLock; #if 0 Heart Beating Explained ClockTicks are incremented every HEART_BEAT_PERIOD millisecs. SendTicks are the number of ticks that go by before sending HBs. The check for received HB msgs is done in the tick just before HB msgs are sent. Interface Lost HB ticks are in terms of heart beat check periods and therefore are incremented only during the check period. An interface is failed when the number of Interface Lost HB ticks have passed and no HB message has been received on that interface. Likewise, Node Lost HB Ticks are in terms of heart beat check periods and are incremented during the check period. After all interfaces have failed on a node, Node Lost HB ticks must pass without an interface going back online before a node down event is issued. Note that a node's comm state is set to offline when all interfaces have failed. #endif #define CLUSNET_HEART_BEAT_SEND_TICKS 2 // every 1.2 secs #define CLUSNET_INTERFACE_LOST_HEART_BEAT_TICKS 3 // after 3 secs #define CLUSNET_NODE_LOST_HEART_BEAT_TICKS 6 // after 6.6 secs ULONG HeartBeatClockTicks; ULONG HeartBeatSendTicks = CLUSNET_HEART_BEAT_SEND_TICKS; ULONG HBInterfaceLostHBTicks = CLUSNET_INTERFACE_LOST_HEART_BEAT_TICKS; ULONG HBNodeLostHBTicks = CLUSNET_NODE_LOST_HEART_BEAT_TICKS; // // Unicast Heartbeat Data // // Even with multicast heartbeats, unicast heartbeats must be supported // for backwards compatibility. // // // This array records all the nodes that need to have a HB sent to another // node. This array is not protected by a lock since it is only used with the // heartbeat DPC routine. // typedef struct _INTERFACE_HEARTBEAT_INFO { CL_NODE_ID NodeId; CL_NETWORK_ID NetworkId; ULONG SeqNumber; ULONG AckNumber; } INTERFACE_HEARTBEAT_INFO, *PINTERFACE_HEARTBEAT_INFO; #define InterfaceHBInfoInitialLength 16 #define InterfaceHBInfoLengthIncrement 4 PINTERFACE_HEARTBEAT_INFO InterfaceHeartBeatInfo = NULL; ULONG InterfaceHBInfoCount; // running count while sending HBs ULONG InterfaceHBInfoCurrentLength; // current length of HB info array LARGE_INTEGER HBTime; // HB time in relative sys time #define MAX_DPC_SKEW ( -HBTime.QuadPart / 2 ) // // Outerscreen mask. This is set by clussvc's membership manager in user // mode. As it changes, MM drops down the set outerscreen Ioctl to update // clusnet's notion of this mask. Clusnet uses this mask to determine the // validity of a received heart beat. If the sending node is not part // of the mask, then it is sent a poison packet and the received event // is not passed on to other consumers. If it is a legetimate PP, then // we generate the proper event. // // Note: MM type definitions and macros have been moved to cnpdef.h for // general usage. // typedef CX_CLUSTERSCREEN CX_OUTERSCREEN; CX_OUTERSCREEN MMOuterscreen; // Multicast Heartbeat Data // typedef struct _NETWORK_MCAST_HEARTBEAT_INFO { CL_NETWORK_ID NetworkId; PCNP_MULTICAST_GROUP McastGroup; CX_HB_NODE_INFO NodeInfo[ClusterDefaultMaxNodes+ClusterMinNodeId]; CX_CLUSTERSCREEN McastTarget; } NETWORK_MCAST_HEARTBEAT_INFO, *PNETWORK_MCAST_HEARTBEAT_INFO; #define NetworkHBInfoInitialLength 4 #define NetworkHBInfoLengthIncrement 4 PNETWORK_MCAST_HEARTBEAT_INFO NetworkHeartBeatInfo = NULL; ULONG NetworkHBInfoCount; // running count while sending HBs ULONG NetworkHBInfoCurrentLength; // current length of HB info array CL_NETWORK_ID MulticastBestNetwork = ClusterAnyNetworkId; ULONG CxMulticastEpoch = 0; // // Declarations for Clussvc to Clusnet Heartbeating. // ULONG ClussvcClusnetHbTimeoutTicks = 0; ClussvcHangAction ClussvcClusnetHbTimeoutAction = ClussvcHangActionDisable; ULONG ClussvcClusnetHbTickCount = 0; BOOLEAN ClussvcTerminateStopHbs = FALSE; PIO_WORKITEM ClussvcTerminateWorkItem = NULL; // Parameters for the Clussvc to Clusnet Heartbeating bugcheck. These are // for informational purposes only and should not otherwise be used. For // instance, the process object is dereferenced immediately after the // pointer is determined. PEPROCESS ClussvcProcessObject = NULL; ULONG ClussvcClusnetHbTimeoutSeconds = 0; /* Forward */ NTSTATUS CxInitializeHeartBeat( void ); VOID CxUnloadHeartBeat( VOID ); VOID CnpHeartBeatDpc( PKDPC DpcObject, PVOID DeferredContext, PVOID Arg1, PVOID Arg2 ); BOOLEAN CnpWalkNodesToSendHeartBeats( IN PCNP_NODE UpdateNode, IN PVOID UpdateContext, IN CN_IRQL NodeTableIrql ); BOOLEAN CnpWalkNodesToCheckForHeartBeats( IN PCNP_NODE UpdateNode, IN PVOID UpdateContext, IN CN_IRQL NodeTableIrql ); VOID CnpSendHBs( IN PCNP_INTERFACE UpdateInterface ); NTSTATUS CxSetOuterscreen( IN ULONG Outerscreen ); VOID CnpReceivePoisonPacket( IN PCNP_NETWORK Network, IN CL_NODE_ID SourceNodeId, IN ULONG SeqNumber ); VOID CnpUpdateMulticastEpoch( ULONG NewEpoch ); VOID CnpCheckClussvcHang( VOID ); VOID CnpLogClussvcHangAndTerminate( IN PDEVICE_OBJECT DeviceObject, IN PVOID Context ); VOID CnpLogClussvcHang( IN PDEVICE_OBJECT DeviceObject, IN PVOID Context ); /* End Forward */ #ifdef ALLOC_PRAGMA #pragma alloc_text(INIT, CxInitializeHeartBeat) #pragma alloc_text(PAGE, CxUnloadHeartBeat) #endif // ALLOC_PRAGMA NTSTATUS CxInitializeHeartBeat( void ) /*++ Routine Description: Init the mechanisms used to send and monitor heart beats Arguments: None Return Value: STATUS_INSUFFICIENT_RESOURCES if allocation fails. STATUS_SUCCESS otherwise. --*/ { // allocate the interface info array InterfaceHBInfoCount = 0; InterfaceHBInfoCurrentLength = InterfaceHBInfoInitialLength; if (InterfaceHBInfoCurrentLength > 0) { InterfaceHeartBeatInfo = CnAllocatePool( InterfaceHBInfoCurrentLength * sizeof(INTERFACE_HEARTBEAT_INFO) ); if (InterfaceHeartBeatInfo == NULL) { return(STATUS_INSUFFICIENT_RESOURCES); } } // allocate the network info array NetworkHBInfoCount = 0; NetworkHBInfoCurrentLength = NetworkHBInfoInitialLength; if (NetworkHBInfoCurrentLength > 0) { NetworkHeartBeatInfo = CnAllocatePool( NetworkHBInfoCurrentLength * sizeof(NETWORK_MCAST_HEARTBEAT_INFO) ); if (NetworkHeartBeatInfo == NULL) { return(STATUS_INSUFFICIENT_RESOURCES); } RtlZeroMemory( NetworkHeartBeatInfo, NetworkHBInfoCurrentLength * sizeof(NETWORK_MCAST_HEARTBEAT_INFO) ); } KeInitializeTimer( &HeartBeatTimer ); KeInitializeDpc( &HeartBeatDpc, CnpHeartBeatDpc, NULL ); KeInitializeEvent( &HeartBeatDpcFinished, SynchronizationEvent, FALSE ); CnInitializeLock( &HeartBeatLock, CNP_HBEAT_LOCK ); MEMLOG( MemLogInitHB, 0, 0 ); return(STATUS_SUCCESS); } // CxInitializeHeartBeat VOID CxUnloadHeartBeat( VOID ) /*++ Routine Description: Called during clusnet driver unload. Free any data structures allocated to send and monitor heartbeats. Arguments: None Return Value: None --*/ { PAGED_CODE(); if (InterfaceHeartBeatInfo != NULL) { CnFreePool(InterfaceHeartBeatInfo); InterfaceHeartBeatInfo = NULL; } if (NetworkHeartBeatInfo != NULL) { CnFreePool(NetworkHeartBeatInfo); NetworkHeartBeatInfo = NULL; } return; } // CxUnloadHeartBeat NTSTATUS CnpStartHeartBeats( VOID ) /*++ Routine Description: Start heart beating with the nodes that are marked alive and have an interface marked either OnlinePending or Online. Arguments: None Return Value: STATUS_INSUFFICIENT_RESOURCES if the workitem allocation fails --*/ { BOOLEAN TimerInserted; CN_IRQL OldIrql; ULONG period = HEART_BEAT_PERIOD; // // Pre-allocate a workitem in case we need an emergency // termination of the cluster service due to a user-mode // hang. // No need to take the lock before the allocation and // assignment, since below is the first place the lock // is acquired as the service starts. CnAssert(ClussvcTerminateWorkItem == NULL); ClussvcTerminateWorkItem = IoAllocateWorkItem(CnDeviceObject); if (ClussvcTerminateWorkItem == NULL) { CnTrace(HBEAT_EVENT, HbTraceTerminateWorkItemAlloc, "[HB] Failed to pre-allocate clussvc termination " "workitem.\n" ); return(STATUS_INSUFFICIENT_RESOURCES); } CnAcquireLock( &HeartBeatLock, &OldIrql ); HBTime.QuadPart = Int32x32To64( HEART_BEAT_PERIOD, -10000 ); TimerInserted = KeSetTimerEx(&HeartBeatTimer, HBTime, HEART_BEAT_PERIOD, &HeartBeatDpc); HeartBeatEnabled = TRUE; ClussvcTerminateStopHbs = FALSE; CnTrace(HBEAT_EVENT, HbTraceTimerStarted, "[HB] Heartbeat timer started. Period = %u ms.", period // LOGULONG ); MEMLOG( MemLogHBStarted, HEART_BEAT_PERIOD, 0 ); CnReleaseLock( &HeartBeatLock, OldIrql ); return(STATUS_SUCCESS); } // CnpStartHeartBeats VOID CnpStopHeartBeats( VOID ) /*++ Routine Description: Stop heart beating with other nodes in the cluster. Arguments: None Return Value: None --*/ { BOOLEAN TimerCanceled; CN_IRQL OldIrql; PIO_WORKITEM FreeWorkItem = NULL; CnAcquireLock( &HeartBeatLock, &OldIrql ); if (HeartBeatEnabled) { HeartBeatEnabled = FALSE; // // Cancel the periodic timer. Contrary to what the DDK implies, // this does not cancel the DPC if it is still queued from the // last timer expiration. It only stops the timer from firing // again. This is true as of 8/99. See KiTimerListExpire() in // ntos\ke\dpcsup.c. // TimerCanceled = KeCancelTimer( &HeartBeatTimer ); CnTrace(HBEAT_DETAIL, HbTraceTimerCancelled, "[HB] Heartbeat timer cancelled: %!bool!", TimerCanceled // LOGBOOLEAN ); MEMLOG( MemLogHBStopped, 0, 0 ); // // Remove the DPC associated with the timer from the system DPC // queue, if it is there. This actually does nothing, because a // timer DPC is only inserted into the system DPC queue if it is // bound to a specific processor. Unbound DPCs are executed inline // on the current processor in the kernel's timer expiration code. // Note that the object for a periodic timer is reinserted into the // timer queue before the DPC is excuted. So, it is possible for the // timer and the associated DPC to be queued simultaneously. This is // true as of 8/99. See KiTimerListExpire() in ntos\ke\dpcsup.c. // // The bottom line is that there is no safe way to synchronize with // the execution of a timer DPC during driver unload. All we can // do is ensure that the DPC handler code recognizes that it should // abort execution immediately and hope that it does so before the // driver code is unloaded. We do this by setting the HeartBeatEnabled // flag to False above. If our DPC code happens to be executing at // this point in time on another processor, as denoted by // HeartBeatDpcRunning, we wait for it to finish. // if ( !KeRemoveQueueDpc( &HeartBeatDpc )) { CnTrace(HBEAT_DETAIL, HbTraceDpcRunning, "[HB] DPC not removed. HeartBeatDpcRunning = %!bool!", HeartBeatDpcRunning // LOGBOOLEAN ); MEMLOG( MemLogHBDpcRunning, HeartBeatDpcRunning, 0 ); if ( HeartBeatDpcRunning ) { CnReleaseLock( &HeartBeatLock, OldIrql ); CnTrace(HBEAT_DETAIL, HbWaitForDpcToFinish, "can't remove DPC; waiting on DPCFinished event" ); MEMLOG( MemLogWaitForDpcFinish, 0, 0 ); KeWaitForSingleObject(&HeartBeatDpcFinished, Executive, KernelMode, FALSE, // not alertable NULL); // no timeout KeClearEvent( &HeartBeatDpcFinished ); CnAcquireLock( &HeartBeatLock, &OldIrql); } } CnTrace(HBEAT_EVENT, HbTraceTimerStopped, "[HB] Heartbeat timer stopped." ); } // // If the pre-allocated workitem was not used, we need to // free it to remove the reference on the clusnet device object. // FreeWorkItem = ClussvcTerminateWorkItem; ClussvcTerminateWorkItem = NULL; CnReleaseLock( &HeartBeatLock, OldIrql ); if (FreeWorkItem != NULL) { IoFreeWorkItem(FreeWorkItem); } return; } // CnpStopHeartBeats VOID CnpSendMcastHBCompletion( IN NTSTATUS Status, IN ULONG BytesSent, IN PVOID Context, IN PVOID Buffer ) /*++ Routine Description: Called when a mcast heartbeat send request completes successfully or unsuccessfully. Dereferences the McastGroup data structure. Arguments: Status - status of request BytesSent - not used Context - points to multicast group data structure Buffer - not used Return value: None. --*/ { PCNP_MULTICAST_GROUP mcastGroup = (PCNP_MULTICAST_GROUP) Context; CnAssert(mcastGroup != NULL); CnpDereferenceMulticastGroup(mcastGroup); return; } // CnpSendMcastHBCompletion NTSTATUS CnpSendMcastHB( IN PCNP_INTERFACE Interface ) /*++ Routine Description: Writes multicast heartbeat data into the NetworkHeartBeatInfo array for target Interface. Notes: Called from DPC with Network and Node locks held. Returns with Network and Node locks held. --*/ { ULONG i; BOOLEAN networkConnected; // find the network info structure for this network for (i = 0; i < NetworkHBInfoCount; i++) { if (NetworkHeartBeatInfo[i].NetworkId == Interface->Network->Id) { break; } } // start a new network info structure, if necessary if (i == NetworkHBInfoCount) { // before claiming an entry in the network info array, // make sure the array is large enough if (NetworkHBInfoCount >= NetworkHBInfoCurrentLength) { // need to allocate a new network info array PNETWORK_MCAST_HEARTBEAT_INFO tempInfo = NULL; PNETWORK_MCAST_HEARTBEAT_INFO freeInfo = NULL; ULONG tempLength; tempLength = NetworkHBInfoCurrentLength + NetworkHBInfoLengthIncrement; tempInfo = CnAllocatePool( tempLength * sizeof(NETWORK_MCAST_HEARTBEAT_INFO) ); if (tempInfo == NULL) { CnTrace( HBEAT_DETAIL, HbNetInfoArrayAllocFailed, "[HB] Failed to allocate network heartbeat info " "array of length %u. Cannot schedule heartbeat " "for node %u on network %u.", tempLength, Interface->Node->Id, Interface->Network->Id ); // cannot continue. the failure to send this // heartbeat will not be fatal if we recover // quickly. if we do not recover, this node // will be poisoned, which is probably best // since it is dangerously low on nonpaged pool. return(STATUS_INSUFFICIENT_RESOURCES); } else { // the allocation was successful. establish // the new array as the heartbeat info // array. RtlZeroMemory( tempInfo, tempLength * sizeof(NETWORK_MCAST_HEARTBEAT_INFO) ); freeInfo = NetworkHeartBeatInfo; NetworkHeartBeatInfo = tempInfo; NetworkHBInfoCurrentLength = tempLength; if (freeInfo != NULL) { if (NetworkHBInfoCount > 0) { RtlCopyMemory( NetworkHeartBeatInfo, freeInfo, NetworkHBInfoCount * sizeof(NETWORK_MCAST_HEARTBEAT_INFO) ); } CnFreePool(freeInfo); } CnTrace( HBEAT_DETAIL, HbNetInfoArrayLengthIncreased, "[HB] Increased network heartbeat info array " "to size %u.", NetworkHBInfoCurrentLength ); } } // increment the current counter NetworkHBInfoCount++; // initialize the information for this structure RtlZeroMemory( &NetworkHeartBeatInfo[i].McastTarget, sizeof(NetworkHeartBeatInfo[i].McastTarget) ); NetworkHeartBeatInfo[i].NetworkId = Interface->Network->Id; NetworkHeartBeatInfo[i].McastGroup = Interface->Network->CurrentMcastGroup; CnpReferenceMulticastGroup(NetworkHeartBeatInfo[i].McastGroup); } networkConnected = (BOOLEAN)(!CnpIsNetworkLocalDisconn(Interface->Network)); CnTrace(HBEAT_DETAIL, HbTraceScheduleMcastHBForInterface, "[HB] Scheduling multicast HB for node %u on network %u " "(I/F state = %!ifstate!) " "(interface media connected = %!bool!).", Interface->Node->Id, // LOGULONG Interface->Network->Id, // LOGULONG Interface->State, // LOGIfState networkConnected ); // fill in the network info for this node/interface NetworkHeartBeatInfo[i].NodeInfo[Interface->Node->Id].SeqNumber = Interface->SequenceToSend; NetworkHeartBeatInfo[i].NodeInfo[Interface->Node->Id].AckNumber = Interface->LastSequenceReceived; CnpClusterScreenInsert( NetworkHeartBeatInfo[i].McastTarget.ClusterScreen, INT_NODE(Interface->Node->Id) ); return(STATUS_SUCCESS); } // CnpSendMcastHB NTSTATUS CnpSendUcastHB( IN PCNP_INTERFACE Interface ) /*++ Routine Description: Writes unicast heartbeat data into the InterfaceHeartBeatInfo array for target Interface. Notes: Called from DPC with Network and Node locks held. Returns with Network and Node locks held. --*/ { BOOLEAN networkConnected; // before filling an entry in the heartbeat info array, // make sure the array is large enough. if (InterfaceHBInfoCount >= InterfaceHBInfoCurrentLength) { // need to allocate a new heartbeat info array PINTERFACE_HEARTBEAT_INFO tempInfo = NULL; PINTERFACE_HEARTBEAT_INFO freeInfo = NULL; ULONG tempLength; tempLength = InterfaceHBInfoCurrentLength + InterfaceHBInfoLengthIncrement; tempInfo = CnAllocatePool( tempLength * sizeof(INTERFACE_HEARTBEAT_INFO) ); if (tempInfo == NULL) { CnTrace( HBEAT_DETAIL, HbInfoArrayAllocFailed, "[HB] Failed to allocate heartbeat info " "array of length %u. Cannot schedule heartbeat " "for node %u on network %u.", tempLength, Interface->Node->Id, Interface->Network->Id ); // cannot continue. the failure to send this // heartbeat will not be fatal if we recover // quickly. if we do not recover, this node // will be poisoned, which is probably best // since it is dangerously low on nonpaged pool. return(STATUS_INSUFFICIENT_RESOURCES); } else { // the allocation was successful. establish // the new array as the heartbeat info // array. freeInfo = InterfaceHeartBeatInfo; InterfaceHeartBeatInfo = tempInfo; InterfaceHBInfoCurrentLength = tempLength; if (freeInfo != NULL) { if (InterfaceHBInfoCount > 0) { RtlCopyMemory( InterfaceHeartBeatInfo, freeInfo, InterfaceHBInfoCount * sizeof(INTERFACE_HEARTBEAT_INFO) ); } CnFreePool(freeInfo); } CnTrace( HBEAT_DETAIL, HbInfoArrayLengthIncreased, "[HB] Increased heartbeat info array to size %u.", InterfaceHBInfoCurrentLength ); } } networkConnected = (BOOLEAN)(!CnpIsNetworkLocalDisconn(Interface->Network)); CnTrace(HBEAT_DETAIL, HbTraceScheduleHBForInterface, "[HB] Scheduling HB for node %u on network %u (I/F state = %!ifstate!) " "(interface media connected = %!bool!).", Interface->Node->Id, // LOGULONG Interface->Network->Id, // LOGULONG Interface->State, // LOGIfState networkConnected ); InterfaceHeartBeatInfo[ InterfaceHBInfoCount ].NodeId = Interface->Node->Id; InterfaceHeartBeatInfo[ InterfaceHBInfoCount ].SeqNumber = Interface->SequenceToSend; InterfaceHeartBeatInfo[ InterfaceHBInfoCount ].AckNumber = Interface->LastSequenceReceived; InterfaceHeartBeatInfo[ InterfaceHBInfoCount ].NetworkId = Interface->Network->Id; ++InterfaceHBInfoCount; return(STATUS_SUCCESS); } // CnpSendUcastHB VOID CnpSendHBs( IN PCNP_INTERFACE Interface ) /*++ Routine Description: If Interface is in the correct state then stuff an entry in the heartbeat info array. Expand the heartbeat info array if necessary. Arguments: Interface - target interface for heartbeat message Return Value: None --*/ { BOOLEAN mcastOnly = FALSE; if ( Interface->State >= ClusnetInterfaceStateUnreachable ) { // increment the sequence number (Interface->SequenceToSend)++; // check if we should include this interface in a // multicast heartbeat. first we verify that the // network is multicast capable. then, we include it // if either of the following conditions are true: // - we have received a multicast heartbeat from the // target interface // - the discovery count (the number of discovery mcasts // left to send to the target interface) is greater // than zero if (CnpIsNetworkMulticastCapable(Interface->Network)) { if (CnpInterfaceQueryReceivedMulticast(Interface)) { // write the mcast heartbeat data. if not // successful, attempt a unicast heartbeat. if (CnpSendMcastHB(Interface) == STATUS_SUCCESS) { mcastOnly = TRUE; } } else if (Interface->McastDiscoverCount > 0) { // write the mcast heartbeat data for a // discovery. if successful, decrement the // discovery count. if (CnpSendMcastHB(Interface) == STATUS_SUCCESS) { --Interface->McastDiscoverCount; // if the discovery count has reached zero, // set the rediscovery countdown. this is // the number of heartbeat periods until we // try discovery again. if (Interface->McastDiscoverCount == 0) { Interface->McastRediscoveryCountdown = CNP_INTERFACE_MCAST_REDISCOVERY; } } } else if (Interface->McastRediscoveryCountdown > 0) { // decrement the rediscovery countdown. if we // reach zero, we will start multicast discovery // on the next heartbeat to this interface. if (--Interface->McastRediscoveryCountdown == 0) { Interface->McastDiscoverCount = CNP_INTERFACE_MCAST_DISCOVERY; } } } // write unicast heartbeat data if (!mcastOnly) { CnpSendUcastHB(Interface); } } CnReleaseLock(&Interface->Network->Lock, Interface->Network->Irql); return; } // CnpSendHBs VOID CnpCheckForHBs( IN PCNP_INTERFACE Interface ) /*++ Routine Description: Check if heart beats have been received for this interface Arguments: None Return Value: None --*/ { ULONG MissedHBCount; BOOLEAN NetworkLockReleased = FALSE; if ( Interface->State >= ClusnetInterfaceStateUnreachable && !CnpIsNetworkLocalDisconn(Interface->Network) ) { MissedHBCount = InterlockedIncrement( &Interface->MissedHBs ); if ( MissedHBCount == 1 ) { // // a HB was received in time for this node. Clear the status // info associated with this interface, but also mark the node // as having an interface that is ok. Note that we do not // use HBs on restricted nets to determine node health. // if (!CnpIsNetworkRestricted(Interface->Network)) { Interface->Node->HBWasMissed = FALSE; } CnTrace(HBEAT_DETAIL, HbTraceHBReceivedForInterface, "[HB] A HB was received from node %u on net %u in this " "period.", Interface->Node->Id, // LOGULONG Interface->Network->Id // LOGULONG ); } else { CnTrace(HBEAT_EVENT, HbTraceMissedIfHB, "[HB] HB MISSED for node %u on net %u, missed count %u.", Interface->Node->Id, // LOGULONG Interface->Network->Id, // LOGULONG MissedHBCount // LOGULONG ); MEMLOG4( MemLogMissedIfHB, (ULONG_PTR)Interface, MissedHBCount, Interface->Node->Id, Interface->Network->Id ); if ( MissedHBCount >= HBInterfaceLostHBTicks && Interface->State >= ClusnetInterfaceStateOnlinePending ) { // // interface is either online pending or online, so move it // to unreachable. CnpFailInterface will also mark the node // unreachable if all of the node's interfaces are unreachable. // CnpFailInterface releases the network object lock as part // of its duties. // CnTrace(HBEAT_DETAIL, HbTraceFailInterface, "[HB] Moving I/F for node %u on net %u to failed state, " "previous I/F state = %!ifstate!.", Interface->Node->Id, // LOGULONG Interface->Network->Id, // LOGULONG Interface->State // LOGIfState ); // // continuation log entries go before the main entry since // we scan the log backwards, i.e., we'll hit FailingIf // before we hit FailingIf1. // MEMLOG4( MemLogFailingIf, (ULONG_PTR)Interface, Interface->State, Interface->Node->Id, Interface->Network->Id ); CnpFailInterface( Interface ); NetworkLockReleased = TRUE; // // issue a net interface unreachable event to let consumers // know what is happening // CnTrace(HBEAT_EVENT, HbTraceInterfaceUnreachableEvent, "[HB] Issuing InterfaceUnreachable event for node %u " "on net %u, previous I/F state = %!ifstate!.", Interface->Node->Id, // LOGULONG Interface->Network->Id, // LOGULONG Interface->State // LOGIfState ); CnIssueEvent(ClusnetEventNetInterfaceUnreachable, Interface->Node->Id, Interface->Network->Id); } } } if ( !NetworkLockReleased ) { CnReleaseLock(&Interface->Network->Lock, Interface->Network->Irql); } return; } // CnpCheckForHBs BOOLEAN CnpWalkNodesToSendHeartBeats( IN PCNP_NODE Node, IN PVOID UpdateContext, IN CN_IRQL NodeTableIrql ) /*++ Routine Description: Support routine called for each node in the node table. If node is alive, then we walk its interfaces, performing the appropriate action. Arguments: None Return Value: None --*/ { // // If this node is alive and not the local node, then walk its // interfaces, supplying the appropriate routine to use at this time // if ( Node->MMState == ClusnetNodeStateAlive && Node != CnpLocalNode ) { CnTrace(HBEAT_DETAIL, HbTraceScheduleHBForNode, "[HB] Scheduling HBs for node %u (state = %!mmstate!).", Node->Id, // LOGULONG Node->MMState // LOGMmState ); MEMLOG( MemLogSendHBWalkNode, Node->Id, Node->MMState ); CnpWalkInterfacesOnNode( Node, (PVOID)CnpSendHBs ); } CnReleaseLock( &Node->Lock, Node->Irql ); return TRUE; // the node table lock is still held } // CnpWalkNodesToSendHeartBeats BOOLEAN CnpWalkNodesToCheckForHeartBeats( IN PCNP_NODE Node, IN PVOID UpdateContext, IN CN_IRQL NodeTableIrql ) /*++ Routine Description: heart beat checking routine called for each node in the node table (except for the local node). If node is alive, then we walk its interfaces, performing the appropriate action. Arguments: None Return Value: None --*/ { BOOLEAN NodeWasReachable; ULONG MissedHBCount; if ( Node->MMState == ClusnetNodeStateAlive && Node != CnpLocalNode ) { // // this node is alive, so walk its interfaces. Assume the // worst by setting the HB Missed flag to true and // have the interfaces prove that this is wrong. Also make // note of the current unreachable flag setting. If it changes // this time // NodeWasReachable = !CnpIsNodeUnreachable( Node ); Node->HBWasMissed = TRUE; CnTrace(HBEAT_DETAIL, HbTraceCheckNodeForHeartbeats, "[HB] Checking for HBs from node %u. WasReachable = %!bool!, " "state = %!mmstate!.", Node->Id, // LOGULONG NodeWasReachable, // LOGBOOLEAN Node->MMState // LOGMmState ); MEMLOG( MemLogCheckHBNodeReachable, Node->Id, NodeWasReachable ); MEMLOG( MemLogCheckHBWalkNode, Node->Id, Node->MMState ); CnpWalkInterfacesOnNode( Node, (PVOID)CnpCheckForHBs ); if ( Node->HBWasMissed ) { // // no HBs received on any of this node's IFs. if membership // still thinks this node is alive and the node has been // unreachable, then note that this node is toast in HB // info array. This will cause a node down event to be // generated for this node. // MissedHBCount = InterlockedIncrement( &Node->MissedHBs ); CnTrace(HBEAT_EVENT, HbTraceNodeMissedHB, "[HB] Node %u has missed %u HBs on all interfaces, " "current state = %!mmstate!.", Node->Id, // LOGULONG MissedHBCount, // LOGULONG Node->MMState // LOGMmState ); MEMLOG( MemLogCheckHBMissedHB, MissedHBCount, Node->MMState ); // // if the this node is a either a member or in the process of // joining AND it's missed too many HBs AND we haven't issued a // node down, then issue a node down. // if ( ( Node->MMState == ClusnetNodeStateAlive || Node->MMState == ClusnetNodeStateJoining ) && MissedHBCount >= HBNodeLostHBTicks && !Node->NodeDownIssued ) { Node->NodeDownIssued = TRUE; CnIssueEvent( ClusnetEventNodeDown, Node->Id, 0 ); CnTrace(HBEAT_EVENT, HbTraceNodeDownEvent, "[HB] Issuing NodeDown event for node %u.", Node->Id // LOGULONG ); MEMLOG( MemLogNodeDownIssued, Node->Id, TRUE ); } } } else { MEMLOG( MemLogCheckHBWalkNode, Node->Id, Node->MMState ); } CnReleaseLock( &Node->Lock, Node->Irql ); return TRUE; // the node table lock is still held } // CnpWalkNodesToCheckForHeartBeats VOID CnpHeartBeatDpc( PKDPC DpcObject, PVOID DeferredContext, PVOID Arg1, PVOID Arg2 ) /*++ Routine Description: Start heart beating with the nodes that are marked alive and have an interface marked either OnlinePending or Online. Arguments: None Return Value: None --*/ { PINTERFACE_HEARTBEAT_INFO pNodeHBInfo; PNETWORK_MCAST_HEARTBEAT_INFO pMcastHBInfo; CN_IRQL OldIrql; BOOLEAN StopSendRecvHbs; #ifdef MEMLOGGING static LARGE_INTEGER LastSysTime; LARGE_INTEGER CurrentTime; LARGE_INTEGER TimeDelta; // // try to determine the skew between when we asked to be run and // the time we actually did run // KeQuerySystemTime( &CurrentTime ); if ( LastSysTime.QuadPart != 0 ) { // // add in HBTime which is negative due to relative sys time // TimeDelta.QuadPart = ( CurrentTime.QuadPart - LastSysTime.QuadPart ) + HBTime.QuadPart; if ( TimeDelta.QuadPart > MAX_DPC_SKEW || TimeDelta.QuadPart < -MAX_DPC_SKEW ) { LONG skew = (LONG)(TimeDelta.QuadPart/10000); // convert to ms MEMLOG( MemLogDpcTimeSkew, TimeDelta.LowPart, 0 ); CnTrace(HBEAT_EVENT, HbTraceLateDpc, "[HB] Timer fired %d ms late.", skew // LOGSLONG ); } } LastSysTime.QuadPart = CurrentTime.QuadPart; #endif // MEMLOGGING CnAcquireLock( &HeartBeatLock, &OldIrql ); if ( !HeartBeatEnabled ) { CnTrace(HBEAT_DETAIL, HbTraceSetDpcEvent, "DPC: setting HeartBeatDpcFinished event" ); MEMLOG( MemLogSetDpcEvent, 0, 0 ); KeSetEvent( &HeartBeatDpcFinished, 0, FALSE ); CnReleaseLock( &HeartBeatLock, OldIrql ); return; } HeartBeatDpcRunning = TRUE; // // Check if we need to stop sending heartbeats. This // occurs when clusnet detects that clussvc is not // operating correctly. In case system work queues // are blocked up (but not DPCs), we stop sending // heartbeats so that other nodes initiate failover. // StopSendRecvHbs = ClussvcTerminateStopHbs; CnReleaseLock( &HeartBeatLock, OldIrql ); if (!StopSendRecvHbs) { if ( HeartBeatClockTicks == 0 || HeartBeatClockTicks == HeartBeatSendTicks) { // // time to send HBs. Clear the count of target interfaces // and walk the node table finding the nodes that are // marked alive. // NetworkHBInfoCount = 0; InterfaceHBInfoCount = 0; CnpWalkNodeTable( CnpWalkNodesToSendHeartBeats, NULL ); // // run down the list of networks and send out any multicast // heartbeats. // pMcastHBInfo = NetworkHeartBeatInfo; while ( NetworkHBInfoCount-- ) { CnTrace( HBEAT_EVENT, HbTraceSendMcastHB, "[HB] Sending multicast HB on net %u.\n", pMcastHBInfo->NetworkId ); CxSendMcastHeartBeatMessage( pMcastHBInfo->NetworkId, pMcastHBInfo->McastGroup, pMcastHBInfo->McastTarget, CxMulticastEpoch, pMcastHBInfo->NodeInfo, CnpSendMcastHBCompletion, pMcastHBInfo->McastGroup ); ++pMcastHBInfo; } // // now run down the list of interfaces that we compiled and // send any unicast packets // pNodeHBInfo = InterfaceHeartBeatInfo; while ( InterfaceHBInfoCount-- ) { CnTrace(HBEAT_EVENT, HbTraceSendHB, "[HB] Sending HB to node %u on net %u, seqno %u, ackno %u.", pNodeHBInfo->NodeId, // LOGULONG pNodeHBInfo->NetworkId, // LOGULONG pNodeHBInfo->SeqNumber, // LOGULONG pNodeHBInfo->AckNumber // LOGULONG ); CxSendHeartBeatMessage(pNodeHBInfo->NodeId, pNodeHBInfo->SeqNumber, pNodeHBInfo->AckNumber, pNodeHBInfo->NetworkId); MEMLOG( MemLogSendingHB, pNodeHBInfo->NodeId, pNodeHBInfo->NetworkId ); ++pNodeHBInfo; } // // finally, up the tick count, progressing to the next potential // work item // HeartBeatClockTicks++; } else if ( HeartBeatClockTicks >= ( HeartBeatSendTicks - 1 )) { // // walk the node table looking for lack of heart beats on // a node's set of interfaces. // CnpWalkNodeTable( CnpWalkNodesToCheckForHeartBeats, NULL ); HeartBeatClockTicks = 0; } else { HeartBeatClockTicks++; } } // Check for clussvc hangs. CnpCheckClussvcHang(); // // indicate that we're no longer running and if we're shutting down // then set the event that the shutdown thread is waiting on // CnAcquireLock( &HeartBeatLock, &OldIrql ); HeartBeatDpcRunning = FALSE; if ( !HeartBeatEnabled ) { KeSetEvent( &HeartBeatDpcFinished, 0, FALSE ); CnTrace(HBEAT_DETAIL, HbTraceSetDpcEvent2, "DPC: setting HeartBeatDpcFinished event (2)" ); MEMLOG( MemLogSetDpcEvent, 0, 0 ); } CnReleaseLock( &HeartBeatLock, OldIrql ); } // CnpHeartBeatDpc PCNP_INTERFACE CnpFindInterfaceLocked( IN PCNP_NODE Node, IN PCNP_NETWORK Network ) /*++ Routine Description: Given node and network structure pointers, find the interface structure. Similar to CnpFindInterface except that we're passing in pointers instead of IDs. Arguments: Node - pointer to node struct that sent the packet Network - pointer to Network struct on which packet was received Return Value: Pointer to Interface on which packet was recv'd, otherwise NULL --*/ { PLIST_ENTRY IfEntry; PCNP_INTERFACE Interface; CnVerifyCpuLockMask(CNP_NODE_OBJECT_LOCK, // Required 0, // Forbidden CNP_NETWORK_OBJECT_LOCK_MAX // Maximum ); for (IfEntry = Node->InterfaceList.Flink; IfEntry != &(Node->InterfaceList); IfEntry = IfEntry->Flink ) { Interface = CONTAINING_RECORD(IfEntry, CNP_INTERFACE, NodeLinkage); if ( Interface->Network == Network ) { break; } } if ( IfEntry == &Node->InterfaceList ) { return NULL; } else { return Interface; } } // CnpFindInterfaceLocked VOID CnpReceiveHeartBeatMessage( IN PCNP_NETWORK Network, IN CL_NODE_ID SourceNodeId, IN ULONG SeqNumber, IN ULONG AckNumber, IN BOOLEAN Multicast, IN ULONG MulticastEpoch ) /*++ Routine Description: We received a heartbeat from a node on a network. Reset the missed HB count on that network's interface. Arguments: Network - pointer to network block on which the packet was received SourceNodeId - node number that issued the packet SeqNumber - sending nodes' sequence num AckNumber - last seq number sent by us that was seen at the sending node Multicast - indicates whether this heartbeat was received in a multicast MulticastEpoch - indicates multicast epoch number from heartbeat packet Return Value: None --*/ { PCNP_NODE Node; PCNP_INTERFACE Interface; CX_OUTERSCREEN CurrentOuterscreen; // // Take a snapshot of the current outerscreen so that our // information doesn't change between decisions. // CurrentOuterscreen.UlongScreen = MMOuterscreen.UlongScreen; // // we ignore all packets until we're part of the cluster // if ( !CnpClusterScreenMember( CurrentOuterscreen.ClusterScreen, INT_NODE( CnLocalNodeId ) ) ) { return; } // // We ignore multicast packets whose epoch is earlier than ours. // This prevents replay attacks, because the multicast key may // not have been regenerated since the last time a node joined (and // heartbeat sequence numbers were reset to one). // if (Multicast && MulticastEpoch < CxMulticastEpoch) { CnTrace(HBEAT_ERROR, HbTraceHBFromExpiredEpoch, "[HB] Discarding HB from old epoch. Source Node %u, " "Pkt Epoch %u, Current Epoch %u.", SourceNodeId, // LOGULONG MulticastEpoch, // LOGULONG CxMulticastEpoch // LOGULONG ); return; } // // convert the Node ID into a pointer and find the interface // on which the packet was received. // Node = CnpFindNode( SourceNodeId ); CnAssert( Node != NULL ); Interface = CnpFindInterfaceLocked( Node, Network ); if ( Interface == NULL ) { // // somehow this network object went away while we were // receiving some data on it. Just ignore this msg // CnTrace(HBEAT_ERROR, HbTraceHBFromUnknownNetwork, "[HB] Discarding HB from node %u on an unknown network.", Node->Id // LOGULONG ); MEMLOG( MemLogNoNetID, Node->Id, (ULONG_PTR)Network ); goto error_exit; } // // determine if this is guy is legit. If not in the outerscreen, // then send a poison packet and we're done // if ( !CnpClusterScreenMember( CurrentOuterscreen.ClusterScreen, INT_NODE( SourceNodeId ) ) ) { // // Don't bother sending poison packets on restricted networks. They // will be ignored. // if (CnpIsNetworkRestricted(Interface->Network)) { goto error_exit; } CnTrace(HBEAT_ERROR, HbTraceHBFromBanishedNode, "[HB] Discarding HB from banished node %u on net %u " "due to outerscreen %04X. Sending poison packet back.", Node->Id, // LOGULONG Interface->Network->Id, // LOGULONG CurrentOuterscreen.UlongScreen // LOGULONG ); CcmpSendPoisonPacket( Node, NULL, 0, Network, NULL); // // The node lock was released. // return; } // // Check that the incoming seq num is something we expect to // guard against replay attacks. // if ( SeqNumber <= Interface->LastSequenceReceived) { CnTrace( HBEAT_ERROR, HbTraceHBOutOfSequence, "[HB] Discarding HB from node %u on net %u with stale seqno %u. " "Last seqno %u. Multicast: %!bool!.", Node->Id, // LOGULONG Interface->Network->Id, // LOGULONG SeqNumber, // LOGULONG Interface->LastSequenceReceived, // LOGULONG Multicast ); MEMLOG( MemLogOutOfSequence, SourceNodeId, SeqNumber ); goto error_exit; } // Update the interface's last received seq number // which will be sent back as the ack number. Interface->LastSequenceReceived = SeqNumber; // // Compare our seq number to the ack number in the packet. // If more than two off then the source node is not recv'ing // our heartbeats, but we're receiving theirs. This network is // not usable. We ignore this msg to guarantee that we will // declare the network down if the condition persists. // // In addition, if we are sending multicast heartbeats to this // interface, revert to unicasts in case there is a multicast // problem. // if (( Interface->SequenceToSend - AckNumber ) > 2 ) { CnTrace(HBEAT_ERROR, HbTraceHBWithStaleAck, "[HB] Discarding HB from node %u with stale ackno %u. " "My seqno %u. Multicast: %!bool!.", Node->Id, // LOGULONG AckNumber, // LOGULONG Interface->SequenceToSend, // LOGULONG Multicast ); MEMLOG( MemLogSeqAckMismatch, (ULONG_PTR)Interface, Interface->State ); if (CnpInterfaceQueryReceivedMulticast(Interface)) { CnpInterfaceClearReceivedMulticast(Interface); Interface->McastDiscoverCount = CNP_INTERFACE_MCAST_DISCOVERY; CnpMulticastChangeNodeReachability( Network, Node, FALSE, // not reachable TRUE, // raise event NULL // OUT new mask ); } goto error_exit; } MEMLOG4( MemLogReceivedPacket, SeqNumber, AckNumber, SourceNodeId, Interface->Network->Id ); CnTrace(HBEAT_EVENT, HbTraceReceivedHBpacket, "[HB] Received HB from node %u on net %u, seqno %u, ackno %u, " "multicast: %!bool!.", SourceNodeId, // LOGULONG Interface->Network->Id, // LOGULONG SeqNumber, // LOGULONG AckNumber, // LOGULONG Multicast ); // Reset the interface's and node's Missed HB count // to indicate that things are somewhat normal. // InterlockedExchange(&Interface->MissedHBs, 0); // // Don't reset node miss count on restricted nets. // if (!CnpIsNetworkRestricted(Interface->Network)) { InterlockedExchange(&Node->MissedHBs, 0); } // // if local interface was previously disconnected (e.g. received // a WMI NDIS status media disconnect event), reconnect it now. // if (CnpIsNetworkLocalDisconn(Interface->Network)) { CxReconnectLocalInterface(Interface->Network->Id); } // // move interface to online if necessary // if ( Interface->State == ClusnetInterfaceStateOnlinePending || Interface->State == ClusnetInterfaceStateUnreachable ) { CnAcquireLockAtDpc( &Interface->Network->Lock ); Interface->Network->Irql = DISPATCH_LEVEL; CnTrace(HBEAT_DETAIL, HbTraceInterfaceOnline, "[HB] Moving interface for node %u on network %u to online " "state.", Node->Id, // LOGULONG Interface->Network->Id // LOGULONG ); // // Initiate multicast discovery. // Interface->McastDiscoverCount = CNP_INTERFACE_MCAST_DISCOVERY; Interface->McastRediscoveryCountdown = 0; MEMLOG( MemLogOnlineIf, Node->Id, Interface->State ); CnpOnlineInterface( Interface ); CnTrace(HBEAT_EVENT, HbTraceInterfaceUpEvent, "[HB] Issuing InterfaceUp event for node %u on network %u.", Node->Id, // LOGULONG Interface->Network->Id // LOGULONG ); CnIssueEvent(ClusnetEventNetInterfaceUp, Node->Id, Interface->Network->Id); } // // Indicate that a multicast has been received from this interface. // This allows us to include this interface in our multicasts. // if (Multicast) { IF_CNDBG(CN_DEBUG_HBEATS) { CNPRINT(("[HB] Received multicast heartbeat on " "network %d from source node %d, seq %d, " "ack %d.\n", Network->Id, SourceNodeId, SeqNumber, AckNumber )); } if (!CnpInterfaceQueryReceivedMulticast(Interface)) { CnpInterfaceSetReceivedMulticast(Interface); CnpMulticastChangeNodeReachability( Network, Node, TRUE, // reachable TRUE, // raise event NULL // OUT new mask ); } // There is no point in sending discovery packets to this // interface. Interface->McastDiscoverCount = 0; Interface->McastRediscoveryCountdown = 0; // If the source node's multicast epoch is greater than // ours, update. We can make the initial comparison without // acquiring the lock. if (MulticastEpoch > CxMulticastEpoch) { CnpUpdateMulticastEpoch(MulticastEpoch); } } CnReleaseLock( &Node->Lock, Node->Irql ); // // when the first HB is recv'ed, a node may be in either the // join or alive state (the sponser, for instance, moves from // dead to alive). We need to clear the Node down issued flag // for either case. If the MM State is joining, then a node up // event must be issued as well. Note that we ignore HBs for // node health purposes on restricted nets. // if ( ( (Node->MMState == ClusnetNodeStateJoining) || (Node->MMState == ClusnetNodeStateAlive) ) && Node->NodeDownIssued && !CnpIsNetworkRestricted(Interface->Network) ) { Node->NodeDownIssued = FALSE; MEMLOG( MemLogNodeDownIssued, Node->Id, FALSE ); if ( Node->MMState == ClusnetNodeStateJoining ) { CnTrace(HBEAT_EVENT, HbTraceNodeUpEvent, "[HB] Issuing NodeUp event for node %u.", Node->Id // LOGULONG ); MEMLOG( MemLogNodeUp, Node->Id, 0 ); CnIssueEvent( ClusnetEventNodeUp, Node->Id, 0 ); } } return; error_exit: CnReleaseLock( &Node->Lock, Node->Irql ); return; } // CnpReceiveHeartBeatMessage NTSTATUS CxSetOuterscreen( IN ULONG Outerscreen ) { // // based on the number of valid nodes, make sure any extranious // bits are not set // CnAssert( ClusterDefaultMaxNodes <= 32 ); CnAssert( ( Outerscreen & ( 0xFFFFFFFE << ( 32 - ClusterDefaultMaxNodes - 1 ))) == 0); IF_CNDBG( CN_DEBUG_HBEATS ) CNPRINT(("[CCMP] Setting outerscreen to %04X\n", ((Outerscreen & 0xFF)<< 8) | ((Outerscreen >> 8) & 0xFF))); MMOuterscreen.UlongScreen = Outerscreen; CnTrace(HBEAT_EVENT, HbTraceSetOuterscreen, "[HB] Setting outerscreen to %04X", Outerscreen // LOGULONG ); MEMLOG( MemLogOuterscreen, Outerscreen, 0 ); return STATUS_SUCCESS; } // CxSetOuterscreen VOID CnpTerminateClusterService( IN PVOID Parameter ) { PWORK_QUEUE_ITEM workQueueItem = Parameter; ULONG sourceNodeId = *((PULONG)(workQueueItem + 1)); WCHAR sourceNodeStringId[ 16 ]; swprintf(sourceNodeStringId, L"%u", sourceNodeId ); // // only way we can get here right now is if a poison packet was received. // CnWriteErrorLogEntry(CLNET_NODE_POISONED, STATUS_SUCCESS, NULL, 0, 1, sourceNodeStringId ); if ( ClussvcProcessHandle ) { // // there is still a race condition between the cluster service shutting // down and closing this handle and it being used here. This really // isn't a problem since the user mode portion is going away anyway. // Besides, there isn't alot we can do if this call doesn't work anyway. // ZwTerminateProcess( ClussvcProcessHandle, STATUS_CLUSTER_POISONED ); } CnFreePool( Parameter ); } // CnpTerminateClusterService VOID CnpReceivePoisonPacket( IN PCNP_NETWORK Network, IN CL_NODE_ID SourceNodeId, IN ULONG SeqNumber ) { PCNP_NODE Node; PCNP_INTERFACE Interface; PWORK_QUEUE_ITEM WorkItem; // // give the node and the network pointers, find the interface on which // this packet was received // Node = CnpFindNode( SourceNodeId ); if ( Node == NULL ) { CnTrace(HBEAT_ERROR, HbTraceNoPoisonFromUnknownNode, "[HB] Discarding poison packet from unknown node %u.", SourceNodeId // LOGULONG ); return; } Interface = CnpFindInterfaceLocked( Node, Network ); if ( Interface == NULL ) { // // somehow this network object went away while we were // receiving some data on it. Just ignore this msg // CnTrace(HBEAT_ERROR, HbTracePoisonFromUnknownNetwork, "[HB] Discarding poison packet from node %u on unknown network.", Node->Id // LOGULONG ); MEMLOG( MemLogNoNetID, Node->Id, (ULONG_PTR)Network ); CnReleaseLock( &Node->Lock, Node->Irql ); return; } // // Check that the incoming seq num is something we expect to // guard against replay attacks. // if ( SeqNumber <= Interface->LastSequenceReceived) { CnTrace(HBEAT_ERROR , HbTracePoisonOutOfSeq, "[HB] Discarding poison packet from node %u with stale seqno %u. " "Current seqno %u.", SourceNodeId, // LOGULONG SeqNumber, // LOGULONG Interface->LastSequenceReceived // LOGULONG ); MEMLOG( MemLogOutOfSequence, SourceNodeId, SeqNumber ); CnReleaseLock( &Node->Lock, Node->Irql ); return; } // // Ignore poison packets from restricted networks // if (CnpIsNetworkRestricted(Network)) { CnTrace(HBEAT_ERROR , HbTracePoisonFromRestrictedNet, "[HB] Discarding poison packet from node %u on restricted " "network %u.", SourceNodeId, // LOGULONG Network->Id // LOGULONG ); CnReleaseLock( &Node->Lock, Node->Irql ); return; } // // We always honor a recv'ed poison packet. // CnReleaseLock( &Node->Lock, Node->Irql ); CnTrace(HBEAT_EVENT, HbTracePoisonPktReceived, "[HB] Received poison packet from node %u. Halting this node.", SourceNodeId // LOGULONG ); MEMLOG( MemLogPoisonPktReceived, SourceNodeId, 0 ); CnIssueEvent( ClusnetEventPoisonPacketReceived, SourceNodeId, 0 ); // // Shutdown all cluster network processing. // CnHaltOperation(NULL); // // allocate a work queue item so we can whack the cluster service // process. allocate extra space at the end and stuff the source node ID // out there. Yes, I know it is groady... // WorkItem = CnAllocatePool( sizeof( WORK_QUEUE_ITEM ) + sizeof( CL_NODE_ID )); if ( WorkItem != NULL ) { *((PULONG)(WorkItem + 1)) = SourceNodeId; ExInitializeWorkItem( WorkItem, CnpTerminateClusterService, WorkItem ); ExQueueWorkItem( WorkItem, CriticalWorkQueue ); } return; } // CnpReceivePoisonPacket VOID CnpLogClussvcHangAndTerminate( IN PDEVICE_OBJECT DeviceObject, IN PVOID Context ) /*++ Routine Description: This routine logs an entry into system event log about clussvc hang, and terminates the clussvc process. Arguments: None Return Value: None --*/ { WCHAR myStr[40]; swprintf(myStr, L"%u", ((ClussvcClusnetHbTimeoutTicks * HEART_BEAT_PERIOD)/1000)); CnWriteErrorLogEntry( CLNET_CLUSSVC_HUNG_TERMINATE, STATUS_SUCCESS, NULL, 0, 1, myStr ); if (ClussvcProcessHandle) { ZwTerminateProcess(ClussvcProcessHandle, STATUS_CLUSTER_NODE_DOWN); } IoFreeWorkItem((PIO_WORKITEM)Context); }//CnpLogClussvcHangAndTerminate VOID CnpLogClussvcHang( IN PDEVICE_OBJECT DeviceObject, IN PVOID Context ) /*++ Routine Description: This routine logs an entry into system event log about clussvc hang. Arguments: None Return Value: None --*/ { WCHAR myStr[40]; swprintf(myStr, L"%u", ((ClussvcClusnetHbTimeoutTicks * HEART_BEAT_PERIOD)/1000)); CnWriteErrorLogEntry( CLNET_CLUSSVC_HUNG, STATUS_SUCCESS, NULL, 0, 1, myStr ); IoFreeWorkItem((PIO_WORKITEM)Context); }//CnpLogClussvcHang VOID CnpCheckClussvcHang( VOID ) /*++ Routine Description: Check for HB ticks from Clussvc, if not disabled, and Tick count has reached max then take appropriate action depending on the configured value. Arguments: None Return Value: None --*/ { ULONG newValue; // Check if heartbeating is disabled, then return. if((ClussvcClusnetHbTickCount == 0) || (ClussvcClusnetHbTimeoutAction == ClussvcHangActionDisable)) { return; } // Decrement the counter by 1. newValue = InterlockedDecrement(&ClussvcClusnetHbTickCount); // If this is 1->0 transition we need to do something. if(newValue != 0) return; CnTrace(HBEAT_ERROR , HbTraceClussvcHang, "[HB] Clussvc to Clusnet HB Timeout, Timeout=%u DPC ticks, Action=%u.", ClussvcClusnetHbTimeoutTicks, ClussvcClusnetHbTimeoutAction ); IF_CNDBG( CN_DEBUG_HBEATS ) { CNPRINT(( "[HB] Clussvc to Clusnet HB Timeout, Timeout=%u DPC ticks, Action=%u\n", ClussvcClusnetHbTimeoutTicks, (ULONG)ClussvcClusnetHbTimeoutAction )); } CnAssert(ClussvcClusnetHbTimeoutAction< ClussvcHangActionMax); switch(ClussvcClusnetHbTimeoutAction) { case ClussvcHangActionLog: // Just log a message and reset ClussvcClusnetHbTickCount to ClussvcClusnetHbTimeoutTicks // Use DelayedWorkQueue { PIO_WORKITEM WorkItem; WorkItem = IoAllocateWorkItem(CnDeviceObject); if ( WorkItem != NULL ) { IoQueueWorkItem( WorkItem, CnpLogClussvcHang, DelayedWorkQueue, (PVOID)WorkItem ); } InterlockedExchange(&ClussvcClusnetHbTickCount, ClussvcClusnetHbTimeoutTicks); } break; case ClussvcHangActionBugCheckMachine: // Bugcheck the machine. { KeBugCheckEx( USER_MODE_HEALTH_MONITOR, (ULONG_PTR)((ClussvcProcessHandle != NULL) ? ClussvcProcessObject : NULL), (ULONG_PTR)(ClussvcClusnetHbTimeoutSeconds), 0, 0 ); } break; case ClussvcHangActionTerminateService: default: // Terminate Cluster Service. Handling is similar to the case as if clusnet has // received a poison packet. Using Critical work queue. { KIRQL irql; // If we have already run through this terminate path, // then we do not do it again. The workitem will already // be on the critical work queue (even if it has not yet // executed). CnAcquireLock(&HeartBeatLock, &irql); if (ClussvcTerminateWorkItem != NULL) { PIO_WORKITEM WorkItem; // Swap out the workitem. WorkItem = ClussvcTerminateWorkItem; ClussvcTerminateWorkItem = NULL; // Stop outgoing heartbeats. ClussvcTerminateStopHbs = TRUE; CnReleaseLock(&HeartBeatLock, irql); // Issue halt event so clusdisk stops reservations. CnIssueEvent(ClusnetEventHalt, 0, 0); // Stop normal clusnet activity. CnHaltOperation(NULL); // Queue the critical workitem to terminate the // service process. IoQueueWorkItem( WorkItem, CnpLogClussvcHangAndTerminate, CriticalWorkQueue, (PVOID)WorkItem ); } else { CnReleaseLock(&HeartBeatLock, irql); } } break; } }//CnpCheckClussvcHang VOID CnpWalkInterfacesAfterRegroup( IN PCNP_INTERFACE Interface ) /*++ Routine Description: Reset counters for each interface after a regroup Arguments: None Return Value: None --*/ { InterlockedExchange(&Interface->MissedHBs, 0); CnReleaseLock(&Interface->Network->Lock, Interface->Network->Irql); } // CnpWalkInterfacesAfterRegroup BOOLEAN CnpWalkNodesAfterRegroup( IN PCNP_NODE Node, IN PVOID UpdateContext, IN CN_IRQL NodeTableIrql ) /*++ Routine Description: Called for each node in the node table. Regroup has finished so we clear the node's missed Heart beat count and its node down issued flag. No node should be unreachable at this point. If we find one, kick off another regroup. Arguments: standard... Return Value: None --*/ { // // check for inconsistent settings of Comm and MM state // if ( ( Node->MMState == ClusnetNodeStateAlive || Node->MMState == ClusnetNodeStateJoining ) && Node->CommState == ClusnetNodeCommStateUnreachable ) { CnTrace(HBEAT_EVENT, HbTraceNodeDownEvent2, "[HB] Issuing NodeDown event for node %u.", Node->Id // LOGULONG ); MEMLOG( MemLogInconsistentStates, Node->Id, Node->MMState ); CnIssueEvent( ClusnetEventNodeDown, Node->Id, 0 ); } CnpWalkInterfacesOnNode( Node, (PVOID)CnpWalkInterfacesAfterRegroup ); InterlockedExchange(&Node->MissedHBs, 0); // // clear this only for nodes in the alive state. Once a node is marked // dead, the flag is re-init'ed to true (this is used during a join to // issue only one node up event). // if ( Node->MMState == ClusnetNodeStateAlive ) { Node->NodeDownIssued = FALSE; MEMLOG( MemLogNodeDownIssued, Node->Id, FALSE ); } CnReleaseLock( &Node->Lock, Node->Irql ); return TRUE; // the node table lock is still held } // CnpWalkNodesAfterRegroup VOID CxRegroupFinished( ULONG NewEventEpoch, ULONG NewRegroupEpoch ) /*++ Routine Description: called when regroup has finished. Walk the node list and perform the cleanup in the walk routine. Arguments: None Return Value: None --*/ { MEMLOG( MemLogRegroupFinished, NewEventEpoch, 0 ); CnTrace(HBEAT_EVENT, HbTraceRegroupFinished, "[HB] Regroup finished, new event epoch = %u, " "new regroup epoch = %u.", NewEventEpoch, // LOGULONG NewRegroupEpoch // LOGULONG ); CnAssert( NewEventEpoch >= EventEpoch ); EventEpoch = NewEventEpoch; if (NewRegroupEpoch > CxMulticastEpoch) { CnpUpdateMulticastEpoch(NewRegroupEpoch); } CnpWalkNodeTable( CnpWalkNodesAfterRegroup, NULL ); } // CxRegroupFinished VOID CnpUpdateMulticastEpoch( ULONG NewEpoch ) /*++ Routine Description: The Multicast Epoch must be monotonically increasing and agreed upon by all nodes. It is based on the regroup epoch (not to be confused with the ClusNet event epoch, which is local to each node). It is conceivable for a stale regroup epoch update to occur; thus, only update if the new value is greater than the current value. Arguments: NewEpoch - new epoch number Return value: None --*/ { KIRQL irql; CnAcquireLock(&HeartBeatLock, &irql); if (NewEpoch > CxMulticastEpoch) { CnTrace(HBEAT_EVENT, HbTraceUpdateMulticastEpoch, "[HB] Updating multicast epoch from %u to %u.", CxMulticastEpoch, NewEpoch ); CxMulticastEpoch = NewEpoch; } CnReleaseLock(&HeartBeatLock, irql); } // CnpUpdateMulticastEpoch /* end chbeat.c */