|
|
/*++
Copyright (c) 1996-1999 Microsoft Corporation
Module Name:
member.c
Abstract:
Cluster membership management routines for the Node Manager.
Author:
Mike Massa (mikemas) 12-Mar-1996
Revision History:
--*/
#include "nmp.h"
#include <clusrtl.h>
//
// Data
//
BOOLEAN NmpMembershipCleanupOk = FALSE; BITSET NmpUpNodeSet = 0; LIST_ENTRY NmpLeaderChangeWaitList = {NULL, NULL};
//
// Routines
//
VOID NmpMarkNodeUp( CL_NODE_ID NodeId ) /*++
Notes:
Called with the NmpLock held.
--*/ { BitsetAdd(NmpUpNodeSet, NodeId);
return; }
VOID NmpNodeUpEventHandler( IN PNM_NODE Node ) /*++
Notes:
Called with the NmpLock held.
--*/ { NmpMarkNodeUp(Node->NodeId);
//
// Don't declare the local node to be up. The join code will
// take care of this.
//
if ((Node != NmLocalNode) && (Node->State == ClusterNodeJoining)) { ClRtlLogPrint(LOG_UNUSUAL, "[NMJOIN] Joining node %1!u! is now participating in the cluster membership.\n", Node->NodeId );
CL_ASSERT(NmpJoinerNodeId == Node->NodeId); CL_ASSERT(Node->State == ClusterNodeJoining); CL_ASSERT(NmpJoinTimer == 0); CL_ASSERT(NmpJoinAbortPending == FALSE); CL_ASSERT(NmpJoinerUp == FALSE);
NmpJoinerUp = TRUE; }
return;
} // NmpNodeUpEventHandler
VOID NmpNodeDownEventHandler( IN PNM_NODE Node ) { NmpMultiNodeDownEventHandler( BitsetFromUnit(Node->NodeId) ); }
DWORD NmpMultiNodeDownEventHandler( IN BITSET DownedNodeSet ) { CL_NODE_ID i; PNM_NODE node; DWORD status; BOOLEAN iAmNewLeader = FALSE; PNM_LEADER_CHANGE_WAIT_ENTRY waitEntry; PLIST_ENTRY listEntry;
ClRtlLogPrint(LOG_NOISE, "[NM] Down node set: %1!04X!.\n", DownedNodeSet);
NmpAcquireLock();
//
// Compute the new up node set
//
BitsetSubtract(NmpUpNodeSet, DownedNodeSet);
ClRtlLogPrint(LOG_NOISE, "[NM] New up node set: %1!04X!.\n", NmpUpNodeSet);
//
// Check for failure of a joining node.
//
if (NmpJoinerNodeId != ClusterInvalidNodeId) {
if (NmpJoinerNodeId == NmLocalNodeId) { //
// The joining node is the local node. Halt.
//
ClRtlLogPrint(LOG_NOISE, "[NMJOIN] Aborting join because of change in membership.\n" ); CsInconsistencyHalt(ERROR_CLUSTER_JOIN_ABORTED); } else if ( (BitsetIsMember(NmpJoinerNodeId, DownedNodeSet)) || ( (BitsetIsMember(NmpSponsorNodeId, DownedNodeSet)) && (!BitsetIsMember(NmpJoinerNodeId, DownedNodeSet)) ) ) { //
// The joining node is down or the sponsor is down and the joiner
// is not yet an active member. Cleanup the join state. If the
// sponsor is down and the joiner is an active member, we will
// clean up when we detect that the joiner has perished.
//
ClRtlLogPrint(LOG_NOISE, "[NMJOIN] Aborting join of node %1!u! sponsored by node %2!u!\n", NmpJoinerNodeId, NmpSponsorNodeId );
//
// Reset joiner state if sponsor died
//
if (BitsetIsMember(NmpSponsorNodeId, DownedNodeSet)) { node = NmpIdArray[NmpJoinerNodeId]; node->State = ClusterNodeDown; // [GorN 4/3/2000]
// Without a node down, cluadmin won't refresh the state.
// If this code is to be changed to emit CLUSTER_NODE_CHANGE_EVENT or
// some other event, NmpUpdateJoinAbort has to be changed as well,
// so that we will have the same join cleanup behavior
BitsetAdd(DownedNodeSet, NmpJoinerNodeId); }
NmpJoinerNodeId = ClusterInvalidNodeId; NmpSponsorNodeId = ClusterInvalidNodeId; NmpJoinTimer = 0; NmpJoinAbortPending = FALSE; NmpJoinSequence = 0; NmpJoinerUp = FALSE; NmpJoinerOutOfSynch = FALSE; } else { //
// Mark that the joiner is out of synch with the cluster
// state. The sponsor will eventually abort the join.
//
ClRtlLogPrint(LOG_NOISE, "[NMJOIN] Joiner node %1!u! is now out of synch with the cluster state.\n", NmpJoinerNodeId ); NmpJoinerOutOfSynch = TRUE; } }
//
// Check if the leader node went down
//
if (BitsetIsMember(NmpLeaderNodeId, DownedNodeSet)) { BOOL isEventSet;
//
// Elect a new leader - active node with the smallest ID.
//
for (i = ClusterMinNodeId; i <= NmMaxNodeId; i++) { if (BitsetIsMember(i, NmpUpNodeSet)) { NmpLeaderNodeId = i; break; } }
CL_ASSERT(i <= NmMaxNodeId);
if (NmpLeaderNodeId == NmLocalNodeId) { //
// The local node is the new leader.
//
ClRtlLogPrint(LOG_NOISE, "[NM] This node is the new leader.\n" );
iAmNewLeader = TRUE; } else { ClRtlLogPrint(LOG_NOISE, "[NM] Node %1!u! is the new leader.\n", NmpLeaderNodeId ); }
//
// Wake up any threads waiting for an RPC call to the leader to
// complete.
//
while (!IsListEmpty(&NmpLeaderChangeWaitList)) { listEntry = RemoveHeadList(&NmpLeaderChangeWaitList);
//
// NULL out the entry's links to indicate that it has been
// dequeued. The users of the notification feature depend
// on this action.
//
listEntry->Flink = NULL; listEntry->Blink = NULL;
//
// Wake up the waiting thread.
//
waitEntry = (PNM_LEADER_CHANGE_WAIT_ENTRY) listEntry; isEventSet = SetEvent(waitEntry->LeaderChangeEvent); CL_ASSERT(isEventSet != 0); } }
//
// First recovery pass - clean up node states and disable communication
//
for (i = ClusterMinNodeId; i <= NmMaxNodeId; i++) { node = NmpIdArray[i];
if ( (node != NULL) && (BitsetIsMember(i, DownedNodeSet)) ) { node->State = ClusterNodeDown;
status = ClusnetOfflineNodeComm( NmClusnetHandle, node->NodeId );
CL_ASSERT( (status == ERROR_SUCCESS) || (status == ERROR_CLUSTER_NODE_ALREADY_DOWN) ); } }
//
// Inform the rest of the service that these nodes are gone
//
ClusterEventEx( CLUSTER_EVENT_NODE_DOWN_EX, EP_CONTEXT_VALID, ULongToPtr(DownedNodeSet) );
//
// Second recovery pass - clean up network states and issue old-style
// node down events
//
for (i = ClusterMinNodeId; i <= NmMaxNodeId; i++) { node = NmpIdArray[i];
if ( (node != NULL) && (BitsetIsMember(i, DownedNodeSet)) ) { //
// Issue an individual node down event.
//
ClusterEvent(CLUSTER_EVENT_NODE_DOWN, node);
//
// Now do Intracluster RPC cleanup...
//
NmpTerminateRpcsToNode(node->NodeId);
//
// Update the network and interface information.
//
NmpUpdateNetworkConnectivityForDownNode(node);
//
// Log an event
//
if (NmpLeaderNodeId == NmLocalNodeId) { LPCWSTR nodeName = OmObjectName(node);
CsLogEvent1( LOG_UNUSUAL, NM_EVENT_NODE_DOWN, nodeName ); } } }
//
// If this node is the new leader, schedule a state computation for all
// networks. State reports may have been received before this node
// assumed leadership duties.
//
if (iAmNewLeader) { NmpRecomputeNT5NetworkAndInterfaceStates(); }
NmpReleaseLock();
return(ERROR_SUCCESS);
} // NmpNodesDownEventHandler //
DWORD NmpNodeChange( IN DWORD NodeId, IN NODESTATUS NewStatus ) { PNM_NODE node;
CL_ASSERT( (NodeId >= ClusterMinNodeId) && (NodeId <= NmMaxNodeId) );
NmpAcquireLock();
node = NmpIdArray[NodeId];
CL_ASSERT(node != NULL);
if (node != NULL) { if (NewStatus == NODE_DOWN) { NmpNodeDownEventHandler(node); } else { CL_ASSERT(NewStatus == NODE_UP); NmpNodeUpEventHandler(node); } }
NmpReleaseLock();
return(ERROR_SUCCESS);
} // NmpNodeChange
VOID NmpHoldIoEventHandler( VOID ) { ClRtlLogPrint(LOG_NOISE, "[NM] Holding I/O.\n" ); #if defined(HOLD_IO_IS_SAFE_NOW)
FmHoldIO(); #endif
return; }
VOID NmpResumeIoEventHandler( VOID ) { ClRtlLogPrint(LOG_NOISE, "[NM] Resuming I/O.\n" ); #if defined(HOLD_IO_IS_SAFE_NOW)
FmResumeIO(); #endif
return; }
BOOL NmpCheckQuorumEventHandler( VOID ) { BOOL haveQuorum;
//
// daviddio 06/19/2000
//
// Before asking FM to arbitrate, determine if we have any
// viable network interfaces. If not, return failure to MM
// and allow other cluster nodes to arbitrate. The SCM
// will restart the cluster service, so that if no nodes
// successfully arbitrate, we will get another shot.
//
if (NmpCheckForNetwork()) {
ClRtlLogPrint(LOG_NOISE, "[NM] Checking if we own the quorum resource.\n" );
haveQuorum = FmArbitrateQuorumResource();
if (haveQuorum) { ClRtlLogPrint(LOG_NOISE, "[NM] We own the quorum resource.\n" ); } else { ClRtlLogPrint(LOG_NOISE, "[NM] We do not own the quorum resource, status %1!u!.\n", GetLastError() );
//[GN] ClusnetHalt( NmClusnetHandle ); => NmpHaltEventHandler
//
} } else {
ClRtlLogPrint(LOG_CRITICAL, "[NM] Abdicating quorum because no valid network " "interfaces were detected.\n" ); haveQuorum = FALSE; }
return(haveQuorum);
} // NmpCheckQuorumEventHandler
void NmpMsgCleanup1( IN DWORD DeadNodeId ) { ClRtlLogPrint(LOG_NOISE, "[NM] Phase 1 message cleanup - node %1!u!.\n", DeadNodeId );
return; }
void NmpMsgCleanup2( IN BITSET DownedNodeSet ) { ClRtlLogPrint(LOG_NOISE, "[NM] Phase 2 message cleanup - node %1!04X!.\n", DownedNodeSet );
NmpAcquireLock(); if ( NmpCleanupIfJoinAborted && (NmpJoinerNodeId != ClusterInvalidNodeId) && BitsetIsMember(NmpJoinerNodeId, DownedNodeSet) ) { //
// Since the joiner is in the DownedNodeSet mask
// the node down will be delivered on this node by a regroup engine.
// No need for NmpUpdateAbortJoin to issue a node down.
//
NmpCleanupIfJoinAborted = FALSE; ClRtlLogPrint(LOG_NOISE, "[NM] NmpCleanupIfJoinAborted is set to false. Joiner - %1!u!.\n", NmpJoinerNodeId ); } NmpReleaseLock();
//
// Inform the rest of the service that these nodes are gone
//
ClusterSyncEventEx( CLUSTER_EVENT_NODE_DOWN_EX, EP_CONTEXT_VALID, ULongToPtr(DownedNodeSet) );
return; }
VOID NmpHaltEventHandler( IN DWORD HaltCode ) { WCHAR string[16];
// Do a graceful stop if we are shutting down //
if (HaltCode == MM_STOP_REQUESTED) { DWORD Status = ERROR_SUCCESS; ClRtlLogPrint(LOG_UNUSUAL, "[NM] Prompt shutdown is requested by a membership engine\n" ); ClusnetHalt( NmClusnetHandle );
CsLogEvent(LOG_NOISE, SERVICE_SUCCESSFUL_TERMINATION);
CsServiceStatus.dwCurrentState = SERVICE_STOPPED; CsServiceStatus.dwControlsAccepted = 0; CsServiceStatus.dwCheckPoint = 0; CsServiceStatus.dwWaitHint = 0; CsServiceStatus.dwWin32ExitCode = Status; CsServiceStatus.dwServiceSpecificExitCode = Status;
CsAnnounceServiceStatus();
ExitProcess(Status);
} else {
wsprintfW(&(string[0]), L"%u", HaltCode);
ClRtlLogPrint(LOG_CRITICAL, "[NM] Halting this node due to membership or communications error. Halt code = %1!u!\n", HaltCode );
ClusnetHalt( NmClusnetHandle );
//
// Adjust membership code to win32 error code. (If mapping exits)
//
HaltCode = MMMapHaltCodeToDosError( HaltCode );
CsInconsistencyHalt(HaltCode); } }
void NmpJoinFailed( void ) { return; }
DWORD NmpGumUpdateHandler( IN DWORD Context, IN BOOL SourceNode, IN DWORD BufferLength, IN PVOID Buffer ) /*++
Routine Description:
Handles GUM updates for membership events.
Arguments:
Context - Supplies the update context. This is the message type
SourceNode - Supplies whether or not the update originated on this node.
BufferLength - Supplies the length of the update.
Buffer - Supplies a pointer to the buffer.
Return Value:
ERROR_SUCCESS if successful
Win32 error code otherwise
--*/
{ DWORD status;
if (Context == NmUpdateJoinComplete) { status = NmpUpdateJoinComplete(Buffer); } else { status = ERROR_SUCCESS; ClRtlLogPrint(LOG_UNUSUAL, "[NM] Discarding unknown gum request %1!u!\n", Context ); }
return(status);
} // NmpUpdateGumHandler
DWORD NmpMembershipInit( VOID ) { DWORD status;
ClRtlLogPrint(LOG_NOISE,"[NM] Initializing membership...\n");
InitializeListHead(&NmpLeaderChangeWaitList);
//
// Initialize membership engine.
//
status = MMInit( NmLocalNodeId, NmMaxNodes, NmpNodeChange, NmpCheckQuorumEventHandler, NmpHoldIoEventHandler, NmpResumeIoEventHandler, NmpMsgCleanup1, NmpMsgCleanup2, NmpHaltEventHandler, NmpJoinFailed, NmpMultiNodeDownEventHandler );
if (status != MM_OK) { status = MMMapStatusToDosError(status); ClRtlLogPrint(LOG_CRITICAL, "[NM] Membership initialization failed, status %1!u!.\n", status ); return(status); }
NmpMembershipCleanupOk = TRUE;
ClRtlLogPrint(LOG_NOISE,"[NM] Membership initialization complete.\n");
return(ERROR_SUCCESS);
} // NmpMembershipInit
VOID NmpMembershipShutdown( VOID ) { if (NmpMembershipCleanupOk) { ClRtlLogPrint(LOG_NOISE,"[NM] Shutting down membership...\n");
MMShutdown();
NmpMembershipCleanupOk = FALSE;
ClRtlLogPrint(LOG_NOISE,"[NM] Membership shutdown complete.\n"); }
return;
} // NmpMembershipShutdown
|