|
|
#ifdef __TANDEM
#pragma columns 79
#pragma page "srgpsm.c - T9050 - Regroup Module state machine routines"
#endif
/* @@@ START COPYRIGHT @@@
** Tandem Confidential: Need to Know only ** Copyright (c) 1995, Tandem Computers Incorporated ** Protected as an unpublished work. ** All Rights Reserved. ** ** The computer program listings, specifications, and documentation ** herein are the property of Tandem Computers Incorporated and shall ** not be reproduced, copied, disclosed, or used in whole or in part ** for any reason without the prior express written permission of ** Tandem Computers Incorporated. ** ** @@@ END COPYRIGHT @@@ **/
/*---------------------------------------------------------------------------
* This file (srgpsm.c) contains regroup state machine routines. *---------------------------------------------------------------------------*/
#ifdef __cplusplus
extern "C" { #endif /* __cplusplus */
#include <wrgp.h>
/*---------- arbitration algorithm ------------ */
DWORD MmQuorumArbitrationTimeout = CLUSTER_QUORUM_DEFAULT_ARBITRATION_TIMEOUT; // seconds
DWORD MmQuorumArbitrationEqualizer = 7; // seconds
#define RGP_ARBITRATION_TIMEOUT ((MmQuorumArbitrationTimeout * 100)/30) // tick == 300ms
#define AVERAGE_ARBITRATION_TIME_IN_SECONDS (MmQuorumArbitrationEqualizer)
void enter_first_cleanup_stage(); void regroup_restart(); int ClusterEmpty(cluster_t c);
DWORD DiskArbitrationThread( IN LPVOID param ) ;
_priv _resident static int regroup_test_arbitrate_advance() { cluster_t temp; int orig_numnodes = ClusterNumMembers(rgp->rgpinfo.cluster); int current_numnodes = ClusterNumMembers(rgp->rgppkt.pruning_result);
if( orig_numnodes == current_numnodes ) { return 1; } //
// If somebody entered stage4 then our group owns the quorum
//
ClusterIntersection( temp, rgp->rgppkt.knownstage4, rgp->rgppkt.pruning_result );
return ClusterNumMembers(temp) != 0; }
_priv _resident static int regroup_start_arbitrate() { int orig_numnodes = ClusterNumMembers(rgp->rgpinfo.cluster); int current_numnodes = ClusterNumMembers(rgp->rgppkt.pruning_result);
if( orig_numnodes == current_numnodes ) { enter_first_cleanup_stage(); return 0; // No Arbitration needed. Proceed to clean up stage //
} else { cluster_t arbitrators; int n_arbitrators; node_t arbitrator; HANDLE thread; DWORD threadId; ULONG epoch;
RGP_LOCK;
epoch = rgp->OS_specific_control.EventEpoch;
if(rgp->arbitration_started) { RGP_UNLOCK; return 1; // stay in this stage for awhile
}
rgp->arbitration_ticks = 0; rgp->arbitration_started = 1;
RGP_UNLOCK;
ClusterIntersection( arbitrators, rgp->rgppkt.pruning_result, rgp->rgppkt.quorumowner );
n_arbitrators = ClusterNumMembers(arbitrators);
if(n_arbitrators == 0) { //
// If there are no quorum owners in this group //
// Let's take the guy with the lowest id //
//
arbitrator = rgp_select_tiebreaker(rgp->rgppkt.pruning_result); } else { //
// Otherwise we will take the quorum owner guy
// with the lowest id
//
arbitrator = rgp_select_tiebreaker(arbitrators);
if(n_arbitrators > 1) { RGP_TRACE( "RGP !!! More than one quorum owner", EXT_NODE(arbitrator), /* TRACE */ GetCluster( rgp->rgpinfo.cluster ), /* TRACE */ GetCluster( rgp->rgppkt.pruning_result ),/* TRACE */ GetCluster( rgp->rgppkt.knownstage2 ) ); /* TRACE */ // Do we need to kill all other arbitrators?
// No.
// ClusterDelete(arbitrators, arbitrator);
// ClusterUnion(
// rgp->poison_targets,
// rgp->poison_targets,
// arbitrators
// );
// rgp_broadcast(RGP_UNACK_POISON);
} }
rgp->tiebreaker = arbitrator;
//
// Now we have an arbitrating node
// We will run a thread that will run arbitration algorithm
//
RGP_TRACE( "RGP Arbitration Delegated to", EXT_NODE(arbitrator), /* TRACE */ GetCluster( rgp->rgpinfo.cluster ), /* TRACE */ GetCluster( rgp->rgppkt.pruning_result ), /* TRACE */ GetCluster( rgp->rgppkt.knownstage2 ) ); /* TRACE */
// Fix Bug #460991
// regroup_restart on stage 4 or later will reset ArbitratingNode
// and if all the nodes are present after restart ApproxArbitrationWinner
// will be not set properly. Assign it here.
rgp->OS_specific_control.ApproxArbitrationWinner = rgp->OS_specific_control.ArbitratingNode = (DWORD)EXT_NODE(arbitrator); if(arbitrator != rgp->mynode) { return 1; }
thread = CreateThread( NULL, // security attributes
0, // stack_size = default
DiskArbitrationThread, ULongToPtr(epoch), 0, // runs immediately
&threadId ); if(thread == NULL) { //
// Force Others to regroup //
//
RGP_LOCK;
rgp_event_handler( RGP_EVT_BANISH_NODE, EXT_NODE(rgp->mynode) );
RGP_UNLOCK;
//
// Kill this node
//
RGP_ERROR(RGP_ARBITRATION_FAILED);
return FALSE; }
CloseHandle(thread); } return TRUE; }
DWORD DiskArbitrationThread( IN LPVOID param ) { cluster_t current_participants; DWORD status; int participant_count; int delay; ULONG_PTR startingEpoch = (ULONG_PTR) param; BOOL EpochsEqual; int orig_numnodes; int current_numnodes; LONGLONG Time1, Time2; ClusterCopy(current_participants, rgp->rgppkt.pruning_result); orig_numnodes = ClusterNumMembers(rgp->rgpinfo.cluster); current_numnodes = ClusterNumMembers(current_participants);
RGP_LOCK;
EpochsEqual = ( startingEpoch == rgp->OS_specific_control.EventEpoch );
RGP_UNLOCK;
if(!EpochsEqual) return 0;
delay = (orig_numnodes+1)/2 - current_numnodes;
if(delay < 0) delay = 0;
Sleep(delay * 6000);
RGP_LOCK;
EpochsEqual = ( startingEpoch == rgp->OS_specific_control.EventEpoch ); if (EpochsEqual) { rgp->OS_specific_control.ArbitrationInProgress += 1; }
RGP_UNLOCK;
if(!EpochsEqual) return 0;
GetSystemTimeAsFileTime((LPFILETIME)&Time1); status = (*(rgp->OS_specific_control.QuorumCallback))(); GetSystemTimeAsFileTime((LPFILETIME)&Time2);
if (status != 0 && startingEpoch == rgp->OS_specific_control.EventEpoch) { // If we won the arbitration and we are in the same epoch (approx check)
// we need to figure out whether we need to slow down a little
Time2 -= Time1;
// Convert to seconds
Time2 = Time2 / 10 / 1000 / 1000; //
// [HACKHACK] GorN Oct/30/1999
// We had a weird timejump in the middle of the arbitration
// Arbitration was completed before it started, we slept for
// too long and regroup timed us out. Let's guard against it.
//
if ( (Time2 >= 0) && (Time2 < AVERAGE_ARBITRATION_TIME_IN_SECONDS) ) { //
// Don't need to be better than the average
// If we are so fast, let's slow down
//
Time2 = AVERAGE_ARBITRATION_TIME_IN_SECONDS - Time2; RGP_TRACE( "RGP sleeping", (ULONG)Time2, /* TRACE */ 0, /* TRACE */ 0, /* TRACE */ 0 ); /* TRACE */ Sleep( (ULONG)(Time2 * 1000) ); } }
RGP_LOCK;
rgp->OS_specific_control.ArbitrationInProgress -= 1;
EpochsEqual = ( startingEpoch == rgp->OS_specific_control.EventEpoch );
if(!EpochsEqual) { RGP_UNLOCK; return 0; }
if(status) { //
// We own the quorum device
// Let's proceed to the next stage
//
enter_first_cleanup_stage(); RGP_UNLOCK; //
// All the rest will see that we are in cleanup stage and
// will proceed to it too
//
} else { //
// Force Others to regroup //
//
rgp_event_handler( RGP_EVT_BANISH_NODE, EXT_NODE(rgp->mynode) ); RGP_UNLOCK;
//
// Kill this node
//
RGP_ERROR(RGP_ARBITRATION_FAILED); }
return 0; }
/************************************************************************
* rgp_check_packet * rgp_print_packet * ================= * * Description: * * Forward declarations of functions used in rgp_sanity_check macro * ************************************************************************/ void rgp_print_packet(rgp_pkt_t* pkt, char* label, int code); int rgp_check_packet(rgp_pkt_t* pkt);
/************************************************************************
* rgp_sanity_check * ================= * * Description: * * This macro prints RGP packet if it has unreasonable values in * powerfail, knownstages, pruning_result, and connectivity_matrix fields. * * Parameters: * * rgp_pkt_t* pkt - * packet to be checked * char* label - * label that will be printed together with a packet * * Returns: * * VOID * ************************************************************************/
#define rgp_sanity_check(__pkt,__label) \
do { \ int __code; __code = rgp_check_packet(__pkt); \ if( __code ) {rgp_print_packet(__pkt, __label, __code);} \ } while ( 0 )
/*---------------------------------------------------------------------------*/
/************************************************************************
* split_brain_avoidance_algorithm * =============================== * * Description: * * This algorithm ensures that, after a regroup incident completes, * at most one group of nodes will survive regardless of connectivity * failures. * * Parameters: * * None * * Returns: * * void - no return value; The algorithm results in either this node * halting (with the RGP_AVOID_SPLIT_BRAIN halt code) or this group * being the only group that survives. * * Algorithm: * * The algorithm is described in detail in the Sierra Tech Memo S.84, * "Modifications in Regroup Algorithm for Sierra". * * The algorithm looks at the set of nodes currently visible from the * local cluster and compares it to the set of nodes alive before * the regroup incident started (outerscreen). The decision to survive * or halt depends on the number of nodes in the current group compared * to the number of nodes in the original group. * * Case 1: * If the current group contains > half the original number, this * group survives. * * Case 2: * If the current group contains < half the original number, this * node (and group) halts. * * Case 3: * If the current group contains exactly half the original number AND * the current group has at least two members, then this group * survives if and only if it contains the tie-breaker node (selected * when the cluster is formed and after each regroup incident). * * Case 4: * If the current group contains exactly half the original number AND * the current group has exactly one member, then we will call the * QuromSelect procedure to check if the Quorum Disk is accessible * from this node. If the procedure returns value TRUE we survive; * else we halt. * * ************************************************************************/ _priv _resident static void split_brain_avoidance_algorithm() { int orig_numnodes, current_numnodes;
RGP_TRACE( "RGP SpltBrainAlg", EXT_NODE(rgp->tiebreaker), /* TRACE */ GetCluster( rgp->rgpinfo.cluster ), /* TRACE */ GetCluster( rgp->outerscreen ), /* TRACE */ GetCluster( rgp->rgppkt.knownstage2 ) ); /* TRACE */
/* Sanity checks:
* 1. The current set of nodes must be a subset of the original set * of nodes. * 2. My node must be in the current set. This was checked * when stage2 was entered. No need to check again. */ if (!ClusterSubsetOf(rgp->rgpinfo.cluster, rgp->rgppkt.knownstage2)) RGP_ERROR(RGP_INTERNAL_ERROR);
orig_numnodes = ClusterNumMembers(rgp->rgpinfo.cluster); current_numnodes = ClusterNumMembers(rgp->rgppkt.knownstage2);
if (orig_numnodes == current_numnodes) /* All nodes are alive. No split brain possibility. */ return;
else if (orig_numnodes == 2) /* Special 2-node case */ { if ((*(rgp->OS_specific_control.QuorumCallback))()) return; /* we have access to Quorum disk. We survive. */ else { #if defined( NT )
ClusnetHalt( NmClusnetHandle ); #endif
RGP_ERROR(RGP_AVOID_SPLIT_BRAIN); } } /* Special 2-node case */
else /* Multi (>2) node case */ { if ((current_numnodes << 1) > orig_numnodes) /* Our group has more than half the nodes => we are the majority.
* We can survive. Other group(s) will kill themselves. */ return; else if ((current_numnodes << 1) < orig_numnodes) /* Our group has less than half the nodes => there may be a
* larger group alive. We must halt and allow that group to * survive. */ RGP_ERROR(RGP_AVOID_SPLIT_BRAIN); else { /* Our group has exactly half the number of processors;
* We survive if we contain the tie-breaker node and halt otherwise. */ if (ClusterMember(rgp->rgppkt.knownstage2, rgp->tiebreaker)) return; else RGP_ERROR(RGP_AVOID_SPLIT_BRAIN); } } /* Multi (>2) node case */
}
/************************************************************************
* regroup_restart * =============== * * Description: * * Starts a new regroup incident. * * Parameters: * * None * * Returns: * * void - no return value * * Algorithm: * * Sets the regroup state to RGP_ACTIVATED, pauses all IO and * initializes the stage masks and connectivity matrix. * ************************************************************************/ _priv _resident static void regroup_restart() { cluster_t old_ignorescreen; UnpackIgnoreScreen(&rgp->rgppkt, old_ignorescreen);
RGP_TRACE( "RGP (re)starting", rgp->rgppkt.seqno, /* TRACE */ rgp->rgppkt.reason, /* TRACE */ rgp->rgppkt.activatingnode, /* TRACE */ rgp->rgppkt.causingnode ); /* TRACE */
RGP_TRACE( "RGP masks ", RGP_MERGE_TO_32( rgp->outerscreen, /* TRACE */ rgp->innerscreen ), /* TRACE */ RGP_MERGE_TO_32( rgp->rgppkt.knownstage1, /* TRACE */ rgp->rgppkt.knownstage2 ), /* TRACE */ RGP_MERGE_TO_32( rgp->rgppkt.knownstage3, /* TRACE */ rgp->rgppkt.knownstage4 ), /* TRACE */ RGP_MERGE_TO_32( rgp->rgppkt.knownstage5, /* TRACE */ rgp->rgppkt.pruning_result ) ); /* TRACE */
/* We are about to start a new pass of the regroup algorithm.
* This does not necessarily mean we have finished the previous * pass; i.e., in an abort situation we may be starting over. * This may occur when some other node fails during the current * pass through the algorithm leaving us hung up at one of the * intermediate stages. */
//
// GN. When we do MM_LEAVE. Our state is COLDLOADED.
// Bailing out of regroup_restart here would prevent us from
// forming a regroup packet that would initate a banishing regroup incident
//
/* To avoid split brained nodes from corrupting data in storage
* devices, we request the transport subsystem to hold all IO requests * in a queue and not transfer them over SNet. We will allow IO to * be resumed when regroup can guarantee that there can no longer be * split brains. This will be done when the final group is determined * and regroup enters the RGP_PHASE1_CLEANUP stage. */
rgp_hold_all_io();
/* The following is a bit of history from the NSK regroup algorithm from
* pre-Sierra systems based on the InterProcessor Bus (IPB). Some of * the particulars mentioned here have changed, but the principle remains. * * Previously, we used to mark all the known stages as zero, except for * stage1. We used to mark only ourselves as in stage1. So, even if our * bus reception logic is screwed up, and we are not receiving packets * from anybody including ourselves, we would mark ourselves as being in * stage1. And after (what used to be) six ticks, we would proceed into * stage2 and mark ourselves as being in stage2. This would cause stage1 * and stage2 to be equal, and our world would constitute just * ourselves. Thus we would go through regroup eliminating everybody * else. However, since we are not receiving packets from anybody else, * we would miss our own iamalive packets, and we too will soon die of * %4032. Thus the symptoms would constitute everybody else dying of * (%4040 + some node number), and that node dying with a %4032 halt. * See TPR S 88070112309628 for more details. * * To avoid this situation, we now do not mark ourselves as in a * particular stage until we get our own regroup packets indicating we * are in that stage. Thus, in regroup_restart, all the stages are * cleared. Previously, regroupbroadcaststatus in sendqueuedmessages * used to send directly from the regroup_control structures. * regroupbroadcaststatus has been modified to construct the unsequenced * packets on its stack. It would first copy the state from the * regroup_control structure, and then would LOR in our node into a known * stage, if requested to do so. When we receive that packet, we would * merge that information into our state, and thus we would be * guaranteed that our bus sending and reception logic is working, and * that we can legitimately mark ourselves as being in that stage. This * whole change avoids problems where bus sending logic works, but bus * reception logic is screwed up for both buses in a node. */
rgp->sendstage = 0; /* Don't let anyone know I am in stage 1 until
* I have seen a regroup clock tick; this is to * cause this node to halt if it is not getting * clock ticks. I will halt when the other nodes * advance without me and send me a status packet * indicating this or send me a poison packet * after declaring me down. */
rgp->rgpcounter = 0; ClusterInit(rgp->rgppkt.knownstage1); ClusterInit(rgp->rgppkt.knownstage2); ClusterInit(rgp->rgppkt.knownstage3); ClusterInit(rgp->rgppkt.knownstage4); ClusterInit(rgp->rgppkt.knownstage5); ClusterInit(rgp->rgppkt.pruning_result);
MatrixInit(rgp->rgppkt.connectivity_matrix); MatrixInit(rgp->internal_connectivity_matrix); /* Just for ease of debugging, to send in our poison packets, we keep
* the known nodes mask at the start of regroup. poison packets contain * known nodes at the beginning of regroup and at the end of it. */
ClusterCopy(rgp->initnodes, rgp->rgpinfo.cluster); ClusterInit(rgp->endnodes);
#if defined( NT )
//
// increment the event epoch so we can detect stale events
// from clusnet
//
++rgp->OS_specific_control.EventEpoch; #endif
if ( (rgp->rgppkt.stage >= RGP_CLOSING) && (rgp->rgppkt.stage <= RGP_PHASE2_CLEANUP) && ClusterCompare(rgp->rgppkt.knownstage1, rgp->rgppkt.knownstage2) ) { //
// If we were interrupted by this restart after we closed
// 1st stage regroup window, then no nodes can be added to group w/o joining.
//
// Thus we will add missing nodes into our ignorescreen.
// This will force the regroup not to wait for them in stage1
cluster_t tmp;
ClusterDifference(tmp, rgp->rgpinfo.cluster, rgp->innerscreen); ClusterUnion(rgp->ignorescreen, rgp->ignorescreen, tmp); }
if ( ClusterMember(rgp->ignorescreen, rgp->mynode) ) { // We shouldn't have get here, but since we are here
// Let's shield us from the outside world
RGP_TRACE( "Self Isolation", 0, 0, 0, 0 ); ClusterCopy(rgp->ignorescreen, rgp->rgpinfo.cluster); ClusterDelete(rgp->ignorescreen, rgp->mynode); }
if ( !ClusterEmpty(rgp->ignorescreen) ) { // if we are ignoring somebody we have
// to be cautious. I.e. we will stay longer in the
// first stage to give a chance to everybody to learn about
// our ignorescreen
rgp->cautiousmode = 1; } if ( !ClusterCompare(old_ignorescreen, rgp->ignorescreen) ) { // Ignore screen is changed, reset restart counter //
RGP_TRACE( "Ignorescreen->", GetCluster(old_ignorescreen), GetCluster(rgp->ignorescreen), 0, 0 ); rgp->restartcount = 0; } PackIgnoreScreen(&rgp->rgppkt, rgp->ignorescreen);
rgp->arbitration_started = 0;
rgp->OS_specific_control.ArbitratingNode = MM_INVALID_NODE; if ( !rgp_is_perturbed() ) { ResetEvent( rgp->OS_specific_control.Stabilized ); }
ClusterInit(rgp->rgppkt.quorumowner); if( QuorumOwner == (DWORD)EXT_NODE(rgp->mynode) ) { ClusterInsert(rgp->rgppkt.quorumowner, rgp->mynode); }
if (rgp->rgppkt.stage == RGP_COLDLOADED) { if (!rgp->OS_specific_control.ShuttingDown) { //
// Currently, RGP_RELOADFAILED calls ExitProcess
// During clean shutdown we would like to send the regroup packet
// out triggering a regroup. So we don't want to die.
//
// Since we are not resetting state to RGP_ACTIVATED, this
// node will not be able to participate in the regroup.
//
RGP_ERROR(RGP_RELOADFAILED); } } else { rgp->rgppkt.stage = RGP_ACTIVATED; }
}
/************************************************************************
* regroup_test_stage2_advance * =========================== * * Description: * * Checks to see if we can advance to regroup stage 2. * * Parameters: * * None * * Returns: * * int - 1 if stage 2 can be entered and 0 if not. * * Algorithm: * * Stage 2 can be entered if one of the following conditions is true. * * (a) all nodes are present and accounted for and at least one * regroup clock tick has occurred * (b) we are not in cautious mode, all but one node are present * and accounted for, AND a minimum number of ticks * (rgp_quickdecisionlegit) have elapsed. * (c) if RGP_MUST_ENTER_STAGE2 ticks have elapsed. * ************************************************************************/ _priv _resident static int regroup_test_stage2_advance() {
cluster_t stragglers; /* set of nodes not yet checkd in */ int num_stragglers; /* # of nodes not yet checkd in */
/* Stage 2 must be entered after some interval regardless of any
* other conditions. */ if (rgp->rgpcounter == 0) return(0); if (rgp->rgpcounter >= RGP_MUST_ENTER_STAGE2) { RGP_TRACE( "RGP S->2cautious", rgp->rgpcounter, /* TRACE */ rgp->cautiousmode, /* TRACE */ GetCluster( rgp->outerscreen ), /* TRACE */ GetCluster( rgp->rgppkt.knownstage1 ) ); /* TRACE */ return(1); }
/* The number of ticks is between 1 and RGP_MUST_ENTER_STAGE2.
* We need to examine the stage1 mask to decide if we can * advance. * * If every node in the old configuration has checked in, I can * advance at once. This is either a false alarm or caused by * power failure or connectivity failures. */
/* Compute the set of nodes from the original configuration not yet
* recognized. */ ClusterDifference(stragglers, rgp->outerscreen, rgp->rgppkt.knownstage1);
//
// We shouldn't wait for the nodes we are ignoring,
// since we cannot get a packet from them anyway
//
ClusterDifference(stragglers, stragglers, rgp->ignorescreen);
if ((num_stragglers = ClusterNumMembers(stragglers)) == 0) { RGP_TRACE( "RGP S->2 all in ", rgp->rgpcounter, /* TRACE */ GetCluster( rgp->outerscreen ), 0, 0 ); /* TRACE */
return(1); /* all present and accounted for */ }
/* If stragglers is non-empty, perhaps I can still advance to stage 2
* if I am not in cautious mode (no recent power fail and not * aborting and rerunning the regroup algorithm) AND all nodes but * one have checked in AND some minimum number of ticks have elapsed. * * The minimum number of ticks is selected to be 1 greater than the * the LATEPOLL inititiation period (allowed consecutive missed IamAlive time) * since that should guarantee that, if the * cluster has broken off into multiple disconnected clusters, * the other clusters would have detected the missing IamAlives, * started regroup and paused IO, thus preventing the possibility * of data corruption caused by a split brain situation. */
if (!(rgp->cautiousmode) && (num_stragglers == 1) && (rgp->rgpcounter > rgp->rgpinfo.Min_Stage1_ticks)) { RGP_TRACE( "RGP S->2 1 miss ", rgp->rgpcounter, /* TRACE */ GetCluster( rgp->outerscreen ), /* TRACE */ GetCluster( rgp->rgppkt.knownstage1 ), 0 ); /* TRACE */ return(1); /* advance - all but one checked in */ }
return(0); /* sorry cannot advance yet */
}
/************************************************************************
* regroup_stage3_advance * =========================== * * Description: * * This function is called after the split brain avoidance algorithm * is run and the tie-breaker is selected in stage 2. It checks if * we can proceed to stage 3 (RGP_PRUNING) and advances to stage 3 * if possible. * * Parameters: * * None * * Returns: * * int - 1 if the regroup stage has been advanced to RGP_PRUNING; * 0 if the stage cannot be advanced yet. * * Algorithm: * * The algorithm depends on whether we are the tie-breaker or not. * * On the tie-breaker node, we first check if there are any * disconnects in the cluster. If there aren't any, there is no need * for pruning. We can then set pruning_result to knownstage2, * advance to the RGP_PRUNING stage and return 1. If there are * disconnects, we must wait a certain number of ticks to collect * connectivity info from all nodes. If the number of ticks have not * passed, return 0. If the required number of ticks have elapsed, * we must call the pruning algorithm to get the list of potential * groups. After that, the select_cluster() routine is called to * pick one from the set of possible clusters. After this is done, * pruning_result is set to the selected cluster and we return 1. * * On a non-tiebreaker node, nothing is done till a stage3 packet is * received from the tie-breaker node or another node which got a * stage 3 packet. If a stage 3 packet has not been received, we * simply return 0. If a stage 3 packet is received, RGP_PRUNING * stage is entered and we return 1. * ************************************************************************/ _priv _resident int regroup_stage3_advance() { int stage_advanced = 0, numgroups, groupnum;
if (rgp->tiebreaker == rgp->mynode) { if (connectivity_complete(rgp->rgppkt.connectivity_matrix)) {
/* No disconnects. All nodes in knownstage2 survive. */ rgp->rgppkt.stage = RGP_PRUNING;
ClusterCopy(rgp->rgppkt.pruning_result, rgp->rgppkt.knownstage2); stage_advanced = 1;
RGP_TRACE( "RGP S->3 NoPrune", rgp->rgpcounter, 0, 0, 0 ); }
/* There are disconnects; must wait for connectivity
* information to be complete. The info is deemed * complete after a fixed number of ticks have * elapsed. */
else if (rgp->pruning_ticks >= RGP_CONNECTIVITY_TICKS) { /* connectivity info collection complete; enter stage 3 */
RGP_TRACE( "RGP Con. matrix1", RGP_MERGE_TO_32( rgp->rgppkt.connectivity_matrix[0], /*TRACE*/ rgp->rgppkt.connectivity_matrix[1] ), /*TRACE*/ RGP_MERGE_TO_32( rgp->rgppkt.connectivity_matrix[2], /*TRACE*/ rgp->rgppkt.connectivity_matrix[3] ), /*TRACE*/ RGP_MERGE_TO_32( rgp->rgppkt.connectivity_matrix[4], /*TRACE*/ rgp->rgppkt.connectivity_matrix[5] ), /*TRACE*/ RGP_MERGE_TO_32( rgp->rgppkt.connectivity_matrix[6], /*TRACE*/ rgp->rgppkt.connectivity_matrix[7])); /*TRACE*/ RGP_TRACE( "RGP Con. matrix2", RGP_MERGE_TO_32( rgp->rgppkt.connectivity_matrix[8], /*TRACE*/ rgp->rgppkt.connectivity_matrix[9] ), /*TRACE*/ RGP_MERGE_TO_32( rgp->rgppkt.connectivity_matrix[10], /*TRACE*/ rgp->rgppkt.connectivity_matrix[11]), /*TRACE*/ RGP_MERGE_TO_32( rgp->rgppkt.connectivity_matrix[12], /*TRACE*/ rgp->rgppkt.connectivity_matrix[13]), /*TRACE*/ RGP_MERGE_TO_32( rgp->rgppkt.connectivity_matrix[14], /*TRACE*/ rgp->rgppkt.connectivity_matrix[15]));/*TRACE*/
numgroups = find_all_fully_connected_groups( rgp->rgppkt.connectivity_matrix, rgp->tiebreaker, rgp->potential_groups);
if ((void *)rgp->select_cluster == RGP_NULL_PTR) { node_t keynode; cluster_t temp; ClusterIntersection( temp, rgp->rgppkt.knownstage2, rgp->rgppkt.quorumowner ); if ( ClusterEmpty(temp) ) { keynode = RGP_NULL_NODE; } else { keynode = rgp_select_tiebreaker(temp); } RGP_TRACE( "RGP keynode ng ", keynode, numgroups, 0, 0); /*TRACE*/ /* No callback specified; use regroup's own routine. */ groupnum = rgp_select_cluster_ex( rgp->potential_groups, numgroups, keynode); } else { /* Call routine specified at rgp_start() time. */ groupnum = (*(rgp->select_cluster))( rgp->potential_groups, numgroups); }
if (groupnum >= 0) ClusterCopy(rgp->rgppkt.pruning_result, rgp->potential_groups[groupnum]); else /* No group can survive. Can't halt yet.
* Need to tell everyone else. */ ClusterInit(rgp->rgppkt.pruning_result);
rgp->rgppkt.stage = RGP_PRUNING;
stage_advanced = 1;
RGP_TRACE( "RGP S->3 Pruned ", rgp->rgpcounter, /* TRACE */ GetCluster( rgp->rgppkt.knownstage2 ), /* TRACE */ GetCluster( rgp->rgppkt.pruning_result ), /* TRACE */ numgroups ); /* TRACE */
} /* connectivity info collection complete; enter stage 3 */
} /* tie-breaker node */
else
{ /* not tie-breaker node */
if (ClusterNumMembers(rgp->rgppkt.knownstage3) != 0) { /* We got a stage 3 packet from someone. Enter stage 3. */ rgp->rgppkt.stage = RGP_PRUNING;
stage_advanced = 1;
RGP_TRACE( "RGP Got S3 pkt ", rgp->rgpcounter, /* TRACE */ GetCluster( rgp->rgppkt.knownstage2 ), /* TRACE */ GetCluster( rgp->rgppkt.pruning_result ), /* TRACE */ GetCluster( rgp->rgppkt.knownstage3 ) ); /* TRACE */ }
} /* not tie-breaker node */
return(stage_advanced); }
/************************************************************************
* enter_first_cleanup_stage * ========================= * * Description: * * This function performs the actions required when entering the * first of the message clean up stages. * * Parameters: * * None * * Returns: * * void - no return value * * Algorithm: * * There are many actions to be performed after the final cluster * is selected. The actions are described in comments throughout * this routine. * ************************************************************************/ _priv _resident void enter_first_cleanup_stage() { cluster_t banishees; node_t failer;
rgp->rgppkt.stage = RGP_PHASE1_CLEANUP;
RGP_TRACE( "RGP S->4 ", rgp->rgpcounter, 0, 0, 0 );
/* The packets we send now will not indicate we are in the phase 1
* cleanup stage yet. We indicate we are in this stage only after * we have completed the clean up action associated with the stage. * This is done in rgp_event_handler, under the * RGP_EVT_PHASE1_CLEANUP_DONE event. */ rgp->sendstage = 0;
/* Now, we can resume IO since we have passed the split brain danger.
* New split brain situations will result in regroup restarting and * pausing IO again. */
rgp_resume_all_io();
/* Compute in banishees the set of nodes being lost from the old
* configuration. */
ClusterDifference(banishees, rgp->rgpinfo.cluster, rgp->rgppkt.pruning_result);
/* Install the new configuration into the masks. */
ClusterCopy(rgp->outerscreen, rgp->rgppkt.pruning_result);
#if defined( NT )
ClusnetSetOuterscreen( NmClusnetHandle, (ULONG)*((PUSHORT)rgp->outerscreen) ); #endif
ClusterCopy(rgp->innerscreen, rgp->rgppkt.pruning_result); ClusterCopy(rgp->endnodes, rgp->rgppkt.pruning_result); ClusterCopy(rgp->rgpinfo.cluster, rgp->rgppkt.pruning_result);
/* Select a new tiebreaker because the previous one may have been */ /* pruned out. Note: tiebreaker_selected has already been set in S2. */ rgp->tiebreaker = rgp_select_tiebreaker(rgp->rgppkt.pruning_result); /* F40 Bug FixID KCY0833 */
/* Mark the state of the banishees as dead and invoke the
* node down callback routine. */ for (failer = 0; failer < (node_t) rgp->num_nodes; failer++) if (ClusterMember(banishees, failer) || rgp->node_states[failer].status == RGP_NODE_COMING_UP // fix bug#265069
) { rgp->node_states[failer].status = RGP_NODE_DEAD; rgp->node_states[failer].pollstate = AWAITING_IAMALIVE; rgp->node_states[failer].lostHBs = 0;
#if !defined(NT)
(*(rgp->nodedown_callback))(EXT_NODE(failer)); #else
ClusnetSetNodeMembershipState(NmClusnetHandle, EXT_NODE( failer ), ClusnetNodeStateDead);
//
// On NT we do the nodedown callback at the end of stage 5.
// This allows the cleanup phases to complete before we let
// the "upper" layers know that a node went down.
//
if ( ClusterMember(rgp->OS_specific_control.CPUUPMASK,failer) ) ClusterInsert( rgp->OS_specific_control.NeedsNodeDownCallback, failer );
#endif // !defined(NT)
}
/* If some nodes have been lost from the configuration, then I will
* queue regroup status packets to them. This is a best efforts * attempt to ensure that they get quickly taken out if they * do in fact continue to run. */
ClusterUnion(rgp->status_targets, banishees, rgp->status_targets);
//
// In NT, we are using rgp->rgppkt.hadpowerfail to transmit
// quorum ownership information
//
#if !defined(NT)
/* I should inform the message system of any node that experienced a
* power on recovery. The message system can use this to clear error * counters so that a link will not be declared down due to errors * which may have been caused by the power failure. */
for (failer = 0; failer < (node_t) rgp->num_nodes; failer++) if ((ClusterMember(rgp->rgppkt.hadpowerfail, failer)) && !(ClusterMember(banishees, failer))) /* This survivor had a power failure. */ rgp_had_power_failure( EXT_NODE(failer) );
#endif // NT
/* Tell the OS to start clean up operations for the failed nodes. */ rgp_start_phase1_cleanup(); }
/************************************************************************
* evaluatestageadvance * ==================== * * Description: * * This function evaluates whether additional state transitions are * possible as a result of the info just received. * * Parameters: * * None * * Returns: * * void - no return value * * Algorithm: * * To evaluate whether we can advance through the stages, a loop is * used with a case entry for each stage. If an entry decides not to * advance to the next stage, it must return from the function. If * it does advance, it should not return but remain in the loop * since it is possible to have cascaded stage transitions * especially in a two node system. Thus, the loop is exited when no * more stage transitions are possible. * ************************************************************************/ _priv _resident static void evaluatestageadvance() { cluster_t temp_cluster; node_t node; node_t i;
for (;;) /* loop until someone exits by returning */ { switch (rgp->rgppkt.stage) {
case RGP_COLDLOADED : { if (!rgp->OS_specific_control.ShuttingDown) { RGP_ERROR(RGP_RELOADFAILED); } return; }
case RGP_ACTIVATED : { /* evaluate whether to go to stage RGP_CLOSING */
if (!regroup_test_stage2_advance()) return;
if (!ClusterMember(rgp->rgppkt.knownstage1, rgp->mynode)) RGP_ERROR(RGP_MISSED_POLL_TO_SELF);
rgp->rgppkt.stage = RGP_CLOSING;
rgp->rgpcounter = 0; rgp->tiebreaker_selected = 0;
/* If we abort the regroup, and there's somebody that everybody
* banished on this regroup, the following line keeps him from * joining up on the next regroup. */ ClusterCopy(rgp->innerscreen, rgp->rgppkt.knownstage1);
break;
} /* evaluate whether to go to stage RGP_CLOSING */
case RGP_CLOSING : { /* evaluate whether to go to stage RGP_PRUNING */
if (rgp->tiebreaker_selected) { if (regroup_stage3_advance()) break; /* try to advance further */ else return; /* cannot advance any more */ }
if (!ClusterCompare(rgp->rgppkt.knownstage1, rgp->rgppkt.knownstage2)) return;
//
// In NT, we no longer use the split-brain avoidance algorithm.
// We use a cluster-wide arbitration algorithm instead.
//
#if !defined(NT)
/* When the known stage 1 and known stage 2 sets are the
* same, we have the complete set of nodes that are * connected to us. It is time to execute the split- * brain avoidance algorithm. If we are a splinter group * cut off from the main group, we will not survive this * algorithm. */
split_brain_avoidance_algorithm();
#endif // NT
/* We are the lucky survivors of the split brain avoidance
* algorithm. Now, we must proceed to elect a new tie-breaker * since the current tie-breaker may no longer be with us. */
rgp->tiebreaker = rgp_select_tiebreaker(rgp->rgppkt.knownstage2);
rgp->tiebreaker_selected = 1;
RGP_TRACE( "RGP S2 tiebr sel", rgp->rgpcounter, /* TRACE */ EXT_NODE(rgp->tiebreaker), /* TRACE */ 0, 0 ); /* TRACE */
rgp->pruning_ticks = 0; break;
} /* evaluate whether to go to stage 3 */
case RGP_PRUNING : { /* evaluate whether to go to RGP_PHASE1_CLEANUP stage */
if (rgp->arbitration_started) { if (regroup_test_arbitrate_advance()) { enter_first_cleanup_stage(); break; } else { return; // Stay in this stage //
} }
if (rgp->has_unreachable_nodes) { RGP_TRACE( "RGP Unreach Node", GetCluster( rgp->rgppkt.pruning_result ), /* TRACE */ GetCluster( rgp->unreachable_nodes ), 0, 0 ); /* TRACE */
/* Must check if the unreachable nodes are in the
* selected final group. If so, we must restart * regroup. */ ClusterIntersection(temp_cluster, rgp->unreachable_nodes, rgp->rgppkt.pruning_result);
/* Clear the unreachable node mask and flag after examining
* them. If we restart, we will start with a clean slate. */ rgp->has_unreachable_nodes = 0; ClusterInit(rgp->unreachable_nodes);
if (ClusterNumMembers(temp_cluster) != 0) { /* We have a node unreachable event to a node
* selected to survive. We must regenerate * the connectivity matrix and re-run the node * pruning algorithm. Start a new regroup incident. * All restarts are in cautious mode. */ rgp->cautiousmode = 1; rgp->rgppkt.seqno = rgp->rgppkt.seqno + 1; rgp->rgppkt.reason = RGP_EVT_NODE_UNREACHABLE; rgp->rgppkt.activatingnode = (uint8) EXT_NODE(rgp->mynode);
/* For causingnode, pick the first unreachable node
* in temp_cluster. */ for (node = 0; node < (node_t) rgp->num_nodes; node++) { if (ClusterMember(temp_cluster, node)) { rgp->rgppkt.causingnode = (uint8) EXT_NODE(node); break; } } regroup_restart(); return; } }
if (!ClusterCompare(rgp->rgppkt.knownstage2, rgp->rgppkt.knownstage3)) return;
/* All nodes in the connected cluster have been notified
* of the pruning decision (entered stage 3). If we are * selected to survive, we can now enter stage 4. If we are * not in the selected group (pruning_result), we must halt. * Wait for at least one node in PRUNING_RESULT to get into * stage 4 before halting. This ensures that the algorithm * does not stall in stage 3 with all pruned out nodes * halting before ANY of the survivors finds that all nodes * entered stage 3. */
if (!ClusterMember(rgp->rgppkt.pruning_result, rgp->mynode)) { /* Wait for at least one node in PRUNING_RESULT
* to get into stage 4 before halting. Since only * nodes in PRUNING_RESULT get into stage 4, it is * sufficient to check if knownstage4 has any members. */ if (ClusterNumMembers(rgp->rgppkt.knownstage4) != 0) RGP_ERROR(RGP_PRUNED_OUT); return; }
// proceed to second stage of pruning - arbitration
if( regroup_start_arbitrate() ) { return; // stay in this stage
} else { break; // either proceed to the next, or restart
}
break;
} /* evaluate whether to go to RGP_PHASE1_CLEANUP stage */
case RGP_PHASE1_CLEANUP : { /* evaluate whether to go to RGP_PHASE2_CLEANUP stage */
if (!ClusterCompare(rgp->rgppkt.pruning_result, rgp->rgppkt.knownstage4)) return;
rgp->rgppkt.stage = RGP_PHASE2_CLEANUP;
RGP_TRACE( "RGP S->5 ", rgp->rgpcounter, 0, 0, 0 );
/* The packets we send now will not indicate we are in the phase 2
* cleanup stage yet. We indicate we are in this stage only after * we have completed the clean up action associated with the stage. * This is done in rgp_event_handler, under the * RGP_EVT_PHASE2_CLEANUP_DONE event. */ rgp->sendstage = 0;
rgp_start_phase2_cleanup();
break;
} /* evaluate whether to go to RGP_PHASE2_CLEANUP stage */
case RGP_PHASE2_CLEANUP : { /* evaluate whether to go to RGP_STABILIZED stage */
if (!ClusterCompare(rgp->rgppkt.knownstage4, rgp->rgppkt.knownstage5)) return;
RGP_LOCK;
//
// [HACKHACK] This is not necessary anymore, since we
// are holding the lock in message.c when delivering
// regroup packet received event
//
if (RGP_PHASE2_CLEANUP != rgp->rgppkt.stage) { RGP_TRACE( "RGP S->6 (race) ", rgp->rgpcounter, rgp->rgppkt.stage, 0, 0 ); break; }
rgp->rgppkt.stage = RGP_STABILIZED;
RGP_TRACE( "RGP S->6 ", rgp->rgpcounter, 0, 0, 0 );
rgp->rgpcounter = 0; rgp->restartcount = 0;
/* Reset the regroup flags which have not yet been cleared. */ rgp->cautiousmode = 0;
/* Clear the mask indicating nodes which own the quorum resrc. */ ClusterInit(rgp->rgppkt.quorumowner);
/* Copy the sequence number into the rgpinfo area. */ rgp->rgpinfo.seqnum = rgp->rgppkt.seqno;
SetEvent( rgp->OS_specific_control.Stabilized ); if (rgp->OS_specific_control.ArbitratingNode != MM_INVALID_NODE) { // Somebody was arbitrating //
rgp->OS_specific_control.ApproxArbitrationWinner = rgp->OS_specific_control.ArbitratingNode; if (rgp->OS_specific_control.ArbitratingNode == (DWORD)EXT_NODE(rgp->mynode)) { //
// [HackHack] To close 422405
// when 421828 is fixed, please uncomment the following line
//
// QuorumOwner = rgp->OS_specific_control.ArbitratingNode;
} else { if (QuorumOwner != MM_INVALID_NODE) { ClRtlLogPrint(LOG_UNUSUAL, "[MM] : clearing quorum owner var (winner is %1!u!), %.\n", rgp->OS_specific_control.ArbitratingNode ); } QuorumOwner = MM_INVALID_NODE; } }
rgp_cleanup_complete();
#if defined(NT)
//
// On NT we deferred doing the node down callback until all the
// cleanup phases have been complete.
//
ClusterCopy( rgp->OS_specific_control.CPUUPMASK, rgp->rgpinfo.cluster );
(*(rgp->nodedown_callback))( rgp->OS_specific_control.NeedsNodeDownCallback );
//
// Clear the down node mask
//
ClusterInit(rgp->OS_specific_control.NeedsNodeDownCallback);
//
// finally, tell clusnet that regroup has finished
//
ClusnetRegroupFinished(NmClusnetHandle, rgp->OS_specific_control.EventEpoch, rgp->rgppkt.seqno);
rgp->last_stable_seqno = rgp->rgppkt.seqno;
RGP_UNLOCK; #endif
return;
} /* evaluate whether to go to RGP_STABILIZED stage */
case RGP_STABILIZED : return; /* stabilized, so I am all done */
default : RGP_ERROR(RGP_INTERNAL_ERROR); /* unknown stage */
} /* switch (rgp->rgppkt.stage) */
} /* loop until someone exits by returning */ }
/************************************************************************
* rgp_event_handler * ================= * * Description: * * The state machine and the heart of the regroup algorithm. * * Parameters: * * int event - * which event happened * * node_t causingnode - * node causing the event: node which sent a regroup status * packet or whose IamAlives are missed; if the causing node is * not relevant information, RGP_NULL_NODE can be passed and * is ignored. *This node ID is in external format.* * * Returns: * * void - no return value * * Algorithm: * * The state machine is the heart of the regroup algorithm. * It is organized as a switch statement with the regroup stage as * the case label and the regroup event as the switch variable. * Events could cause regroup to start a new incident, to advance * through stages or to update information without advancing to * another stage. This routine also arranges for regroup status * packets to be sent to all relevant nodes including our own * node. * ************************************************************************/ _priv _resident void RGP_EVENT_HANDLER_EX(int event, node_t causingnode, void *arg) {
rgp_pkt_t *rcvd_pkt_p; cluster_t ignorescreen_rcvd; uint8 oldstage; int send_status_pkts = 0;
/* Note: arg is only used when event == RGP_EVENT_RECEIVED_PACKET. It is the ptr to the packet */
/* Trace unusual invocations of this routine. */ if (event != RGP_EVT_RECEIVED_PACKET && event != RGP_EVT_CLOCK_TICK) RGP_TRACE( "RGP Event ", event, causingnode, rgp->rgppkt.stage, rgp->rgpcounter ); /* TRACE */
switch (event) { case RGP_EVT_NODE_UNREACHABLE : { /* All paths to a node are unreachable */
/* Ignore the event if the unreachable node has been eliminated
* from our outerscreen. The message system probably doesn't * know it yet. */ if (ClusterMember(rgp->outerscreen, INT_NODE(causingnode))) { /* Store this event and check after node pruning (when
* entering the RGP_PRUNING stage). If a regroup incident * is in progress and we haven't entered the RGP_PRUNING * stage yet, this will happen in the current incident. * If not, it will happen in the next regroup incident * which will surely start soon due to this disconnect. * * We do not start a regroup incident for this event. We will * wait for IamAlives to be missed for starting a new regroup * incident. This is due to the requirement that, in case * of a total disconnect resulting in multiple groups, we must * stay in stage 1 till we can guarantee that the other group(s) * has started regroup and paused IO. We assume that the * regroup incident started at the IamAlive check tick and * use the periodic nature of the IamAlive sends and * IamAlive checks to limit the stage1 pause to the period * of IamAlive sends (+ 1 tick to drain IO). If we started * a regroup incident due to the node unreachable event, we * have to stay in stage1 longer. */ rgp->has_unreachable_nodes = 1; ClusterInsert(rgp->unreachable_nodes, INT_NODE(causingnode));
break; } } /* All paths to a node are unreachable */
case RGP_EVT_PHASE1_CLEANUP_DONE : { /* The following checks are needed in case we restarted
* regroup and asked for phase1 cleanup multiple times. * We must make sure that all such requests have been * completed. */ if ( (rgp->rgppkt.stage == RGP_PHASE1_CLEANUP) && (rgp->rgp_msgsys_p->phase1_cleanup == 0) ) { /* all caught up */
/* Let others and ourselves get packets indicating we are in
* this stage. When we get that packet, we will update our * knownstage field. If our sending or receiving apparatus * failed meanwhile and we don't get our own packet, it * will cause regroup to be restarted. */ rgp->sendstage = 1; send_status_pkts = 1; evaluatestageadvance(); } /* all caught up */
break; }
case RGP_EVT_PHASE2_CLEANUP_DONE : {
/* The following checks are needed in case we restarted
* regroup and asked for phase2 cleanup multiple times. * We must make sure that all such requests have been * completed. */ if ( (rgp->rgppkt.stage == RGP_PHASE2_CLEANUP) && (rgp->rgp_msgsys_p->phase2_cleanup == 0) ) { /* all caught up */ /* Let others and ourselves get packets indicating we are
* in this stage. */ rgp->sendstage = 1; send_status_pkts = 1; evaluatestageadvance(); } /* all caught up */ break; }
case RGP_EVT_LATEPOLLPACKET : { /* some node is late with IamAlives */
RGP_LOCK; // to ensure that the packet receive does not initiate
// regroup asynchronously.
/* Start a new regroup incident if not already active. */ if (rgp->rgppkt.stage == RGP_STABILIZED) { rgp->rgppkt.seqno = rgp->rgppkt.seqno + 1; rgp->rgppkt.reason = RGP_EVT_LATEPOLLPACKET; rgp->rgppkt.activatingnode = (uint8) EXT_NODE(rgp->mynode); rgp->rgppkt.causingnode = (uint8) causingnode; regroup_restart(); send_status_pkts = 1; } else if (rgp->rgppkt.stage == RGP_COLDLOADED) { RGP_ERROR(RGP_RELOADFAILED); } RGP_UNLOCK; break; } /* some node is late with IamAlives */
case MM_EVT_LEAVE: rgp->OS_specific_control.ShuttingDown = TRUE; case RGP_EVT_BANISH_NODE : { /* assumes that the lock is held */
rgp->rgppkt.seqno = rgp->rgppkt.seqno + 1; rgp->rgppkt.activatingnode = (uint8) EXT_NODE(rgp->mynode); // Pack Ignore Screen in the regroup_restart will
// fill reason and causingnode fields of the packet
ClusterInsert(rgp->ignorescreen, INT_NODE(causingnode) ); regroup_restart(); send_status_pkts = 1; break; } #if 0
case MM_EVT_LEAVE: // this node needs to leave the cluster gracefully
{ // Initiate a Regroup Event amongst remaining members if any
// Start a new regroup incident if not already active.
if (rgp->rgppkt.stage == RGP_STABILIZED) { rgp->rgppkt.seqno = rgp->rgppkt.seqno + 1; rgp->rgppkt.reason = MM_EVT_LEAVE; rgp->rgppkt.activatingnode = (uint8) EXT_NODE(rgp->mynode); rgp->rgppkt.causingnode = (uint8) EXT_NODE(rgp->mynode); regroup_restart(); send_status_pkts = 1; } break; } #endif
case RGP_EVT_CLOCK_TICK : { /* called on regroup clock tick when regroup is active */
if( (rgp->rgppkt.stage == RGP_PRUNING) && (rgp->arbitration_started) ) { rgp->arbitration_ticks++;
if (rgp->arbitration_ticks >= RGP_ARBITRATION_TIMEOUT) { //
// Kill timed-out arbitrator
//
if(rgp->tiebreaker == rgp->mynode) { //
// If this node was arbitrating, then die
//
if ( IsDebuggerPresent() ) { DebugBreak(); }
RGP_ERROR(RGP_ARBITRATION_STALLED); } else { //
// Kill the arbitrator and initiate another regroup
//
RGP_TRACE( "RGP arbitration stalled ", rgp->rgppkt.stage, 0, 0, 0 );
rgp_event_handler( RGP_EVT_BANISH_NODE, EXT_NODE(rgp->tiebreaker) );
break; } }
evaluatestageadvance();
//
// No need to send packets while we are waiting for
// the arbitrator to win
//
// send_status_pkts = rgp->rgppkt.stage != RGP_PRUNING;
//
// [GN] Wrong. We do have to send status packets.
// If we have partial connectivity, we need to
// continue exchanging packets, so that the pruner,
// can learn indirectly that all nodes got the pruning results.
//
send_status_pkts = 1;
break; } else { rgp->rgpcounter++; /* increment the counter */ }
if ( (rgp->rgppkt.stage == RGP_ACTIVATED) && (rgp->sendstage == 0) ) { /* To detect the potential failure of my timer pop mechanism
* (such as by the corruption of the time list), I wait for * at least one regroup clock tick before I let myself and * others know I am in stage 1. */ // [GorN Jan14/2000]
// We don't send our connectivity information,
// before we get the first clock tick.
// However we collect this information in
// rgp->internal_connectivity_matrix.
// Let's put it in the outgoing packet
// so that everybody will see what we think about them.
MatrixOr(rgp->rgppkt.connectivity_matrix, rgp->internal_connectivity_matrix); rgp->sendstage = 1; /* let everyone know we are in stage 1 */ } else if ( (rgp->rgppkt.stage >= RGP_CLOSING) && (rgp->rgppkt.stage <= RGP_PHASE2_CLEANUP) ) { /* check for possible abort and restart */
if (rgp->rgpcounter >= RGP_MUST_RESTART) { /* Stalled out. Probably someone died after starting
* or another node is still in stage 1 cautious mode */
if ( ++(rgp->restartcount) > RGP_RESTART_MAX ) { // It is not a good idea to die, because somebody
// is stalling. Let's add stallees into ignore mask and restart
//
// RGP_ERROR(RGP_INTERNAL_ERROR); // [Fixed]
cluster_t tmp, *stage;
switch (rgp->rgppkt.stage) { case RGP_CLOSING: stage = &rgp->rgppkt.knownstage2; break; case RGP_PRUNING: stage = &rgp->rgppkt.knownstage3; break; case RGP_PHASE1_CLEANUP: stage = &rgp->rgppkt.knownstage4; break; case RGP_PHASE2_CLEANUP: stage = &rgp->rgppkt.knownstage5; break; } ClusterDifference(tmp, rgp->rgpinfo.cluster, *stage);
//
// If we stalled during closing, due to tiebraker running
// the pruning algorithn going bunkers, we can have tmp = 0
// In this case, we need to ignore somebody to guarantee that
// the algorithm completes.
//
if ( ClusterEmpty(tmp) && rgp->tiebreaker_selected) { ClusterInsert(tmp, rgp->tiebreaker); }
ClusterUnion(rgp->ignorescreen, rgp->ignorescreen, tmp); }
/* If we are stalling in stage 3 and we have been pruned out,
* it is possible that we are stalling because we have been * isolated from all other nodes. We must halt in this case. */ if ( (rgp->rgppkt.stage == RGP_PRUNING) && !ClusterMember(rgp->rgppkt.pruning_result, rgp->mynode) ) RGP_ERROR(RGP_PRUNED_OUT);
rgp->cautiousmode = 1; rgp->rgppkt.seqno = rgp->rgppkt.seqno + 1;
RGP_TRACE( "RGP stalled ", rgp->rgppkt.stage, 0, 0, 0 );
regroup_restart();
} /* Stalled out ... */ } /* check for possible abort and restart */
if ((rgp->rgppkt.stage == RGP_CLOSING) && rgp->tiebreaker_selected) rgp->pruning_ticks++;
evaluatestageadvance();
send_status_pkts = 1; /* send rgp packets regardless of progress */
break;
} /* called on regroup clock tick when regroup is active */
case RGP_EVT_RECEIVED_PACKET : { /* received an rgp packet */
/* If the sending node is excluded by the outer screen, then it is
* not even part of the current (most recently known) configuration. * Therefore the packet should not be honored, and a poison message * should be sent to try to kill this renegade processor. * That is done in the calling routine that processes all incoming * regroup module packets (IamAlive, regroup and poison packets). */
/* If the sending node was accepted by the outer screen but then
* excluded by the inner screen, then the packet will be disregarded * but no poison message sent. This phenomenon may occur when this * node has entered stage 2 without having heard from (recognized) * the sending node and then a message arrives late from that * sending node. In this case the fate of the sending node, i.e. * whether it gets ruled out of the global configuration or not is * unknown at this point. If the sender can get itself recognized * by some node before that node enters stage 2, then it will be * saved. Otherwise it will be declared down and subsequently shot * with poison packets if it ever tries to assert itself. */
/* Remember the arg to this routine is the packet pointer */ rcvd_pkt_p = (rgp_pkt_t *)arg; /* address of pkt just received */ if ( rgp->rgppkt.seqno != rcvd_pkt_p->seqno) RGP_TRACE( "RGP Event ", event, causingnode, rgp->rgppkt.stage, rgp->rgpcounter ); /* TRACE */
UnpackIgnoreScreen(rcvd_pkt_p, ignorescreen_rcvd); if ( !ClusterEmpty(ignorescreen_rcvd) ) { RGP_TRACE( "RGP Incoming pkt", GetCluster(ignorescreen_rcvd), rcvd_pkt_p->seqno, rgp->rgppkt.stage, causingnode); }
if ( !ClusterMember(rgp->innerscreen, INT_NODE(causingnode))) { RGP_TRACE( "RGP Ignoring !inner", causingnode, rgp->rgppkt.stage, GetCluster(rgp->innerscreen), GetCluster(ignorescreen_rcvd) ); return; }
RGP_LOCK; // To ensure that the timer thread does not initiate
// regroup asynchronously at this time.
//////////////////////////// New Ignore Screen Stuff /////////////////////////////////
if (ClusterMember(rgp->ignorescreen, INT_NODE(causingnode) )) { RGP_UNLOCK; RGP_TRACE( "RGP Ignoring", causingnode, rgp->rgppkt.stage, GetCluster(rgp->ignorescreen), GetCluster(ignorescreen_rcvd) ); return; }
if (rcvd_pkt_p->seqno < rgp->last_stable_seqno ) { RGP_UNLOCK; RGP_TRACE( "RGP old packet", causingnode, rcvd_pkt_p->seqno, rgp->last_stable_seqno, 0); // This is a late packet from the previous regroup incident
// from the node that is currently in my outerscreen.
// This node could not have sent it now, this is probably a packet
// that stuck somewhere and was delieverd eons later.
// Simply ignore it.
return; }
if ( ClusterMember(ignorescreen_rcvd, rgp->mynode ) ) { //
// Sender ignores me. We will do the same to him.
//
ClusterInsert(rgp->ignorescreen, INT_NODE(causingnode) ); rgp->rgppkt.seqno = rgp->rgppkt.seqno + 1; regroup_restart(); send_status_pkts = 1; RGP_UNLOCK; break; }
if ( ClusterCompare(ignorescreen_rcvd, rgp->ignorescreen) ) { // We have the same ignore screen.
// No work needs to be done
} else if ( ClusterSubsetOf(rgp->ignorescreen, ignorescreen_rcvd) ) { // Incoming packet has smaller ignore screen
// Ignore this packet, but reply to its sender with
// our current regroup packet to force to upgrade to
// our view of the world.
// do so only if we are properly initialized
if (rgp->rgppkt.stage == RGP_COLDLOADED && !rgp->OS_specific_control.ShuttingDown) { RGP_ERROR(RGP_RELOADFAILED); } RGP_TRACE( "RGP smaller ignore mask ", GetCluster(ignorescreen_rcvd), GetCluster(rgp->ignorescreen), /* TRACE */ rgp->rgppkt.stage, rcvd_pkt_p->stage ); /* TRACE */ ClusterInsert(rgp->status_targets, INT_NODE(causingnode)); rgp_broadcast(RGP_UNACK_REGROUP); RGP_UNLOCK; return; } else if ( ClusterSubsetOf(ignorescreen_rcvd, rgp->ignorescreen) ) { RGP_TRACE( "RGP bigger ignore mask ", GetCluster(ignorescreen_rcvd), GetCluster(rgp->ignorescreen), /* TRACE */ rgp->rgppkt.stage, causingnode ); /* TRACE */ // Incoming packet has bigger ignore screen.
// Upgrade to this information and process the packet
rgp->rgppkt.seqno = rcvd_pkt_p->seqno; /* Somebody else activated regroup. So, let's just copy */ /* the sender's reason code and reason nodes. */
//
// Ignore mask parts are in the reason and activatingnode fields
//
ClusterCopy(rgp->ignorescreen, ignorescreen_rcvd); // fix bug #328216
rgp->rgppkt.reason = rcvd_pkt_p->reason; rgp->rgppkt.activatingnode = rcvd_pkt_p->activatingnode; rgp->rgppkt.causingnode = rcvd_pkt_p->causingnode; regroup_restart(); send_status_pkts = 1; } else { RGP_TRACE( "RGP different ignore masks ", GetCluster(ignorescreen_rcvd), GetCluster(rgp->ignorescreen), /* TRACE */ rgp->rgppkt.stage, causingnode ); /* TRACE */ // Ignore masks are different and neither of them is
// a subset of another.
//
// We need to merge information out of these masks
// and restart the regroup.
//
// Packet that we just received will be ignored
ClusterUnion(rgp->ignorescreen, rgp->ignorescreen, ignorescreen_rcvd); rgp->rgppkt.seqno = max(rgp->rgppkt.seqno, rcvd_pkt_p->seqno) + 1; regroup_restart(); send_status_pkts = 1; RGP_UNLOCK; break; }
//////////////////////////// End of new Ignore Screen Stuff /////////////////////////////////
// Now ignorescreens of this node packet and incoming packet are the same //
// proceed with regular regroup processing //
/* Since the packet is acceptable, the regroup sequence number
* must be compared to that of this node. If the incoming message * has a higher sequence number, then a new pass of the regroup * algorithm has started. This node must accept the new sequence * number, reinitialize its data, and start partcicipating in * the new pass. Also, the incoming message must be processed * since, once the algorithm reinitializes, the sequence numbers * now match. * * If the incoming packet has a matching sequence number, then it * should be accepted. The knowledge of the global state of the * algorithm it reflects must be merged with that already present * in this node. Then this node must evaluate whether further * state transitions are possible. * * Finally, if the incoming packet has a lower sequence number, then * it comes from a node unaware of the current level of the global * algorithm. The data in it should be ignored, but a packet should * be sent to it so that it will reinitialize its algorithm. * * The sequence number is a 32 bit algebraic value - hopefully it * will never wrap around. */
if (rcvd_pkt_p->seqno < rgp->rgppkt.seqno) { /* sender below current level - ignore but let him know it*/
RGP_TRACE( "RGP lower seqno ", rgp->rgppkt.seqno, rcvd_pkt_p->seqno, /* TRACE */ rgp->rgppkt.stage, rcvd_pkt_p->stage ); /* TRACE */
ClusterInsert(rgp->status_targets, INT_NODE(causingnode)); rgp_broadcast(RGP_UNACK_REGROUP); RGP_UNLOCK; return; }
if (rcvd_pkt_p->seqno > rgp->rgppkt.seqno) { /* sender above current level - I must upgrade to it*/
// The node that forces a restart responsible for keeping
// track of restarts and making a decision who will die/be ignored
// if ( ++(rgp->restartcount) > RGP_RESTART_MAX )
// RGP_ERROR(RGP_INTERNAL_ERROR);
if ( (rgp->rgppkt.stage != RGP_STABILIZED) || ((rcvd_pkt_p->seqno - rgp->rgppkt.seqno) > 1) ) { RGP_TRACE( "RGP higher seqno", rgp->rgppkt.seqno, rcvd_pkt_p->seqno, /* TRACE */ rgp->rgppkt.stage, rcvd_pkt_p->stage );/* TRACE */ rgp->cautiousmode = 1; }
rgp->rgppkt.seqno = rcvd_pkt_p->seqno;
/* Somebody else activated regroup. So, let's just copy */ /* the sender's reason code and reason nodes. */
rgp->rgppkt.reason = rcvd_pkt_p->reason; rgp->rgppkt.activatingnode = rcvd_pkt_p->activatingnode; rgp->rgppkt.causingnode = rcvd_pkt_p->causingnode; regroup_restart(); send_status_pkts = 1;
} /* sender above current level - I must upgrade to it*/
/* Now we are at the same level - even if we weren't at first.
* * If the sender has already commited to a view of the world * that excludes me, I must halt in order to keep the system in * a consistent state. * * This is true even with the split brain avoidance algorithm. * The fact that stage1 = stage2 in the packet implies that the * sender has already run the split brain avoidance algorithm * and decided that he should survive. */
if ( (rcvd_pkt_p->stage > RGP_ACTIVATED) && ClusterCompare(rcvd_pkt_p->knownstage1, rcvd_pkt_p->knownstage2) && !ClusterMember(rcvd_pkt_p->knownstage1, rgp->mynode) ) { ClusterInsert(rgp->ignorescreen, INT_NODE(causingnode) ); rgp->rgppkt.seqno ++; regroup_restart(); send_status_pkts = 1; RGP_UNLOCK; // /* I must die for overall consistency. */
// RGP_ERROR((uint16) (RGP_PARIAH + causingnode)); // [Fixed]
break; } RGP_UNLOCK;
/* If I have terminated the active part of the algorithm, I
* am in stage 6 and am not routinely broadcasting my status * anymore. If I get a packet from someone else who has not * yet terminated, then I must send him the word. But if he * has terminated, I must not send any packet or else there * will be an infinite loop of packets bouncing back and forth. */
if (rgp->rgppkt.stage == RGP_STABILIZED) { /* I have terminated so can't learn anything more. */ if (!ClusterCompare(rcvd_pkt_p->knownstage5, rgp->rgppkt.knownstage5)) { /* but sender has not so I must notify him */ ClusterInsert(rgp->status_targets, INT_NODE(causingnode)); rgp_broadcast(RGP_UNACK_REGROUP); } return; }
/* At this point, the packet is from a legal node within the
* current round of the algorithm and I have not terminated * at stage RGP_STABILIZED so I need to absorb whatever new * info is in this packet. * * The way to merge what this packet says with what I already * know is to just logically OR the known stage x fields * together. */ { int seqno = rcvd_pkt_p->seqno&0xffff; int stage = rcvd_pkt_p->stage&0xffff; int trgs = *(int*)rgp->status_targets & 0xffff; int node = INT_NODE(causingnode)&0xffff;
RGP_TRACE( "RGP recv pkt ", ((seqno << 16) | stage), RGP_MERGE_TO_32( rcvd_pkt_p->knownstage1, rcvd_pkt_p->knownstage2 ), RGP_MERGE_TO_32( rcvd_pkt_p->knownstage3, rcvd_pkt_p->knownstage4 ), (trgs << 16) | node ); }
rgp_sanity_check(rcvd_pkt_p, "RGP Received packet"); rgp_sanity_check(&(rgp->rgppkt), "RGP Internal packet");
ClusterUnion(rgp->rgppkt.quorumowner, rcvd_pkt_p->quorumowner, rgp->rgppkt.quorumowner); ClusterUnion(rgp->rgppkt.knownstage1, rcvd_pkt_p->knownstage1, rgp->rgppkt.knownstage1); ClusterUnion(rgp->rgppkt.knownstage2, rcvd_pkt_p->knownstage2, rgp->rgppkt.knownstage2); ClusterUnion(rgp->rgppkt.knownstage3, rcvd_pkt_p->knownstage3, rgp->rgppkt.knownstage3); ClusterUnion(rgp->rgppkt.knownstage4, rcvd_pkt_p->knownstage4, rgp->rgppkt.knownstage4); ClusterUnion(rgp->rgppkt.knownstage5, rcvd_pkt_p->knownstage5, rgp->rgppkt.knownstage5); ClusterUnion(rgp->rgppkt.pruning_result, rcvd_pkt_p->pruning_result, rgp->rgppkt.pruning_result);
/* But when I am in stage 2, it is possible that I can learn to
* recognize some node I have not previously recognized by hearing * of it indirectly from some other node that I have recognized. * To handle this case, I always merge knownstage1 info into * the inner screen so that subsequent messages from the newly * recognized node will be accepted and processed. */ if ((rgp->rgppkt.stage == RGP_CLOSING) && !(rgp->tiebreaker_selected)) ClusterUnion(rgp->innerscreen, rgp->rgppkt.knownstage1, rgp->innerscreen);
/* In the first two stages of regroup, the inter-node connectivity
* information is collected and propagated. When we get a regroup * packet, we turn ON the bit corresponding to the [our-node, * sender-node] entry in the connectivity matrix. We also OR in * the matrix sent by the sender node in the regroup packet. * * The matrix is not updated if we are in stage 1 and haven't * received the first clock tick. This is to prevent the * node pruning algorithm from considering us alive if our * timer mechanism is disrupted, but the IPC mechanism is OK. */
/* [GorN 01/07/2000] If we are not collection connectivity information,
* until we receive a first tick we can ran into problems if the node is * killed right after it send out its first timer driven packet * (which doesn't have any connectivity info yet). This can cause a * confusion. See bug 451792. * * What we will do is we will collect connectivity information on * the side even when rgp->sendstage is FALSE and move it into the regroup * packet if we ever get a clock tick */
if (rgp->rgppkt.stage < RGP_PRUNING && !rgp->sendstage) { MatrixSet(rgp->internal_connectivity_matrix, rgp->mynode, INT_NODE(causingnode)); if (causingnode != EXT_NODE(rgp->mynode)) MatrixOr(rgp->internal_connectivity_matrix, rcvd_pkt_p->connectivity_matrix); }
if ((rgp->rgppkt.stage < RGP_PRUNING) && rgp->sendstage) { MatrixSet(rgp->rgppkt.connectivity_matrix, rgp->mynode, INT_NODE(causingnode)); if (causingnode != EXT_NODE(rgp->mynode)) MatrixOr(rgp->rgppkt.connectivity_matrix, rcvd_pkt_p->connectivity_matrix); }
/* Now, I can evaluate whether additional state transitions are
* possible as a result of the info just received. */ oldstage = rgp->rgppkt.stage;
// QuorumCheck now runs in a separate thread
// if (oldstage != RGP_CLOSING) // Cannot run Quorumcheck from here.
evaluatestageadvance();
/* To speed things up, let us broadcast our status if our
* stage has changed and we are willing to let others and * ourselves see it. */
if ( (oldstage != rgp->rgppkt.stage) && rgp->sendstage ) send_status_pkts = 1; /* broadcast at once to speed things up */
break; } /* received an rgp packet */
//
// We do not support power failure notifications in NT
//
#if defined(NT)
CL_ASSERT(event != RGP_EVT_POWERFAIL); //
// Fall thru to default case
//
#else // NT
case RGP_EVT_POWERFAIL : { /* Our node got a power up interrupt or an indication of power
* failure from another node. */
/* Note that this code will unconditionally abort and restart
* the algorithm even if it was active before the power failure. * The new incident must be in cautious mode. */
rgp->cautiousmode = 1; rgp->rgppkt.seqno = rgp->rgppkt.seqno + 1; rgp->rgppkt.reason = RGP_EVT_POWERFAIL; rgp->rgppkt.activatingnode = (uint8) EXT_NODE(rgp->mynode); rgp->rgppkt.causingnode = (uint8) causingnode;
/* rgp->pfail_state is set to a non-zero value when a pfail event
* is reported to regroup. It is decremented at every regroup clock * tick till it reaches zero. While this number is non-zero, missing * self IamAlives are ignored and do not cause the node to halt. * This gives the sending hardware some time to recover from power * failures before self IamAlives are checked. */ if (causingnode == EXT_NODE(rgp->mynode)) rgp->pfail_state = RGP_PFAIL_TICKS;
/* Store the fact that causingnode experienced a PFAIL,
* for reporting to the message system when regroup stabilizes. */ ClusterInsert(rgp->rgppkt.hadpowerfail, INT_NODE(causingnode));
regroup_restart(); send_status_pkts = 1; break; } /* power failure */
#endif // NT
default : { RGP_ERROR(RGP_INTERNAL_ERROR); } }
if (send_status_pkts) /* significant change - send status at once */ { ClusterUnion(rgp->status_targets, rgp->outerscreen, rgp->status_targets); rgp_broadcast(RGP_UNACK_REGROUP); } }
/************************************************************************
* rgp_check_packet * ================= * * Description: * * verifies that RGP packet has reasonable values in * powerfail, knownstages, pruning_result, and connectivity_matrix fields * * Parameters: * * rgp_pkt_t* pkt - * packet to be checked * * Returns: * * 0 - packet looks good * 1,2,3... - strange looking packet * ************************************************************************/ int rgp_check_packet(rgp_pkt_t* pkt) { node_t i;
//
// Verify that
// knownstage5 \subset knownstage4 \subset knownstage3 \subset
// knownstage2 \subset knownstage1 \subset rgp->rgpinfo.cluster
//
// int ClusterSubsetOf(cluster_t big, cluster_t small)
// Returns 1 if set small = set big or small is a subset of big.
//
if( !ClusterSubsetOf(pkt->knownstage4, pkt->knownstage5) ) { return 5; } if( !ClusterSubsetOf(pkt->knownstage3, pkt->knownstage4) ) { return 4; } if( !ClusterSubsetOf(pkt->knownstage2, pkt->knownstage3) ) { return 3; } if( !ClusterSubsetOf(pkt->knownstage1, pkt->knownstage2) ) { return 2; } if( !ClusterSubsetOf(rgp->rgpinfo.cluster, pkt->knownstage1) ) { return 1; }
//
// pruning_result has to be a subset of knownstage2
//
if( !ClusterSubsetOf(pkt->knownstage2, pkt->pruning_result) ) { return 9; }
//
// quorumowner has to be a subset of original cluster
//
if(!ClusterSubsetOf(rgp->rgpinfo.cluster, pkt->quorumowner)) { return 8; } //
// Check connectivity matrix
//
for(i = 0; i < MAX_CLUSTER_SIZE; ++i) { if( ClusterMember( rgp->rgpinfo.cluster, i ) ) { //
// Node i is a member of a cluster
// Its connectivity bitmap has to be a subset of rgp->rgpinfo.cluster
//
if(!ClusterSubsetOf(rgp->rgpinfo.cluster, pkt->connectivity_matrix[i])) { return 10; } } else { //
// Node i is not a member of a cluster
// Its connectivity bitmap has to be 0
//
if(!ClusterEmpty(pkt->connectivity_matrix[i])) return 11; } }
return 0; }
/************************************************************************
* rgp_print_packet * ================= * * Description: * * Prints RGP packet fields * * Parameters: * * rgp_pkt_t* pkt - * packet to be printed * char* label - * label to be printed together with a packet * int code - * a number to be printed together with a packet * * Returns: * * VOID * ************************************************************************/ void rgp_print_packet(rgp_pkt_t* pkt, char* label, int code) { uint8 pktsubtype; uint8 stage; uint16 reason; uint32 seqno; uint8 activatingnode; uint8 causingnode; cluster_t quorumowner;
RGP_TRACE( label, pkt->seqno, /* TRACE */ code, (pkt->stage << 16) | (pkt->activatingnode << 8) | (pkt->causingnode), /* TRACE */ RGP_MERGE_TO_32( rgp->outerscreen, rgp->innerscreen ) ); RGP_TRACE( "RGP CHK masks ", RGP_MERGE_TO_32( rgp->rgpinfo.cluster, /* TRACE */ pkt->quorumowner ), /* TRACE */ RGP_MERGE_TO_32( pkt->knownstage1, /* TRACE */ pkt->knownstage2 ), /* TRACE */ RGP_MERGE_TO_32( pkt->knownstage3, /* TRACE */ pkt->knownstage4 ), /* TRACE */ RGP_MERGE_TO_32( pkt->knownstage5, /* TRACE */ pkt->pruning_result ) ); /* TRACE */ RGP_TRACE( "RGP CHK Con. matrix1", RGP_MERGE_TO_32( pkt->connectivity_matrix[0], /*TRACE*/ pkt->connectivity_matrix[1] ), /*TRACE*/ RGP_MERGE_TO_32( pkt->connectivity_matrix[2], /*TRACE*/ pkt->connectivity_matrix[3] ), /*TRACE*/ RGP_MERGE_TO_32( pkt->connectivity_matrix[4], /*TRACE*/ pkt->connectivity_matrix[5] ), /*TRACE*/ RGP_MERGE_TO_32( pkt->connectivity_matrix[6], /*TRACE*/ pkt->connectivity_matrix[7])); /*TRACE*/ RGP_TRACE( "RGP CHK Con. matrix2", RGP_MERGE_TO_32( pkt->connectivity_matrix[8], /*TRACE*/ pkt->connectivity_matrix[9] ), /*TRACE*/ RGP_MERGE_TO_32( pkt->connectivity_matrix[10], /*TRACE*/ pkt->connectivity_matrix[11]), /*TRACE*/ RGP_MERGE_TO_32( pkt->connectivity_matrix[12], /*TRACE*/ pkt->connectivity_matrix[13]), /*TRACE*/ RGP_MERGE_TO_32( pkt->connectivity_matrix[14], /*TRACE*/ pkt->connectivity_matrix[15]));/*TRACE*/ }
/************************************************************************
* UnpackIgnoreScreen * ================= * * Description: * * Extracts ignorescreen out of regroup packet * * Parameters: * * rgp_pkt_t* from - * source packet * cluster_t to - * target node set * * Returns: * * VOID * * Comments: * * If the packet is received from NT4 node, unpacked ignorescreen * will ne always 0. * ************************************************************************/ void UnpackIgnoreScreen(rgp_pkt_t* from, cluster_t to) { #pragma warning( push )
#pragma warning( disable : 4244 )
if (from->reason < RGP_EVT_IGNORE_MASK) { ClusterInit(to); } else { to[0] = ((uint16)from->reason) >> 8; to[1] = (uint8)from->causingnode; } #pragma warning( pop )
}
/************************************************************************
* rgp_print_packet * ================= * * Description: * * Put an ignorescreen back into a regroup packet * * Parameters: * * rgp_pkt_t* to - * packet to be updated * cluster_t from - * source node set * * Returns: * * VOID * ************************************************************************/ void PackIgnoreScreen(rgp_pkt_t* to, cluster_t from) { if ( ClusterEmpty(from) ) { to->reason &= 255; to->causingnode = 0; } else { to->reason = (uint8)RGP_EVT_IGNORE_MASK | (from[0] << 8); to->causingnode = from[1]; } }
/*---------------------------------------------------------------------------*/
#ifdef __cplusplus
} #endif /* __cplusplus */
#if 0
History of changes to this file: ------------------------------------------------------------------------- 1995, December 13 F40:KSK0610 /*F40:KSK06102.2*/
This file is part of the portable Regroup Module used in the NonStop Kernel (NSK) and Loosely Coupled UNIX (LCU) operating systems. There are 10 files in the module - jrgp.h, jrgpos.h, wrgp.h, wrgpos.h, srgpif.c, srgpos.c, srgpsm.c, srgputl.c, srgpcli.c and srgpsvr.c. The last two are simulation files to test the Regroup Module on a UNIX workstation in user mode with processes simulating processor nodes and UDP datagrams used to send unacknowledged datagrams.
This file was first submitted for release into NSK on 12/13/95. ------------------------------------------------------------------------------ This change occurred on 19 Jan 1996 /*F40:MB06458.1*/ Changes for phase IV Sierra message system release. Includes: /*F40:MB06458.2*/ - Some cleanup of the code /*F40:MB06458.3*/ - Increment KCCB counters to count the number of setup messages and /*F40:MB06458.4*/ unsequenced messages sent. /*F40:MB06458.5*/ - Fixed some bugs /*F40:MB06458.6*/ - Disable interrupts before allocating broadcast sibs. /*F40:MB06458.7*/ - Change per-packet-timeout to 5ms /*F40:MB06458.8*/ - Make the regroup and powerfail broadcast use highest priority /*F40:MB06458.9*/ tnet services queue. /*F40:MB06458.10*/ - Call the millicode backdoor to get the processor status from SP /*F40:MB06458.11*/ - Fixed expand bug in msg_listen_ and msg_readctrl_ /*F40:MB06458.12*/ - Added enhancement to msngr_sendmsg_ so that clients do not need /*F40:MB06458.13*/ to be unstoppable before calling this routine. /*F40:MB06458.14*/ - Added new steps in the build file called /*F40:MB06458.15*/ MSGSYS_C - compiles all the message system C files /*F40:MB06458.16*/ MSDRIVER - compiles all the MSDriver files /*F40:MB06458.17*/ REGROUP - compiles all the regroup files /*F40:MB06458.18*/ - remove #pragma env libspace because we set it as a command line /*F40:MB06458.19*/ parameter. /*F40:MB06458.20*/ ----------------------------------------------------------------------- /*F40:MB06458.21*/
#endif /* 0 - change descriptions */
|