windows-server-2003/base/cluster/service/mm/srgpsm.c


								#ifdef __TANDEM

								#pragma columns 79

								#pragma page "srgpsm.c - T9050 - Regroup Module state machine routines"

								#endif


								/* @@@ START COPYRIGHT @@@

								**  Tandem Confidential:  Need to Know only

								**  Copyright (c) 1995, Tandem Computers Incorporated

								**  Protected as an unpublished work.

								**  All Rights Reserved.

								**

								**  The computer program listings, specifications, and documentation

								**  herein are the property of Tandem Computers Incorporated and shall

								**  not be reproduced, copied, disclosed, or used in whole or in part

								**  for any reason without the prior express written permission of

								**  Tandem Computers Incorporated.

								**

								** @@@ END COPYRIGHT @@@

								**/


								/*---------------------------------------------------------------------------

								 * This file (srgpsm.c) contains regroup state machine routines.

								 *---------------------------------------------------------------------------*/


								#ifdef __cplusplus

								   extern "C" {

								#endif /* __cplusplus */


								#include <wrgp.h>


								/*---------- arbitration algorithm ------------ */


								DWORD MmQuorumArbitrationTimeout   = CLUSTER_QUORUM_DEFAULT_ARBITRATION_TIMEOUT; // seconds

								DWORD MmQuorumArbitrationEqualizer = 7;  // seconds


								#define RGP_ARBITRATION_TIMEOUT             ((MmQuorumArbitrationTimeout * 100)/30) // tick == 300ms

								#define AVERAGE_ARBITRATION_TIME_IN_SECONDS (MmQuorumArbitrationEqualizer)


								void enter_first_cleanup_stage();

								void regroup_restart();

								int ClusterEmpty(cluster_t c);


								DWORD

								DiskArbitrationThread(

								    IN LPVOID param

								    ) ;


								_priv _resident static int

								regroup_test_arbitrate_advance()

								{

								   cluster_t temp;

								   int orig_numnodes    = ClusterNumMembers(rgp->rgpinfo.cluster);

								   int current_numnodes = ClusterNumMembers(rgp->rgppkt.pruning_result);


								   if( orig_numnodes == current_numnodes ) {

								      return 1;

								   }

								   //

								   // If somebody entered stage4 then our group owns the quorum

								   //

								   ClusterIntersection(

								       temp,

								       rgp->rgppkt.knownstage4,

								       rgp->rgppkt.pruning_result

								       );


								   return ClusterNumMembers(temp) != 0;

								}


								_priv _resident static int

								regroup_start_arbitrate()

								{

								   int orig_numnodes    = ClusterNumMembers(rgp->rgpinfo.cluster);

								   int current_numnodes = ClusterNumMembers(rgp->rgppkt.pruning_result);


								   if( orig_numnodes == current_numnodes ) {

								      enter_first_cleanup_stage();

								      return 0; // No Arbitration needed. Proceed to clean up stage //

								   }

								   else {

								      cluster_t arbitrators;

								      int       n_arbitrators;

								      node_t    arbitrator;

								      HANDLE    thread;

								      DWORD     threadId;

								      ULONG     epoch;


								      RGP_LOCK;


								      epoch = rgp->OS_specific_control.EventEpoch;


								      if(rgp->arbitration_started) {

								         RGP_UNLOCK;

								         return 1; // stay in this stage for awhile

								      }


								      rgp->arbitration_ticks = 0;

								      rgp->arbitration_started = 1;


								      RGP_UNLOCK;


								      ClusterIntersection(

								          arbitrators,

								          rgp->rgppkt.pruning_result,

								          rgp->rgppkt.quorumowner

								          );


								      n_arbitrators = ClusterNumMembers(arbitrators);


								      if(n_arbitrators == 0) {

								         //

								         // If there are no quorum owners in this group //

								         // Let's take the guy with the lowest id       //

								         //

								         arbitrator = rgp_select_tiebreaker(rgp->rgppkt.pruning_result);

								      } else {

								         //

								         // Otherwise we will take the quorum owner guy

								         // with the lowest id

								         //

								         arbitrator = rgp_select_tiebreaker(arbitrators);


								         if(n_arbitrators > 1) {

								            RGP_TRACE( "RGP !!! More than one quorum owner",

								                       EXT_NODE(arbitrator),                    /* TRACE */

								                       GetCluster( rgp->rgpinfo.cluster ),      /* TRACE */

								                       GetCluster( rgp->rgppkt.pruning_result ),/* TRACE */

								                       GetCluster( rgp->rgppkt.knownstage2 ) ); /* TRACE */

								            // Do we need to kill all other arbitrators?

								            // No.

								            // ClusterDelete(arbitrators, arbitrator);

								            // ClusterUnion(

								            //     rgp->poison_targets,

								            //     rgp->poison_targets,

								            //     arbitrators

								            //     );

								            // rgp_broadcast(RGP_UNACK_POISON);

								         }

								      }


								      rgp->tiebreaker = arbitrator;


								      //

								      // Now we have an arbitrating node

								      // We will run a thread that will run arbitration algorithm

								      //


								      RGP_TRACE( "RGP Arbitration Delegated to",

								                 EXT_NODE(arbitrator),                    /* TRACE */

								                 GetCluster( rgp->rgpinfo.cluster ),      /* TRACE */

								                 GetCluster( rgp->rgppkt.pruning_result ),       /* TRACE */

								                 GetCluster( rgp->rgppkt.knownstage2 ) ); /* TRACE */


								      // Fix Bug #460991

								      // regroup_restart on stage 4 or later will reset ArbitratingNode

								      // and if all the nodes are present after restart ApproxArbitrationWinner

								      // will be not set properly. Assign it here.

								      rgp->OS_specific_control.ApproxArbitrationWinner =

								      rgp->OS_specific_control.ArbitratingNode = (DWORD)EXT_NODE(arbitrator);


								      if(arbitrator != rgp->mynode) {

								         return 1;

								      }


								      thread = CreateThread( NULL, // security attributes

								                             0,    // stack_size = default

								                             DiskArbitrationThread,

								                             ULongToPtr(epoch),

								                             0,    // runs immediately

								                             &threadId );

								      if(thread == NULL) {

								         //

								         // Force Others to regroup //

								         //

								         RGP_LOCK;


								         rgp_event_handler( RGP_EVT_BANISH_NODE, EXT_NODE(rgp->mynode) );


								         RGP_UNLOCK;


								         //

								         // Kill this node

								         //

								         RGP_ERROR(RGP_ARBITRATION_FAILED);


								         return FALSE;

								      }


								      CloseHandle(thread);

								   }

								   return TRUE;

								}


								DWORD

								DiskArbitrationThread(

								    IN LPVOID param

								    )

								{

								   cluster_t current_participants;

								   DWORD     status;

								   int       participant_count;

								   int       delay;

								   ULONG_PTR startingEpoch = (ULONG_PTR) param;

								   BOOL      EpochsEqual;

								   int       orig_numnodes;

								   int       current_numnodes;

								   LONGLONG  Time1, Time2;


								   ClusterCopy(current_participants, rgp->rgppkt.pruning_result);

								   orig_numnodes = ClusterNumMembers(rgp->rgpinfo.cluster);

								   current_numnodes = ClusterNumMembers(current_participants);


								   RGP_LOCK;


								   EpochsEqual = ( startingEpoch == rgp->OS_specific_control.EventEpoch );


								   RGP_UNLOCK;


								   if(!EpochsEqual)

								      return 0;


								   delay = (orig_numnodes+1)/2 - current_numnodes;


								   if(delay < 0) delay = 0;


								   Sleep(delay * 6000);


								   RGP_LOCK;


								   EpochsEqual = ( startingEpoch == rgp->OS_specific_control.EventEpoch );

								   if (EpochsEqual) {

								      rgp->OS_specific_control.ArbitrationInProgress += 1;

								   }


								   RGP_UNLOCK;


								   if(!EpochsEqual)

								      return 0;


								   GetSystemTimeAsFileTime((LPFILETIME)&Time1);

								   status = (*(rgp->OS_specific_control.QuorumCallback))();

								   GetSystemTimeAsFileTime((LPFILETIME)&Time2);


								   if (status != 0

								    && startingEpoch == rgp->OS_specific_control.EventEpoch)

								   {

								       // If we won the arbitration and we are in the same epoch (approx check)

								       // we need to figure out whether we need to slow down a little


								       Time2 -= Time1;


								       // Convert to seconds


								       Time2 = Time2 / 10 / 1000 / 1000;

								       //

								       // [HACKHACK] GorN Oct/30/1999

								       //   We had a weird timejump in the middle of the arbitration

								       //   Arbitration was completed before it started, we slept for

								       //   too long and regroup timed us out. Let's guard against it.

								       //

								       if ( (Time2 >= 0)

								         && (Time2 < AVERAGE_ARBITRATION_TIME_IN_SECONDS) )

								       {


								          //

								          // Don't need to be better than the average

								          // If we are so fast, let's slow down

								          //


								          Time2 = AVERAGE_ARBITRATION_TIME_IN_SECONDS - Time2;


								          RGP_TRACE( "RGP sleeping",

								                  (ULONG)Time2,  /* TRACE */

								                  0,      /* TRACE */

								                  0,      /* TRACE */

								                  0 );    /* TRACE */

								          Sleep( (ULONG)(Time2 * 1000) );

								       }

								   }


								   RGP_LOCK;


								   rgp->OS_specific_control.ArbitrationInProgress -= 1;


								   EpochsEqual = ( startingEpoch == rgp->OS_specific_control.EventEpoch );


								   if(!EpochsEqual) {

								      RGP_UNLOCK;

								      return 0;

								   }


								   if(status) {

								      //

								      // We own the quorum device

								      // Let's proceed to the next stage

								      //

								      enter_first_cleanup_stage();

								      RGP_UNLOCK;

								      //

								      // All the rest will see that we are in cleanup stage and

								      // will proceed to it too

								      //

								   } else {

								      //

								      // Force Others to regroup //

								      //

								      rgp_event_handler( RGP_EVT_BANISH_NODE, EXT_NODE(rgp->mynode) );

								      RGP_UNLOCK;


								      //

								      // Kill this node

								      //

								      RGP_ERROR(RGP_ARBITRATION_FAILED);

								   }


								   return 0;

								}


								/************************************************************************

								 * rgp_check_packet

								 * rgp_print_packet

								 * =================

								 *

								 * Description:

								 *

								 *    Forward declarations of functions used in rgp_sanity_check macro

								 *

								 ************************************************************************/

								void rgp_print_packet(rgp_pkt_t* pkt, char* label, int code);

								int  rgp_check_packet(rgp_pkt_t* pkt);


								/************************************************************************

								 * rgp_sanity_check

								 * =================

								 *

								 * Description:

								 *

								 *   This macro prints RGP packet if it has unreasonable values in

								 *   powerfail, knownstages, pruning_result, and connectivity_matrix fields.

								 *

								 * Parameters:

								 *

								 *    rgp_pkt_t* pkt -

								 *       packet to be checked

								 *    char* label -

								 *       label that will be printed together with a packet

								 *

								 * Returns:

								 *

								 *    VOID

								 *

								 ************************************************************************/


								#define rgp_sanity_check(__pkt,__label)                    \

								do {                                                       \

								  int __code; __code = rgp_check_packet(__pkt);            \

								  if( __code ) {rgp_print_packet(__pkt, __label, __code);} \

								} while ( 0 )


								/*---------------------------------------------------------------------------*/


								/************************************************************************

								 * split_brain_avoidance_algorithm

								 * ===============================

								 *

								 * Description:

								 *

								 *    This algorithm ensures that, after a regroup incident completes,

								 *    at most one group of nodes will survive regardless of connectivity

								 *    failures.

								 *

								 * Parameters:

								 *

								 *    None

								 *

								 * Returns:

								 *

								 *    void - no return value; The algorithm results in either this node

								 *    halting (with the RGP_AVOID_SPLIT_BRAIN halt code) or this group

								 *    being the only group that survives.

								 *

								 * Algorithm:

								 *

								 *    The algorithm is described in detail in the Sierra Tech Memo S.84,

								 *    "Modifications in Regroup Algorithm for Sierra".

								 *

								 *    The algorithm looks at the set of nodes currently visible from the

								 *    local cluster and compares it to the set of nodes alive before

								 *    the regroup incident started (outerscreen). The decision to survive

								 *    or halt depends on the number of nodes in the current group compared

								 *    to the number of nodes in the original group.

								 *

								 *    Case 1:

								 *       If the current group contains > half the original number, this

								 *       group survives.

								 *

								 *    Case 2:

								 *       If the current group contains < half the original number, this

								 *       node (and group) halts.

								 *

								 *    Case 3:

								 *       If the current group contains exactly half the original number AND

								 *       the current group has at least two members, then this group

								 *       survives if and only if it contains the tie-breaker node (selected

								 *       when the cluster is formed and after each regroup incident).

								 *

								 *    Case 4:

								 *       If the current group contains exactly half the original number AND

								 *       the current group has exactly one member, then we will call the

								 *               QuromSelect procedure to check if the Quorum Disk is accessible

								 *               from this node. If the procedure returns value TRUE we survive;

								 *               else we halt.

								 *

								 *

								 ************************************************************************/

								_priv _resident static void

								split_brain_avoidance_algorithm()

								{

								   int orig_numnodes, current_numnodes;


								   RGP_TRACE( "RGP SpltBrainAlg",

								              EXT_NODE(rgp->tiebreaker),               /* TRACE */

								              GetCluster( rgp->rgpinfo.cluster ),      /* TRACE */

								              GetCluster( rgp->outerscreen ),          /* TRACE */

								              GetCluster( rgp->rgppkt.knownstage2 ) ); /* TRACE */


								   /* Sanity checks:

								    * 1. The current set of nodes must be a subset of the original set

								    *    of nodes.

								    * 2. My node must be in the current set. This was checked

								    *    when stage2 was entered. No need to check again.

								    */

								   if (!ClusterSubsetOf(rgp->rgpinfo.cluster, rgp->rgppkt.knownstage2))

								      RGP_ERROR(RGP_INTERNAL_ERROR);


								   orig_numnodes    = ClusterNumMembers(rgp->rgpinfo.cluster);

								   current_numnodes = ClusterNumMembers(rgp->rgppkt.knownstage2);


								   if (orig_numnodes == current_numnodes)

								      /* All nodes are alive. No split brain possibility. */

								      return;


								   else if (orig_numnodes == 2)  /* Special 2-node case */

								   {

								      if ((*(rgp->OS_specific_control.QuorumCallback))())

								         return; /* we have access to Quorum disk. We survive. */

								      else {

								#if defined( NT )

								          ClusnetHalt( NmClusnetHandle );

								#endif

								          RGP_ERROR(RGP_AVOID_SPLIT_BRAIN);

								      }

								   } /* Special 2-node case */


								   else /* Multi (>2) node case */

								   {

								      if ((current_numnodes << 1) > orig_numnodes)

								         /* Our group has more than half the nodes => we are the majority.

								          * We can survive. Other group(s) will kill themselves.

								          */

								         return;

								      else if ((current_numnodes << 1) < orig_numnodes)

								         /* Our group has less than half the nodes => there may be a

								          * larger group alive. We must halt and allow that group to

								          * survive.

								          */

								         RGP_ERROR(RGP_AVOID_SPLIT_BRAIN);

								      else

								      {

								         /* Our group has exactly half the number of processors;

								          * We survive if we contain the tie-breaker node and halt otherwise.

								          */

								         if (ClusterMember(rgp->rgppkt.knownstage2, rgp->tiebreaker))

								            return;

								         else

								            RGP_ERROR(RGP_AVOID_SPLIT_BRAIN);

								      }

								   } /* Multi (>2) node case */


								}


								/************************************************************************

								 * regroup_restart

								 * ===============

								 *

								 * Description:

								 *

								 *    Starts a new regroup incident.

								 *

								 * Parameters:

								 *

								 *    None

								 *

								 * Returns:

								 *

								 *    void - no return value

								 *

								 * Algorithm:

								 *

								 *    Sets the regroup state to RGP_ACTIVATED, pauses all IO and

								 *    initializes the stage masks and connectivity matrix.

								 *

								 ************************************************************************/

								_priv _resident static void

								regroup_restart()

								{

								   cluster_t old_ignorescreen;

								   UnpackIgnoreScreen(&rgp->rgppkt, old_ignorescreen);


								   RGP_TRACE( "RGP (re)starting",

								              rgp->rgppkt.seqno,                               /* TRACE */

								              rgp->rgppkt.reason,                              /* TRACE */

								              rgp->rgppkt.activatingnode,                      /* TRACE */

								              rgp->rgppkt.causingnode );                       /* TRACE */


								   RGP_TRACE( "RGP masks       ",

								              RGP_MERGE_TO_32( rgp->outerscreen,               /* TRACE */

								                               rgp->innerscreen ),             /* TRACE */

								              RGP_MERGE_TO_32( rgp->rgppkt.knownstage1,        /* TRACE */

								                               rgp->rgppkt.knownstage2 ),      /* TRACE */

								              RGP_MERGE_TO_32( rgp->rgppkt.knownstage3,        /* TRACE */

								                               rgp->rgppkt.knownstage4 ),      /* TRACE */

								              RGP_MERGE_TO_32( rgp->rgppkt.knownstage5,        /* TRACE */

								                               rgp->rgppkt.pruning_result ) ); /* TRACE */


								   /* We are about to start a new pass of the regroup algorithm.

								    * This does not necessarily mean we have finished the previous

								    * pass; i.e., in an abort situation we may be starting over.

								    * This may occur when some other node fails during the current

								    * pass through the algorithm leaving us hung up at one of the

								    * intermediate stages.

								    */


								   //

								   // GN. When we do MM_LEAVE. Our state is COLDLOADED.

								   //  Bailing out of regroup_restart here would prevent us from

								   //  forming a regroup packet that would initate a banishing regroup incident

								   //


								   /* To avoid split brained nodes from corrupting data in storage

								    * devices, we request the transport subsystem to hold all IO requests

								    * in a queue and not transfer them over SNet. We will allow IO to

								    * be resumed when regroup can guarantee that there can no longer be

								    * split brains. This will be done when the final group is determined

								    * and regroup enters the RGP_PHASE1_CLEANUP stage.

								    */


								   rgp_hold_all_io();


								   /* The following is a bit of history from the NSK regroup algorithm from

								    * pre-Sierra systems based on the InterProcessor Bus (IPB). Some of

								    * the particulars mentioned here have changed, but the principle remains.

								    *

								    * Previously, we used to mark all the known stages as zero, except for

								    * stage1. We used to mark only ourselves as in stage1. So, even if our

								    * bus reception logic is screwed up, and we are not receiving packets

								    * from anybody including ourselves, we would mark ourselves as being in

								    * stage1. And after (what used to be) six ticks, we would proceed into

								    * stage2 and mark ourselves as being in stage2. This would cause stage1

								    * and stage2 to be equal, and our world would constitute just

								    * ourselves. Thus we would go through regroup eliminating everybody

								    * else. However, since we are not receiving packets from anybody else,

								    * we would miss our own iamalive packets, and we too will soon die of

								    * %4032. Thus the symptoms would constitute everybody else dying of

								    * (%4040 + some node number), and that node dying with a %4032 halt.

								    * See TPR S 88070112309628 for more details.

								    *

								    * To avoid this situation, we now do not mark ourselves as in a

								    * particular stage until we get our own regroup packets indicating we

								    * are in that stage. Thus, in regroup_restart, all the stages are

								    * cleared. Previously, regroupbroadcaststatus in sendqueuedmessages

								    * used to send directly from the regroup_control structures.

								    * regroupbroadcaststatus has been modified to construct the unsequenced

								    * packets on its stack. It would first copy the state from the

								    * regroup_control structure, and then would LOR in our node into a known

								    * stage, if requested to do so. When we receive that packet, we would

								    * merge that information into our state, and thus we would be

								    * guaranteed that our bus sending and reception logic is working, and

								    * that we can legitimately mark ourselves as being in that stage. This

								    * whole change avoids problems where bus sending logic works, but bus

								    * reception logic is screwed up for both buses in a node.

								    */


								   rgp->sendstage = 0; /* Don't let anyone know I am in stage 1 until

								                        * I have seen a regroup clock tick; this is to

								                        * cause this node to halt if it is not getting

								                        * clock ticks. I will halt when the other nodes

								                        * advance without me and send me a status packet

								                        * indicating this or send me a poison packet

								                        * after declaring me down.

								                        */


								   rgp->rgpcounter = 0;

								   ClusterInit(rgp->rgppkt.knownstage1);

								   ClusterInit(rgp->rgppkt.knownstage2);

								   ClusterInit(rgp->rgppkt.knownstage3);

								   ClusterInit(rgp->rgppkt.knownstage4);

								   ClusterInit(rgp->rgppkt.knownstage5);

								   ClusterInit(rgp->rgppkt.pruning_result);


								   MatrixInit(rgp->rgppkt.connectivity_matrix);

								   MatrixInit(rgp->internal_connectivity_matrix);


								   /* Just for ease of debugging, to send in our poison packets, we keep

								    * the known nodes mask at the start of regroup. poison packets contain

								    * known nodes at the beginning of regroup and at the end of it.

								    */


								   ClusterCopy(rgp->initnodes, rgp->rgpinfo.cluster);

								   ClusterInit(rgp->endnodes);


								#if defined( NT )

								   //

								   // increment the event epoch so we can detect stale events

								   // from clusnet

								   //

								   ++rgp->OS_specific_control.EventEpoch;

								#endif


								   if ( (rgp->rgppkt.stage >= RGP_CLOSING) &&

								        (rgp->rgppkt.stage <= RGP_PHASE2_CLEANUP) &&

								        ClusterCompare(rgp->rgppkt.knownstage1,

								                       rgp->rgppkt.knownstage2) )

								   {

								       //

								       // If we were interrupted by this restart after we closed

								       // 1st stage regroup window, then no nodes can be added to group w/o joining.

								       //

								       // Thus we will add missing nodes into our ignorescreen.

								       // This will force the regroup not to wait for them in stage1

								       cluster_t tmp;


								       ClusterDifference(tmp, rgp->rgpinfo.cluster, rgp->innerscreen);

								       ClusterUnion(rgp->ignorescreen, rgp->ignorescreen, tmp);

								   }


								   if ( ClusterMember(rgp->ignorescreen, rgp->mynode) ) {

								       // We shouldn't have get here, but since we are here

								       // Let's shield us from the outside world

								       RGP_TRACE( "Self Isolation", 0, 0, 0, 0 );

								       ClusterCopy(rgp->ignorescreen, rgp->rgpinfo.cluster);

								       ClusterDelete(rgp->ignorescreen, rgp->mynode);

								   }


								   if ( !ClusterEmpty(rgp->ignorescreen) ) {

								       // if we are ignoring somebody we have

								       // to be cautious. I.e. we will stay longer in the

								       // first stage to give a chance to everybody to learn about

								       // our ignorescreen

								       rgp->cautiousmode = 1;

								   }


								   if ( !ClusterCompare(old_ignorescreen, rgp->ignorescreen) ) {

								       // Ignore screen is changed, reset restart counter //

								       RGP_TRACE( "Ignorescreen->", GetCluster(old_ignorescreen), GetCluster(rgp->ignorescreen), 0, 0 );

								       rgp->restartcount = 0;

								   }

								   PackIgnoreScreen(&rgp->rgppkt, rgp->ignorescreen);


								   rgp->arbitration_started = 0;


								   rgp->OS_specific_control.ArbitratingNode = MM_INVALID_NODE;

								   if ( !rgp_is_perturbed() ) {

								       ResetEvent( rgp->OS_specific_control.Stabilized );

								   }


								   ClusterInit(rgp->rgppkt.quorumowner);

								   if( QuorumOwner == (DWORD)EXT_NODE(rgp->mynode) ) {

								      ClusterInsert(rgp->rgppkt.quorumowner, rgp->mynode);

								   }


								   if (rgp->rgppkt.stage == RGP_COLDLOADED)

								   {

								       if (!rgp->OS_specific_control.ShuttingDown) {

								           //

								           // Currently, RGP_RELOADFAILED calls ExitProcess

								           // During clean shutdown we would like to send the regroup packet

								           // out triggering a regroup. So we don't want to die.

								           //

								           // Since we are not resetting state to RGP_ACTIVATED, this

								           // node will not be able to participate in the regroup.

								           //

								           RGP_ERROR(RGP_RELOADFAILED);

								       }

								   } else {

								       rgp->rgppkt.stage = RGP_ACTIVATED;

								   }


								}


								/************************************************************************

								 * regroup_test_stage2_advance

								 * ===========================

								 *

								 * Description:

								 *

								 *    Checks to see if we can advance to regroup stage 2.

								 *

								 * Parameters:

								 *

								 *    None

								 *

								 * Returns:

								 *

								 *    int - 1 if stage 2 can be entered and 0 if not.

								 *

								 * Algorithm:

								 *

								 *    Stage 2 can be entered if one of the following conditions is true.

								 *

								 *    (a) all nodes are present and accounted for and at least one

								 *        regroup clock tick has occurred

								 *    (b) we are not in cautious mode, all but one node are present

								 *        and accounted for, AND a minimum number of ticks

								 *        (rgp_quickdecisionlegit) have elapsed.

								 *    (c) if RGP_MUST_ENTER_STAGE2 ticks have elapsed.

								 *

								 ************************************************************************/

								_priv _resident static int

								regroup_test_stage2_advance()

								{


								   cluster_t stragglers; /* set of nodes not yet checkd in */

								   int num_stragglers;   /* # of nodes not yet checkd in   */


								   /* Stage 2 must be entered after some interval regardless of any

								    * other conditions.

								    */

								   if (rgp->rgpcounter == 0)

								      return(0);

								   if (rgp->rgpcounter >= RGP_MUST_ENTER_STAGE2)

								   {

								       RGP_TRACE( "RGP S->2cautious",

								                  rgp->rgpcounter,                         /* TRACE */

								                  rgp->cautiousmode,                       /* TRACE */

								                  GetCluster( rgp->outerscreen ),          /* TRACE */

								                  GetCluster( rgp->rgppkt.knownstage1 ) ); /* TRACE */

								      return(1);

								   }


								   /* The number of ticks is between 1 and RGP_MUST_ENTER_STAGE2.

								    * We need to examine the stage1 mask to decide if we can

								    * advance.

								    *

								    * If every node in the old configuration has checked in, I can

								    * advance at once. This is either a false alarm or caused by

								    * power failure or connectivity failures.

								    */


								   /* Compute the set of nodes from the original configuration not yet

								    * recognized.

								    */

								   ClusterDifference(stragglers, rgp->outerscreen,

								                     rgp->rgppkt.knownstage1);


								   //

								   // We shouldn't wait for the nodes we are ignoring,

								   // since we cannot get a packet from them anyway

								   //

								   ClusterDifference(stragglers, stragglers,

								                     rgp->ignorescreen);


								   if ((num_stragglers = ClusterNumMembers(stragglers)) == 0)

								   {

								      RGP_TRACE( "RGP S->2 all in ",

								                 rgp->rgpcounter,                        /* TRACE */

								                 GetCluster( rgp->outerscreen ), 0, 0 ); /* TRACE */


								      return(1);   /* all present and accounted for */

								   }


								   /* If stragglers is non-empty, perhaps I can still advance to stage 2

								    * if I am not in cautious mode (no recent power fail and not

								    * aborting and rerunning the regroup algorithm) AND all nodes but

								    * one have checked in AND some minimum number of ticks have elapsed.

								    *

								    * The minimum number of ticks is selected to be 1 greater than the

								    * the LATEPOLL inititiation period (allowed consecutive missed IamAlive time)

								        * since that should guarantee that, if the

								    * cluster has broken off into multiple disconnected clusters,

								    * the other clusters would have detected the missing IamAlives,

								    * started regroup and paused IO, thus preventing the possibility

								    * of data corruption caused by a split brain situation.

								    */


								   if (!(rgp->cautiousmode) &&

								       (num_stragglers == 1) &&

								           (rgp->rgpcounter > rgp->rgpinfo.Min_Stage1_ticks))

								   {

								      RGP_TRACE( "RGP S->2 1 miss ",

								                 rgp->rgpcounter,                            /* TRACE */

								                 GetCluster( rgp->outerscreen ),             /* TRACE */

								                 GetCluster( rgp->rgppkt.knownstage1 ), 0 ); /* TRACE */

								      return(1);  /* advance - all but one checked in */

								   }


								   return(0); /* sorry cannot advance yet */


								}


								/************************************************************************

								 * regroup_stage3_advance

								 * ===========================

								 *

								 * Description:

								 *

								 *    This function is called after the split brain avoidance algorithm

								 *    is run and the tie-breaker is selected in stage 2. It checks if

								 *    we can proceed to stage 3 (RGP_PRUNING) and advances to stage 3

								 *    if possible.

								 *

								 * Parameters:

								 *

								 *    None

								 *

								 * Returns:

								 *

								 *    int - 1 if the regroup stage has been advanced to RGP_PRUNING;

								 *          0 if the stage cannot be advanced yet.

								 *

								 * Algorithm:

								 *

								 *    The algorithm depends on whether we are the tie-breaker or not.

								 *

								 *    On the tie-breaker node, we first check if there are any

								 *    disconnects in the cluster. If there aren't any, there is no need

								 *    for pruning. We can then set pruning_result to knownstage2,

								 *    advance to the RGP_PRUNING stage and return 1. If there are

								 *    disconnects, we must wait a certain number of ticks to collect

								 *    connectivity info from all nodes. If the number of ticks have not

								 *    passed, return 0. If the required number of ticks have elapsed,

								 *    we must call the pruning algorithm to get the list of potential

								 *    groups. After that, the select_cluster() routine is called to

								 *    pick one from the set of possible clusters. After this is done,

								 *    pruning_result is set to the selected cluster and we return 1.

								 *

								 *    On a non-tiebreaker node, nothing is done till a stage3 packet is

								 *    received from the tie-breaker node or another node which got a

								 *    stage 3 packet. If a stage 3 packet has not been received, we

								 *    simply return 0. If a stage 3 packet is received, RGP_PRUNING

								 *    stage is entered and we return 1.

								 *

								 ************************************************************************/

								_priv _resident int

								regroup_stage3_advance()

								{

								   int stage_advanced = 0, numgroups, groupnum;


								   if (rgp->tiebreaker == rgp->mynode)

								   {

								      if (connectivity_complete(rgp->rgppkt.connectivity_matrix))

								      {


								         /* No disconnects. All nodes in knownstage2 survive. */

								         rgp->rgppkt.stage = RGP_PRUNING;


								         ClusterCopy(rgp->rgppkt.pruning_result,

								                     rgp->rgppkt.knownstage2);

								         stage_advanced = 1;


								         RGP_TRACE( "RGP S->3 NoPrune", rgp->rgpcounter, 0, 0, 0 );

								      }


								      /* There are disconnects; must wait for connectivity

								       * information to be complete. The info is deemed

								       * complete after a fixed number of ticks have

								       * elapsed.

								       */


								      else if (rgp->pruning_ticks >= RGP_CONNECTIVITY_TICKS)

								      { /* connectivity info collection complete; enter stage 3 */


								         RGP_TRACE( "RGP Con. matrix1",

								              RGP_MERGE_TO_32( rgp->rgppkt.connectivity_matrix[0],   /*TRACE*/

								                               rgp->rgppkt.connectivity_matrix[1] ), /*TRACE*/

								              RGP_MERGE_TO_32( rgp->rgppkt.connectivity_matrix[2],   /*TRACE*/

								                               rgp->rgppkt.connectivity_matrix[3] ), /*TRACE*/

								              RGP_MERGE_TO_32( rgp->rgppkt.connectivity_matrix[4],   /*TRACE*/

								                               rgp->rgppkt.connectivity_matrix[5] ), /*TRACE*/

								              RGP_MERGE_TO_32( rgp->rgppkt.connectivity_matrix[6],   /*TRACE*/

								                               rgp->rgppkt.connectivity_matrix[7])); /*TRACE*/

								         RGP_TRACE( "RGP Con. matrix2",

								              RGP_MERGE_TO_32( rgp->rgppkt.connectivity_matrix[8],   /*TRACE*/

								                               rgp->rgppkt.connectivity_matrix[9] ), /*TRACE*/

								              RGP_MERGE_TO_32( rgp->rgppkt.connectivity_matrix[10],  /*TRACE*/

								                               rgp->rgppkt.connectivity_matrix[11]), /*TRACE*/

								              RGP_MERGE_TO_32( rgp->rgppkt.connectivity_matrix[12],  /*TRACE*/

								                               rgp->rgppkt.connectivity_matrix[13]), /*TRACE*/

								              RGP_MERGE_TO_32( rgp->rgppkt.connectivity_matrix[14],  /*TRACE*/

								                               rgp->rgppkt.connectivity_matrix[15]));/*TRACE*/


								         numgroups = find_all_fully_connected_groups(

								                        rgp->rgppkt.connectivity_matrix,

								                        rgp->tiebreaker,

								                        rgp->potential_groups);


								         if ((void *)rgp->select_cluster == RGP_NULL_PTR)

								         {

								             node_t keynode;

								             cluster_t temp;

								             ClusterIntersection(

								                 temp,

								                 rgp->rgppkt.knownstage2,

								                 rgp->rgppkt.quorumowner

								                 );

								             if ( ClusterEmpty(temp) ) {

								                 keynode = RGP_NULL_NODE;

								             } else {

								                 keynode = rgp_select_tiebreaker(temp);

								             }

								             RGP_TRACE( "RGP keynode ng  ", keynode, numgroups, 0, 0); /*TRACE*/

								            /* No callback specified; use regroup's own routine. */

								            groupnum = rgp_select_cluster_ex(

								                           rgp->potential_groups, numgroups, keynode);

								         }

								         else

								         {

								            /* Call routine specified at rgp_start() time. */

								            groupnum = (*(rgp->select_cluster))(

								                           rgp->potential_groups, numgroups);

								         }


								         if (groupnum >= 0)

								            ClusterCopy(rgp->rgppkt.pruning_result,

								                        rgp->potential_groups[groupnum]);

								         else

								            /* No group can survive. Can't halt yet.

								             * Need to tell everyone else.

								             */

								            ClusterInit(rgp->rgppkt.pruning_result);


								         rgp->rgppkt.stage = RGP_PRUNING;


								         stage_advanced = 1;


								         RGP_TRACE( "RGP S->3 Pruned ",

								                    rgp->rgpcounter,                          /* TRACE */

								                    GetCluster( rgp->rgppkt.knownstage2 ),    /* TRACE */

								                    GetCluster( rgp->rgppkt.pruning_result ), /* TRACE */

								                    numgroups );                              /* TRACE */


								      } /* connectivity info collection complete; enter stage 3 */


								   } /* tie-breaker node */


								   else


								   { /* not tie-breaker node */


								      if (ClusterNumMembers(rgp->rgppkt.knownstage3) != 0)

								      {

								         /* We got a stage 3 packet from someone. Enter stage 3. */

								         rgp->rgppkt.stage = RGP_PRUNING;


								         stage_advanced = 1;


								         RGP_TRACE( "RGP Got S3 pkt  ",

								                    rgp->rgpcounter,                          /* TRACE */

								                    GetCluster( rgp->rgppkt.knownstage2 ),    /* TRACE */

								                    GetCluster( rgp->rgppkt.pruning_result ), /* TRACE */

								                    GetCluster( rgp->rgppkt.knownstage3 ) );  /* TRACE */

								      }


								   } /* not tie-breaker node */


								   return(stage_advanced);

								}


								/************************************************************************

								 * enter_first_cleanup_stage

								 * =========================

								 *

								 * Description:

								 *

								 *    This function performs the actions required when entering the

								 *    first of the message clean up stages.

								 *

								 * Parameters:

								 *

								 *    None

								 *

								 * Returns:

								 *

								 *    void - no return value

								 *

								 * Algorithm:

								 *

								 *    There are many actions to be performed after the final cluster

								 *    is selected. The actions are described in comments throughout

								 *    this routine.

								 *

								 ************************************************************************/

								_priv _resident void

								enter_first_cleanup_stage()

								{

								   cluster_t banishees;

								   node_t failer;


								   rgp->rgppkt.stage = RGP_PHASE1_CLEANUP;


								   RGP_TRACE( "RGP S->4        ", rgp->rgpcounter, 0, 0, 0 );


								   /* The packets we send now will not indicate we are in the phase 1

								    * cleanup stage yet. We indicate we are in this stage only after

								    * we have completed the clean up action associated with the stage.

								    * This is done in rgp_event_handler, under the

								    * RGP_EVT_PHASE1_CLEANUP_DONE event.

								    */

								   rgp->sendstage = 0;


								   /* Now, we can resume IO since we have passed the split brain danger.

								    * New split brain situations will result in regroup restarting and

								    * pausing IO again.

								    */


								   rgp_resume_all_io();


								   /* Compute in banishees the set of nodes being lost from the old

								    * configuration.

								    */


								   ClusterDifference(banishees, rgp->rgpinfo.cluster,

								                     rgp->rgppkt.pruning_result);


								   /* Install the new configuration into the masks. */


								   ClusterCopy(rgp->outerscreen,     rgp->rgppkt.pruning_result);


								#if defined( NT )

								   ClusnetSetOuterscreen(

								       NmClusnetHandle,

								       (ULONG)*((PUSHORT)rgp->outerscreen)

								       );

								#endif


								   ClusterCopy(rgp->innerscreen,     rgp->rgppkt.pruning_result);

								   ClusterCopy(rgp->endnodes,        rgp->rgppkt.pruning_result);

								   ClusterCopy(rgp->rgpinfo.cluster, rgp->rgppkt.pruning_result);


								   /* Select a new tiebreaker because the previous one may have been    */

								   /* pruned out. Note: tiebreaker_selected has already been set in S2. */

								   rgp->tiebreaker =

								      rgp_select_tiebreaker(rgp->rgppkt.pruning_result);

								      /* F40 Bug FixID KCY0833 */


								   /* Mark the state of the banishees as dead and invoke the

								    * node down callback routine.

								    */

								   for (failer = 0; failer < (node_t) rgp->num_nodes; failer++)

								      if (ClusterMember(banishees, failer)

								          || rgp->node_states[failer].status == RGP_NODE_COMING_UP // fix bug#265069

								          )

								      {

								         rgp->node_states[failer].status = RGP_NODE_DEAD;

								         rgp->node_states[failer].pollstate = AWAITING_IAMALIVE;

								         rgp->node_states[failer].lostHBs = 0;


								#if !defined(NT)

								         (*(rgp->nodedown_callback))(EXT_NODE(failer));

								#else


								         ClusnetSetNodeMembershipState(NmClusnetHandle,

								                                       EXT_NODE( failer ),

								                                       ClusnetNodeStateDead);


								         //

								         // On NT we do the nodedown callback at the end of stage 5.

								         // This allows the cleanup phases to complete before we let

								         // the "upper" layers know that a node went down.

								         //

								         if ( ClusterMember(rgp->OS_specific_control.CPUUPMASK,failer) )

								            ClusterInsert(

								                rgp->OS_specific_control.NeedsNodeDownCallback,

								                failer

								                );


								#endif // !defined(NT)


								      }


								   /* If some nodes have been lost from the configuration, then I will

								    * queue regroup status packets to them. This is a best efforts

								    * attempt to ensure that they get quickly taken out if they

								    * do in fact continue to run.

								    */


								   ClusterUnion(rgp->status_targets, banishees, rgp->status_targets);


								   //

								   // In NT, we are using rgp->rgppkt.hadpowerfail to transmit

								   // quorum ownership information

								   //

								   #if !defined(NT)


								   /* I should inform the message system of any node that experienced a

								    * power on recovery. The message system can use this to clear error

								    * counters so that a link will not be declared down due to errors

								    * which may have been caused by the power failure.

								    */


								   for (failer = 0; failer < (node_t) rgp->num_nodes; failer++)

								      if ((ClusterMember(rgp->rgppkt.hadpowerfail, failer)) &&

								          !(ClusterMember(banishees, failer)))

								         /* This survivor had a power failure. */

								         rgp_had_power_failure( EXT_NODE(failer) );


								   #endif // NT


								   /* Tell the OS to start clean up operations for the failed nodes. */

								   rgp_start_phase1_cleanup();

								}


								/************************************************************************

								 * evaluatestageadvance

								 * ====================

								 *

								 * Description:

								 *

								 *    This function evaluates whether additional state transitions are

								 *    possible as a result of the info just received.

								 *

								 * Parameters:

								 *

								 *    None

								 *

								 * Returns:

								 *

								 *    void - no return value

								 *

								 * Algorithm:

								 *

								 *    To evaluate whether we can advance through the stages, a loop is

								 *    used with a case entry for each stage. If an entry decides not to

								 *    advance to the next stage, it must return from the function. If

								 *    it does advance, it should not return but remain in the loop

								 *    since it is possible to have cascaded stage transitions

								 *    especially in a two node system. Thus, the loop is exited when no

								 *    more stage transitions are possible.

								 *

								 ************************************************************************/

								_priv _resident static void

								evaluatestageadvance()

								{

								   cluster_t   temp_cluster;

								   node_t      node;

								   node_t          i;


								   for (;;)  /* loop until someone exits by returning */

								   {

								      switch (rgp->rgppkt.stage)

								      {


								         case RGP_COLDLOADED :

								         {

								            if (!rgp->OS_specific_control.ShuttingDown) {

								                RGP_ERROR(RGP_RELOADFAILED);

								            }

								            return;

								         }


								         case RGP_ACTIVATED :

								         { /* evaluate whether to go to stage RGP_CLOSING */


								            if (!regroup_test_stage2_advance())

								               return;


								            if (!ClusterMember(rgp->rgppkt.knownstage1, rgp->mynode))

								               RGP_ERROR(RGP_MISSED_POLL_TO_SELF);


								            rgp->rgppkt.stage = RGP_CLOSING;


								            rgp->rgpcounter = 0;

								            rgp->tiebreaker_selected = 0;


								            /* If we abort the regroup, and there's somebody that everybody

								             * banished on this regroup, the following line keeps him from

								             * joining up on the next regroup.

								             */

								            ClusterCopy(rgp->innerscreen, rgp->rgppkt.knownstage1);


								            break;


								         } /* evaluate whether to go to stage RGP_CLOSING */


								         case RGP_CLOSING :

								         { /* evaluate whether to go to stage RGP_PRUNING */


								            if (rgp->tiebreaker_selected)

								            {

								               if (regroup_stage3_advance())

								                  break;  /* try to advance further */

								               else

								                  return; /* cannot advance any more */

								            }


								            if (!ClusterCompare(rgp->rgppkt.knownstage1,

								                                rgp->rgppkt.knownstage2))

								               return;


								           //

								           // In NT, we no longer use the split-brain avoidance algorithm.

								           // We use a cluster-wide arbitration algorithm instead.

								           //

								           #if !defined(NT)

								            /* When the known stage 1 and known stage 2 sets are the

								             * same, we have the complete set of nodes that are

								             * connected to us. It is time to execute the split-

								             * brain avoidance algorithm. If we are a splinter group

								             * cut off from the main group, we will not survive this

								             * algorithm.

								             */


								           split_brain_avoidance_algorithm();


								           #endif // NT


								            /* We are the lucky survivors of the split brain avoidance

								             * algorithm. Now, we must proceed to elect a new tie-breaker

								             * since the current tie-breaker may no longer be with us.

								             */


								            rgp->tiebreaker =

								               rgp_select_tiebreaker(rgp->rgppkt.knownstage2);


								            rgp->tiebreaker_selected = 1;


								            RGP_TRACE( "RGP S2 tiebr sel",

								                       rgp->rgpcounter,               /* TRACE */

								                       EXT_NODE(rgp->tiebreaker),     /* TRACE */

								                       0, 0 );                        /* TRACE */


								            rgp->pruning_ticks = 0;

								            break;


								         } /* evaluate whether to go to stage 3 */


								         case RGP_PRUNING :

								         { /* evaluate whether to go to RGP_PHASE1_CLEANUP stage */


								            if (rgp->arbitration_started) {

								               if (regroup_test_arbitrate_advance()) {

								                  enter_first_cleanup_stage();

								                  break;

								               } else {

								                  return; // Stay in this stage //

								               }

								            }


								            if (rgp->has_unreachable_nodes)

								            {

								               RGP_TRACE( "RGP Unreach Node",

								                  GetCluster( rgp->rgppkt.pruning_result ),     /* TRACE */

								                  GetCluster( rgp->unreachable_nodes ), 0, 0 ); /* TRACE */


								               /* Must check if the unreachable nodes are in the

								                * selected final group. If so, we must restart

								                * regroup.

								                */

								               ClusterIntersection(temp_cluster, rgp->unreachable_nodes,

								                                   rgp->rgppkt.pruning_result);


								               /* Clear the unreachable node mask and flag after examining

								                * them. If we restart, we will start with a clean slate.

								                */

								               rgp->has_unreachable_nodes = 0;

								               ClusterInit(rgp->unreachable_nodes);


								               if (ClusterNumMembers(temp_cluster) != 0)

								               {

								                  /* We have a node unreachable event to a node

								                   * selected to survive. We must regenerate

								                   * the connectivity matrix and re-run the node

								                   * pruning algorithm. Start a new regroup incident.

								                   * All restarts are in cautious mode.

								                   */

								                  rgp->cautiousmode = 1;

								                  rgp->rgppkt.seqno = rgp->rgppkt.seqno + 1;

								                  rgp->rgppkt.reason = RGP_EVT_NODE_UNREACHABLE;

								                  rgp->rgppkt.activatingnode = (uint8) EXT_NODE(rgp->mynode);


								                  /* For causingnode, pick the first unreachable node

								                   * in temp_cluster.

								                   */

								                  for (node = 0; node < (node_t) rgp->num_nodes; node++)

								                  {

								                     if (ClusterMember(temp_cluster, node))

								                     {

								                        rgp->rgppkt.causingnode = (uint8) EXT_NODE(node);

								                        break;

								                     }

								                  }

								                  regroup_restart();

								                  return;

								               }

								            }


								            if (!ClusterCompare(rgp->rgppkt.knownstage2,

								                                rgp->rgppkt.knownstage3))

								               return;


								            /* All nodes in the connected cluster have been notified

								             * of the pruning decision (entered stage 3). If we are

								             * selected to survive, we can now enter stage 4. If we are

								             * not in the selected group (pruning_result), we must halt.

								             * Wait for at least one node in PRUNING_RESULT to get into

								             * stage 4 before halting. This ensures that the algorithm

								             * does not stall in stage 3 with all pruned out nodes

								             * halting before ANY of the survivors finds that all nodes

								             * entered stage 3.

								             */


								            if (!ClusterMember(rgp->rgppkt.pruning_result, rgp->mynode))

								            {

								               /* Wait for at least one node in PRUNING_RESULT

								                * to get into stage 4 before halting. Since only

								                * nodes in PRUNING_RESULT get into stage 4, it is

								                * sufficient to check if knownstage4 has any members.

								                */

								               if (ClusterNumMembers(rgp->rgppkt.knownstage4) != 0)

								                  RGP_ERROR(RGP_PRUNED_OUT);

								                           return;

								            }


								            // proceed to second stage of pruning - arbitration

								            if( regroup_start_arbitrate() ) {

								               return; // stay in this stage

								            } else {

								               break;  // either proceed to the next, or restart

								            }


								            break;


								         }  /* evaluate whether to go to RGP_PHASE1_CLEANUP stage */


								         case RGP_PHASE1_CLEANUP :

								         { /* evaluate whether to go to RGP_PHASE2_CLEANUP stage */


								            if (!ClusterCompare(rgp->rgppkt.pruning_result,

								                                rgp->rgppkt.knownstage4))

								               return;


								            rgp->rgppkt.stage = RGP_PHASE2_CLEANUP;


								            RGP_TRACE( "RGP S->5        ", rgp->rgpcounter, 0, 0, 0 );


								            /* The packets we send now will not indicate we are in the phase 2

								             * cleanup stage yet. We indicate we are in this stage only after

								             * we have completed the clean up action associated with the stage.

								             * This is done in rgp_event_handler, under the

								             * RGP_EVT_PHASE2_CLEANUP_DONE event.

								             */

								            rgp->sendstage = 0;


								            rgp_start_phase2_cleanup();


								            break;


								         }   /* evaluate whether to go to RGP_PHASE2_CLEANUP stage */


								         case RGP_PHASE2_CLEANUP :

								         { /* evaluate whether to go to RGP_STABILIZED stage */


								            if (!ClusterCompare(rgp->rgppkt.knownstage4,

								                                rgp->rgppkt.knownstage5))

								               return;


								            RGP_LOCK;


								            //

								            // [HACKHACK] This is not necessary anymore, since we

								            // are holding the lock in message.c when delivering

								            // regroup packet received event

								            //

								            if (RGP_PHASE2_CLEANUP != rgp->rgppkt.stage) {

								                RGP_TRACE( "RGP S->6 (race) ", rgp->rgpcounter, rgp->rgppkt.stage, 0, 0 );

								                break;

								            }


								            rgp->rgppkt.stage             = RGP_STABILIZED;


								            RGP_TRACE( "RGP S->6        ", rgp->rgpcounter, 0, 0, 0 );


								            rgp->rgpcounter        = 0;

								            rgp->restartcount      = 0;


								            /* Reset the regroup flags which have not yet been cleared. */

								            rgp->cautiousmode      = 0;


								            /* Clear the mask indicating nodes which own the quorum resrc. */

								            ClusterInit(rgp->rgppkt.quorumowner);


								            /* Copy the sequence number into the rgpinfo area. */

								            rgp->rgpinfo.seqnum = rgp->rgppkt.seqno;


								            SetEvent( rgp->OS_specific_control.Stabilized );

								            if (rgp->OS_specific_control.ArbitratingNode != MM_INVALID_NODE) {

								                // Somebody was arbitrating //

								                rgp->OS_specific_control.ApproxArbitrationWinner =

								                	rgp->OS_specific_control.ArbitratingNode;

								                if (rgp->OS_specific_control.ArbitratingNode == (DWORD)EXT_NODE(rgp->mynode)) {

								                    //

								                    // [HackHack] To close 422405

								                    // when 421828 is fixed, please uncomment the following line

								                    //

								                    // QuorumOwner = rgp->OS_specific_control.ArbitratingNode;

								                } else {

								                    if (QuorumOwner != MM_INVALID_NODE) {

								                        ClRtlLogPrint(LOG_UNUSUAL,

								                            "[MM] : clearing quorum owner var (winner is %1!u!), %.\n",

								                            rgp->OS_specific_control.ArbitratingNode

								                            );

								                    }

								                    QuorumOwner = MM_INVALID_NODE;

								                }

								            }


								            rgp_cleanup_complete();


								#if defined(NT)

								            //

								            // On NT we deferred doing the node down callback until all the

								            // cleanup phases have been complete.

								            //

								            ClusterCopy(

								                rgp->OS_specific_control.CPUUPMASK,

								                rgp->rgpinfo.cluster

								                );


								            (*(rgp->nodedown_callback))(

								                rgp->OS_specific_control.NeedsNodeDownCallback

								                );


								            //

								            // Clear the down node mask

								            //

								            ClusterInit(rgp->OS_specific_control.NeedsNodeDownCallback);


								            //

								            // finally, tell clusnet that regroup has finished

								            //

								            ClusnetRegroupFinished(NmClusnetHandle,

								                                   rgp->OS_specific_control.EventEpoch,

								                                   rgp->rgppkt.seqno);


								            rgp->last_stable_seqno = rgp->rgppkt.seqno;


								            RGP_UNLOCK;

								#endif


								            return;


								         } /* evaluate whether to go to RGP_STABILIZED stage */


								         case RGP_STABILIZED :

								            return;            /* stabilized, so I am all done */


								                 default :

								            RGP_ERROR(RGP_INTERNAL_ERROR);  /* unknown stage */


								      } /* switch (rgp->rgppkt.stage) */


								  } /* loop until someone exits by returning */

								}


								/************************************************************************

								 * rgp_event_handler

								 * =================

								 *

								 * Description:

								 *

								 *    The state machine and the heart of the regroup algorithm.

								 *

								 * Parameters:

								 *

								 *    int event -

								 *       which event happened

								 *

								 *    node_t causingnode -

								 *       node causing the event: node which sent a regroup status

								 *       packet or whose IamAlives are missed; if the causing node is

								 *       not relevant information, RGP_NULL_NODE can be passed and

								 *       is ignored. *This node ID is in external format.*

								 *

								 * Returns:

								 *

								 *    void - no return value

								 *

								 * Algorithm:

								 *

								 *    The state machine is the heart of the regroup algorithm.

								 *    It is organized as a switch statement with the regroup stage as

								 *    the case label and the regroup event as the switch variable.

								 *    Events could cause regroup to start a new incident, to advance

								 *    through stages or to update information without advancing to

								 *    another stage. This routine also arranges for regroup status

								 *    packets to be sent to all relevant nodes including our own

								 *    node.

								 *

								 ************************************************************************/

								_priv _resident void

								RGP_EVENT_HANDLER_EX(int event, node_t causingnode, void *arg)

								{


								   rgp_pkt_t    *rcvd_pkt_p;

								   cluster_t    ignorescreen_rcvd;

								   uint8        oldstage;

								   int          send_status_pkts = 0;


								    /* Note: arg is only used when event == RGP_EVENT_RECEIVED_PACKET.  It is the ptr to the packet */


								   /* Trace unusual invocations of this routine. */

								   if  (event != RGP_EVT_RECEIVED_PACKET  &&  event != RGP_EVT_CLOCK_TICK)

									  RGP_TRACE( "RGP Event       ", event, causingnode, rgp->rgppkt.stage, rgp->rgpcounter );  /* TRACE */


								   switch (event)

								   {

								      case RGP_EVT_NODE_UNREACHABLE :

								      { /* All paths to a node are unreachable */


								         /* Ignore the event if the unreachable node has been eliminated

								          * from our outerscreen. The message system probably doesn't

								          * know it yet.

								          */

								         if (ClusterMember(rgp->outerscreen, INT_NODE(causingnode)))

								         {

								            /* Store this event and check after node pruning (when

								             * entering the RGP_PRUNING stage). If a regroup incident

								             * is in progress and we haven't entered the RGP_PRUNING

								             * stage yet, this will happen in the current incident.

								             * If not, it will happen in the next regroup incident

								             * which will surely start soon due to this disconnect.

								             *

								             * We do not start a regroup incident for this event. We will

								             * wait for IamAlives to be missed for starting a new regroup

								             * incident. This is due to the requirement that, in case

								             * of a total disconnect resulting in multiple groups, we must

								             * stay in stage 1 till we can guarantee that the other group(s)

								             * has started regroup and paused IO. We assume that the

								             * regroup incident started at the IamAlive check tick and

								             * use the periodic nature of the IamAlive sends and

								             * IamAlive checks to limit the stage1 pause to the period

								             * of IamAlive sends (+ 1 tick to drain IO). If we started

								             * a regroup incident due to the node unreachable event, we

								             * have to stay in stage1 longer.

								             */

								            rgp->has_unreachable_nodes = 1;

								            ClusterInsert(rgp->unreachable_nodes, INT_NODE(causingnode));


								            break;

								         }

								      } /* All paths to a node are unreachable */


								      case RGP_EVT_PHASE1_CLEANUP_DONE :

								      {

								         /* The following checks are needed in case we restarted

								          * regroup and asked for phase1 cleanup multiple times.

								          * We must make sure that all such requests have been

								          * completed.

								          */

								         if ( (rgp->rgppkt.stage == RGP_PHASE1_CLEANUP) &&

								              (rgp->rgp_msgsys_p->phase1_cleanup == 0) )

								         { /* all caught up */


								            /* Let others and ourselves get packets indicating we are in

								             * this stage. When we get that packet, we will update our

								             * knownstage field. If our sending or receiving apparatus

								             * failed meanwhile and we don't get our own packet, it

								             * will cause regroup to be restarted.

								             */

								            rgp->sendstage = 1;

								            send_status_pkts = 1;

								            evaluatestageadvance();

								         } /* all caught up */


								         break;

								      }


								      case RGP_EVT_PHASE2_CLEANUP_DONE :

								      {


								         /* The following checks are needed in case we restarted

								          * regroup and asked for phase2 cleanup multiple times.

								          * We must make sure that all such requests have been

								          * completed.

								          */

								         if ( (rgp->rgppkt.stage == RGP_PHASE2_CLEANUP) &&

								              (rgp->rgp_msgsys_p->phase2_cleanup == 0) )

								         { /* all caught up */

								            /* Let others and ourselves get packets indicating we are

								             * in this stage.

								             */

								            rgp->sendstage = 1;

								            send_status_pkts = 1;

								            evaluatestageadvance();

								         } /* all caught up */

								         break;

								      }


								      case RGP_EVT_LATEPOLLPACKET :

								      { /* some node is late with IamAlives */


								         RGP_LOCK; // to ensure that the packet receive does not initiate

								                           // regroup asynchronously.

								                 /* Start a new regroup incident if not already active. */

								         if (rgp->rgppkt.stage == RGP_STABILIZED)

								         {

								            rgp->rgppkt.seqno = rgp->rgppkt.seqno + 1;

								            rgp->rgppkt.reason = RGP_EVT_LATEPOLLPACKET;

								            rgp->rgppkt.activatingnode = (uint8) EXT_NODE(rgp->mynode);

								            rgp->rgppkt.causingnode = (uint8) causingnode;

								            regroup_restart();

								            send_status_pkts = 1;

								         } else if (rgp->rgppkt.stage == RGP_COLDLOADED)

								         {

								            RGP_ERROR(RGP_RELOADFAILED);

								         }

								         RGP_UNLOCK;

								         break;

								      } /* some node is late with IamAlives */


								      case MM_EVT_LEAVE:

								         rgp->OS_specific_control.ShuttingDown = TRUE;

								      case RGP_EVT_BANISH_NODE :

								      { /* assumes that the lock is held */


								         rgp->rgppkt.seqno = rgp->rgppkt.seqno + 1;

								         rgp->rgppkt.activatingnode = (uint8) EXT_NODE(rgp->mynode);

								         // Pack Ignore Screen in the regroup_restart will

								         // fill reason and causingnode fields of the packet

								         ClusterInsert(rgp->ignorescreen, INT_NODE(causingnode) );

								         regroup_restart();

								         send_status_pkts = 1;

								         break;

								      }

								#if 0

								      case MM_EVT_LEAVE: // this node needs to leave the cluster gracefully

								      {

								                // Initiate a Regroup Event amongst remaining members if any

								                // Start a new regroup incident if not already active.

								        if (rgp->rgppkt.stage == RGP_STABILIZED)

								        {

								           rgp->rgppkt.seqno = rgp->rgppkt.seqno + 1;

								           rgp->rgppkt.reason = MM_EVT_LEAVE;

								           rgp->rgppkt.activatingnode = (uint8) EXT_NODE(rgp->mynode);

								           rgp->rgppkt.causingnode = (uint8) EXT_NODE(rgp->mynode);

								           regroup_restart();

								           send_status_pkts = 1;

								        }

								        break;

								      }

								#endif


								      case RGP_EVT_CLOCK_TICK :

								      { /* called on regroup clock tick when regroup is active */


								         if( (rgp->rgppkt.stage == RGP_PRUNING) &&

								             (rgp->arbitration_started)

								           )

								         {

								            rgp->arbitration_ticks++;


								            if (rgp->arbitration_ticks >= RGP_ARBITRATION_TIMEOUT) {

								               //

								               // Kill timed-out arbitrator

								               //

								               if(rgp->tiebreaker == rgp->mynode) {

								                  //

								                  // If this node was arbitrating, then die

								                  //

								                  if ( IsDebuggerPresent() ) {

								                     DebugBreak();

								                  }


								                  RGP_ERROR(RGP_ARBITRATION_STALLED);

								               }

								               else {

								                  //

								                  // Kill the arbitrator and initiate another regroup

								                  //

								                  RGP_TRACE(

								                      "RGP arbitration stalled     ",

								                      rgp->rgppkt.stage, 0, 0, 0

								                      );


								                  rgp_event_handler(

								                      RGP_EVT_BANISH_NODE,

								                      EXT_NODE(rgp->tiebreaker)

								                      );


								                  break;

								               }

								            }


								            evaluatestageadvance();


								            //

								            // No need to send packets while we are waiting for

								            // the arbitrator to win

								            //

								            // send_status_pkts = rgp->rgppkt.stage != RGP_PRUNING;

								            //

								            // [GN] Wrong. We do have to send status packets.

								            // If we have partial connectivity, we need to

								            // continue exchanging packets, so that the pruner,

								            // can learn indirectly that all nodes got the pruning results.

								            //

								            send_status_pkts = 1;


								            break;

								         }

								         else {

								            rgp->rgpcounter++;  /* increment the counter */

								         }


								         if ( (rgp->rgppkt.stage == RGP_ACTIVATED) && (rgp->sendstage == 0) )

								         {

								            /* To detect the potential failure of my timer pop mechanism

								             * (such as by the corruption of the time list), I wait for

								             * at least one regroup clock tick before I let myself and

								             * others know I am in stage 1.

								             */

								            // [GorN Jan14/2000]

								            //   We don't send our connectivity information,

								            //   before we get the first clock tick.

								            //   However we collect this information in

								            //   rgp->internal_connectivity_matrix.

								            //      Let's put it in the outgoing packet

								            //   so that everybody will see what we think about them.


								            MatrixOr(rgp->rgppkt.connectivity_matrix,

								                     rgp->internal_connectivity_matrix);


								            rgp->sendstage = 1; /* let everyone know we are in stage 1 */

								         }

								         else if ( (rgp->rgppkt.stage >= RGP_CLOSING) &&

								              (rgp->rgppkt.stage <= RGP_PHASE2_CLEANUP) )

								         { /* check for possible abort and restart */


								            if (rgp->rgpcounter >= RGP_MUST_RESTART)

								            {

								              /* Stalled out. Probably someone died after starting

								               * or another node is still in stage 1 cautious mode

								               */


								               if ( ++(rgp->restartcount) > RGP_RESTART_MAX ) {

								                   // It is not a good idea to die, because somebody

								                   // is stalling. Let's add stallees into ignore mask and restart

								                   //

								                   // RGP_ERROR(RGP_INTERNAL_ERROR); // [Fixed]

								                   cluster_t tmp, *stage;


								                   switch (rgp->rgppkt.stage) {

								                   case RGP_CLOSING: stage = &rgp->rgppkt.knownstage2; break;

								                   case RGP_PRUNING: stage = &rgp->rgppkt.knownstage3; break;

								                   case RGP_PHASE1_CLEANUP: stage = &rgp->rgppkt.knownstage4; break;

								                   case RGP_PHASE2_CLEANUP: stage = &rgp->rgppkt.knownstage5; break;

								                   }

								                   ClusterDifference(tmp, rgp->rgpinfo.cluster, *stage);


								                   //

								                   // If we stalled during closing, due to tiebraker running

								                   // the pruning algorithn going bunkers, we can have tmp = 0

								                   // In this case, we need to ignore somebody to guarantee that

								                   // the algorithm completes.

								                   //

								                   if ( ClusterEmpty(tmp) && rgp->tiebreaker_selected) {

								                       ClusterInsert(tmp, rgp->tiebreaker);

								                   }


								                   ClusterUnion(rgp->ignorescreen, rgp->ignorescreen, tmp);

								               }


								               /* If we are stalling in stage 3 and we have been pruned out,

								                * it is possible that we are stalling because we have been

								                * isolated from all other nodes. We must halt in this case.

								                */

								               if ( (rgp->rgppkt.stage == RGP_PRUNING) &&

								                    !ClusterMember(rgp->rgppkt.pruning_result, rgp->mynode) )

								                  RGP_ERROR(RGP_PRUNED_OUT);


								               rgp->cautiousmode = 1;

								               rgp->rgppkt.seqno = rgp->rgppkt.seqno + 1;


								               RGP_TRACE( "RGP stalled     ", rgp->rgppkt.stage, 0, 0, 0 );


								               regroup_restart();


								            } /* Stalled out ... */

								         } /* check for possible abort and restart */


								         if ((rgp->rgppkt.stage == RGP_CLOSING) && rgp->tiebreaker_selected)

								            rgp->pruning_ticks++;


								         evaluatestageadvance();


								         send_status_pkts = 1; /* send rgp packets regardless of progress */


								         break;


								      } /* called on regroup clock tick when regroup is active */


								      case RGP_EVT_RECEIVED_PACKET :

								      { /* received an rgp packet */


								         /* If the sending node is excluded by the outer screen, then it is

								          * not even part of the current (most recently known) configuration.

								          * Therefore the packet should not be honored, and a poison message

								          * should be sent to try to kill this renegade processor.

								          * That is done in the calling routine that processes all incoming

								          * regroup module packets (IamAlive, regroup and poison packets).

								          */


								         /* If the sending node was accepted by the outer screen but then

								          * excluded by the inner screen, then the packet will be disregarded

								          * but no poison message sent. This phenomenon may occur when this

								          * node has entered stage 2 without having heard from (recognized)

								          * the sending node and then a message arrives late from that

								          * sending node. In this case the fate of the sending node, i.e.

								          * whether it gets ruled out of the global configuration or not is

								          * unknown at this point. If the sender can get itself recognized

								          * by some node before that node enters stage 2, then it will be

								          * saved. Otherwise it will be declared down and subsequently shot

								          * with poison packets if it ever tries to assert itself.

								          */


									  /* Remember the arg to this routine is the packet pointer */

								         rcvd_pkt_p = (rgp_pkt_t *)arg; /* address of pkt just received */

									     if ( rgp->rgppkt.seqno != rcvd_pkt_p->seqno)

										     RGP_TRACE( "RGP Event       ", event, causingnode, rgp->rgppkt.stage, rgp->rgpcounter );  /* TRACE */


								         UnpackIgnoreScreen(rcvd_pkt_p, ignorescreen_rcvd);

								         if ( !ClusterEmpty(ignorescreen_rcvd) ) {

								             RGP_TRACE( "RGP Incoming pkt", GetCluster(ignorescreen_rcvd),

								                        rcvd_pkt_p->seqno, rgp->rgppkt.stage, causingnode);

								         }


								         if ( !ClusterMember(rgp->innerscreen, INT_NODE(causingnode))) {

								             RGP_TRACE( "RGP Ignoring !inner", causingnode, rgp->rgppkt.stage,

								                        GetCluster(rgp->innerscreen), GetCluster(ignorescreen_rcvd) );

								             return;

								         }


								         RGP_LOCK; // To ensure that the timer thread does not initiate

								                   // regroup asynchronously at this time.


								//////////////////////////// New Ignore Screen Stuff /////////////////////////////////


								         if (ClusterMember(rgp->ignorescreen, INT_NODE(causingnode) )) {

								             RGP_UNLOCK;

								             RGP_TRACE( "RGP Ignoring", causingnode, rgp->rgppkt.stage,

								                        GetCluster(rgp->ignorescreen), GetCluster(ignorescreen_rcvd) );

								             return;

								         }


								         if (rcvd_pkt_p->seqno < rgp->last_stable_seqno ) {

								             RGP_UNLOCK;

								             RGP_TRACE( "RGP old packet", causingnode, rcvd_pkt_p->seqno, rgp->last_stable_seqno, 0);

								             // This is a late packet from the previous regroup incident

								             // from the node that is currently in my outerscreen.

								             // This node could not have sent it now, this is probably a packet

								             // that stuck somewhere and was delieverd eons later.

								             // Simply ignore it.

								             return;

								         }


								         if ( ClusterMember(ignorescreen_rcvd, rgp->mynode ) ) {

								             //

								             // Sender ignores me. We will do the same to him.

								             //

								             ClusterInsert(rgp->ignorescreen, INT_NODE(causingnode) );

								             rgp->rgppkt.seqno = rgp->rgppkt.seqno + 1;

								             regroup_restart();

								             send_status_pkts = 1;

								             RGP_UNLOCK;

								             break;

								         }


								         if ( ClusterCompare(ignorescreen_rcvd, rgp->ignorescreen) ) {

								             // We have the same ignore screen.

								             // No work needs to be done

								         } else if ( ClusterSubsetOf(rgp->ignorescreen, ignorescreen_rcvd) ) {

								             // Incoming packet has smaller ignore screen

								             // Ignore this packet, but reply to its sender with

								             // our current regroup packet to force to upgrade to

								             // our view of the world.


								             // do so only if we are properly initialized

								             if (rgp->rgppkt.stage == RGP_COLDLOADED && !rgp->OS_specific_control.ShuttingDown) {

								                 RGP_ERROR(RGP_RELOADFAILED);

								             }


								             RGP_TRACE( "RGP smaller ignore mask ",

								                        GetCluster(ignorescreen_rcvd), GetCluster(rgp->ignorescreen),   /* TRACE */

								                        rgp->rgppkt.stage, rcvd_pkt_p->stage ); /* TRACE */


								             ClusterInsert(rgp->status_targets, INT_NODE(causingnode));

								             rgp_broadcast(RGP_UNACK_REGROUP);

								             RGP_UNLOCK;

								             return;

								         } else if ( ClusterSubsetOf(ignorescreen_rcvd, rgp->ignorescreen) ) {

								             RGP_TRACE( "RGP bigger ignore mask ",

								                        GetCluster(ignorescreen_rcvd), GetCluster(rgp->ignorescreen),   /* TRACE */

								                        rgp->rgppkt.stage, causingnode ); /* TRACE */

								             // Incoming packet has bigger ignore screen.

								             // Upgrade to this information and process the packet

								             rgp->rgppkt.seqno = rcvd_pkt_p->seqno;


								             /*  Somebody else activated regroup. So, let's just copy */

								             /*  the sender's reason code and reason nodes.           */


								             //

								             // Ignore mask parts are in the reason and activatingnode fields

								             //


								             ClusterCopy(rgp->ignorescreen, ignorescreen_rcvd); // fix bug #328216

								             rgp->rgppkt.reason = rcvd_pkt_p->reason;

								             rgp->rgppkt.activatingnode = rcvd_pkt_p->activatingnode;

								             rgp->rgppkt.causingnode = rcvd_pkt_p->causingnode;

								             regroup_restart();

								             send_status_pkts = 1;

								         } else {

								             RGP_TRACE( "RGP different ignore masks ",

								                        GetCluster(ignorescreen_rcvd), GetCluster(rgp->ignorescreen),   /* TRACE */

								                        rgp->rgppkt.stage, causingnode ); /* TRACE */

								             // Ignore masks are different and neither of them is

								             // a subset of another.

								             //

								             // We need to merge information out of these masks

								             // and restart the regroup.

								             //

								             // Packet that we just received will be ignored


								             ClusterUnion(rgp->ignorescreen, rgp->ignorescreen, ignorescreen_rcvd);

								             rgp->rgppkt.seqno = max(rgp->rgppkt.seqno, rcvd_pkt_p->seqno) + 1;

								             regroup_restart();

								             send_status_pkts = 1;

								             RGP_UNLOCK;

								             break;

								         }


								//////////////////////////// End of new Ignore Screen Stuff /////////////////////////////////


								         // Now ignorescreens of this node packet and incoming packet are the same //

								         // proceed with regular regroup processing //


								         /* Since the packet is acceptable, the regroup sequence number

								          * must be compared to that of this node. If the incoming message

								          * has a higher sequence number, then a new pass of the regroup

								          * algorithm has started. This node must accept the new sequence

								          * number, reinitialize its data, and start partcicipating in

								          * the new pass. Also, the incoming message must be processed

								          * since, once the algorithm reinitializes, the sequence numbers

								          * now match.

								          *

								          * If the incoming packet has a matching sequence number, then it

								          * should be accepted. The knowledge of the global state of the

								          * algorithm it reflects must be merged with that already present

								          * in this node. Then this node must evaluate whether further

								          * state transitions are possible.

								          *

								          * Finally, if the incoming packet has a lower sequence number, then

								          * it comes from a node unaware of the current level of the global

								          * algorithm. The data in it should be ignored, but a packet should

								          * be sent to it so that it will reinitialize its algorithm.

								          *

								          * The sequence number is a 32 bit algebraic value - hopefully it

								          * will never wrap around.

								          */


								         if (rcvd_pkt_p->seqno < rgp->rgppkt.seqno)

								         { /* sender below current level - ignore but let him know it*/


								            RGP_TRACE( "RGP lower seqno ",

								                       rgp->rgppkt.seqno, rcvd_pkt_p->seqno,   /* TRACE */

								                       rgp->rgppkt.stage, rcvd_pkt_p->stage ); /* TRACE */


								            ClusterInsert(rgp->status_targets, INT_NODE(causingnode));

								            rgp_broadcast(RGP_UNACK_REGROUP);

								                        RGP_UNLOCK;

								            return;

								         }


								         if (rcvd_pkt_p->seqno > rgp->rgppkt.seqno)

								         { /* sender above current level - I must upgrade to it*/


								            // The node that forces a restart responsible for keeping

								            // track of restarts and making a decision who will die/be ignored

								            // if ( ++(rgp->restartcount) > RGP_RESTART_MAX )

								            //   RGP_ERROR(RGP_INTERNAL_ERROR);


								            if ( (rgp->rgppkt.stage != RGP_STABILIZED) ||

								                 ((rcvd_pkt_p->seqno - rgp->rgppkt.seqno) > 1) )

								            {

								               RGP_TRACE( "RGP higher seqno",

								                          rgp->rgppkt.seqno, rcvd_pkt_p->seqno,  /* TRACE */

								                          rgp->rgppkt.stage, rcvd_pkt_p->stage );/* TRACE */

								               rgp->cautiousmode = 1;

								            }


								            rgp->rgppkt.seqno = rcvd_pkt_p->seqno;


								            /*  Somebody else activated regroup. So, let's just copy */

								            /*  the sender's reason code and reason nodes.           */


								            rgp->rgppkt.reason = rcvd_pkt_p->reason;

								            rgp->rgppkt.activatingnode = rcvd_pkt_p->activatingnode;

								            rgp->rgppkt.causingnode = rcvd_pkt_p->causingnode;

								            regroup_restart();

								            send_status_pkts = 1;


								         } /* sender above current level - I must upgrade to it*/


								         /* Now we are at the same level - even if we weren't at first.

								          *

								          * If the sender has already commited to a view of the world

								          * that excludes me, I must halt in order to keep the system in

								          * a consistent state.

								          *

								          * This is true even with the split brain avoidance algorithm.

								          * The fact that stage1 = stage2 in the packet implies that the

								          * sender has already run the split brain avoidance algorithm

								          * and decided that he should survive.

								          */


								         if ( (rcvd_pkt_p->stage > RGP_ACTIVATED) &&

								              ClusterCompare(rcvd_pkt_p->knownstage1,

								                             rcvd_pkt_p->knownstage2) &&

								              !ClusterMember(rcvd_pkt_p->knownstage1, rgp->mynode) )

								         {

								             ClusterInsert(rgp->ignorescreen, INT_NODE(causingnode) );

								             rgp->rgppkt.seqno ++;

								             regroup_restart();

								             send_status_pkts = 1;

								             RGP_UNLOCK;

								//             /* I must die for overall consistency. */

								//             RGP_ERROR((uint16) (RGP_PARIAH + causingnode)); // [Fixed]

								             break;

								         }

								         RGP_UNLOCK;


								         /* If I have terminated the active part of the algorithm, I

								          * am in stage 6 and am not routinely broadcasting my status

								          * anymore. If I get a packet from someone else who has not

								          * yet terminated, then I must send him the word. But if he

								          * has terminated, I must not send any packet or else there

								          * will be an infinite loop of packets bouncing back and forth.

								          */


								         if (rgp->rgppkt.stage == RGP_STABILIZED)

								         { /* I have terminated so can't learn anything more. */

								            if (!ClusterCompare(rcvd_pkt_p->knownstage5,

								                                rgp->rgppkt.knownstage5))

								            { /* but sender has not so I must notify him */

								               ClusterInsert(rgp->status_targets, INT_NODE(causingnode));

								               rgp_broadcast(RGP_UNACK_REGROUP);

								            }

								            return;

								         }


								         /* At this point, the packet is from a legal node within the

								          * current round of the algorithm and I have not terminated

								          * at stage RGP_STABILIZED so I need to absorb whatever new

								          * info is in this packet.

								          *

								          * The way to merge what this packet says with what I already

								          * know is to just logically OR the known stage x fields

								          * together.

								          */

								          {

								              int seqno = rcvd_pkt_p->seqno&0xffff;

								              int stage = rcvd_pkt_p->stage&0xffff;

								              int trgs = *(int*)rgp->status_targets & 0xffff;

								              int node = INT_NODE(causingnode)&0xffff;


								              RGP_TRACE( "RGP recv pkt ",

								                  ((seqno << 16) | stage),

								                  RGP_MERGE_TO_32(

								                      rcvd_pkt_p->knownstage1,

								                      rcvd_pkt_p->knownstage2

								                      ),

								                  RGP_MERGE_TO_32(

								                      rcvd_pkt_p->knownstage3,

								                      rcvd_pkt_p->knownstage4

								                      ),

								                  (trgs << 16) | node

								                  );

								         }


								         rgp_sanity_check(rcvd_pkt_p,  "RGP Received packet");

								         rgp_sanity_check(&(rgp->rgppkt), "RGP Internal packet");


								         ClusterUnion(rgp->rgppkt.quorumowner, rcvd_pkt_p->quorumowner,

								                      rgp->rgppkt.quorumowner);

								         ClusterUnion(rgp->rgppkt.knownstage1, rcvd_pkt_p->knownstage1,

								                      rgp->rgppkt.knownstage1);

								         ClusterUnion(rgp->rgppkt.knownstage2, rcvd_pkt_p->knownstage2,

								                      rgp->rgppkt.knownstage2);

								         ClusterUnion(rgp->rgppkt.knownstage3, rcvd_pkt_p->knownstage3,

								                      rgp->rgppkt.knownstage3);

								         ClusterUnion(rgp->rgppkt.knownstage4, rcvd_pkt_p->knownstage4,

								                      rgp->rgppkt.knownstage4);

								         ClusterUnion(rgp->rgppkt.knownstage5, rcvd_pkt_p->knownstage5,

								                      rgp->rgppkt.knownstage5);

								         ClusterUnion(rgp->rgppkt.pruning_result, rcvd_pkt_p->pruning_result,

								                      rgp->rgppkt.pruning_result);


								         /* But when I am in stage 2, it is possible that I can learn to

								          * recognize some node I have not previously recognized by hearing

								          * of it indirectly from some other node that I have recognized.

								          * To handle this case, I always merge knownstage1 info into

								          * the inner screen so that subsequent messages from the newly

								          * recognized node will be accepted and processed.

								          */

								         if  ((rgp->rgppkt.stage == RGP_CLOSING) &&

								              !(rgp->tiebreaker_selected))

								            ClusterUnion(rgp->innerscreen, rgp->rgppkt.knownstage1,

								                         rgp->innerscreen);


								         /* In the first two stages of regroup, the inter-node connectivity

								          * information is collected and propagated. When we get a regroup

								          * packet, we turn ON the bit corresponding to the [our-node,

								          * sender-node] entry in the connectivity matrix. We also OR in

								          * the matrix sent by the sender node in the regroup packet.

								          *

								          * The matrix is not updated if we are in stage 1 and haven't

								          * received the first clock tick. This is to prevent the

								          * node pruning algorithm from considering us alive if our

								          * timer mechanism is disrupted, but the IPC mechanism is OK.

								          */


								         /* [GorN 01/07/2000] If we are not collection connectivity information,

								          * until we receive a first tick we can ran into problems if the node is

								          * killed right after it send out its first timer driven packet

								          * (which doesn't have any connectivity info yet). This can cause a

								          * confusion. See bug 451792.

								          *

								          * What we will do is we will collect connectivity information on

								          * the side even when rgp->sendstage is FALSE and move it into the regroup

								          * packet if we ever get a clock tick

								          */


								         if (rgp->rgppkt.stage < RGP_PRUNING && !rgp->sendstage)

								         {

								            MatrixSet(rgp->internal_connectivity_matrix,

								                      rgp->mynode, INT_NODE(causingnode));

								            if (causingnode != EXT_NODE(rgp->mynode))

								               MatrixOr(rgp->internal_connectivity_matrix,

								                        rcvd_pkt_p->connectivity_matrix);

								         }


								         if ((rgp->rgppkt.stage < RGP_PRUNING) && rgp->sendstage)

								         {

								            MatrixSet(rgp->rgppkt.connectivity_matrix,

								                      rgp->mynode, INT_NODE(causingnode));

								            if (causingnode != EXT_NODE(rgp->mynode))

								               MatrixOr(rgp->rgppkt.connectivity_matrix,

								                        rcvd_pkt_p->connectivity_matrix);

								         }


								         /* Now, I can evaluate whether additional state transitions are

								          * possible as a result of the info just received.

								          */

								         oldstage = rgp->rgppkt.stage;


								//       QuorumCheck now runs in a separate thread

								//         if (oldstage != RGP_CLOSING) // Cannot run Quorumcheck from here.

								         evaluatestageadvance();


								         /* To speed things up, let us broadcast our status if our

								          * stage has changed and we are willing to let others and

								          * ourselves see it.

								          */


								         if ( (oldstage != rgp->rgppkt.stage) && rgp->sendstage )

								            send_status_pkts = 1; /* broadcast at once to speed things up */


								         break;

								      }   /* received an rgp packet */


								      //

								      // We do not support power failure notifications in NT

								      //

								      #if defined(NT)


								      CL_ASSERT(event != RGP_EVT_POWERFAIL);

								      //

								      // Fall thru to default case

								      //


								      #else // NT


								      case RGP_EVT_POWERFAIL :

								      { /* Our node got a power up interrupt or an indication of power

								         * failure from another node. */


								         /* Note that this code will unconditionally abort and restart

								          * the algorithm even if it was active before the power failure.

								          * The new incident must be in cautious mode.

								          */


								         rgp->cautiousmode = 1;

								         rgp->rgppkt.seqno = rgp->rgppkt.seqno + 1;

								         rgp->rgppkt.reason = RGP_EVT_POWERFAIL;

								         rgp->rgppkt.activatingnode = (uint8) EXT_NODE(rgp->mynode);

								         rgp->rgppkt.causingnode = (uint8) causingnode;


								         /* rgp->pfail_state is set to a non-zero value when a pfail event

								          * is reported to regroup. It is decremented at every regroup clock

								          * tick till it reaches zero. While this number is non-zero, missing

								          * self IamAlives are ignored and do not cause the node to halt.

								          * This gives the sending hardware some time to recover from power

								          * failures before self IamAlives are checked.

								          */

								         if (causingnode == EXT_NODE(rgp->mynode))

								            rgp->pfail_state = RGP_PFAIL_TICKS;


								         /* Store the fact that causingnode experienced a PFAIL,

								          * for reporting to the message system when regroup stabilizes.

								          */

								         ClusterInsert(rgp->rgppkt.hadpowerfail, INT_NODE(causingnode));


								         regroup_restart();

								         send_status_pkts = 1;

								         break;

								      } /* power failure */


								      #endif // NT


								      default :

								      {

								         RGP_ERROR(RGP_INTERNAL_ERROR);

								      }

								   }


								   if (send_status_pkts) /* significant change - send status at once */

								   {

								      ClusterUnion(rgp->status_targets,

								                   rgp->outerscreen, rgp->status_targets);

								      rgp_broadcast(RGP_UNACK_REGROUP);

								   }

								}


								/************************************************************************

								 * rgp_check_packet

								 * =================

								 *

								 * Description:

								 *

								 *  verifies that RGP packet has reasonable values in

								 *  powerfail, knownstages, pruning_result, and connectivity_matrix fields

								 *

								 * Parameters:

								 *

								 *    rgp_pkt_t* pkt -

								 *       packet to be checked

								 *

								 * Returns:

								 *

								 *    0 - packet looks good

								 *    1,2,3... - strange looking packet

								 *

								 ************************************************************************/

								int rgp_check_packet(rgp_pkt_t* pkt) {

								   node_t       i;


								   //

								   // Verify that

								   //   knownstage5 \subset knownstage4 \subset knownstage3 \subset

								   //   knownstage2 \subset knownstage1 \subset rgp->rgpinfo.cluster

								   //

								   // int ClusterSubsetOf(cluster_t big, cluster_t small)

								   //   Returns 1 if set small = set big or small is a subset of big.

								   //


								   if( !ClusterSubsetOf(pkt->knownstage4, pkt->knownstage5) ) {

								      return 5;

								   }

								   if( !ClusterSubsetOf(pkt->knownstage3, pkt->knownstage4) ) {

								      return 4;

								   }

								   if( !ClusterSubsetOf(pkt->knownstage2, pkt->knownstage3) ) {

								      return 3;

								   }

								   if( !ClusterSubsetOf(pkt->knownstage1, pkt->knownstage2) ) {

								      return 2;

								   }

								   if( !ClusterSubsetOf(rgp->rgpinfo.cluster, pkt->knownstage1) ) {

								      return 1;

								   }


								   //

								   // pruning_result has to be a subset of knownstage2

								   //

								   if( !ClusterSubsetOf(pkt->knownstage2, pkt->pruning_result) ) {

								      return 9;

								   }


								   //

								   // quorumowner has to be a subset of original cluster

								   //

								   if(!ClusterSubsetOf(rgp->rgpinfo.cluster, pkt->quorumowner)) {

								      return 8;

								   }

								   //

								   // Check connectivity matrix

								   //

								   for(i = 0; i < MAX_CLUSTER_SIZE; ++i) {

								      if( ClusterMember( rgp->rgpinfo.cluster, i ) ) {

								         //

								         // Node i is a member of a cluster

								         // Its connectivity bitmap has to be a subset of rgp->rgpinfo.cluster

								         //

								         if(!ClusterSubsetOf(rgp->rgpinfo.cluster, pkt->connectivity_matrix[i])) {

								            return 10;

								         }

								      } else {

								         //

								         // Node i is not a member of a cluster

								         // Its connectivity bitmap has to be 0

								         //

								         if(!ClusterEmpty(pkt->connectivity_matrix[i]))

								            return 11;

								      }

								   }


								   return 0;

								}


								/************************************************************************

								 * rgp_print_packet

								 * =================

								 *

								 * Description:

								 *

								 *    Prints RGP packet fields

								 *

								 * Parameters:

								 *

								 *    rgp_pkt_t* pkt -

								 *       packet to be printed

								 *    char* label -

								 *       label to be printed together with a packet

								 *    int code -

								 *       a number to be printed together with a packet

								 *

								 * Returns:

								 *

								 *    VOID

								 *

								 ************************************************************************/

								void rgp_print_packet(rgp_pkt_t* pkt, char* label, int code)

								{

								   uint8                   pktsubtype;

								   uint8                   stage;

								   uint16                  reason;

								   uint32                  seqno;

								   uint8                   activatingnode;

								   uint8                   causingnode;

								   cluster_t               quorumowner;


								   RGP_TRACE( label,

								              pkt->seqno,                               /* TRACE */

								              code,

								              (pkt->stage << 16) |

								              (pkt->activatingnode  << 8) |

								              (pkt->causingnode),                       /* TRACE */

								              RGP_MERGE_TO_32( rgp->outerscreen,

								                               rgp->innerscreen )

								               );

								   RGP_TRACE( "RGP CHK masks       ",

								              RGP_MERGE_TO_32( rgp->rgpinfo.cluster,    /* TRACE */

								                               pkt->quorumowner ),      /* TRACE */

								              RGP_MERGE_TO_32( pkt->knownstage1,        /* TRACE */

								                               pkt->knownstage2 ),      /* TRACE */

								              RGP_MERGE_TO_32( pkt->knownstage3,        /* TRACE */

								                               pkt->knownstage4 ),      /* TRACE */

								              RGP_MERGE_TO_32( pkt->knownstage5,        /* TRACE */

								                               pkt->pruning_result ) ); /* TRACE */

								   RGP_TRACE( "RGP CHK Con. matrix1",

								        RGP_MERGE_TO_32( pkt->connectivity_matrix[0],   /*TRACE*/

								                         pkt->connectivity_matrix[1] ), /*TRACE*/

								        RGP_MERGE_TO_32( pkt->connectivity_matrix[2],   /*TRACE*/

								                         pkt->connectivity_matrix[3] ), /*TRACE*/

								        RGP_MERGE_TO_32( pkt->connectivity_matrix[4],   /*TRACE*/

								                         pkt->connectivity_matrix[5] ), /*TRACE*/

								        RGP_MERGE_TO_32( pkt->connectivity_matrix[6],   /*TRACE*/

								                         pkt->connectivity_matrix[7])); /*TRACE*/

								   RGP_TRACE( "RGP CHK Con. matrix2",

								        RGP_MERGE_TO_32( pkt->connectivity_matrix[8],   /*TRACE*/

								                         pkt->connectivity_matrix[9] ), /*TRACE*/

								        RGP_MERGE_TO_32( pkt->connectivity_matrix[10],  /*TRACE*/

								                         pkt->connectivity_matrix[11]), /*TRACE*/

								        RGP_MERGE_TO_32( pkt->connectivity_matrix[12],  /*TRACE*/

								                         pkt->connectivity_matrix[13]), /*TRACE*/

								        RGP_MERGE_TO_32( pkt->connectivity_matrix[14],  /*TRACE*/

								                         pkt->connectivity_matrix[15]));/*TRACE*/

								}


								/************************************************************************

								 * UnpackIgnoreScreen

								 * =================

								 *

								 * Description:

								 *

								 *    Extracts ignorescreen out of regroup packet

								 *

								 * Parameters:

								 *

								 *    rgp_pkt_t* from -

								 *       source packet

								 *    cluster_t to -

								 *       target node set

								 *

								 * Returns:

								 *

								 *    VOID

								 *

								 * Comments:

								 *

								 *   If the packet is received from NT4 node, unpacked ignorescreen

								 *   will ne always 0.

								 *

								 ************************************************************************/

								void UnpackIgnoreScreen(rgp_pkt_t* from, cluster_t to)

								{

								#pragma warning( push )

								#pragma warning( disable : 4244 )

								    if (from->reason < RGP_EVT_IGNORE_MASK) {

								        ClusterInit(to);

								    } else {

								        to[0] = ((uint16)from->reason) >> 8;

								        to[1] = (uint8)from->causingnode;

								    }

								#pragma warning( pop )

								}


								/************************************************************************

								 * rgp_print_packet

								 * =================

								 *

								 * Description:

								 *

								 *    Put an ignorescreen back into a regroup packet

								 *

								 * Parameters:

								 *

								 *    rgp_pkt_t* to -

								 *       packet to be updated

								 *    cluster_t from -

								 *       source node set

								 *

								 * Returns:

								 *

								 *    VOID

								 *

								 ************************************************************************/

								void PackIgnoreScreen(rgp_pkt_t* to, cluster_t from)

								{

								    if ( ClusterEmpty(from) ) {

								        to->reason &= 255;

								        to->causingnode = 0;

								    } else {

								        to->reason = (uint8)RGP_EVT_IGNORE_MASK | (from[0] << 8);

								        to->causingnode = from[1];

								    }

								}


								/*---------------------------------------------------------------------------*/


								#ifdef __cplusplus

								}

								#endif /* __cplusplus */


								#if 0


								History of changes to this file:

								-------------------------------------------------------------------------

								1995, December 13                                           F40:KSK0610          /*F40:KSK06102.2*/


								This file is part of the portable Regroup Module used in the NonStop

								Kernel (NSK) and Loosely Coupled UNIX (LCU) operating systems. There

								are 10 files in the module - jrgp.h, jrgpos.h, wrgp.h, wrgpos.h,

								srgpif.c, srgpos.c, srgpsm.c, srgputl.c, srgpcli.c and srgpsvr.c.

								The last two are simulation files to test the Regroup Module on a

								UNIX workstation in user mode with processes simulating processor nodes

								and UDP datagrams used to send unacknowledged datagrams.


								This file was first submitted for release into NSK on 12/13/95.

								------------------------------------------------------------------------------

								This change occurred on 19 Jan 1996                                              /*F40:MB06458.1*/

								Changes for phase IV Sierra message system release. Includes:                    /*F40:MB06458.2*/

								 - Some cleanup of the code                                                      /*F40:MB06458.3*/

								 - Increment KCCB counters to count the number of setup messages and             /*F40:MB06458.4*/

								   unsequenced messages sent.                                                    /*F40:MB06458.5*/

								 - Fixed some bugs                                                               /*F40:MB06458.6*/

								 - Disable interrupts before allocating broadcast sibs.                          /*F40:MB06458.7*/

								 - Change per-packet-timeout to 5ms                                              /*F40:MB06458.8*/

								 - Make the regroup and powerfail broadcast use highest priority                 /*F40:MB06458.9*/

								   tnet services queue.                                                          /*F40:MB06458.10*/

								 - Call the millicode backdoor to get the processor status from SP               /*F40:MB06458.11*/

								 - Fixed expand bug in msg_listen_ and msg_readctrl_                             /*F40:MB06458.12*/

								 - Added enhancement to msngr_sendmsg_ so that clients do not need               /*F40:MB06458.13*/

								   to be unstoppable before calling this routine.                                /*F40:MB06458.14*/

								 - Added new steps in the build file called                                      /*F40:MB06458.15*/

								   MSGSYS_C - compiles all the message system C files                            /*F40:MB06458.16*/

								   MSDRIVER - compiles all the MSDriver files                                    /*F40:MB06458.17*/

								   REGROUP  - compiles all the regroup files                                     /*F40:MB06458.18*/

								 - remove #pragma env libspace because we set it as a command line               /*F40:MB06458.19*/

								   parameter.                                                                    /*F40:MB06458.20*/

								-----------------------------------------------------------------------          /*F40:MB06458.21*/


								#endif    /* 0 - change descriptions */