windows-server-2003/base/cluster/service/mm/srgpsm.c

#ifdef __TANDEM
#pragma columns 79
#pragma page "srgpsm.c - T9050 - Regroup Module state machine routines"
#endif

/* @@@ START COPYRIGHT @@@
**  Tandem Confidential:  Need to Know only
**  Copyright (c) 1995, Tandem Computers Incorporated
**  Protected as an unpublished work.
**  All Rights Reserved.
**
**  The computer program listings, specifications, and documentation
**  herein are the property of Tandem Computers Incorporated and shall
**  not be reproduced, copied, disclosed, or used in whole or in part
**  for any reason without the prior express written permission of
**  Tandem Computers Incorporated.
**
** @@@ END COPYRIGHT @@@
**/

/*---------------------------------------------------------------------------
 * This file (srgpsm.c) contains regroup state machine routines.
 *---------------------------------------------------------------------------*/

#ifdef __cplusplus
   extern "C" {
#endif /* __cplusplus */


#include <wrgp.h>


/*---------- arbitration algorithm ------------ */

DWORD MmQuorumArbitrationTimeout   = CLUSTER_QUORUM_DEFAULT_ARBITRATION_TIMEOUT; // seconds
DWORD MmQuorumArbitrationEqualizer = 7;  // seconds

#define RGP_ARBITRATION_TIMEOUT             ((MmQuorumArbitrationTimeout * 100)/30) // tick == 300ms 
#define AVERAGE_ARBITRATION_TIME_IN_SECONDS (MmQuorumArbitrationEqualizer)

void enter_first_cleanup_stage();
void regroup_restart();
int ClusterEmpty(cluster_t c);

DWORD
DiskArbitrationThread(
    IN LPVOID param
    ) ;

_priv _resident static int
regroup_test_arbitrate_advance()
{
   cluster_t temp;
   int orig_numnodes    = ClusterNumMembers(rgp->rgpinfo.cluster);
   int current_numnodes = ClusterNumMembers(rgp->rgppkt.pruning_result);

   if( orig_numnodes == current_numnodes ) {
      return 1;
   }
   //
   // If somebody entered stage4 then our group owns the quorum
   //
   ClusterIntersection(
       temp,
       rgp->rgppkt.knownstage4,
       rgp->rgppkt.pruning_result
       );

   return ClusterNumMembers(temp) != 0;
}

_priv _resident static int
regroup_start_arbitrate()
{
   int orig_numnodes    = ClusterNumMembers(rgp->rgpinfo.cluster);
   int current_numnodes = ClusterNumMembers(rgp->rgppkt.pruning_result);

   if( orig_numnodes == current_numnodes ) {
      enter_first_cleanup_stage();
      return 0; // No Arbitration needed. Proceed to clean up stage //
   }
   else {
      cluster_t arbitrators;
      int       n_arbitrators;
      node_t    arbitrator;
      HANDLE    thread;
      DWORD     threadId;
      ULONG     epoch;

      RGP_LOCK;

      epoch = rgp->OS_specific_control.EventEpoch;

      if(rgp->arbitration_started) {
         RGP_UNLOCK;
         return 1; // stay in this stage for awhile
      }

      rgp->arbitration_ticks = 0;
      rgp->arbitration_started = 1;

      RGP_UNLOCK;

      ClusterIntersection(
          arbitrators,
          rgp->rgppkt.pruning_result,
          rgp->rgppkt.quorumowner
          );

      n_arbitrators = ClusterNumMembers(arbitrators);

      if(n_arbitrators == 0) {
         //
         // If there are no quorum owners in this group //
         // Let's take the guy with the lowest id       //
         //
         arbitrator = rgp_select_tiebreaker(rgp->rgppkt.pruning_result);
      } else {
         //
         // Otherwise we will take the quorum owner guy
         // with the lowest id
         //
         arbitrator = rgp_select_tiebreaker(arbitrators);

         if(n_arbitrators > 1) {
            RGP_TRACE( "RGP !!! More than one quorum owner",
                       EXT_NODE(arbitrator),                    /* TRACE */
                       GetCluster( rgp->rgpinfo.cluster ),      /* TRACE */
                       GetCluster( rgp->rgppkt.pruning_result ),/* TRACE */
                       GetCluster( rgp->rgppkt.knownstage2 ) ); /* TRACE */
            // Do we need to kill all other arbitrators?
            // No.
            // ClusterDelete(arbitrators, arbitrator);
            // ClusterUnion(
            //     rgp->poison_targets,
            //     rgp->poison_targets,
            //     arbitrators
            //     );
            // rgp_broadcast(RGP_UNACK_POISON);
         }
      }

      rgp->tiebreaker = arbitrator;

      //
      // Now we have an arbitrating node
      // We will run a thread that will run arbitration algorithm
      //

      RGP_TRACE( "RGP Arbitration Delegated to",
                 EXT_NODE(arbitrator),                    /* TRACE */
                 GetCluster( rgp->rgpinfo.cluster ),      /* TRACE */
                 GetCluster( rgp->rgppkt.pruning_result ),       /* TRACE */
                 GetCluster( rgp->rgppkt.knownstage2 ) ); /* TRACE */

      // Fix Bug #460991
      // regroup_restart on stage 4 or later will reset ArbitratingNode
      // and if all the nodes are present after restart ApproxArbitrationWinner
      // will be not set properly. Assign it here.
      rgp->OS_specific_control.ApproxArbitrationWinner =
      rgp->OS_specific_control.ArbitratingNode = (DWORD)EXT_NODE(arbitrator);
  
      if(arbitrator != rgp->mynode) {
         return 1;
      }

      thread = CreateThread( NULL, // security attributes
                             0,    // stack_size = default
                             DiskArbitrationThread,
                             ULongToPtr(epoch),
                             0,    // runs immediately
                             &threadId );
      if(thread == NULL) {
         //
         // Force Others to regroup //
         //
         RGP_LOCK;

         rgp_event_handler( RGP_EVT_BANISH_NODE, EXT_NODE(rgp->mynode) );

         RGP_UNLOCK;

         //
         // Kill this node
         //
         RGP_ERROR(RGP_ARBITRATION_FAILED);

         return FALSE;
      }

      CloseHandle(thread);
   }
   return TRUE;
}

DWORD
DiskArbitrationThread(
    IN LPVOID param
    )
{
   cluster_t current_participants;
   DWORD     status;
   int       participant_count;
   int       delay;
   ULONG_PTR startingEpoch = (ULONG_PTR) param;
   BOOL      EpochsEqual;
   int       orig_numnodes;
   int       current_numnodes;
   LONGLONG  Time1, Time2;
   
   ClusterCopy(current_participants, rgp->rgppkt.pruning_result);
   orig_numnodes = ClusterNumMembers(rgp->rgpinfo.cluster);
   current_numnodes = ClusterNumMembers(current_participants);

   RGP_LOCK;

   EpochsEqual = ( startingEpoch == rgp->OS_specific_control.EventEpoch );

   RGP_UNLOCK;

   if(!EpochsEqual)
      return 0;

   delay = (orig_numnodes+1)/2 - current_numnodes;

   if(delay < 0) delay = 0;

   Sleep(delay * 6000);

   RGP_LOCK;

   EpochsEqual = ( startingEpoch == rgp->OS_specific_control.EventEpoch );
   if (EpochsEqual) {
      rgp->OS_specific_control.ArbitrationInProgress += 1;
   }

   RGP_UNLOCK;

   if(!EpochsEqual)
      return 0;

   GetSystemTimeAsFileTime((LPFILETIME)&Time1);
   status = (*(rgp->OS_specific_control.QuorumCallback))();
   GetSystemTimeAsFileTime((LPFILETIME)&Time2);

   if (status != 0 
    && startingEpoch == rgp->OS_specific_control.EventEpoch)
   {
       // If we won the arbitration and we are in the same epoch (approx check)
       // we need to figure out whether we need to slow down a little
   
       Time2 -= Time1;

       // Convert to seconds

       Time2 = Time2 / 10 / 1000 / 1000;
       //
       // [HACKHACK] GorN Oct/30/1999
       //   We had a weird timejump in the middle of the arbitration
       //   Arbitration was completed before it started, we slept for 
       //   too long and regroup timed us out. Let's guard against it.
       //
       if ( (Time2 >= 0)
         && (Time2 < AVERAGE_ARBITRATION_TIME_IN_SECONDS) ) 
       {
       
          //
          // Don't need to be better than the average
          // If we are so fast, let's slow down
          //

          Time2 = AVERAGE_ARBITRATION_TIME_IN_SECONDS - Time2;
       
          RGP_TRACE( "RGP sleeping",
                  (ULONG)Time2,  /* TRACE */
                  0,      /* TRACE */
                  0,      /* TRACE */
                  0 );    /* TRACE */
          Sleep( (ULONG)(Time2 * 1000) );
       }
   }       
   

   RGP_LOCK;

   rgp->OS_specific_control.ArbitrationInProgress -= 1;

   EpochsEqual = ( startingEpoch == rgp->OS_specific_control.EventEpoch );

   if(!EpochsEqual) {
      RGP_UNLOCK;
      return 0;
   }

   if(status) {
      //
      // We own the quorum device
      // Let's proceed to the next stage
      //
      enter_first_cleanup_stage();
      RGP_UNLOCK;
      //
      // All the rest will see that we are in cleanup stage and
      // will proceed to it too
      //
   } else {
      //
      // Force Others to regroup //
      //
      rgp_event_handler( RGP_EVT_BANISH_NODE, EXT_NODE(rgp->mynode) );
      RGP_UNLOCK;

      //
      // Kill this node
      //
      RGP_ERROR(RGP_ARBITRATION_FAILED);
   }

   return 0;
}

/************************************************************************
 * rgp_check_packet
 * rgp_print_packet
 * =================
 *
 * Description:
 *
 *    Forward declarations of functions used in rgp_sanity_check macro
 *
 ************************************************************************/
void rgp_print_packet(rgp_pkt_t* pkt, char* label, int code);
int  rgp_check_packet(rgp_pkt_t* pkt);

/************************************************************************
 * rgp_sanity_check
 * =================
 *
 * Description:
 *
 *   This macro prints RGP packet if it has unreasonable values in
 *   powerfail, knownstages, pruning_result, and connectivity_matrix fields.
 *
 * Parameters:
 *
 *    rgp_pkt_t* pkt -
 *       packet to be checked
 *    char* label -
 *       label that will be printed together with a packet
 *
 * Returns:
 *
 *    VOID
 *
 ************************************************************************/

#define rgp_sanity_check(__pkt,__label)                    \
do {                                                       \
  int __code; __code = rgp_check_packet(__pkt);            \
  if( __code ) {rgp_print_packet(__pkt, __label, __code);} \
} while ( 0 )


/*---------------------------------------------------------------------------*/

/************************************************************************
 * split_brain_avoidance_algorithm
 * ===============================
 *
 * Description:
 *
 *    This algorithm ensures that, after a regroup incident completes,
 *    at most one group of nodes will survive regardless of connectivity
 *    failures.
 *
 * Parameters:
 *
 *    None
 *
 * Returns:
 *
 *    void - no return value; The algorithm results in either this node
 *    halting (with the RGP_AVOID_SPLIT_BRAIN halt code) or this group
 *    being the only group that survives.
 *
 * Algorithm:
 *
 *    The algorithm is described in detail in the Sierra Tech Memo S.84,
 *    "Modifications in Regroup Algorithm for Sierra".
 *
 *    The algorithm looks at the set of nodes currently visible from the
 *    local cluster and compares it to the set of nodes alive before
 *    the regroup incident started (outerscreen). The decision to survive
 *    or halt depends on the number of nodes in the current group compared
 *    to the number of nodes in the original group.
 *
 *    Case 1:
 *       If the current group contains > half the original number, this
 *       group survives.
 *
 *    Case 2:
 *       If the current group contains < half the original number, this
 *       node (and group) halts.
 *
 *    Case 3:
 *       If the current group contains exactly half the original number AND
 *       the current group has at least two members, then this group
 *       survives if and only if it contains the tie-breaker node (selected
 *       when the cluster is formed and after each regroup incident).
 *
 *    Case 4:
 *       If the current group contains exactly half the original number AND
 *       the current group has exactly one member, then we will call the
 *               QuromSelect procedure to check if the Quorum Disk is accessible
 *               from this node. If the procedure returns value TRUE we survive;
 *               else we halt.
 *
 *
 ************************************************************************/
_priv _resident static void
split_brain_avoidance_algorithm()
{
   int orig_numnodes, current_numnodes;

   RGP_TRACE( "RGP SpltBrainAlg",
              EXT_NODE(rgp->tiebreaker),               /* TRACE */
              GetCluster( rgp->rgpinfo.cluster ),      /* TRACE */
              GetCluster( rgp->outerscreen ),          /* TRACE */
              GetCluster( rgp->rgppkt.knownstage2 ) ); /* TRACE */

   /* Sanity checks:
    * 1. The current set of nodes must be a subset of the original set
    *    of nodes.
    * 2. My node must be in the current set. This was checked
    *    when stage2 was entered. No need to check again.
    */
   if (!ClusterSubsetOf(rgp->rgpinfo.cluster, rgp->rgppkt.knownstage2))
      RGP_ERROR(RGP_INTERNAL_ERROR);

   orig_numnodes    = ClusterNumMembers(rgp->rgpinfo.cluster);
   current_numnodes = ClusterNumMembers(rgp->rgppkt.knownstage2);

   if (orig_numnodes == current_numnodes)
      /* All nodes are alive. No split brain possibility. */
      return;

   else if (orig_numnodes == 2)  /* Special 2-node case */
   {
      if ((*(rgp->OS_specific_control.QuorumCallback))())
         return; /* we have access to Quorum disk. We survive. */
      else {
#if defined( NT )
          ClusnetHalt( NmClusnetHandle );
#endif
          RGP_ERROR(RGP_AVOID_SPLIT_BRAIN);
      }
   } /* Special 2-node case */

   else /* Multi (>2) node case */
   {
      if ((current_numnodes << 1) > orig_numnodes)
         /* Our group has more than half the nodes => we are the majority.
          * We can survive. Other group(s) will kill themselves.
          */
         return;
      else if ((current_numnodes << 1) < orig_numnodes)
         /* Our group has less than half the nodes => there may be a
          * larger group alive. We must halt and allow that group to
          * survive.
          */
         RGP_ERROR(RGP_AVOID_SPLIT_BRAIN);
      else
      {
         /* Our group has exactly half the number of processors;
          * We survive if we contain the tie-breaker node and halt otherwise.
          */
         if (ClusterMember(rgp->rgppkt.knownstage2, rgp->tiebreaker))
            return;
         else
            RGP_ERROR(RGP_AVOID_SPLIT_BRAIN);
      }
   } /* Multi (>2) node case */

}


/************************************************************************
 * regroup_restart
 * ===============
 *
 * Description:
 *
 *    Starts a new regroup incident.
 *
 * Parameters:
 *
 *    None
 *
 * Returns:
 *
 *    void - no return value
 *
 * Algorithm:
 *
 *    Sets the regroup state to RGP_ACTIVATED, pauses all IO and
 *    initializes the stage masks and connectivity matrix.
 *
 ************************************************************************/
_priv _resident static void
regroup_restart()
{
   cluster_t old_ignorescreen;
   UnpackIgnoreScreen(&rgp->rgppkt, old_ignorescreen);

   RGP_TRACE( "RGP (re)starting",
              rgp->rgppkt.seqno,                               /* TRACE */
              rgp->rgppkt.reason,                              /* TRACE */
              rgp->rgppkt.activatingnode,                      /* TRACE */
              rgp->rgppkt.causingnode );                       /* TRACE */

   RGP_TRACE( "RGP masks       ",
              RGP_MERGE_TO_32( rgp->outerscreen,               /* TRACE */
                               rgp->innerscreen ),             /* TRACE */
              RGP_MERGE_TO_32( rgp->rgppkt.knownstage1,        /* TRACE */
                               rgp->rgppkt.knownstage2 ),      /* TRACE */
              RGP_MERGE_TO_32( rgp->rgppkt.knownstage3,        /* TRACE */
                               rgp->rgppkt.knownstage4 ),      /* TRACE */
              RGP_MERGE_TO_32( rgp->rgppkt.knownstage5,        /* TRACE */
                               rgp->rgppkt.pruning_result ) ); /* TRACE */

   /* We are about to start a new pass of the regroup algorithm.
    * This does not necessarily mean we have finished the previous
    * pass; i.e., in an abort situation we may be starting over.
    * This may occur when some other node fails during the current
    * pass through the algorithm leaving us hung up at one of the
    * intermediate stages.
    */

   //
   // GN. When we do MM_LEAVE. Our state is COLDLOADED.
   //  Bailing out of regroup_restart here would prevent us from
   //  forming a regroup packet that would initate a banishing regroup incident
   //

   /* To avoid split brained nodes from corrupting data in storage
    * devices, we request the transport subsystem to hold all IO requests
    * in a queue and not transfer them over SNet. We will allow IO to
    * be resumed when regroup can guarantee that there can no longer be
    * split brains. This will be done when the final group is determined
    * and regroup enters the RGP_PHASE1_CLEANUP stage.
    */

   rgp_hold_all_io();

   /* The following is a bit of history from the NSK regroup algorithm from
    * pre-Sierra systems based on the InterProcessor Bus (IPB). Some of
    * the particulars mentioned here have changed, but the principle remains.
    *
    * Previously, we used to mark all the known stages as zero, except for
    * stage1. We used to mark only ourselves as in stage1. So, even if our
    * bus reception logic is screwed up, and we are not receiving packets
    * from anybody including ourselves, we would mark ourselves as being in
    * stage1. And after (what used to be) six ticks, we would proceed into
    * stage2 and mark ourselves as being in stage2. This would cause stage1
    * and stage2 to be equal, and our world would constitute just
    * ourselves. Thus we would go through regroup eliminating everybody
    * else. However, since we are not receiving packets from anybody else,
    * we would miss our own iamalive packets, and we too will soon die of
    * %4032. Thus the symptoms would constitute everybody else dying of
    * (%4040 + some node number), and that node dying with a %4032 halt.
    * See TPR S 88070112309628 for more details.
    *
    * To avoid this situation, we now do not mark ourselves as in a
    * particular stage until we get our own regroup packets indicating we
    * are in that stage. Thus, in regroup_restart, all the stages are
    * cleared. Previously, regroupbroadcaststatus in sendqueuedmessages
    * used to send directly from the regroup_control structures.
    * regroupbroadcaststatus has been modified to construct the unsequenced
    * packets on its stack. It would first copy the state from the
    * regroup_control structure, and then would LOR in our node into a known
    * stage, if requested to do so. When we receive that packet, we would
    * merge that information into our state, and thus we would be
    * guaranteed that our bus sending and reception logic is working, and
    * that we can legitimately mark ourselves as being in that stage. This
    * whole change avoids problems where bus sending logic works, but bus
    * reception logic is screwed up for both buses in a node.
    */

   rgp->sendstage = 0; /* Don't let anyone know I am in stage 1 until
                        * I have seen a regroup clock tick; this is to
                        * cause this node to halt if it is not getting
                        * clock ticks. I will halt when the other nodes
                        * advance without me and send me a status packet
                        * indicating this or send me a poison packet
                        * after declaring me down.
                        */


   rgp->rgpcounter = 0;
   ClusterInit(rgp->rgppkt.knownstage1);
   ClusterInit(rgp->rgppkt.knownstage2);
   ClusterInit(rgp->rgppkt.knownstage3);
   ClusterInit(rgp->rgppkt.knownstage4);
   ClusterInit(rgp->rgppkt.knownstage5);
   ClusterInit(rgp->rgppkt.pruning_result);

   MatrixInit(rgp->rgppkt.connectivity_matrix);
   MatrixInit(rgp->internal_connectivity_matrix);
   
   /* Just for ease of debugging, to send in our poison packets, we keep
    * the known nodes mask at the start of regroup. poison packets contain
    * known nodes at the beginning of regroup and at the end of it.
    */

   ClusterCopy(rgp->initnodes, rgp->rgpinfo.cluster);
   ClusterInit(rgp->endnodes);

#if defined( NT )
   //
   // increment the event epoch so we can detect stale events
   // from clusnet
   //
   ++rgp->OS_specific_control.EventEpoch;
#endif

   if ( (rgp->rgppkt.stage >= RGP_CLOSING) &&
        (rgp->rgppkt.stage <= RGP_PHASE2_CLEANUP) &&
        ClusterCompare(rgp->rgppkt.knownstage1,
                       rgp->rgppkt.knownstage2) ) 
   {
       //
       // If we were interrupted by this restart after we closed
       // 1st stage regroup window, then no nodes can be added to group w/o joining.
       //
       // Thus we will add missing nodes into our ignorescreen.
       // This will force the regroup not to wait for them in stage1
       cluster_t tmp;

       ClusterDifference(tmp, rgp->rgpinfo.cluster, rgp->innerscreen);
       ClusterUnion(rgp->ignorescreen, rgp->ignorescreen, tmp);
   }

   if ( ClusterMember(rgp->ignorescreen, rgp->mynode) ) {
       // We shouldn't have get here, but since we are here
       // Let's shield us from the outside world
       RGP_TRACE( "Self Isolation", 0, 0, 0, 0 );
       ClusterCopy(rgp->ignorescreen, rgp->rgpinfo.cluster);
       ClusterDelete(rgp->ignorescreen, rgp->mynode);
   }

   if ( !ClusterEmpty(rgp->ignorescreen) ) {
       // if we are ignoring somebody we have
       // to be cautious. I.e. we will stay longer in the
       // first stage to give a chance to everybody to learn about
       // our ignorescreen
       rgp->cautiousmode = 1;
   } 
   
   if ( !ClusterCompare(old_ignorescreen, rgp->ignorescreen) ) {
       // Ignore screen is changed, reset restart counter //
       RGP_TRACE( "Ignorescreen->", GetCluster(old_ignorescreen), GetCluster(rgp->ignorescreen), 0, 0 );
       rgp->restartcount = 0;
   }
   PackIgnoreScreen(&rgp->rgppkt, rgp->ignorescreen);

   rgp->arbitration_started = 0;

   rgp->OS_specific_control.ArbitratingNode = MM_INVALID_NODE;
   if ( !rgp_is_perturbed() ) {
       ResetEvent( rgp->OS_specific_control.Stabilized );
   }

   ClusterInit(rgp->rgppkt.quorumowner);
   if( QuorumOwner == (DWORD)EXT_NODE(rgp->mynode) ) {
      ClusterInsert(rgp->rgppkt.quorumowner, rgp->mynode);
   }


   if (rgp->rgppkt.stage == RGP_COLDLOADED)
   {
       if (!rgp->OS_specific_control.ShuttingDown) {
           //
           // Currently, RGP_RELOADFAILED calls ExitProcess
           // During clean shutdown we would like to send the regroup packet
           // out triggering a regroup. So we don't want to die.
           //
           // Since we are not resetting state to RGP_ACTIVATED, this
           // node will not be able to participate in the regroup.
           //
           RGP_ERROR(RGP_RELOADFAILED);
       }
   } else {
       rgp->rgppkt.stage = RGP_ACTIVATED;
   }

}

/************************************************************************
 * regroup_test_stage2_advance
 * ===========================
 *
 * Description:
 *
 *    Checks to see if we can advance to regroup stage 2.
 *
 * Parameters:
 *
 *    None
 *
 * Returns:
 *
 *    int - 1 if stage 2 can be entered and 0 if not.
 *
 * Algorithm:
 *
 *    Stage 2 can be entered if one of the following conditions is true.
 *
 *    (a) all nodes are present and accounted for and at least one
 *        regroup clock tick has occurred
 *    (b) we are not in cautious mode, all but one node are present
 *        and accounted for, AND a minimum number of ticks
 *        (rgp_quickdecisionlegit) have elapsed.
 *    (c) if RGP_MUST_ENTER_STAGE2 ticks have elapsed.
 *
 ************************************************************************/
_priv _resident static int
regroup_test_stage2_advance()
{

   cluster_t stragglers; /* set of nodes not yet checkd in */
   int num_stragglers;   /* # of nodes not yet checkd in   */

   /* Stage 2 must be entered after some interval regardless of any
    * other conditions.
    */
   if (rgp->rgpcounter == 0)
      return(0);
   if (rgp->rgpcounter >= RGP_MUST_ENTER_STAGE2)
   {
       RGP_TRACE( "RGP S->2cautious",
                  rgp->rgpcounter,                         /* TRACE */
                  rgp->cautiousmode,                       /* TRACE */
                  GetCluster( rgp->outerscreen ),          /* TRACE */
                  GetCluster( rgp->rgppkt.knownstage1 ) ); /* TRACE */
      return(1);
   }

   /* The number of ticks is between 1 and RGP_MUST_ENTER_STAGE2.
    * We need to examine the stage1 mask to decide if we can
    * advance.
    *
    * If every node in the old configuration has checked in, I can
    * advance at once. This is either a false alarm or caused by
    * power failure or connectivity failures.
    */

   /* Compute the set of nodes from the original configuration not yet
    * recognized.
    */
   ClusterDifference(stragglers, rgp->outerscreen,
                     rgp->rgppkt.knownstage1);

   //
   // We shouldn't wait for the nodes we are ignoring,
   // since we cannot get a packet from them anyway
   //
   ClusterDifference(stragglers, stragglers, 
                     rgp->ignorescreen);

   if ((num_stragglers = ClusterNumMembers(stragglers)) == 0)
   {
      RGP_TRACE( "RGP S->2 all in ",
                 rgp->rgpcounter,                        /* TRACE */
                 GetCluster( rgp->outerscreen ), 0, 0 ); /* TRACE */

      return(1);   /* all present and accounted for */
   }

   /* If stragglers is non-empty, perhaps I can still advance to stage 2
    * if I am not in cautious mode (no recent power fail and not
    * aborting and rerunning the regroup algorithm) AND all nodes but
    * one have checked in AND some minimum number of ticks have elapsed.
    *
    * The minimum number of ticks is selected to be 1 greater than the
    * the LATEPOLL inititiation period (allowed consecutive missed IamAlive time)
        * since that should guarantee that, if the
    * cluster has broken off into multiple disconnected clusters,
    * the other clusters would have detected the missing IamAlives,
    * started regroup and paused IO, thus preventing the possibility
    * of data corruption caused by a split brain situation.
    */

   if (!(rgp->cautiousmode) &&
       (num_stragglers == 1) &&
           (rgp->rgpcounter > rgp->rgpinfo.Min_Stage1_ticks))
   {
      RGP_TRACE( "RGP S->2 1 miss ",
                 rgp->rgpcounter,                            /* TRACE */
                 GetCluster( rgp->outerscreen ),             /* TRACE */
                 GetCluster( rgp->rgppkt.knownstage1 ), 0 ); /* TRACE */
      return(1);  /* advance - all but one checked in */
   }

   return(0); /* sorry cannot advance yet */

}


/************************************************************************
 * regroup_stage3_advance
 * ===========================
 *
 * Description:
 *
 *    This function is called after the split brain avoidance algorithm
 *    is run and the tie-breaker is selected in stage 2. It checks if
 *    we can proceed to stage 3 (RGP_PRUNING) and advances to stage 3
 *    if possible.
 *
 * Parameters:
 *
 *    None
 *
 * Returns:
 *
 *    int - 1 if the regroup stage has been advanced to RGP_PRUNING;
 *          0 if the stage cannot be advanced yet.
 *
 * Algorithm:
 *
 *    The algorithm depends on whether we are the tie-breaker or not.
 *
 *    On the tie-breaker node, we first check if there are any
 *    disconnects in the cluster. If there aren't any, there is no need
 *    for pruning. We can then set pruning_result to knownstage2,
 *    advance to the RGP_PRUNING stage and return 1. If there are
 *    disconnects, we must wait a certain number of ticks to collect
 *    connectivity info from all nodes. If the number of ticks have not
 *    passed, return 0. If the required number of ticks have elapsed,
 *    we must call the pruning algorithm to get the list of potential
 *    groups. After that, the select_cluster() routine is called to
 *    pick one from the set of possible clusters. After this is done,
 *    pruning_result is set to the selected cluster and we return 1.
 *
 *    On a non-tiebreaker node, nothing is done till a stage3 packet is
 *    received from the tie-breaker node or another node which got a
 *    stage 3 packet. If a stage 3 packet has not been received, we
 *    simply return 0. If a stage 3 packet is received, RGP_PRUNING
 *    stage is entered and we return 1.
 *
 ************************************************************************/
_priv _resident int
regroup_stage3_advance()
{
   int stage_advanced = 0, numgroups, groupnum;

   if (rgp->tiebreaker == rgp->mynode)
   {
      if (connectivity_complete(rgp->rgppkt.connectivity_matrix))
      {

         /* No disconnects. All nodes in knownstage2 survive. */
         rgp->rgppkt.stage = RGP_PRUNING;

         ClusterCopy(rgp->rgppkt.pruning_result,
                     rgp->rgppkt.knownstage2);
         stage_advanced = 1;

         RGP_TRACE( "RGP S->3 NoPrune", rgp->rgpcounter, 0, 0, 0 );
      }

      /* There are disconnects; must wait for connectivity
       * information to be complete. The info is deemed
       * complete after a fixed number of ticks have
       * elapsed.
       */

      else if (rgp->pruning_ticks >= RGP_CONNECTIVITY_TICKS)
      { /* connectivity info collection complete; enter stage 3 */

         RGP_TRACE( "RGP Con. matrix1",
              RGP_MERGE_TO_32( rgp->rgppkt.connectivity_matrix[0],   /*TRACE*/
                               rgp->rgppkt.connectivity_matrix[1] ), /*TRACE*/
              RGP_MERGE_TO_32( rgp->rgppkt.connectivity_matrix[2],   /*TRACE*/
                               rgp->rgppkt.connectivity_matrix[3] ), /*TRACE*/
              RGP_MERGE_TO_32( rgp->rgppkt.connectivity_matrix[4],   /*TRACE*/
                               rgp->rgppkt.connectivity_matrix[5] ), /*TRACE*/
              RGP_MERGE_TO_32( rgp->rgppkt.connectivity_matrix[6],   /*TRACE*/
                               rgp->rgppkt.connectivity_matrix[7])); /*TRACE*/
         RGP_TRACE( "RGP Con. matrix2",
              RGP_MERGE_TO_32( rgp->rgppkt.connectivity_matrix[8],   /*TRACE*/
                               rgp->rgppkt.connectivity_matrix[9] ), /*TRACE*/
              RGP_MERGE_TO_32( rgp->rgppkt.connectivity_matrix[10],  /*TRACE*/
                               rgp->rgppkt.connectivity_matrix[11]), /*TRACE*/
              RGP_MERGE_TO_32( rgp->rgppkt.connectivity_matrix[12],  /*TRACE*/
                               rgp->rgppkt.connectivity_matrix[13]), /*TRACE*/
              RGP_MERGE_TO_32( rgp->rgppkt.connectivity_matrix[14],  /*TRACE*/
                               rgp->rgppkt.connectivity_matrix[15]));/*TRACE*/

         numgroups = find_all_fully_connected_groups(
                        rgp->rgppkt.connectivity_matrix,
                        rgp->tiebreaker,
                        rgp->potential_groups);

         if ((void *)rgp->select_cluster == RGP_NULL_PTR)
         {
             node_t keynode;
             cluster_t temp;
             ClusterIntersection(
                 temp,
                 rgp->rgppkt.knownstage2,
                 rgp->rgppkt.quorumowner
                 );
             if ( ClusterEmpty(temp) ) {
                 keynode = RGP_NULL_NODE;
             } else {
                 keynode = rgp_select_tiebreaker(temp);
             }
             RGP_TRACE( "RGP keynode ng  ", keynode, numgroups, 0, 0); /*TRACE*/
            /* No callback specified; use regroup's own routine. */
            groupnum = rgp_select_cluster_ex(
                           rgp->potential_groups, numgroups, keynode);
         }
         else
         {
            /* Call routine specified at rgp_start() time. */
            groupnum = (*(rgp->select_cluster))(
                           rgp->potential_groups, numgroups);
         }

         if (groupnum >= 0)
            ClusterCopy(rgp->rgppkt.pruning_result,
                        rgp->potential_groups[groupnum]);
         else
            /* No group can survive. Can't halt yet.
             * Need to tell everyone else.
             */
            ClusterInit(rgp->rgppkt.pruning_result);

         rgp->rgppkt.stage = RGP_PRUNING;

         stage_advanced = 1;

         RGP_TRACE( "RGP S->3 Pruned ",
                    rgp->rgpcounter,                          /* TRACE */
                    GetCluster( rgp->rgppkt.knownstage2 ),    /* TRACE */
                    GetCluster( rgp->rgppkt.pruning_result ), /* TRACE */
                    numgroups );                              /* TRACE */

      } /* connectivity info collection complete; enter stage 3 */

   } /* tie-breaker node */

   else

   { /* not tie-breaker node */

      if (ClusterNumMembers(rgp->rgppkt.knownstage3) != 0)
      {
         /* We got a stage 3 packet from someone. Enter stage 3. */
         rgp->rgppkt.stage = RGP_PRUNING;

         stage_advanced = 1;

         RGP_TRACE( "RGP Got S3 pkt  ",
                    rgp->rgpcounter,                          /* TRACE */
                    GetCluster( rgp->rgppkt.knownstage2 ),    /* TRACE */
                    GetCluster( rgp->rgppkt.pruning_result ), /* TRACE */
                    GetCluster( rgp->rgppkt.knownstage3 ) );  /* TRACE */
      }

   } /* not tie-breaker node */

   return(stage_advanced);
}


/************************************************************************
 * enter_first_cleanup_stage
 * =========================
 *
 * Description:
 *
 *    This function performs the actions required when entering the
 *    first of the message clean up stages.
 *
 * Parameters:
 *
 *    None
 *
 * Returns:
 *
 *    void - no return value
 *
 * Algorithm:
 *
 *    There are many actions to be performed after the final cluster
 *    is selected. The actions are described in comments throughout
 *    this routine.
 *
 ************************************************************************/
_priv _resident void
enter_first_cleanup_stage()
{
   cluster_t banishees;
   node_t failer;

   rgp->rgppkt.stage = RGP_PHASE1_CLEANUP;

   RGP_TRACE( "RGP S->4        ", rgp->rgpcounter, 0, 0, 0 );

   /* The packets we send now will not indicate we are in the phase 1
    * cleanup stage yet. We indicate we are in this stage only after
    * we have completed the clean up action associated with the stage.
    * This is done in rgp_event_handler, under the
    * RGP_EVT_PHASE1_CLEANUP_DONE event.
    */
   rgp->sendstage = 0;

   /* Now, we can resume IO since we have passed the split brain danger.
    * New split brain situations will result in regroup restarting and
    * pausing IO again.
    */

   rgp_resume_all_io();

   /* Compute in banishees the set of nodes being lost from the old
    * configuration.
    */

   ClusterDifference(banishees, rgp->rgpinfo.cluster,
                     rgp->rgppkt.pruning_result);

   /* Install the new configuration into the masks. */

   ClusterCopy(rgp->outerscreen,     rgp->rgppkt.pruning_result);

#if defined( NT )
   ClusnetSetOuterscreen(
       NmClusnetHandle,
       (ULONG)*((PUSHORT)rgp->outerscreen)
       );
#endif

   ClusterCopy(rgp->innerscreen,     rgp->rgppkt.pruning_result);
   ClusterCopy(rgp->endnodes,        rgp->rgppkt.pruning_result);
   ClusterCopy(rgp->rgpinfo.cluster, rgp->rgppkt.pruning_result);

   /* Select a new tiebreaker because the previous one may have been    */
   /* pruned out. Note: tiebreaker_selected has already been set in S2. */
   rgp->tiebreaker =
      rgp_select_tiebreaker(rgp->rgppkt.pruning_result);
      /* F40 Bug FixID KCY0833 */

   /* Mark the state of the banishees as dead and invoke the
    * node down callback routine.
    */
   for (failer = 0; failer < (node_t) rgp->num_nodes; failer++)
      if (ClusterMember(banishees, failer)
          || rgp->node_states[failer].status == RGP_NODE_COMING_UP // fix bug#265069
          )
      {
         rgp->node_states[failer].status = RGP_NODE_DEAD;
         rgp->node_states[failer].pollstate = AWAITING_IAMALIVE;
         rgp->node_states[failer].lostHBs = 0;

#if !defined(NT)
         (*(rgp->nodedown_callback))(EXT_NODE(failer));
#else

         ClusnetSetNodeMembershipState(NmClusnetHandle,
                                       EXT_NODE( failer ),
                                       ClusnetNodeStateDead);

         //
         // On NT we do the nodedown callback at the end of stage 5.
         // This allows the cleanup phases to complete before we let
         // the "upper" layers know that a node went down.
         //
         if ( ClusterMember(rgp->OS_specific_control.CPUUPMASK,failer) )
            ClusterInsert(
                rgp->OS_specific_control.NeedsNodeDownCallback,
                failer
                );

#endif // !defined(NT)

      }

   /* If some nodes have been lost from the configuration, then I will
    * queue regroup status packets to them. This is a best efforts
    * attempt to ensure that they get quickly taken out if they
    * do in fact continue to run.
    */

   ClusterUnion(rgp->status_targets, banishees, rgp->status_targets);

   //
   // In NT, we are using rgp->rgppkt.hadpowerfail to transmit
   // quorum ownership information
   //
   #if !defined(NT)

   /* I should inform the message system of any node that experienced a
    * power on recovery. The message system can use this to clear error
    * counters so that a link will not be declared down due to errors
    * which may have been caused by the power failure.
    */

   for (failer = 0; failer < (node_t) rgp->num_nodes; failer++)
      if ((ClusterMember(rgp->rgppkt.hadpowerfail, failer)) &&
          !(ClusterMember(banishees, failer)))
         /* This survivor had a power failure. */
         rgp_had_power_failure( EXT_NODE(failer) );

   #endif // NT

   /* Tell the OS to start clean up operations for the failed nodes. */
   rgp_start_phase1_cleanup();
}


/************************************************************************
 * evaluatestageadvance
 * ====================
 *
 * Description:
 *
 *    This function evaluates whether additional state transitions are
 *    possible as a result of the info just received.
 *
 * Parameters:
 *
 *    None
 *
 * Returns:
 *
 *    void - no return value
 *
 * Algorithm:
 *
 *    To evaluate whether we can advance through the stages, a loop is
 *    used with a case entry for each stage. If an entry decides not to
 *    advance to the next stage, it must return from the function. If
 *    it does advance, it should not return but remain in the loop
 *    since it is possible to have cascaded stage transitions
 *    especially in a two node system. Thus, the loop is exited when no
 *    more stage transitions are possible.
 *
 ************************************************************************/
_priv _resident static void
evaluatestageadvance()
{
   cluster_t   temp_cluster;
   node_t      node;
   node_t          i;

   for (;;)  /* loop until someone exits by returning */
   {
      switch (rgp->rgppkt.stage)
      {

         case RGP_COLDLOADED :
         {
            if (!rgp->OS_specific_control.ShuttingDown) {
                RGP_ERROR(RGP_RELOADFAILED);
            }
            return;
         }


         case RGP_ACTIVATED :
         { /* evaluate whether to go to stage RGP_CLOSING */

            if (!regroup_test_stage2_advance())
               return;

            if (!ClusterMember(rgp->rgppkt.knownstage1, rgp->mynode))
               RGP_ERROR(RGP_MISSED_POLL_TO_SELF);

            rgp->rgppkt.stage = RGP_CLOSING;

            rgp->rgpcounter = 0;
            rgp->tiebreaker_selected = 0;

            /* If we abort the regroup, and there's somebody that everybody
             * banished on this regroup, the following line keeps him from
             * joining up on the next regroup.
             */
            ClusterCopy(rgp->innerscreen, rgp->rgppkt.knownstage1);

            break;

         } /* evaluate whether to go to stage RGP_CLOSING */


         case RGP_CLOSING :
         { /* evaluate whether to go to stage RGP_PRUNING */

            if (rgp->tiebreaker_selected)
            {
               if (regroup_stage3_advance())
                  break;  /* try to advance further */
               else
                  return; /* cannot advance any more */
            }

            if (!ClusterCompare(rgp->rgppkt.knownstage1,
                                rgp->rgppkt.knownstage2))
               return;

           //
           // In NT, we no longer use the split-brain avoidance algorithm.
           // We use a cluster-wide arbitration algorithm instead.
           //
           #if !defined(NT)
            /* When the known stage 1 and known stage 2 sets are the
             * same, we have the complete set of nodes that are
             * connected to us. It is time to execute the split-
             * brain avoidance algorithm. If we are a splinter group
             * cut off from the main group, we will not survive this
             * algorithm.
             */

           split_brain_avoidance_algorithm();

           #endif // NT

            /* We are the lucky survivors of the split brain avoidance
             * algorithm. Now, we must proceed to elect a new tie-breaker
             * since the current tie-breaker may no longer be with us.
             */

            rgp->tiebreaker =
               rgp_select_tiebreaker(rgp->rgppkt.knownstage2);

            rgp->tiebreaker_selected = 1;

            RGP_TRACE( "RGP S2 tiebr sel",
                       rgp->rgpcounter,               /* TRACE */
                       EXT_NODE(rgp->tiebreaker),     /* TRACE */
                       0, 0 );                        /* TRACE */

            rgp->pruning_ticks = 0;
            break;

         } /* evaluate whether to go to stage 3 */


         case RGP_PRUNING :
         { /* evaluate whether to go to RGP_PHASE1_CLEANUP stage */

            if (rgp->arbitration_started) {
               if (regroup_test_arbitrate_advance()) {
                  enter_first_cleanup_stage();
                  break;
               } else {
                  return; // Stay in this stage //
               }
            }

            if (rgp->has_unreachable_nodes)
            {
               RGP_TRACE( "RGP Unreach Node",
                  GetCluster( rgp->rgppkt.pruning_result ),     /* TRACE */
                  GetCluster( rgp->unreachable_nodes ), 0, 0 ); /* TRACE */

               /* Must check if the unreachable nodes are in the
                * selected final group. If so, we must restart
                * regroup.
                */
               ClusterIntersection(temp_cluster, rgp->unreachable_nodes,
                                   rgp->rgppkt.pruning_result);

               /* Clear the unreachable node mask and flag after examining
                * them. If we restart, we will start with a clean slate.
                */
               rgp->has_unreachable_nodes = 0;
               ClusterInit(rgp->unreachable_nodes);

               if (ClusterNumMembers(temp_cluster) != 0)
               {
                  /* We have a node unreachable event to a node
                   * selected to survive. We must regenerate
                   * the connectivity matrix and re-run the node
                   * pruning algorithm. Start a new regroup incident.
                   * All restarts are in cautious mode.
                   */
                  rgp->cautiousmode = 1;
                  rgp->rgppkt.seqno = rgp->rgppkt.seqno + 1;
                  rgp->rgppkt.reason = RGP_EVT_NODE_UNREACHABLE;
                  rgp->rgppkt.activatingnode = (uint8) EXT_NODE(rgp->mynode);

                  /* For causingnode, pick the first unreachable node
                   * in temp_cluster.
                   */
                  for (node = 0; node < (node_t) rgp->num_nodes; node++)
                  {
                     if (ClusterMember(temp_cluster, node))
                     {
                        rgp->rgppkt.causingnode = (uint8) EXT_NODE(node);
                        break;
                     }
                  }
                  regroup_restart();
                  return;
               }
            }

            if (!ClusterCompare(rgp->rgppkt.knownstage2,
                                rgp->rgppkt.knownstage3))
               return;

            /* All nodes in the connected cluster have been notified
             * of the pruning decision (entered stage 3). If we are
             * selected to survive, we can now enter stage 4. If we are
             * not in the selected group (pruning_result), we must halt.
             * Wait for at least one node in PRUNING_RESULT to get into
             * stage 4 before halting. This ensures that the algorithm
             * does not stall in stage 3 with all pruned out nodes
             * halting before ANY of the survivors finds that all nodes
             * entered stage 3.
             */

            if (!ClusterMember(rgp->rgppkt.pruning_result, rgp->mynode))
            {
               /* Wait for at least one node in PRUNING_RESULT
                * to get into stage 4 before halting. Since only
                * nodes in PRUNING_RESULT get into stage 4, it is
                * sufficient to check if knownstage4 has any members.
                */
               if (ClusterNumMembers(rgp->rgppkt.knownstage4) != 0)
                  RGP_ERROR(RGP_PRUNED_OUT);
                           return;
            }

            // proceed to second stage of pruning - arbitration
            if( regroup_start_arbitrate() ) {
               return; // stay in this stage
            } else {
               break;  // either proceed to the next, or restart
            }

            break;

         }  /* evaluate whether to go to RGP_PHASE1_CLEANUP stage */


         case RGP_PHASE1_CLEANUP :
         { /* evaluate whether to go to RGP_PHASE2_CLEANUP stage */

            if (!ClusterCompare(rgp->rgppkt.pruning_result,
                                rgp->rgppkt.knownstage4))
               return;

            rgp->rgppkt.stage = RGP_PHASE2_CLEANUP;

            RGP_TRACE( "RGP S->5        ", rgp->rgpcounter, 0, 0, 0 );

            /* The packets we send now will not indicate we are in the phase 2
             * cleanup stage yet. We indicate we are in this stage only after
             * we have completed the clean up action associated with the stage.
             * This is done in rgp_event_handler, under the
             * RGP_EVT_PHASE2_CLEANUP_DONE event.
             */
            rgp->sendstage = 0;

            rgp_start_phase2_cleanup();

            break;

         }   /* evaluate whether to go to RGP_PHASE2_CLEANUP stage */


         case RGP_PHASE2_CLEANUP :
         { /* evaluate whether to go to RGP_STABILIZED stage */

            if (!ClusterCompare(rgp->rgppkt.knownstage4,
                                rgp->rgppkt.knownstage5))
               return;

            RGP_LOCK;

            //
            // [HACKHACK] This is not necessary anymore, since we
            // are holding the lock in message.c when delivering 
            // regroup packet received event
            //
            if (RGP_PHASE2_CLEANUP != rgp->rgppkt.stage) {
                RGP_TRACE( "RGP S->6 (race) ", rgp->rgpcounter, rgp->rgppkt.stage, 0, 0 );
                break;
            }

            rgp->rgppkt.stage             = RGP_STABILIZED;

            RGP_TRACE( "RGP S->6        ", rgp->rgpcounter, 0, 0, 0 );

            rgp->rgpcounter        = 0;
            rgp->restartcount      = 0;

            /* Reset the regroup flags which have not yet been cleared. */
            rgp->cautiousmode      = 0;

            /* Clear the mask indicating nodes which own the quorum resrc. */
            ClusterInit(rgp->rgppkt.quorumowner);

            /* Copy the sequence number into the rgpinfo area. */
            rgp->rgpinfo.seqnum = rgp->rgppkt.seqno;

            SetEvent( rgp->OS_specific_control.Stabilized );
            if (rgp->OS_specific_control.ArbitratingNode != MM_INVALID_NODE) {
                // Somebody was arbitrating //
                rgp->OS_specific_control.ApproxArbitrationWinner =
                	rgp->OS_specific_control.ArbitratingNode;
                if (rgp->OS_specific_control.ArbitratingNode == (DWORD)EXT_NODE(rgp->mynode)) {
                    //
                    // [HackHack] To close 422405
                    // when 421828 is fixed, please uncomment the following line
                    //
                    // QuorumOwner = rgp->OS_specific_control.ArbitratingNode;
                } else {
                    if (QuorumOwner != MM_INVALID_NODE) {
                        ClRtlLogPrint(LOG_UNUSUAL, 
                            "[MM] : clearing quorum owner var (winner is %1!u!), %.\n", 
                            rgp->OS_specific_control.ArbitratingNode
                            );
                    }
                    QuorumOwner = MM_INVALID_NODE;
                }
            }

            rgp_cleanup_complete();

#if defined(NT)
            //
            // On NT we deferred doing the node down callback until all the
            // cleanup phases have been complete.
            //
            ClusterCopy(
                rgp->OS_specific_control.CPUUPMASK,
                rgp->rgpinfo.cluster
                );

            (*(rgp->nodedown_callback))(
                rgp->OS_specific_control.NeedsNodeDownCallback
                );

            //
            // Clear the down node mask
            //
            ClusterInit(rgp->OS_specific_control.NeedsNodeDownCallback);

            //
            // finally, tell clusnet that regroup has finished
            //
            ClusnetRegroupFinished(NmClusnetHandle,
                                   rgp->OS_specific_control.EventEpoch,
                                   rgp->rgppkt.seqno);

            rgp->last_stable_seqno = rgp->rgppkt.seqno;
            

            RGP_UNLOCK;
#endif

            return;

         } /* evaluate whether to go to RGP_STABILIZED stage */


         case RGP_STABILIZED :
            return;            /* stabilized, so I am all done */

                 default :
            RGP_ERROR(RGP_INTERNAL_ERROR);  /* unknown stage */

      } /* switch (rgp->rgppkt.stage) */

  } /* loop until someone exits by returning */
}


/************************************************************************
 * rgp_event_handler
 * =================
 *
 * Description:
 *
 *    The state machine and the heart of the regroup algorithm.
 *
 * Parameters:
 *
 *    int event -
 *       which event happened
 *
 *    node_t causingnode -
 *       node causing the event: node which sent a regroup status
 *       packet or whose IamAlives are missed; if the causing node is
 *       not relevant information, RGP_NULL_NODE can be passed and
 *       is ignored. *This node ID is in external format.*
 *
 * Returns:
 *
 *    void - no return value
 *
 * Algorithm:
 *
 *    The state machine is the heart of the regroup algorithm.
 *    It is organized as a switch statement with the regroup stage as
 *    the case label and the regroup event as the switch variable.
 *    Events could cause regroup to start a new incident, to advance
 *    through stages or to update information without advancing to
 *    another stage. This routine also arranges for regroup status
 *    packets to be sent to all relevant nodes including our own
 *    node.
 *
 ************************************************************************/
_priv _resident void
RGP_EVENT_HANDLER_EX(int event, node_t causingnode, void *arg)
{

   rgp_pkt_t    *rcvd_pkt_p;
   cluster_t    ignorescreen_rcvd;
   uint8        oldstage;
   int          send_status_pkts = 0;


    /* Note: arg is only used when event == RGP_EVENT_RECEIVED_PACKET.  It is the ptr to the packet */

   /* Trace unusual invocations of this routine. */
   if  (event != RGP_EVT_RECEIVED_PACKET  &&  event != RGP_EVT_CLOCK_TICK)
	  RGP_TRACE( "RGP Event       ", event, causingnode, rgp->rgppkt.stage, rgp->rgpcounter );  /* TRACE */

   switch (event)
   {
      case RGP_EVT_NODE_UNREACHABLE :
      { /* All paths to a node are unreachable */

         /* Ignore the event if the unreachable node has been eliminated
          * from our outerscreen. The message system probably doesn't
          * know it yet.
          */
         if (ClusterMember(rgp->outerscreen, INT_NODE(causingnode)))
         {
            /* Store this event and check after node pruning (when
             * entering the RGP_PRUNING stage). If a regroup incident
             * is in progress and we haven't entered the RGP_PRUNING
             * stage yet, this will happen in the current incident.
             * If not, it will happen in the next regroup incident
             * which will surely start soon due to this disconnect.
             *
             * We do not start a regroup incident for this event. We will
             * wait for IamAlives to be missed for starting a new regroup
             * incident. This is due to the requirement that, in case
             * of a total disconnect resulting in multiple groups, we must
             * stay in stage 1 till we can guarantee that the other group(s)
             * has started regroup and paused IO. We assume that the
             * regroup incident started at the IamAlive check tick and
             * use the periodic nature of the IamAlive sends and
             * IamAlive checks to limit the stage1 pause to the period
             * of IamAlive sends (+ 1 tick to drain IO). If we started
             * a regroup incident due to the node unreachable event, we
             * have to stay in stage1 longer.
             */
            rgp->has_unreachable_nodes = 1;
            ClusterInsert(rgp->unreachable_nodes, INT_NODE(causingnode));

            break;
         }
      } /* All paths to a node are unreachable */


      case RGP_EVT_PHASE1_CLEANUP_DONE :
      {
         /* The following checks are needed in case we restarted
          * regroup and asked for phase1 cleanup multiple times.
          * We must make sure that all such requests have been
          * completed.
          */
         if ( (rgp->rgppkt.stage == RGP_PHASE1_CLEANUP) &&
              (rgp->rgp_msgsys_p->phase1_cleanup == 0) )
         { /* all caught up */

            /* Let others and ourselves get packets indicating we are in
             * this stage. When we get that packet, we will update our
             * knownstage field. If our sending or receiving apparatus
             * failed meanwhile and we don't get our own packet, it
             * will cause regroup to be restarted.
             */
            rgp->sendstage = 1;
            send_status_pkts = 1;
            evaluatestageadvance();
         } /* all caught up */

         break;
      }


      case RGP_EVT_PHASE2_CLEANUP_DONE :
      {

         /* The following checks are needed in case we restarted
          * regroup and asked for phase2 cleanup multiple times.
          * We must make sure that all such requests have been
          * completed.
          */
         if ( (rgp->rgppkt.stage == RGP_PHASE2_CLEANUP) &&
              (rgp->rgp_msgsys_p->phase2_cleanup == 0) )
         { /* all caught up */
            /* Let others and ourselves get packets indicating we are
             * in this stage.
             */
            rgp->sendstage = 1;
            send_status_pkts = 1;
            evaluatestageadvance();
         } /* all caught up */
         break;
      }


      case RGP_EVT_LATEPOLLPACKET :
      { /* some node is late with IamAlives */

         RGP_LOCK; // to ensure that the packet receive does not initiate
                           // regroup asynchronously.
                 /* Start a new regroup incident if not already active. */
         if (rgp->rgppkt.stage == RGP_STABILIZED)
         {
            rgp->rgppkt.seqno = rgp->rgppkt.seqno + 1;
            rgp->rgppkt.reason = RGP_EVT_LATEPOLLPACKET;
            rgp->rgppkt.activatingnode = (uint8) EXT_NODE(rgp->mynode);
            rgp->rgppkt.causingnode = (uint8) causingnode;
            regroup_restart();
            send_status_pkts = 1;
         } else if (rgp->rgppkt.stage == RGP_COLDLOADED)
         {
            RGP_ERROR(RGP_RELOADFAILED);
         }
         RGP_UNLOCK;
         break;
      } /* some node is late with IamAlives */

      case MM_EVT_LEAVE:
         rgp->OS_specific_control.ShuttingDown = TRUE;
      case RGP_EVT_BANISH_NODE :
      { /* assumes that the lock is held */

         rgp->rgppkt.seqno = rgp->rgppkt.seqno + 1;
         rgp->rgppkt.activatingnode = (uint8) EXT_NODE(rgp->mynode);
         // Pack Ignore Screen in the regroup_restart will
         // fill reason and causingnode fields of the packet
         ClusterInsert(rgp->ignorescreen, INT_NODE(causingnode) );
         regroup_restart();
         send_status_pkts = 1;
         break;
      }
#if 0
      case MM_EVT_LEAVE: // this node needs to leave the cluster gracefully
      {
                // Initiate a Regroup Event amongst remaining members if any
                // Start a new regroup incident if not already active.
        if (rgp->rgppkt.stage == RGP_STABILIZED)
        {
           rgp->rgppkt.seqno = rgp->rgppkt.seqno + 1;
           rgp->rgppkt.reason = MM_EVT_LEAVE;
           rgp->rgppkt.activatingnode = (uint8) EXT_NODE(rgp->mynode);
           rgp->rgppkt.causingnode = (uint8) EXT_NODE(rgp->mynode);
           regroup_restart();
           send_status_pkts = 1;
        }
        break;
      }
#endif

      case RGP_EVT_CLOCK_TICK :
      { /* called on regroup clock tick when regroup is active */

         if( (rgp->rgppkt.stage == RGP_PRUNING) &&
             (rgp->arbitration_started)
           )
         {
            rgp->arbitration_ticks++;

            if (rgp->arbitration_ticks >= RGP_ARBITRATION_TIMEOUT) {
               //
               // Kill timed-out arbitrator
               //
               if(rgp->tiebreaker == rgp->mynode) {
                  //
                  // If this node was arbitrating, then die
                  //
                  if ( IsDebuggerPresent() ) {
                     DebugBreak();
                  }

                  RGP_ERROR(RGP_ARBITRATION_STALLED);
               }
               else {
                  //
                  // Kill the arbitrator and initiate another regroup
                  //
                  RGP_TRACE(
                      "RGP arbitration stalled     ",
                      rgp->rgppkt.stage, 0, 0, 0
                      );

                  rgp_event_handler(
                      RGP_EVT_BANISH_NODE,
                      EXT_NODE(rgp->tiebreaker)
                      );

                  break;
               }
            }

            evaluatestageadvance();

            //
            // No need to send packets while we are waiting for
            // the arbitrator to win
            //
            // send_status_pkts = rgp->rgppkt.stage != RGP_PRUNING;
            //
            // [GN] Wrong. We do have to send status packets.
            // If we have partial connectivity, we need to 
            // continue exchanging packets, so that the pruner,
            // can learn indirectly that all nodes got the pruning results.
            //
            send_status_pkts = 1;

            break;
         }
         else {
            rgp->rgpcounter++;  /* increment the counter */
         }

         if ( (rgp->rgppkt.stage == RGP_ACTIVATED) && (rgp->sendstage == 0) )
         {
            /* To detect the potential failure of my timer pop mechanism
             * (such as by the corruption of the time list), I wait for
             * at least one regroup clock tick before I let myself and
             * others know I am in stage 1.
             */
            // [GorN Jan14/2000] 
            //   We don't send our connectivity information,
            //   before we get the first clock tick.
            //   However we collect this information in
            //   rgp->internal_connectivity_matrix.
            //      Let's put it in the outgoing packet
            //   so that everybody will see what we think about them.
            
            MatrixOr(rgp->rgppkt.connectivity_matrix, 
                     rgp->internal_connectivity_matrix);
                     
            rgp->sendstage = 1; /* let everyone know we are in stage 1 */
         }
         else if ( (rgp->rgppkt.stage >= RGP_CLOSING) &&
              (rgp->rgppkt.stage <= RGP_PHASE2_CLEANUP) )
         { /* check for possible abort and restart */

            if (rgp->rgpcounter >= RGP_MUST_RESTART)
            {
              /* Stalled out. Probably someone died after starting
               * or another node is still in stage 1 cautious mode
               */

               if ( ++(rgp->restartcount) > RGP_RESTART_MAX ) {
                   // It is not a good idea to die, because somebody
                   // is stalling. Let's add stallees into ignore mask and restart
                   //
                   // RGP_ERROR(RGP_INTERNAL_ERROR); // [Fixed]
                   cluster_t tmp, *stage;

                   switch (rgp->rgppkt.stage) {
                   case RGP_CLOSING: stage = &rgp->rgppkt.knownstage2; break;
                   case RGP_PRUNING: stage = &rgp->rgppkt.knownstage3; break;
                   case RGP_PHASE1_CLEANUP: stage = &rgp->rgppkt.knownstage4; break;
                   case RGP_PHASE2_CLEANUP: stage = &rgp->rgppkt.knownstage5; break;
                   }
                   ClusterDifference(tmp, rgp->rgpinfo.cluster, *stage);

                   //
                   // If we stalled during closing, due to tiebraker running
                   // the pruning algorithn going bunkers, we can have tmp = 0
                   // In this case, we need to ignore somebody to guarantee that
                   // the algorithm completes.
                   //
                   if ( ClusterEmpty(tmp) && rgp->tiebreaker_selected) {
                       ClusterInsert(tmp, rgp->tiebreaker);
                   }

                   ClusterUnion(rgp->ignorescreen, rgp->ignorescreen, tmp);
               }

               /* If we are stalling in stage 3 and we have been pruned out,
                * it is possible that we are stalling because we have been
                * isolated from all other nodes. We must halt in this case.
                */
               if ( (rgp->rgppkt.stage == RGP_PRUNING) &&
                    !ClusterMember(rgp->rgppkt.pruning_result, rgp->mynode) )
                  RGP_ERROR(RGP_PRUNED_OUT);

               rgp->cautiousmode = 1;
               rgp->rgppkt.seqno = rgp->rgppkt.seqno + 1;

               RGP_TRACE( "RGP stalled     ", rgp->rgppkt.stage, 0, 0, 0 );

               regroup_restart();

            } /* Stalled out ... */
         } /* check for possible abort and restart */

         if ((rgp->rgppkt.stage == RGP_CLOSING) && rgp->tiebreaker_selected)
            rgp->pruning_ticks++;

         evaluatestageadvance();

         send_status_pkts = 1; /* send rgp packets regardless of progress */

         break;

      } /* called on regroup clock tick when regroup is active */


      case RGP_EVT_RECEIVED_PACKET :
      { /* received an rgp packet */

         /* If the sending node is excluded by the outer screen, then it is
          * not even part of the current (most recently known) configuration.
          * Therefore the packet should not be honored, and a poison message
          * should be sent to try to kill this renegade processor.
          * That is done in the calling routine that processes all incoming
          * regroup module packets (IamAlive, regroup and poison packets).
          */

         /* If the sending node was accepted by the outer screen but then
          * excluded by the inner screen, then the packet will be disregarded
          * but no poison message sent. This phenomenon may occur when this
          * node has entered stage 2 without having heard from (recognized)
          * the sending node and then a message arrives late from that
          * sending node. In this case the fate of the sending node, i.e.
          * whether it gets ruled out of the global configuration or not is
          * unknown at this point. If the sender can get itself recognized
          * by some node before that node enters stage 2, then it will be
          * saved. Otherwise it will be declared down and subsequently shot
          * with poison packets if it ever tries to assert itself.
          */

	  /* Remember the arg to this routine is the packet pointer */
         rcvd_pkt_p = (rgp_pkt_t *)arg; /* address of pkt just received */
	     if ( rgp->rgppkt.seqno != rcvd_pkt_p->seqno)
		     RGP_TRACE( "RGP Event       ", event, causingnode, rgp->rgppkt.stage, rgp->rgpcounter );  /* TRACE */

         UnpackIgnoreScreen(rcvd_pkt_p, ignorescreen_rcvd);
         if ( !ClusterEmpty(ignorescreen_rcvd) ) {
             RGP_TRACE( "RGP Incoming pkt", GetCluster(ignorescreen_rcvd), 
                        rcvd_pkt_p->seqno, rgp->rgppkt.stage, causingnode);
         }

         if ( !ClusterMember(rgp->innerscreen, INT_NODE(causingnode))) {
             RGP_TRACE( "RGP Ignoring !inner", causingnode, rgp->rgppkt.stage, 
                        GetCluster(rgp->innerscreen), GetCluster(ignorescreen_rcvd) );
             return;
         }

         RGP_LOCK; // To ensure that the timer thread does not initiate
                   // regroup asynchronously at this time.

//////////////////////////// New Ignore Screen Stuff /////////////////////////////////         
         
         if (ClusterMember(rgp->ignorescreen, INT_NODE(causingnode) )) {
             RGP_UNLOCK;
             RGP_TRACE( "RGP Ignoring", causingnode, rgp->rgppkt.stage, 
                        GetCluster(rgp->ignorescreen), GetCluster(ignorescreen_rcvd) );
             return;
         }

         if (rcvd_pkt_p->seqno < rgp->last_stable_seqno ) {
             RGP_UNLOCK;
             RGP_TRACE( "RGP old packet", causingnode, rcvd_pkt_p->seqno, rgp->last_stable_seqno, 0);
             // This is a late packet from the previous regroup incident
             // from the node that is currently in my outerscreen. 
             // This node could not have sent it now, this is probably a packet
             // that stuck somewhere and was delieverd eons later.
             // Simply ignore it.
             return;
         }


         if ( ClusterMember(ignorescreen_rcvd, rgp->mynode ) ) {
             //
             // Sender ignores me. We will do the same to him.
             //
             ClusterInsert(rgp->ignorescreen, INT_NODE(causingnode) );
             rgp->rgppkt.seqno = rgp->rgppkt.seqno + 1;
             regroup_restart();
             send_status_pkts = 1;
             RGP_UNLOCK;
             break;
         }

         if ( ClusterCompare(ignorescreen_rcvd, rgp->ignorescreen) ) {
             // We have the same ignore screen.
             // No work needs to be done
         } else if ( ClusterSubsetOf(rgp->ignorescreen, ignorescreen_rcvd) ) {
             // Incoming packet has smaller ignore screen
             // Ignore this packet, but reply to its sender with
             // our current regroup packet to force to upgrade to
             // our view of the world.
             
             // do so only if we are properly initialized
             if (rgp->rgppkt.stage == RGP_COLDLOADED && !rgp->OS_specific_control.ShuttingDown) {
                 RGP_ERROR(RGP_RELOADFAILED);
             }
         
             RGP_TRACE( "RGP smaller ignore mask ",
                        GetCluster(ignorescreen_rcvd), GetCluster(rgp->ignorescreen),   /* TRACE */
                        rgp->rgppkt.stage, rcvd_pkt_p->stage ); /* TRACE */
 
             ClusterInsert(rgp->status_targets, INT_NODE(causingnode));
             rgp_broadcast(RGP_UNACK_REGROUP);
             RGP_UNLOCK;
             return;
         } else if ( ClusterSubsetOf(ignorescreen_rcvd, rgp->ignorescreen) ) {
             RGP_TRACE( "RGP bigger ignore mask ",
                        GetCluster(ignorescreen_rcvd), GetCluster(rgp->ignorescreen),   /* TRACE */
                        rgp->rgppkt.stage, causingnode ); /* TRACE */
             // Incoming packet has bigger ignore screen.
             // Upgrade to this information and process the packet
             rgp->rgppkt.seqno = rcvd_pkt_p->seqno;
 
             /*  Somebody else activated regroup. So, let's just copy */
             /*  the sender's reason code and reason nodes.           */

             //
             // Ignore mask parts are in the reason and activatingnode fields
             //
 
             ClusterCopy(rgp->ignorescreen, ignorescreen_rcvd); // fix bug #328216
             rgp->rgppkt.reason = rcvd_pkt_p->reason;
             rgp->rgppkt.activatingnode = rcvd_pkt_p->activatingnode;
             rgp->rgppkt.causingnode = rcvd_pkt_p->causingnode;
             regroup_restart();
             send_status_pkts = 1;
         } else {
             RGP_TRACE( "RGP different ignore masks ",
                        GetCluster(ignorescreen_rcvd), GetCluster(rgp->ignorescreen),   /* TRACE */
                        rgp->rgppkt.stage, causingnode ); /* TRACE */
             // Ignore masks are different and neither of them is
             // a subset of another.
             //
             // We need to merge information out of these masks
             // and restart the regroup.
             //
             // Packet that we just received will be ignored

             ClusterUnion(rgp->ignorescreen, rgp->ignorescreen, ignorescreen_rcvd);
             rgp->rgppkt.seqno = max(rgp->rgppkt.seqno, rcvd_pkt_p->seqno) + 1;
             regroup_restart();
             send_status_pkts = 1;
             RGP_UNLOCK;
             break;
         }

//////////////////////////// End of new Ignore Screen Stuff ///////////////////////////////// 

         // Now ignorescreens of this node packet and incoming packet are the same //
         // proceed with regular regroup processing //
         
         /* Since the packet is acceptable, the regroup sequence number
          * must be compared to that of this node. If the incoming message
          * has a higher sequence number, then a new pass of the regroup
          * algorithm has started. This node must accept the new sequence
          * number, reinitialize its data, and start partcicipating in
          * the new pass. Also, the incoming message must be processed
          * since, once the algorithm reinitializes, the sequence numbers
          * now match.
          *
          * If the incoming packet has a matching sequence number, then it
          * should be accepted. The knowledge of the global state of the
          * algorithm it reflects must be merged with that already present
          * in this node. Then this node must evaluate whether further
          * state transitions are possible.
          *
          * Finally, if the incoming packet has a lower sequence number, then
          * it comes from a node unaware of the current level of the global
          * algorithm. The data in it should be ignored, but a packet should
          * be sent to it so that it will reinitialize its algorithm.
          *
          * The sequence number is a 32 bit algebraic value - hopefully it
          * will never wrap around.
          */


         if (rcvd_pkt_p->seqno < rgp->rgppkt.seqno)
         { /* sender below current level - ignore but let him know it*/

            RGP_TRACE( "RGP lower seqno ",
                       rgp->rgppkt.seqno, rcvd_pkt_p->seqno,   /* TRACE */
                       rgp->rgppkt.stage, rcvd_pkt_p->stage ); /* TRACE */

            ClusterInsert(rgp->status_targets, INT_NODE(causingnode));
            rgp_broadcast(RGP_UNACK_REGROUP);
                        RGP_UNLOCK;
            return;
         }

         if (rcvd_pkt_p->seqno > rgp->rgppkt.seqno)
         { /* sender above current level - I must upgrade to it*/

            // The node that forces a restart responsible for keeping
            // track of restarts and making a decision who will die/be ignored
            // if ( ++(rgp->restartcount) > RGP_RESTART_MAX )
            //   RGP_ERROR(RGP_INTERNAL_ERROR);

            if ( (rgp->rgppkt.stage != RGP_STABILIZED) ||
                 ((rcvd_pkt_p->seqno - rgp->rgppkt.seqno) > 1) )
            {
               RGP_TRACE( "RGP higher seqno",
                          rgp->rgppkt.seqno, rcvd_pkt_p->seqno,  /* TRACE */
                          rgp->rgppkt.stage, rcvd_pkt_p->stage );/* TRACE */
               rgp->cautiousmode = 1;
            }

            rgp->rgppkt.seqno = rcvd_pkt_p->seqno;

            /*  Somebody else activated regroup. So, let's just copy */
            /*  the sender's reason code and reason nodes.           */

            rgp->rgppkt.reason = rcvd_pkt_p->reason;
            rgp->rgppkt.activatingnode = rcvd_pkt_p->activatingnode;
            rgp->rgppkt.causingnode = rcvd_pkt_p->causingnode;
            regroup_restart();
            send_status_pkts = 1;

         } /* sender above current level - I must upgrade to it*/

         /* Now we are at the same level - even if we weren't at first.
          *
          * If the sender has already commited to a view of the world
          * that excludes me, I must halt in order to keep the system in
          * a consistent state.
          *
          * This is true even with the split brain avoidance algorithm.
          * The fact that stage1 = stage2 in the packet implies that the
          * sender has already run the split brain avoidance algorithm
          * and decided that he should survive.
          */

         if ( (rcvd_pkt_p->stage > RGP_ACTIVATED) &&
              ClusterCompare(rcvd_pkt_p->knownstage1,
                             rcvd_pkt_p->knownstage2) &&
              !ClusterMember(rcvd_pkt_p->knownstage1, rgp->mynode) )
         {
             ClusterInsert(rgp->ignorescreen, INT_NODE(causingnode) );
             rgp->rgppkt.seqno ++;
             regroup_restart();
             send_status_pkts = 1;
             RGP_UNLOCK;
//             /* I must die for overall consistency. */
//             RGP_ERROR((uint16) (RGP_PARIAH + causingnode)); // [Fixed]
             break;
         }
         RGP_UNLOCK;


         /* If I have terminated the active part of the algorithm, I
          * am in stage 6 and am not routinely broadcasting my status
          * anymore. If I get a packet from someone else who has not
          * yet terminated, then I must send him the word. But if he
          * has terminated, I must not send any packet or else there
          * will be an infinite loop of packets bouncing back and forth.
          */

         if (rgp->rgppkt.stage == RGP_STABILIZED)
         { /* I have terminated so can't learn anything more. */
            if (!ClusterCompare(rcvd_pkt_p->knownstage5,
                                rgp->rgppkt.knownstage5))
            { /* but sender has not so I must notify him */
               ClusterInsert(rgp->status_targets, INT_NODE(causingnode));
               rgp_broadcast(RGP_UNACK_REGROUP);
            }
            return;
         }

         /* At this point, the packet is from a legal node within the
          * current round of the algorithm and I have not terminated
          * at stage RGP_STABILIZED so I need to absorb whatever new
          * info is in this packet.
          *
          * The way to merge what this packet says with what I already
          * know is to just logically OR the known stage x fields
          * together.
          */
          {
              int seqno = rcvd_pkt_p->seqno&0xffff;
              int stage = rcvd_pkt_p->stage&0xffff;
              int trgs = *(int*)rgp->status_targets & 0xffff;
              int node = INT_NODE(causingnode)&0xffff;

              RGP_TRACE( "RGP recv pkt ",
                  ((seqno << 16) | stage),
                  RGP_MERGE_TO_32(
                      rcvd_pkt_p->knownstage1,
                      rcvd_pkt_p->knownstage2
                      ),
                  RGP_MERGE_TO_32(
                      rcvd_pkt_p->knownstage3,
                      rcvd_pkt_p->knownstage4
                      ),
                  (trgs << 16) | node
                  );
         }

         rgp_sanity_check(rcvd_pkt_p,  "RGP Received packet");
         rgp_sanity_check(&(rgp->rgppkt), "RGP Internal packet");

         ClusterUnion(rgp->rgppkt.quorumowner, rcvd_pkt_p->quorumowner,
                      rgp->rgppkt.quorumowner);
         ClusterUnion(rgp->rgppkt.knownstage1, rcvd_pkt_p->knownstage1,
                      rgp->rgppkt.knownstage1);
         ClusterUnion(rgp->rgppkt.knownstage2, rcvd_pkt_p->knownstage2,
                      rgp->rgppkt.knownstage2);
         ClusterUnion(rgp->rgppkt.knownstage3, rcvd_pkt_p->knownstage3,
                      rgp->rgppkt.knownstage3);
         ClusterUnion(rgp->rgppkt.knownstage4, rcvd_pkt_p->knownstage4,
                      rgp->rgppkt.knownstage4);
         ClusterUnion(rgp->rgppkt.knownstage5, rcvd_pkt_p->knownstage5,
                      rgp->rgppkt.knownstage5);
         ClusterUnion(rgp->rgppkt.pruning_result, rcvd_pkt_p->pruning_result,
                      rgp->rgppkt.pruning_result);

         /* But when I am in stage 2, it is possible that I can learn to
          * recognize some node I have not previously recognized by hearing
          * of it indirectly from some other node that I have recognized.
          * To handle this case, I always merge knownstage1 info into
          * the inner screen so that subsequent messages from the newly
          * recognized node will be accepted and processed.
          */
         if  ((rgp->rgppkt.stage == RGP_CLOSING) &&
              !(rgp->tiebreaker_selected))
            ClusterUnion(rgp->innerscreen, rgp->rgppkt.knownstage1,
                         rgp->innerscreen);

         /* In the first two stages of regroup, the inter-node connectivity
          * information is collected and propagated. When we get a regroup
          * packet, we turn ON the bit corresponding to the [our-node,
          * sender-node] entry in the connectivity matrix. We also OR in
          * the matrix sent by the sender node in the regroup packet.
          *
          * The matrix is not updated if we are in stage 1 and haven't
          * received the first clock tick. This is to prevent the
          * node pruning algorithm from considering us alive if our
          * timer mechanism is disrupted, but the IPC mechanism is OK.
          */

         /* [GorN 01/07/2000] If we are not collection connectivity information,
          * until we receive a first tick we can ran into problems if the node is
          * killed right after it send out its first timer driven packet 
          * (which doesn't have any connectivity info yet). This can cause a 
          * confusion. See bug 451792. 
          *
          * What we will do is we will collect connectivity information on
          * the side even when rgp->sendstage is FALSE and move it into the regroup
          * packet if we ever get a clock tick
          */

         if (rgp->rgppkt.stage < RGP_PRUNING && !rgp->sendstage)
         {
            MatrixSet(rgp->internal_connectivity_matrix,
                      rgp->mynode, INT_NODE(causingnode));
            if (causingnode != EXT_NODE(rgp->mynode))
               MatrixOr(rgp->internal_connectivity_matrix,
                        rcvd_pkt_p->connectivity_matrix);
         }

         if ((rgp->rgppkt.stage < RGP_PRUNING) && rgp->sendstage) 
         {
            MatrixSet(rgp->rgppkt.connectivity_matrix,
                      rgp->mynode, INT_NODE(causingnode));
            if (causingnode != EXT_NODE(rgp->mynode))
               MatrixOr(rgp->rgppkt.connectivity_matrix,
                        rcvd_pkt_p->connectivity_matrix);
         }

         /* Now, I can evaluate whether additional state transitions are
          * possible as a result of the info just received.
          */
         oldstage = rgp->rgppkt.stage;

//       QuorumCheck now runs in a separate thread
//         if (oldstage != RGP_CLOSING) // Cannot run Quorumcheck from here.
         evaluatestageadvance();

         /* To speed things up, let us broadcast our status if our
          * stage has changed and we are willing to let others and
          * ourselves see it.
          */

         if ( (oldstage != rgp->rgppkt.stage) && rgp->sendstage )
            send_status_pkts = 1; /* broadcast at once to speed things up */

         break;
      }   /* received an rgp packet */

      //
      // We do not support power failure notifications in NT
      //
      #if defined(NT)

      CL_ASSERT(event != RGP_EVT_POWERFAIL);
      //
      // Fall thru to default case
      //

      #else // NT

      case RGP_EVT_POWERFAIL :
      { /* Our node got a power up interrupt or an indication of power
         * failure from another node. */

         /* Note that this code will unconditionally abort and restart
          * the algorithm even if it was active before the power failure.
          * The new incident must be in cautious mode.
          */

         rgp->cautiousmode = 1;
         rgp->rgppkt.seqno = rgp->rgppkt.seqno + 1;
         rgp->rgppkt.reason = RGP_EVT_POWERFAIL;
         rgp->rgppkt.activatingnode = (uint8) EXT_NODE(rgp->mynode);
         rgp->rgppkt.causingnode = (uint8) causingnode;

         /* rgp->pfail_state is set to a non-zero value when a pfail event
          * is reported to regroup. It is decremented at every regroup clock
          * tick till it reaches zero. While this number is non-zero, missing
          * self IamAlives are ignored and do not cause the node to halt.
          * This gives the sending hardware some time to recover from power
          * failures before self IamAlives are checked.
          */
         if (causingnode == EXT_NODE(rgp->mynode))
            rgp->pfail_state = RGP_PFAIL_TICKS;

         /* Store the fact that causingnode experienced a PFAIL,
          * for reporting to the message system when regroup stabilizes.
          */
         ClusterInsert(rgp->rgppkt.hadpowerfail, INT_NODE(causingnode));

         regroup_restart();
         send_status_pkts = 1;
         break;
      } /* power failure */

      #endif // NT

      default :
      {
         RGP_ERROR(RGP_INTERNAL_ERROR);
      }
   }

   if (send_status_pkts) /* significant change - send status at once */
   {
      ClusterUnion(rgp->status_targets,
                   rgp->outerscreen, rgp->status_targets);
      rgp_broadcast(RGP_UNACK_REGROUP);
   }
}

/************************************************************************
 * rgp_check_packet
 * =================
 *
 * Description:
 *
 *  verifies that RGP packet has reasonable values in
 *  powerfail, knownstages, pruning_result, and connectivity_matrix fields
 *
 * Parameters:
 *
 *    rgp_pkt_t* pkt -
 *       packet to be checked
 *
 * Returns:
 *
 *    0 - packet looks good
 *    1,2,3... - strange looking packet
 *
 ************************************************************************/
int rgp_check_packet(rgp_pkt_t* pkt) {
   node_t       i;

   //
   // Verify that
   //   knownstage5 \subset knownstage4 \subset knownstage3 \subset
   //   knownstage2 \subset knownstage1 \subset rgp->rgpinfo.cluster
   //
   // int ClusterSubsetOf(cluster_t big, cluster_t small)
   //   Returns 1 if set small = set big or small is a subset of big.
   //

   if( !ClusterSubsetOf(pkt->knownstage4, pkt->knownstage5) ) {
      return 5;
   }
   if( !ClusterSubsetOf(pkt->knownstage3, pkt->knownstage4) ) {
      return 4;
   }
   if( !ClusterSubsetOf(pkt->knownstage2, pkt->knownstage3) ) {
      return 3;
   }
   if( !ClusterSubsetOf(pkt->knownstage1, pkt->knownstage2) ) {
      return 2;
   }
   if( !ClusterSubsetOf(rgp->rgpinfo.cluster, pkt->knownstage1) ) {
      return 1;
   }

   //
   // pruning_result has to be a subset of knownstage2
   //
   if( !ClusterSubsetOf(pkt->knownstage2, pkt->pruning_result) ) {
      return 9;
   }

   //
   // quorumowner has to be a subset of original cluster
   //
   if(!ClusterSubsetOf(rgp->rgpinfo.cluster, pkt->quorumowner)) {
      return 8;
   }
   //
   // Check connectivity matrix
   //
   for(i = 0; i < MAX_CLUSTER_SIZE; ++i) {
      if( ClusterMember( rgp->rgpinfo.cluster, i ) ) {
         //
         // Node i is a member of a cluster
         // Its connectivity bitmap has to be a subset of rgp->rgpinfo.cluster
         //
         if(!ClusterSubsetOf(rgp->rgpinfo.cluster, pkt->connectivity_matrix[i])) {
            return 10;
         }
      } else {
         //
         // Node i is not a member of a cluster
         // Its connectivity bitmap has to be 0
         //
         if(!ClusterEmpty(pkt->connectivity_matrix[i]))
            return 11;
      }
   }

   return 0;
}

/************************************************************************
 * rgp_print_packet
 * =================
 *
 * Description:
 *
 *    Prints RGP packet fields
 *
 * Parameters:
 *
 *    rgp_pkt_t* pkt -
 *       packet to be printed
 *    char* label -
 *       label to be printed together with a packet
 *    int code -
 *       a number to be printed together with a packet
 *
 * Returns:
 *
 *    VOID
 *
 ************************************************************************/
void rgp_print_packet(rgp_pkt_t* pkt, char* label, int code)
{
   uint8                   pktsubtype;
   uint8                   stage;
   uint16                  reason;
   uint32                  seqno;
   uint8                   activatingnode;
   uint8                   causingnode;
   cluster_t               quorumowner;

   RGP_TRACE( label,
              pkt->seqno,                               /* TRACE */
              code,
              (pkt->stage << 16) |
              (pkt->activatingnode  << 8) |
              (pkt->causingnode),                       /* TRACE */
              RGP_MERGE_TO_32( rgp->outerscreen,
                               rgp->innerscreen )
               );
   RGP_TRACE( "RGP CHK masks       ",
              RGP_MERGE_TO_32( rgp->rgpinfo.cluster,    /* TRACE */
                               pkt->quorumowner ),      /* TRACE */
              RGP_MERGE_TO_32( pkt->knownstage1,        /* TRACE */
                               pkt->knownstage2 ),      /* TRACE */
              RGP_MERGE_TO_32( pkt->knownstage3,        /* TRACE */
                               pkt->knownstage4 ),      /* TRACE */
              RGP_MERGE_TO_32( pkt->knownstage5,        /* TRACE */
                               pkt->pruning_result ) ); /* TRACE */
   RGP_TRACE( "RGP CHK Con. matrix1",
        RGP_MERGE_TO_32( pkt->connectivity_matrix[0],   /*TRACE*/
                         pkt->connectivity_matrix[1] ), /*TRACE*/
        RGP_MERGE_TO_32( pkt->connectivity_matrix[2],   /*TRACE*/
                         pkt->connectivity_matrix[3] ), /*TRACE*/
        RGP_MERGE_TO_32( pkt->connectivity_matrix[4],   /*TRACE*/
                         pkt->connectivity_matrix[5] ), /*TRACE*/
        RGP_MERGE_TO_32( pkt->connectivity_matrix[6],   /*TRACE*/
                         pkt->connectivity_matrix[7])); /*TRACE*/
   RGP_TRACE( "RGP CHK Con. matrix2",
        RGP_MERGE_TO_32( pkt->connectivity_matrix[8],   /*TRACE*/
                         pkt->connectivity_matrix[9] ), /*TRACE*/
        RGP_MERGE_TO_32( pkt->connectivity_matrix[10],  /*TRACE*/
                         pkt->connectivity_matrix[11]), /*TRACE*/
        RGP_MERGE_TO_32( pkt->connectivity_matrix[12],  /*TRACE*/
                         pkt->connectivity_matrix[13]), /*TRACE*/
        RGP_MERGE_TO_32( pkt->connectivity_matrix[14],  /*TRACE*/
                         pkt->connectivity_matrix[15]));/*TRACE*/
}


/************************************************************************
 * UnpackIgnoreScreen
 * =================
 *
 * Description:
 *
 *    Extracts ignorescreen out of regroup packet
 *
 * Parameters:
 *
 *    rgp_pkt_t* from -
 *       source packet 
 *    cluster_t to -
 *       target node set
 *
 * Returns:
 *
 *    VOID
 *
 * Comments:
 *
 *   If the packet is received from NT4 node, unpacked ignorescreen
 *   will ne always 0.
 *
 ************************************************************************/
void UnpackIgnoreScreen(rgp_pkt_t* from, cluster_t to) 
{
#pragma warning( push )
#pragma warning( disable : 4244 )
    if (from->reason < RGP_EVT_IGNORE_MASK) {
        ClusterInit(to);
    } else {
        to[0] = ((uint16)from->reason) >> 8;
        to[1] = (uint8)from->causingnode;
    }
#pragma warning( pop )
}

/************************************************************************
 * rgp_print_packet
 * =================
 *
 * Description:
 *
 *    Put an ignorescreen back into a regroup packet
 *
 * Parameters:
 *
 *    rgp_pkt_t* to -
 *       packet to be updated
 *    cluster_t from -
 *       source node set
 *
 * Returns:
 *
 *    VOID
 *
 ************************************************************************/
void PackIgnoreScreen(rgp_pkt_t* to, cluster_t from)
{
    if ( ClusterEmpty(from) ) {
        to->reason &= 255;
        to->causingnode = 0;
    } else {
        to->reason = (uint8)RGP_EVT_IGNORE_MASK | (from[0] << 8);
        to->causingnode = from[1];
    }
}


/*---------------------------------------------------------------------------*/

#ifdef __cplusplus
}
#endif /* __cplusplus */


#if 0

History of changes to this file:
-------------------------------------------------------------------------
1995, December 13                                           F40:KSK0610          /*F40:KSK06102.2*/

This file is part of the portable Regroup Module used in the NonStop
Kernel (NSK) and Loosely Coupled UNIX (LCU) operating systems. There
are 10 files in the module - jrgp.h, jrgpos.h, wrgp.h, wrgpos.h,
srgpif.c, srgpos.c, srgpsm.c, srgputl.c, srgpcli.c and srgpsvr.c.
The last two are simulation files to test the Regroup Module on a
UNIX workstation in user mode with processes simulating processor nodes
and UDP datagrams used to send unacknowledged datagrams.

This file was first submitted for release into NSK on 12/13/95.
------------------------------------------------------------------------------
This change occurred on 19 Jan 1996                                              /*F40:MB06458.1*/
Changes for phase IV Sierra message system release. Includes:                    /*F40:MB06458.2*/
 - Some cleanup of the code                                                      /*F40:MB06458.3*/
 - Increment KCCB counters to count the number of setup messages and             /*F40:MB06458.4*/
   unsequenced messages sent.                                                    /*F40:MB06458.5*/
 - Fixed some bugs                                                               /*F40:MB06458.6*/
 - Disable interrupts before allocating broadcast sibs.                          /*F40:MB06458.7*/
 - Change per-packet-timeout to 5ms                                              /*F40:MB06458.8*/
 - Make the regroup and powerfail broadcast use highest priority                 /*F40:MB06458.9*/
   tnet services queue.                                                          /*F40:MB06458.10*/
 - Call the millicode backdoor to get the processor status from SP               /*F40:MB06458.11*/
 - Fixed expand bug in msg_listen_ and msg_readctrl_                             /*F40:MB06458.12*/
 - Added enhancement to msngr_sendmsg_ so that clients do not need               /*F40:MB06458.13*/
   to be unstoppable before calling this routine.                                /*F40:MB06458.14*/
 - Added new steps in the build file called                                      /*F40:MB06458.15*/
   MSGSYS_C - compiles all the message system C files                            /*F40:MB06458.16*/
   MSDRIVER - compiles all the MSDriver files                                    /*F40:MB06458.17*/
   REGROUP  - compiles all the regroup files                                     /*F40:MB06458.18*/
 - remove #pragma env libspace because we set it as a command line               /*F40:MB06458.19*/
   parameter.                                                                    /*F40:MB06458.20*/
-----------------------------------------------------------------------          /*F40:MB06458.21*/

#endif    /* 0 - change descriptions */