#ifndef _WRGP_H_ #define _WRGP_H_ #ifdef __TANDEM #pragma columns 79 #pragma page "wrgp.h - T9050 - internal declarations for Regroup Module" #endif /* @@@ START COPYRIGHT @@@ ** Tandem Confidential: Need to Know only ** Copyright (c) 1995, Tandem Computers Incorporated ** Protected as an unpublished work. ** All Rights Reserved. ** ** The computer program listings, specifications, and documentation ** herein are the property of Tandem Computers Incorporated and shall ** not be reproduced, copied, disclosed, or used in whole or in part ** for any reason without the prior express written permission of ** Tandem Computers Incorporated. ** ** @@@ END COPYRIGHT @@@ **/ /*--------------------------------------------------------------------------- * This file (wrgp.h) contains the cluster_t data type and types used for the * node pruning algorithm and declares the routines exported by the Cluster * data type and the node pruning algorithm. *---------------------------------------------------------------------------*/ #ifdef __cplusplus extern "C" { #endif /* __cplusplus */ #include #include #include #define RGP_VERSION 1 /* version # of data structures */ #define RGP_INITSEQNUM 0 /* starting seq# # of regroup */ #define RGPPKTLEN sizeof(rgp_pkt_t) /* byte length of regroup pkts */ #define IAMALIVEPKTLEN sizeof(iamalive_pkt_t)/* byte length of IamAlive pkts */ #define POISONPKTLEN sizeof(poison_pkt_t) /* byte length of poison pkts */ /*-------------------------------------------------------*/ /* The following are the stages of the regroup algorithm */ /*-------------------------------------------------------*/ #define RGP_COLDLOADED 0 #define RGP_ACTIVATED 1 #define RGP_CLOSING 2 #define RGP_PRUNING 3 #define RGP_PHASE1_CLEANUP 4 #define RGP_PHASE2_CLEANUP 5 #define RGP_STABILIZED 6 /*--------------------------------------------------------------------*/ /* Macros to transform node numbers used by the OS to node numbers */ /* used by the Regroup module and vice versa. Regroup's internal node */ /* numbers start at 0 while the OS starts node numbers at */ /* LOWEST_NODENUM. */ /*--------------------------------------------------------------------*/ #define EXT_NODE(int_node) ((node_t)(int_node + LOWEST_NODENUM)) #define INT_NODE(ext_node) ((node_t)(ext_node - LOWEST_NODENUM)) /*----------------------------------------*/ /* Defines for the node pruning algorithm */ /*----------------------------------------*/ /* The data type "cluster_t" is a bit array of size equal to the maximum * number of nodes in the cluster. The bit array is implemented as an * array of uint8s. * * Given a node#, its bit position in the bit array is computed by first * locating the byte in the array (node# / BYTEL) and then the bit in * the byte. Bits in the byte are numbered 0..7 (from left to right). * Thus, node 0 is placed in byte 0, bit 0, which is the left-most bit * in the bit array. */ #define BYTE(cluster, node) ( (cluster)[(node) / BYTEL] ) /* byte# in array */ #define BIT(node) ( (node) % BYTEL ) /* bit# in byte */ /* The connectivity matrix is an array of elements of type cluster_t. * cluster_t is equivalent to a bit array with one bit per node. Thus the * matrix is equivalent to a two-dimensional bit array, with each * dimension being MAX_CLUSTER_SIZE large. A bit value of 1 for matrix[i][j] * represents a unidirectional connection between nodes i and j (a * regroup packet received on node i from node j). */ typedef cluster_t connectivity_matrix_t[MAX_CLUSTER_SIZE]; #define connected(i,j) (ClusterMember(c[(int)i],j) && \ ClusterMember(c[(int)j],i)) /* bidirectional */ /* Should a node that cannot receive its own regroup packets be considered * dead? Not necessarily. It may be able to send packets to others and * be considered alive by everyone. There is no real need for the ability * to send to yourself on the network. Software bugs could result in * such a situation. Therefore, the correct way to check if a node is * alive would be to check if there is a non-zero bit in either the row * or column corresponding to the node; that is, if the node has * received regroup packets from or sent regroup packets to any node, * it may be considered alive. But for simplicity, we will assume in * the following macro that a node that does not receive its own * regroup packets will be considered dead. */ #define node_considered_alive(i) ClusterMember(c[(int)i],i) /* The upper bound on the number of potential fully-connected groups is * the lower of 2**N and 2**D where N is the number of live nodes and * D is the number of disconnects. If this number exceeds MAX_GROUPS, * do not attempt to exhaustively generate all possible groups; * just return an arbitrary fully-connected group which includes a * node selected by the cluster manager. */ #define MAX_GROUPS 256 /* if more than these, pick arbitrary group */ #define LOG2_MAX_GROUPS 8 /* log (base 2) of MAX_GROUPS */ #define too_many_groups(nodes, disconnects) \ ((nodes > LOG2_MAX_GROUPS) && (disconnects > LOG2_MAX_GROUPS)) /* The disconnect array is an array of (i,j) pairs which represent a * break in connectivity between nodes i and j. */ typedef node_t disconnect_array [LOG2_MAX_GROUPS * (LOG2_MAX_GROUPS-1)/2] [2]; /*---------------------------------------------------------------------------*/ /* Following are templates for three kinds of unacknowledged datagrams sent */ /* by the regroup module (regroup pkts, IamAlive pkts and poison pkts). */ /*---------------------------------------------------------------------------*/ // // We already hand packed all on the wire structures. // packon will instruct the compiler not to mess with field alignment (kind of) // #include /************************************************************************ * rgp_pkt_t (regroup status packet) * --------------------------------- * This structure is used to send the current state of the regroup state * machine to other nodes. * * ___________________________________________________________ * wd0 | pktsubtype | stage | reason | Low8 ignscr | * |_____________|_______________|_____________________________| * wd1 | seqno | * |_____________________________|_____________________________| * wd2 | activa- | causingnode | quorumowner | * | tingnode | Hi8 ignscr | (was hadpowerfail) | * |_____________|_______________|_____________________________| * wd3 | knownstage1 | knownstage2 | * |_____________________________|_____________________________| * wd4 | knownstage3 | knownstage4 | * |_____________________________|_____________________________| * wd5 | knownstage5 | pruning_result | * |_____________________________|_____________________________| * wd6 : : * | connectivity_matrix | * : : * wd13|___________________________________________________________| * * * pktsubtype - packet subtype = RGP_UNACK_REGROUP * stage - current stage (state) of the regroup algorithm * reason - reason for the activation of regroup * seqno - sequence number of current regroup incident * activatingnode - node that calls for a regroup incident * causingnode - node whose poll packet was missed or which * had a power failure or otherwise caused * a regroup incident being called for * quorumowner - mask of nodes that think they own the quorum resrc * knownstage1 - mask of nodes known to have entered stage 1 * knownstage2 - mask of nodes known to have entered stage 2 * knownstage3 - mask of nodes known to have entered stage 3 * knownstage4 - mask of nodes known to have entered stage 4 * knownstage5 - mask of nodes known to have entered stage 5 * pruning_result - result of node pruning by tie-breaker node * connectivity_matrix - current connectivity info for entire cluster * */ #ifdef __TANDEM #pragma fieldalign shared8 rgp_pkt #endif /* __TANDEM */ typedef struct rgp_pkt { uint8 pktsubtype; uint8 stage; uint16 reason; uint32 seqno; uint8 activatingnode; uint8 causingnode; cluster_t quorumowner; cluster_t knownstage1; cluster_t knownstage2; cluster_t knownstage3; cluster_t knownstage4; cluster_t knownstage5; cluster_t pruning_result; connectivity_matrix_t connectivity_matrix; } rgp_pkt_t; /************************************************************************ * iamalive_pkt_t * -------------- * This structure is used by a node to indicate to another node that it * is alive and well. * * ___________________________________________________________ * wd0 | pktsubtype | filler | * |_____________|_____________________________________________| * wd1 : : * | testpattern | * : : * wd13|___________________________________________________________| * * * pktsubtype - packet subtype = RGP_UNACK_IAMALIVE * testpattern - a bit pattern used for testing * */ #ifdef __TANDEM #pragma fieldalign shared8 iamalive_pkt #endif /* __TANDEM */ typedef struct iamalive_pkt { uint8 pktsubtype; uint8 filler[3]; union { uint8 bytes[RGP_UNACK_PKTLEN - 4]; uint32 words[(RGP_UNACK_PKTLEN - 4)/4]; } testpattern; } iamalive_pkt_t; /************************************************************************ * poison_pkt_t * ------------ * This structure is used to send a poison packet to another node to * force the other node to halt. * * ___________________________________________________________ * wd0 | pktsubtype | unused1 | reason | * |_____________|_______________|_____________________________| * wd1 | seqno | * |_____________________________|_____________________________| * wd2 | activa- | causingnode | | * | tingnode | | unused2 | * |_____________|_______________|_____________________________| * wd3 | initnodes | endnodes | * |_____________________________|_____________________________| * * * pktsubtype - packet subtype = RGP_UNACK_POISON * reason - reason for the last activation of regroup * seqno - current regroup sequence number * (sequence number of last regroup incident) * activatingnode - node which called for last regroup incident * causingnode - node whose poll packet was missed or which * had a power failure or otherwise caused * the last regroup incident being called for * initnodes - mask of nodes at beginning of last regroup * endnodes - mask of nodes at end of last regroup * */ #ifdef __TANDEM #pragma fieldalign shared8 poison_pkt #endif /* __TANDEM */ typedef struct poison_pkt { uint8 pktsubtype; uint8 unused1; uint16 reason; uint32 seqno; uint8 activatingnode; uint8 causingnode; uint16 unused2; cluster_t initnodes; cluster_t endnodes; } poison_pkt_t; #include // // There is no room for a 16 bit ignorescreen mask // in rgp_pkt_t structure. We use a few bit from several // fields to store the ignore screen. // The following routines do packing and unpacking // of ignorescreen from/into the packet // extern void PackIgnoreScreen(rgp_pkt_t* to, cluster_t from); extern void UnpackIgnoreScreen(rgp_pkt_t* from, cluster_t to); extern void SetMulticastReachable(uint32 mask); /*---------------------------------------------------------------------------*/ /* This struct is keeps track of the state of each node in the cluster. */ /*---------------------------------------------------------------------------*/ typedef struct { uint16 status; /* state of node - alive, dead etc. */ uint16 pollstate; /* whether I'm alives have been received */ uint16 lostHBs; /* tracks the number of consecutive I'm alives lost */ } node_state_t; /* The status and pollstate fields of the node_state_t struct can have the * following values. */ /* Node status of nodes */ #define RGP_NODE_ALIVE 1 /* node is considered alive */ #define RGP_NODE_COMING_UP 2 /* node is coming up */ #define RGP_NODE_DEAD 3 /* node has failed */ #define RGP_NODE_NOT_CONFIGURED 4 /* node is not even configured */ /* IamAlive status codes of nodes */ #define AWAITING_IAMALIVE 1 /* awaiting IamAlives */ #define IAMALIVE_RECEIVED 2 /* got IamAlive */ #define RGP_IAMALIVE_THRESHOLD 100 /* after getting this many Iam- * * Alives, we check if every * * node has sent at least one */ /************************************************************************ * rgp_control_t (regroup's only global data structure) * ---------------------------------------------------- * This structure holds all the Regroup state and other info. * This is the only global data structure used by Regroup. * * NOTE: The word offsets shown in this picture assume that * MAX_CLUSTER_SIZE is 16. * * ___________________________________________________________ * wd0 | | * : rgpinfo structure : * : : * |___________________________________________________________| * wd3 | mynode | tiebreaker | * |_____________________________|_____________________________| * wd4 | num_nodes | * |___________________________________________________________| * wd5 | clock_ticks | rgpcounter | * |_____________________________|_____________________________| * wd6 | restartcount | pruning_ticks | * |_____________________________|_____________________________| * wd7 | pfail_state | flags | * |_____________________________|_____________________________| * wd8 | outerscreen | innerscreen | * |_____________________________|_____________________________| * wd9 | status_targets | poison_targets | * |_____________________________|_____________________________| * wd10| initnodes | endnodes | * |_____________________________|_____________________________| * wd11| unreachable_nodes | arbitration_ticks | * |_____________________________|_____________________________| * wd12| ignorescreen | filler[0] | * |_____________________________|_____________________________| * wd13| filler[1] | filler[2] | * |_____________________________|_____________________________| * wd14| | * : node_states[MAX_CLUSTER_SIZE] : * : : * |___________________________________________________________| * wd30| *nodedown_callback() | * |___________________________________________________________| * wd31| *select_cluster() | * |___________________________________________________________| * wd32| *rgp_msgsys_p | * |___________________________________________________________| * wd33| *received_pktaddr | * |___________________________________________________________| * wd34| | * : rgppkt : * : : * |___________________________________________________________| * wd48| | * : rgppkt_to_send : * : : * |___________________________________________________________| * wd62| | * : iamalive_pkt : * : : * |___________________________________________________________| * wd76| | * : poison_pkt : * |___________________________________________________________| * wd80| | * : : * : potential_groups[MAX_GROUPS] : * : : * |___________________________________________________________| *wd208| | * : last_stable_seqno : * |___________________________________________________________| *wd212| | * : internal_connectivity_matrix : * |___________________________________________________________| *wdyyy| | * : OS_specific_control : *wdxxx|___________________________________________________________| * * * rgpinfo - contains regroup timing parameters and mask of * fully-integrated cluster (to send IamAlives and monitor) * * mynode - node number of local node * * tiebreaker - node selected to act as a tie-breaker in the * split-brain avoidance algorithm and to run the * pruning algorithm * * num_nodes - number of nodes configured in the system, including * any unused node numbers in the middle; this is equal * to (the largest configured node# in the system - * lowest possible node # + 1). * * clock_ticks- regroup's internal clock used for checking if it is * time to send IamAlive packets and to check if IamAlives * have been received. It is incremented every * RGP_CLOCK_PERIOD and reset to 0 after checking * for IamAlives. * * rgpcounter - counts regroup clock ticks in a regroup incident in * order to detect if the algorithm is stalling. * This is reset when a new regroup incident begins and * is incremented at each regroup clock tick while * regroup is perturbed. * * restartcount - counts # of regroup algorithm restarts in each regroup * incident; the node is halted if there are too many * restarts. * * pruning_ticks - number of regroup clock ticks after the tie-breaker * has been selected; if there are disconnects, the * tie-breaker should wait a fixed number of ticks * before running the pruning algorithm. * * pfail_state - set to a +ve value when a pfail event is reported * to regroup. It is decremented at every regroup * clock tick till it reaches zero. While this number * is +ve, missing self IamAlives are ignored and * do not cause the node to halt. This gives the * sending hardware some time to recover from power * failures before self IamAlives are checked. * * outerscreen - outer recognition mask: nodes not in this mask are * considered dead or outcasts; if they try to contact * us, send them poison packets to make sure they stay down * * innerscreen - inner recognition mask: nodes not in this mask are * considered tardy. Regroup packts from them will be * ignored. They may survive if they can find some * node which hasn't eliminated them from this screen. * * status_targets - nodes to send regroup status packets to * * poison_targets - nodes to send poison packets to * * initnodes - nodes alive at the beginning of last regroup incident * * endnodes - nodes alive at the end of last regroup incident * * unreachable_nodes - stores unreachable_node events till the events * can be processed * * arbitration_ticks - number of regroup clock ticks after the arbitration * started. If arbitration_ticks counter exceeds * RGP_ARBITRATION_TIMEOUT number of ticks, * the arbitrating node will shoot itself, and the rest * of the group will restart the regroup ignoring stalled * arbitrator * * ignorescreen - this is a local copy of ignorescreen passed as * a part of the regroup packet. The packets from * the nodes in this screen are ignored and no wait * for the nodes in ignorescreen is performed in stage 1 * * last_stable_seqno - this is a sequence number of the last successful regroup. * It allows to detect really outdated packets * * flags: * * cautiousmode - need to be "cautious"; wait longer in stage 1 * * sendstage - This flag is used to indicate whether the * regroup status packets should indicate we * are in the current stage. When we enter the * cleanup stages, we don't let others know we * are in the stage until the cleanup actions * are completed. * * This flag is set when a new regroup incident * is started. It is then cleared when we enter * a cleanup stage and set again when the * cleanup operations are completed. * * tiebreaker_selected - set in stage 2 after tie-breaker is selected * * has_unreachable_nodes - set when a node_unreachable event is detected * in stages 1 or 2. checked in stage 3. * * flags_unused - 11 unused bits * * node_states[MAX_CLUSTER_SIZE] - state of all the nodes * * *nodedown_callback() - registered callback routine to be invoked * to report node failure * * *select_cluster() - registered callback routine to be invoked * when multiple cluster options exist * * *rgp_msgsys_p - pointer to struct shared by regroup and message system * * *received_pktaddr - address of rgp packet received * * rgp_lock - lock to serialize access to this struct * * rgppkt - regroup status in the form of a packet * * rgppkt_to_send - regroup packet to be broadcast * * iamalive_pkt - I am alive packet to be broadcast * * poison_pkt - poison packet to be sent * * potential_groups[MAX_GROUPS] - scratch pad for pruning algorithm * */ #ifdef __TANDEM #pragma fieldalign shared8 rgp_control #endif /* __TANDEM */ typedef struct rgp_control { /* timing parameters and cluster membership */ rgpinfo_t rgpinfo; /* node numbers */ node_t mynode; node_t tiebreaker; uint32 num_nodes; /* various counters counting clock ticks */ uint16 clock_ticks; uint16 rgpcounter; uint16 restartcount; uint16 pruning_ticks; uint16 pfail_state; /* rgpflags */ uint16 cautiousmode : 1; uint16 sendstage : 1; uint16 tiebreaker_selected : 1; uint16 has_unreachable_nodes : 1; uint16 arbitration_started : 1; uint16 flags_unused : 11; /* cluster masks */ cluster_t outerscreen; cluster_t innerscreen; cluster_t status_targets; cluster_t poison_targets; cluster_t initnodes; cluster_t endnodes; cluster_t unreachable_nodes; uint16 arbitration_ticks; cluster_t ignorescreen; uint16 filler[3]; /* for alignment and future use */ /* node states */ node_state_t node_states[MAX_CLUSTER_SIZE]; /* callback routines */ void (*nodedown_callback)(cluster_t failed_nodes); int (*select_cluster)(cluster_t cluster_choices[], int num_clusters); /* pointers to other structures */ rgp_msgsys_p rgp_msgsys_p; rgp_pkt_t *received_pktaddr; /* current status in the form of a regroup packet */ rgp_pkt_t rgppkt; /* packets to be sent */ rgp_pkt_t rgppkt_to_send; iamalive_pkt_t iamalive_pkt; poison_pkt_t poison_pkt; /* scratch pad for node pruning algorithm */ cluster_t potential_groups[MAX_GROUPS]; /* The rest of the struct is an OS-specific substruct * (defined in wrgpos.h). */ uint32 last_stable_seqno; /* temporary place to collect connectivity information * while send_stage = 0. (Can't use rgp_pkt conn.matrix, * because we don't want to see our info until we get * the first timer tick */ connectivity_matrix_t internal_connectivity_matrix; OS_specific_rgp_control_t OS_specific_control; } rgp_control_t; /*---------------------------------------------------------------------------*/ /* Procedures exported by the Cluster type implementation */ _priv _resident extern void ClusterInit(cluster_t c); _priv _resident extern void ClusterUnion(cluster_t dst, cluster_t src1, cluster_t src2); _priv _resident extern void ClusterIntersection(cluster_t dst, cluster_t src1, cluster_t src2); _priv _resident extern void ClusterDifference(cluster_t dst, cluster_t src1, cluster_t src2); _priv _resident extern int ClusterCompare(cluster_t c1, cluster_t c2); _priv _resident extern int ClusterSubsetOf(cluster_t big, cluster_t small); _priv _resident extern void ClusterComplement(cluster_t dst, cluster_t src); _priv _resident extern int ClusterMember(cluster_t c, node_t i); _priv _resident extern void ClusterInsert(cluster_t c, node_t i); _priv _resident extern void ClusterDelete(cluster_t c, node_t i); _priv _resident extern void ClusterCopy(cluster_t dst, cluster_t src); _priv _resident extern void ClusterSwap(cluster_t c1, cluster_t c2); _priv _resident extern int ClusterNumMembers(cluster_t c); extern int ClusterEmpty(cluster_t c); /*---------------------------------------------------------------------------*/ /* Function to select the tie-breaker node used in both the split-brain * avoidance and node pruning algorithms */ _priv _resident extern node_t rgp_select_tiebreaker(cluster_t cluster); /*---------------------------------------------------------------------------*/ /* Procedures exported by the node pruning algorithm */ _priv _resident extern void MatrixInit(connectivity_matrix_t c); /* Initialize the matrix c to show 0 connectivity. */ _priv _resident extern void MatrixSet(connectivity_matrix_t c, int row, int column); /* Set c[row,column] to 1. */ _priv _resident extern void MatrixOr(connectivity_matrix_t t, connectivity_matrix_t s); /* OR in s into t. */ _priv _resident extern int connectivity_complete(connectivity_matrix_t c); /* Returns 1 if all live nodes are connected to all other live nodes * and 0 if there is at least one disconnect. */ _priv _resident extern int find_all_fully_connected_groups(connectivity_matrix_t c, node_t selected_node, cluster_t groups[]); /* Analyzes the connectivity matrix and comes up with the list of * all maximal, fully-connected groups. Returns the number of * such groups found. 0 is returned iff there are no live nodes. */ /*---------------------------------------------------------------------------*/ /* Declaration of Regroup's global data structure */ #ifdef NSK #include #define rgp ((rgp_control_t *) MSGROOT->RegroupControlAddr) #else extern rgp_control_t *rgp; #endif /* NSK */ /*---------------------------------------------------------------------------*/ #ifdef __cplusplus } #endif /* __cplusplus */ #if 0 History of changes to this file: ------------------------------------------------------------------------- 1995, December 13 F40:KSK0610 /*F40:KSK06102.1*/ This file is part of the portable Regroup Module used in the NonStop Kernel (NSK) and Loosely Coupled UNIX (LCU) operating systems. There are 10 files in the module - jrgp.h, jrgpos.h, wrgp.h, wrgpos.h, srgpif.c, srgpos.c, srgpsm.c, srgputl.c, srgpcli.c and srgpsvr.c. The last two are simulation files to test the Regroup Module on a UNIX workstation in user mode with processes simulating processor nodes and UDP datagrams used to send unacknowledged datagrams. This file was first submitted for release into NSK on 12/13/95. ------------------------------------------------------------------------------ #endif /* 0 - change descriptions */ #endif /* _WRGP_H_ defined */