Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1224 lines
40 KiB

  1. #ifdef __TANDEM
  2. #pragma columns 79
  3. #pragma page "srgpif.c - T9050 - interface routines for Regroup Module"
  4. #endif
  5. /* @@@ START COPYRIGHT @@@
  6. ** Tandem Confidential: Need to Know only
  7. ** Copyright (c) 1995, Tandem Computers Incorporated
  8. ** Protected as an unpublished work.
  9. ** All Rights Reserved.
  10. **
  11. ** The computer program listings, specifications, and documentation
  12. ** herein are the property of Tandem Computers Incorporated and shall
  13. ** not be reproduced, copied, disclosed, or used in whole or in part
  14. ** for any reason without the prior express written permission of
  15. ** Tandem Computers Incorporated.
  16. **
  17. ** @@@ END COPYRIGHT @@@
  18. **/
  19. /*---------------------------------------------------------------------------
  20. * This file (srgpif.c) contains all the external interface routines
  21. * of Regroup.
  22. *---------------------------------------------------------------------------*/
  23. #ifdef __cplusplus
  24. extern "C" {
  25. #endif /* __cplusplus */
  26. #include <wrgp.h>
  27. /************************************************************************
  28. * rgp_estimate_memory
  29. * ===================
  30. *
  31. * Description:
  32. *
  33. * Routine to find the number of bytes of memory needed by regroup.
  34. * The only global memory used by Regroup is for the rgp_control structure.
  35. * The caller must allocate and zero out a chunk of this much memory
  36. * and then call rgp_init() with a pointer to this memory.
  37. *
  38. * Parameters:
  39. *
  40. * None
  41. *
  42. * Returns:
  43. *
  44. * int - number of bytes of locked down and initialized (to 0) memory
  45. * needed by Regroup. The memory must be 4-byte aligned.
  46. *
  47. * Algorithm:
  48. *
  49. * Uses the size of the rgp_control_t to calculate the number of
  50. * bytes needed.
  51. *
  52. ************************************************************************/
  53. _priv _resident int
  54. RGP_ESTIMATE_MEMORY(void)
  55. {
  56. return(sizeof(rgp_control_t));
  57. }
  58. /************************************************************************
  59. * rgp_init
  60. * ========
  61. *
  62. * Description:
  63. *
  64. * Routine to initialize the global Regroup data structures.
  65. *
  66. * Parameters:
  67. *
  68. * node_t this_node -
  69. * node number of local node; regroup uses bit masks to represent
  70. * nodes in the cluster and starts numbering nodes from 0. The OS
  71. * starts numbering at LOWEST_NODENUM. This transformation is
  72. * maintained in all the regroup interfaces to the OS.
  73. *
  74. * unsigned int num_nodes -
  75. * number of nodes in the configured node number space =
  76. * (largest configured node number - LOWEST_NODENUM + 1).
  77. *
  78. * void *rgp_buffer -
  79. * pointer to a block of locked down memory initialized to 0; this is
  80. * for use by Regroup as its global memory; must be 4-byte aligned
  81. *
  82. * int rgp_buflen -
  83. * length in bytes of the locked down buffer *rgp_buffer; must be equal
  84. * to or greater than the number returned by rgp_estimate_memory()
  85. *
  86. * rgp_msgsys_p rgp_msgsys_p -
  87. * pointer to a common struct used by the message system and
  88. * Regroup to co-ordinate regroup related work
  89. *
  90. * Returns:
  91. *
  92. * void - no return value
  93. *
  94. * Algorithm:
  95. *
  96. * Initializes the Regroup global data structure with default initial
  97. * values and the parameters passed in.
  98. *
  99. ************************************************************************/
  100. _priv _resident void
  101. RGP_INIT(node_t this_node, unsigned int num_nodes,
  102. void *rgp_buffer, int rgp_buflen,
  103. rgp_msgsys_p rgp_msgsys_p)
  104. {
  105. this_node = INT_NODE(this_node); /* adjust the node number by the offset */
  106. if ((num_nodes > MAX_CLUSTER_SIZE) ||
  107. (this_node >= (node_t) num_nodes) ||
  108. (rgp_buflen < rgp_estimate_memory()) /* buffer too small */ ||
  109. ((ULONG_PTR)rgp_buffer % 4) /* buffer not 4-byte aligned */
  110. )
  111. RGP_ERROR(RGP_INTERNAL_ERROR);
  112. #ifdef NSK
  113. /* In NSK, the caller must set up the global rgp pointer. */
  114. #else
  115. rgp = (rgp_control_t *) rgp_buffer;
  116. #endif /* NSK */
  117. rgp->num_nodes = num_nodes; /* # of nodes configured */
  118. rgp->rgp_msgsys_p = rgp_msgsys_p; /* ptr to struct shared with Msgsys */
  119. rgp->mynode = this_node;
  120. #if defined (NT)
  121. /* Initialize RGP_LOCK, the CRITICALSECTION object that will be used
  122. * to synchronize access within the regroup procedures */
  123. InitializeCriticalSection( &rgp->OS_specific_control.RgpCriticalSection );
  124. #endif
  125. RGP_CLEANUP();
  126. /* We place a bit pattern in the IamAlive packet. This bit
  127. * pattern toggles all the bits.
  128. */
  129. rgp->iamalive_pkt.testpattern.words[0] = 0x0055FF6D;
  130. rgp->iamalive_pkt.testpattern.words[1] = 0x92CC33E3;
  131. rgp->iamalive_pkt.testpattern.words[2] = 0x718E49F0;
  132. rgp->iamalive_pkt.testpattern.words[3] = 0x92CC33E3;
  133. rgp->iamalive_pkt.testpattern.words[4] = 0x0055FF6D;
  134. rgp->iamalive_pkt.testpattern.words[5] = 0x0055FF6D;
  135. rgp->iamalive_pkt.testpattern.words[6] = 0x92CC33E3;
  136. rgp->iamalive_pkt.testpattern.words[7] = 0x718E49F0;
  137. rgp->iamalive_pkt.testpattern.words[8] = 0x92CC33E3;
  138. rgp->iamalive_pkt.testpattern.words[9] = 0x0055FF6D;
  139. rgp->iamalive_pkt.testpattern.words[10] = 0x55AA55AA;
  140. rgp->iamalive_pkt.testpattern.words[11] = 0x55AA55AA;
  141. rgp->iamalive_pkt.testpattern.words[12] = 0x55AA55AA;
  142. rgp->poison_pkt.pktsubtype = RGP_UNACK_POISON;
  143. rgp_init_OS(); /* OS-specific initializations */
  144. rgp_cleanup_OS(); /* OS-specific cleanup */
  145. /* Trace the call after the data structures have been initialized. */
  146. RGP_TRACE( "RGP Init called ", EXT_NODE(this_node), num_nodes,
  147. PtrToUlong(rgp_buffer), PtrToUlong(rgp_msgsys_p) ); /* TRACE */
  148. }
  149. /**************************************************************************
  150. * rgp_cleanup
  151. * ===========
  152. * Description:
  153. *
  154. * This function cleans up the RGP structure such that this node is
  155. * virtually returned to the state following RGP_INIT and ready to be
  156. * "join"ed into the cluster.
  157. *
  158. * Parameters:
  159. *
  160. * None
  161. *
  162. * Returns:
  163. *
  164. * None
  165. **************************************************************************/
  166. _priv _resident void
  167. RGP_CLEANUP(void)
  168. {
  169. node_t i;
  170. RGP_LOCK;
  171. /* Initialize the state of all possible nodes in the cluster. */
  172. for (i = 0; i < (node_t) rgp->num_nodes; i++)
  173. {
  174. rgp->node_states[i].status = RGP_NODE_DEAD;
  175. rgp->node_states[i].pollstate = AWAITING_IAMALIVE;
  176. rgp->node_states[i].lostHBs = 0;
  177. #if defined( NT )
  178. ClusnetSetNodeMembershipState(NmClusnetHandle,
  179. EXT_NODE( i ),
  180. ClusnetNodeStateDead);
  181. #endif // NT
  182. }
  183. for (i = (node_t)rgp->num_nodes; i < MAX_CLUSTER_SIZE; i++)
  184. {
  185. rgp->node_states[i].status = RGP_NODE_NOT_CONFIGURED;
  186. rgp->node_states[i].pollstate = AWAITING_IAMALIVE;
  187. rgp->node_states[i].lostHBs = 0;
  188. #if defined( NT )
  189. ClusnetSetNodeMembershipState(NmClusnetHandle,
  190. EXT_NODE( i ),
  191. ClusnetNodeStateNotConfigured);
  192. #endif // NT
  193. }
  194. rgp->rgpinfo.version = RGP_VERSION;
  195. rgp->rgpinfo.seqnum = RGP_INITSEQNUM;
  196. rgp->rgpinfo.iamalive_ticks = RGP_IAMALIVE_TICKS;
  197. rgp->rgpinfo.check_ticks = RGP_CHECK_TICKS;
  198. rgp->rgpinfo.Min_Stage1_ticks = RGP_MIN_STAGE1_TICKS;
  199. rgp->rgpinfo.a_tick = RGP_INACTIVE_PERIOD;
  200. ClusterInit(rgp->rgpinfo.cluster);
  201. rgp->rgppkt.stage = RGP_COLDLOADED;
  202. rgp->rgpcounter = 0;
  203. rgp->restartcount = 0;
  204. rgp->tiebreaker = rgp->mynode;
  205. /* Initialize the unacknowledged packet buffers */
  206. rgp->rgppkt.pktsubtype = RGP_UNACK_REGROUP;
  207. rgp->rgppkt.seqno = rgp->rgpinfo.seqnum;
  208. rgp->last_stable_seqno = rgp->rgpinfo.seqnum;
  209. ClusterCopy(rgp->OS_specific_control.CPUUPMASK, rgp->rgpinfo.cluster);
  210. ClusterCopy(rgp->outerscreen, rgp->rgpinfo.cluster);
  211. #if defined( NT )
  212. ClusnetSetOuterscreen( NmClusnetHandle, (ULONG)*((PUSHORT)rgp->outerscreen) );
  213. #endif
  214. ClusterCopy(rgp->innerscreen, rgp->rgpinfo.cluster);
  215. ClusterCopy(rgp->rgppkt.knownstage1, rgp->rgpinfo.cluster);
  216. ClusterCopy(rgp->rgppkt.knownstage2, rgp->rgpinfo.cluster);
  217. ClusterCopy(rgp->rgppkt.knownstage3, rgp->rgpinfo.cluster);
  218. ClusterCopy(rgp->rgppkt.knownstage4, rgp->rgpinfo.cluster);
  219. ClusterCopy(rgp->rgppkt.knownstage5, rgp->rgpinfo.cluster);
  220. ClusterCopy(rgp->rgppkt.pruning_result, rgp->rgpinfo.cluster);
  221. MatrixInit(rgp->rgppkt.connectivity_matrix);
  222. rgp->rgppkt_to_send.pktsubtype = RGP_UNACK_REGROUP;
  223. rgp->iamalive_pkt.pktsubtype = RGP_UNACK_IAMALIVE;
  224. RGP_UNLOCK;
  225. }
  226. /***************************************************************************
  227. * rgp_sequence_number
  228. * ===================
  229. * Description:
  230. *
  231. * This function returns the regroup sequence number.
  232. *
  233. * This provides only a subset of the functionality provided by
  234. * rgp_getrgpinfo(), but is a simpler function and has no structure
  235. * parameters, making it easier to call from PTAL.
  236. *
  237. * A regroup incident could be in progress when this routine is
  238. * called.
  239. *
  240. * Parameters:
  241. *
  242. * None
  243. *
  244. * Returns:
  245. *
  246. * uint32 - the current regroup sequence number; this reflects
  247. * how many regroup incidents have happened since
  248. * the system came up. Since one incident can result in
  249. * upto RGP_RESTART_MAX restarts each resulting in the
  250. * sequence # being bumped, this number does not always
  251. * equal the number of regroup incidents.
  252. *
  253. ***************************************************************************/
  254. _priv _resident uint32
  255. RGP_SEQUENCE_NUMBER(void)
  256. {
  257. return(rgp->rgpinfo.seqnum);
  258. }
  259. /************************************************************************
  260. * rgp_getrgpinfo
  261. * ==============
  262. *
  263. * Description:
  264. *
  265. * Routine to get Regroup parameters.
  266. *
  267. * Parameters:
  268. *
  269. * rgpinfo_t *rgpinfo - pointer to struct to be filled with Regroup
  270. * parameters.
  271. *
  272. * Returns:
  273. *
  274. * int - 0 if successful; -1 if Regroup is perturbed.
  275. *
  276. * Algorithm:
  277. *
  278. * Copies the rgpinfo struct from the Regroup global memory into the
  279. * struct passed in by the caller.
  280. *
  281. ************************************************************************/
  282. _priv _resident int
  283. RGP_GETRGPINFO(rgpinfo_t *rgpinfo)
  284. {
  285. int error = 0;
  286. /* If no rgpinfo structure is passed OR rgp_init() has not been called
  287. * earlier, halt.
  288. */
  289. if ((rgpinfo == RGP_NULL_PTR) || (rgp == RGP_NULL_PTR))
  290. RGP_ERROR( RGP_INTERNAL_ERROR );
  291. RGP_LOCK;
  292. if (rgp_is_perturbed())
  293. error = -1;
  294. else
  295. /* Copy the rgpinfo structure from regroup's internal struct. */
  296. *rgpinfo = rgp->rgpinfo;
  297. RGP_UNLOCK;
  298. return(error);
  299. }
  300. /************************************************************************
  301. * rgp_setrgpinfo
  302. * ==============
  303. *
  304. * Description:
  305. *
  306. * Routine to set Regroup parameters. This routine is to be called on
  307. * newly booting nodes to set the Regroup parameters to the values
  308. * in the master or reloading node. The parameters to be updated
  309. * include Regroup timing parameters and the cluster membership;
  310. * that is, the current set of nodes in the system.
  311. *
  312. * This routine can also be called on the first node to boot to
  313. * modify the Regroup timing parameters which are set to the default
  314. * values when rgp_init() is called. Such modification has to be done
  315. * before other nodes are added to the system.
  316. *
  317. * Parameters:
  318. *
  319. * rgpinfo_t *rgpinfo - pointer to struct with Regroup parameters to
  320. * be modified.
  321. *
  322. * Returns:
  323. *
  324. * int - 0 if successful; -1 if there is more than one node in the
  325. * cluster. This is to prevent modification of timing parameters
  326. * after the second node is added to the system.
  327. *
  328. * Algorithm:
  329. *
  330. * Copies the contents of the user-passed struct into the one in the
  331. * Regroup global memory and updates related parameters.
  332. *
  333. ************************************************************************/
  334. _priv _resident int
  335. RGP_SETRGPINFO(rgpinfo_t *rgpinfo)
  336. {
  337. int error = 0;
  338. node_t i;
  339. /* If no rgpinfo structure is passed OR the version # of the
  340. * structure is not understood OR rgp_init() has not been called,
  341. * halt.
  342. */
  343. if ((rgpinfo == RGP_NULL_PTR) ||
  344. (rgpinfo->version != RGP_VERSION) ||
  345. (rgp == RGP_NULL_PTR))
  346. RGP_ERROR( RGP_INTERNAL_ERROR );
  347. RGP_LOCK;
  348. /* The following checks must be made before proceeding:
  349. *
  350. * 1. Regroup must not be perturbed.
  351. *
  352. * 2. If rgp_start() has been called (regroup is in the
  353. * RGP_STABILIZED state), only the local node must be in the
  354. * cluster when this routine is called.
  355. *
  356. * 3. If rgp_start() has been called, this routine can be used
  357. * only to modify the timing parameters and not to specify the
  358. * cluster.
  359. *
  360. * If these restrictions are not followed, return -1.
  361. */
  362. RGP_TRACE( "RGP SetRGPInfo ",
  363. rgpinfo->version, /* TRACE */
  364. rgpinfo->seqnum, /* TRACE */
  365. rgpinfo->iamalive_ticks, /* TRACE */
  366. GetCluster( rgpinfo->cluster ) );/* TRACE */
  367. if ( rgp_is_perturbed() ||
  368. ( (rgp->rgppkt.stage == RGP_STABILIZED) &&
  369. ( (ClusterNumMembers(rgp->rgpinfo.cluster) > 1) ||
  370. !ClusterCompare(rgp->rgpinfo.cluster,rgpinfo->cluster)
  371. )
  372. )
  373. )
  374. error = -1;
  375. else
  376. {
  377. /* Copy the rgpinfo structure into regroup's internal struct. */
  378. rgp->rgpinfo = *rgpinfo;
  379. /* If iamalive_ticks is set to 0, use the default value instead. */ /*F40:KSK06102.2*/
  380. if (rgpinfo->iamalive_ticks == 0) /*F40:KSK06102.3*/
  381. rgp->rgpinfo.iamalive_ticks = RGP_IAMALIVE_TICKS; /*F40:KSK06102.4*/
  382. /*F40:KSK06102.5*/
  383. if (rgpinfo->check_ticks == 0)
  384. {
  385. rgp->rgpinfo.check_ticks = RGP_CHECK_TICKS;
  386. }
  387. if (rgpinfo->Min_Stage1_ticks == 0)
  388. rgp->rgpinfo.Min_Stage1_ticks =
  389. (rgp->rgpinfo.iamalive_ticks * rgp->rgpinfo.check_ticks);
  390. if (rgpinfo->a_tick == 0)
  391. rgp->rgpinfo.a_tick = RGP_CLOCK_PERIOD;
  392. // Tell Timer thread to restart RGP timer
  393. SetEvent (rgp->OS_specific_control.TimerSignal);
  394. /* The cluster should include the local node even if the cluster
  395. * field in the rgpinfo structure does not include it.
  396. */
  397. ClusterInsert(rgp->rgpinfo.cluster, rgp->mynode);
  398. /* Copy the sequence number into the regroup packet area. */
  399. rgp->rgppkt.seqno = rgp->rgpinfo.seqnum;
  400. /* If nodes have been added in the cluster field, they must be
  401. * added to all the screens and their status must be set to
  402. * alive.
  403. */
  404. ClusterCopy(rgp->OS_specific_control.CPUUPMASK, rgp->rgpinfo.cluster);
  405. ClusterCopy(rgp->outerscreen, rgp->rgpinfo.cluster);
  406. #if defined( NT )
  407. ClusnetSetOuterscreen( NmClusnetHandle, (ULONG)*((PUSHORT)rgp->outerscreen) );
  408. ClusterComplement(rgp->ignorescreen, rgp->outerscreen);
  409. #endif
  410. ClusterCopy(rgp->innerscreen, rgp->rgpinfo.cluster);
  411. ClusterCopy(rgp->rgppkt.knownstage1, rgp->rgpinfo.cluster);
  412. ClusterCopy(rgp->rgppkt.knownstage2, rgp->rgpinfo.cluster);
  413. ClusterCopy(rgp->rgppkt.knownstage3, rgp->rgpinfo.cluster);
  414. ClusterCopy(rgp->rgppkt.knownstage4, rgp->rgpinfo.cluster);
  415. ClusterCopy(rgp->rgppkt.knownstage5, rgp->rgpinfo.cluster);
  416. ClusterCopy(rgp->rgppkt.pruning_result, rgp->rgpinfo.cluster);
  417. rgp->tiebreaker = rgp_select_tiebreaker(rgp->rgpinfo.cluster);
  418. for (i = 0; i < (node_t) rgp->num_nodes; i++)
  419. {
  420. if (ClusterMember(rgp->rgpinfo.cluster, i))
  421. {
  422. rgp->node_states[i].pollstate = IAMALIVE_RECEIVED;
  423. rgp->node_states[i].status = RGP_NODE_ALIVE;
  424. #if defined( NT )
  425. ClusnetSetNodeMembershipState(NmClusnetHandle,
  426. EXT_NODE( i ),
  427. ClusnetNodeStateAlive);
  428. #endif // NT
  429. }
  430. }
  431. /* Reset the clock counter so that IamAlives are sent when
  432. * the next timer tick arrives.
  433. */
  434. rgp->clock_ticks = 0;
  435. }
  436. RGP_UNLOCK;
  437. return(error);
  438. }
  439. /************************************************************************
  440. * rgp_start
  441. * =========
  442. *
  443. * Description:
  444. *
  445. * This routine signals the end of node integration into the cluster.
  446. * The node can now start participating in the Regroup algorithm.
  447. *
  448. * Parameters:
  449. *
  450. * void (*rgp_node_failed)()
  451. * pointer to a routine to be called when a node failure is
  452. * detected.
  453. *
  454. * int (*rgp_select_cluster)()
  455. * pointer to an optional routine to be called when link failures
  456. * cause multiple alternative clusters to be formed. This routine
  457. * should select one from a list of suggested clusters.
  458. *
  459. * Returns:
  460. *
  461. * void - no return value
  462. *
  463. * Algorithm:
  464. *
  465. * Installs the callback routines in the global data structure and
  466. * changes the Regroup state to RGP_STABILIZED.
  467. *
  468. ************************************************************************/
  469. _priv _resident void
  470. RGP_START(void (*nodedown_callback)(cluster_t failed_nodes),
  471. int (*select_cluster)(cluster_t cluster_choices[], int num_clusters)
  472. )
  473. {
  474. if (rgp == RGP_NULL_PTR)
  475. RGP_ERROR( RGP_INTERNAL_ERROR );
  476. RGP_LOCK;
  477. RGP_TRACE( "RGP Start called",
  478. rgp->rgppkt.stage, /* TRACE */
  479. PtrToUlong(nodedown_callback), /* TRACE */
  480. PtrToUlong(select_cluster), /* TRACE */
  481. 0 ); /* TRACE */
  482. /* Install callback routines for node failure notification and cluster
  483. * selection. If no routine is given by the caller, use default ones.
  484. */
  485. if (nodedown_callback == RGP_NULL_PTR)
  486. {
  487. #ifdef NSK
  488. /* In NSK, rgp_start() is called from pTAL code and passing routine
  489. * addresses is cumbersome. So, RGP_NULL_PTR is passed and we
  490. * call the routine rgp_node_failed() which must be supplied by
  491. * the message system.
  492. */
  493. rgp->nodedown_callback = rgp_node_failed; /* hardcoded name */
  494. #else
  495. /* A node down callback routine must be supplied. */
  496. RGP_ERROR( RGP_INTERNAL_ERROR );
  497. #endif /* NSK */
  498. }
  499. else
  500. rgp->nodedown_callback = nodedown_callback;
  501. #if 0
  502. /* The select cluster routine is optional. */
  503. if (select_cluster == RGP_NULL_PTR)
  504. rgp->select_cluster = rgp_select_cluster; /* supplied by regroup */
  505. else
  506. #endif
  507. //
  508. // Calling rgp_select_cluster is
  509. // not a good idea since it doesn't take into the consideration
  510. // quorum owner node.
  511. // If rgp->select_cluster == RGP_NULL_PTR, then srgpsm.c uses
  512. // rgp_select_cluster_ex, that will try to select the group
  513. // that contain the current quorum owner node
  514. rgp->select_cluster = select_cluster;
  515. #if defined(NT)
  516. /* Call the node up callback. This is where the local node gets
  517. * the node up callback for itself coming up. Other nodes call
  518. * the callback, for this node coming up, in rgp_monitor_node.
  519. */
  520. ClusterInsert(rgp->rgpinfo.cluster, rgp->mynode);
  521. ClusterCopy(rgp->OS_specific_control.CPUUPMASK, rgp->rgpinfo.cluster);
  522. if ( rgp->OS_specific_control.UpDownCallback != RGP_NULL_PTR )
  523. {
  524. (*(rgp->OS_specific_control.UpDownCallback))(
  525. EXT_NODE(rgp->mynode),
  526. NODE_UP
  527. );
  528. }
  529. #endif /* NT */
  530. RGP_UNLOCK;
  531. }
  532. /************************************************************************
  533. * rgp_add_node
  534. * ============
  535. *
  536. * Description:
  537. *
  538. * Called to add a newly booting node to the regroup masks. This prevents
  539. * Regroup from sending poison packets to the new node when it tries to
  540. * contact our node by sending IamAlive messages.
  541. *
  542. * Parameters:
  543. *
  544. * node_t node - node to be added to the recognition masks
  545. *
  546. * Returns:
  547. *
  548. * int - 0 on success and -1 on failure. The routine fails only if a
  549. * regroup incident is in progress.
  550. *
  551. * Algorithm:
  552. *
  553. * The node is added to all the recognition masks and its state is
  554. * changed to RGP_NODE_COMING_UP.
  555. *
  556. ************************************************************************/
  557. _priv _resident int
  558. RGP_ADD_NODE(node_t node)
  559. {
  560. int error = 0;
  561. RGP_LOCK;
  562. RGP_TRACE( "RGP Add node ", node, rgp->rgppkt.stage,
  563. GetCluster(rgp->outerscreen), /* TRACE */
  564. GetCluster(rgp->rgpinfo.cluster) ); /* TRACE */
  565. /* Cannot add a node while regroup is perturbed. Return -1 in that case.
  566. * The new node booting should fail due to the regroup incident anyway.
  567. */
  568. if (rgp_is_perturbed())
  569. error = -1;
  570. else
  571. {
  572. node = INT_NODE(node); /* adjust the node number by the offset */
  573. ClusterInsert(rgp->outerscreen, node);
  574. #if defined( NT )
  575. ClusnetSetOuterscreen( NmClusnetHandle, (ULONG)*((PUSHORT)rgp->outerscreen) );
  576. #endif
  577. ClusterInsert(rgp->innerscreen, node);
  578. ClusterInsert(rgp->rgppkt.knownstage1, node);
  579. ClusterInsert(rgp->rgppkt.knownstage2, node);
  580. ClusterInsert(rgp->rgppkt.knownstage3, node);
  581. ClusterInsert(rgp->rgppkt.knownstage4, node);
  582. ClusterInsert(rgp->rgppkt.knownstage5, node);
  583. ClusterInsert(rgp->rgppkt.pruning_result, node);
  584. rgp->node_states[node].pollstate = AWAITING_IAMALIVE;
  585. rgp->node_states[node].status = RGP_NODE_COMING_UP;
  586. rgp->node_states[node].lostHBs = 0;
  587. #if defined( NT )
  588. ClusterDelete( rgp->OS_specific_control.Banished, node );
  589. //
  590. // Remove joining node from ignore screen
  591. //
  592. ClusterDelete( rgp->ignorescreen, node );
  593. PackIgnoreScreen(&rgp->rgppkt, rgp->ignorescreen);
  594. ClusnetSetNodeMembershipState(NmClusnetHandle,
  595. EXT_NODE( node ),
  596. ClusnetNodeStateJoining);
  597. #endif // NT
  598. }
  599. RGP_UNLOCK;
  600. return(error);
  601. }
  602. /************************************************************************
  603. * rgp_monitor_node
  604. * ================
  605. *
  606. * Description:
  607. *
  608. * Called by all running nodes to change the status of a newly booted node
  609. * to UP. Can be called by the new node also; it is a no-op in this case.
  610. *
  611. * Parameters:
  612. *
  613. * node_t node - number of node being declared up
  614. *
  615. * Returns:
  616. *
  617. * int - 0 on success and -1 on failure. The routine fails only if the
  618. * state of the node is neither RGP_NODE_COMING_UP nor RGP_NODE_ALIVE.
  619. *
  620. * Algorithm:
  621. *
  622. * If the node is marked coming up, its state is changed to
  623. * RGP_NODE_ALIVE. If the node has already been marked up,
  624. * nothing is done.
  625. *
  626. ************************************************************************/
  627. _priv _resident int
  628. RGP_MONITOR_NODE(node_t node)
  629. {
  630. int error = 0;
  631. RGP_LOCK;
  632. RGP_TRACE( "RGP Monitor node", node, rgp->rgppkt.stage,
  633. GetCluster(rgp->outerscreen), /* TRACE */
  634. GetCluster(rgp->rgpinfo.cluster) ); /* TRACE */
  635. node = INT_NODE(node); /* adjust the node number by the offset */
  636. /* Accept the request only if the state of the node is COMING_UP or UP. */
  637. if (rgp->node_states[node].status == RGP_NODE_COMING_UP)
  638. {
  639. ClusterInsert(rgp->rgpinfo.cluster, node);
  640. rgp->tiebreaker = rgp_select_tiebreaker(rgp->rgpinfo.cluster);
  641. rgp->node_states[node].pollstate = IAMALIVE_RECEIVED;
  642. rgp->node_states[node].status = RGP_NODE_ALIVE;
  643. #if defined(NT)
  644. ClusterCopy(rgp->OS_specific_control.CPUUPMASK, rgp->rgpinfo.cluster);
  645. ClusnetSetNodeMembershipState(NmClusnetHandle,
  646. EXT_NODE( node ),
  647. ClusnetNodeStateAlive);
  648. /* A node came up. Call the node up callback. */
  649. if ( rgp->OS_specific_control.UpDownCallback != RGP_NULL_PTR )
  650. {
  651. (*(rgp->OS_specific_control.UpDownCallback))(
  652. EXT_NODE(node),
  653. NODE_UP
  654. );
  655. }
  656. #endif /* NT */
  657. }
  658. else if (rgp->node_states[node].status != RGP_NODE_ALIVE)
  659. /* Perhaps the booting node failed and regroup has already marked
  660. * it down. The cluster manager may have invoked a global update
  661. * resulting in this call before regroup reporetd the failure
  662. * of the node.
  663. */
  664. error = -1;
  665. RGP_UNLOCK;
  666. return(error);
  667. }
  668. /************************************************************************
  669. * rgp_remove_node
  670. * ===============
  671. *
  672. * Description:
  673. *
  674. * Called by the cluster manager to force out a booting node if booting
  675. * fails. Regroup may or may not have already removed the booting node
  676. * from the masks and declared it down, depending on what stage the
  677. * booting is in and when the booting node failed.
  678. *
  679. * Regroup can remove the node from the masks of all nodes in the cluster
  680. * by simply starting a new incident of regroup with any event code. This
  681. * will force all nodes to come to an agreement on cluster membership
  682. * that excludes the booting node. If the booting node is alive, it will
  683. * commit suicide since it will be in the incompetent (RGP_COLDLOADED)
  684. * state.
  685. *
  686. * Removing the new node from our masks is not necessary since regroup
  687. * will detect the node failure and adjust the masks. If we do remove it
  688. * from our masks BEFORE initiating regroup, regroup may complete quicker
  689. * since we will not wait in stage 1 for the node to check in. Also, this
  690. * could allow a node to be removed even after it is fully integrated.
  691. * This is because our node will send a poison packet to the removed node
  692. * if it tries to contact us.
  693. *
  694. * But this "enhancement" is not implemented because it requires a new
  695. * regroup event code which is examined by all nodes and processed
  696. * specially. Currently, the regroup event code is used only for
  697. * debugging info. Also, there is no guarantee that all nodes see the
  698. * same regroup reason code. For instance, some may see a missing
  699. * IamAlive while others may see a power failure.
  700. *
  701. * Parameters:
  702. *
  703. * node_t node - node to be removed from the recognition masks
  704. * (in external format).
  705. *
  706. * Returns:
  707. *
  708. * int - 0 on success and -1 on failure. The routine fails if a
  709. * regroup incident is in progress or rgp_start() has not been
  710. * called (as in a new node where the booting is not complete).
  711. *
  712. * Algorithm:
  713. *
  714. * If the node is still in the recognition masks, a new regroup incident
  715. * is started. This incident will result in all nodes declaring the node
  716. * dead and removing it from the recognition masks.
  717. *
  718. ************************************************************************/
  719. _priv _resident int
  720. RGP_REMOVE_NODE(node_t node)
  721. {
  722. int error = 0;
  723. RGP_LOCK;
  724. RGP_TRACE( "RGP Remove node ", node, rgp->rgppkt.stage,
  725. GetCluster(rgp->outerscreen), /* TRACE */
  726. GetCluster(rgp->rgpinfo.cluster) ); /* TRACE */
  727. if (rgp->rgppkt.stage == RGP_STABILIZED)
  728. {
  729. if (ClusterMember(rgp->outerscreen, INT_NODE(node)))
  730. {
  731. /* Node is currently in our screen. The node may have never come up
  732. * after rgp_add_node() was called OR regroup may not have figured
  733. * out yet that the node is down. In either case, the node must
  734. * be forced out and all nodes in the cluster notified (by a regroup
  735. * incident). If the node is still running, it will commit suicide
  736. * when this regroup incident starts.
  737. */
  738. rgp_event_handler(RGP_EVT_LATEPOLLPACKET, node);
  739. }
  740. else
  741. {
  742. /* Either the node was not added to the cluster OR regroup has
  743. * already figured out that the node is dead and reported this.
  744. * In either case, there is nothing more to do.
  745. */
  746. }
  747. }
  748. else
  749. error = -1;
  750. RGP_UNLOCK;
  751. return(error);
  752. }
  753. /************************************************************************
  754. * rgp_is_perturbed
  755. * ================
  756. *
  757. * Description:
  758. *
  759. * Function to check if a regroup incident is in progress.
  760. *
  761. * Parameters:
  762. *
  763. * None.
  764. *
  765. * Returns:
  766. *
  767. * int - 0 if no regroup is quiescent; non-zero if a regroup incident
  768. * is in progress.
  769. *
  770. * Algorithm:
  771. *
  772. * Looks at the current state of the Regroup algorithm.
  773. *
  774. ************************************************************************/
  775. _priv _resident int
  776. RGP_IS_PERTURBED(void)
  777. {
  778. uint8 stage = rgp->rgppkt.stage;
  779. return((stage != RGP_STABILIZED) && (stage != RGP_COLDLOADED));
  780. }
  781. /************************************************************************
  782. * rgp_periodic_check
  783. * ==================
  784. *
  785. * Description:
  786. *
  787. * This routine is invoked every RGP_CLOCK_PERIOD by the timer interrupt
  788. * handler of the native OS. It performs Regroups's periodic operations.
  789. *
  790. * Parameters:
  791. *
  792. * None
  793. *
  794. * Returns:
  795. *
  796. * void - no return value
  797. *
  798. * Algorithm:
  799. *
  800. * This routine requests Iamalive packets to be sent, checks if
  801. * IamAlives have been received (and calls rgp_event_handler() if
  802. * not) and sends a clock tick to the regroup algorithm if it is in
  803. * progress.
  804. *
  805. * IamAlives are checked at twice the IamAlive period. The regroup
  806. * global variable clock_ticks is incremented in each call. After
  807. * the IamAlives are checked, clock_ticks is reset to 0. Thus, the
  808. * ticker counts time modulo twice the IamAlive ticks.
  809. *
  810. ************************************************************************/
  811. _priv _resident void
  812. RGP_PERIODIC_CHECK(void)
  813. {
  814. node_t node;
  815. RGP_LOCK;
  816. /* If regroup is active, give it a shot at each regroup clock tick. */
  817. if ((rgp->rgppkt.stage != RGP_STABILIZED) &&
  818. (rgp->rgppkt.stage != RGP_COLDLOADED))
  819. rgp_event_handler(RGP_EVT_CLOCK_TICK, RGP_NULL_NODE);
  820. #if !defined( NT )
  821. /* Send IamAlive messages at appropriate intervals. */
  822. if ( (rgp->clock_ticks == 0) ||
  823. (rgp->clock_ticks == rgp->rgpinfo.iamalive_ticks) )
  824. {
  825. rgp_broadcast(RGP_UNACK_IAMALIVE);
  826. rgp->clock_ticks++;
  827. }
  828. /* Check for missing IamAlives at IamAlive sending period,
  829. * But flag an error (LATE_POLL) only if "check_ticks" IamAlives missed.
  830. * The checking is offset from the sending by one clock tick.
  831. */
  832. else if ( rgp->clock_ticks >= (rgp->rgpinfo.iamalive_ticks - 1) )
  833. { /* check all nodes for IamAlives received */
  834. for (node = 0; node < (node_t) rgp->num_nodes; node++)
  835. {
  836. if (rgp->node_states[node].status == RGP_NODE_ALIVE)
  837. {
  838. if ( rgp->node_states[node].pollstate == IAMALIVE_RECEIVED )
  839. { /* checked in in time */
  840. #if defined(TDM_DEBUG)
  841. if ( rgp->OS_specific_control.debug.doing_tracing )
  842. {
  843. printf ("Node %d: Node %d is alive. My rgp state=%d\n",
  844. EXT_NODE(rgp->mynode), EXT_NODE(node), rgp->rgppkt.stage );
  845. }
  846. #endif
  847. rgp->node_states[node].pollstate = AWAITING_IAMALIVE;
  848. rgp->node_states[node].lostHBs = 0;
  849. }
  850. else if ( rgp->node_states[node].lostHBs++ < rgp->rgpinfo.check_ticks )
  851. ;// allow upto (check_ticks-1) IamAlives to be lost.
  852. else
  853. {
  854. /* missing IamAlives */
  855. if (node == rgp->mynode) /* missed my own packets */
  856. {
  857. /* We should be lenient if we just had a power failure.
  858. */
  859. if (rgp->pfail_state == 0) /* no recent power failure */
  860. RGP_ERROR( RGP_MISSED_POLL_TO_SELF );
  861. }
  862. else
  863. rgp_event_handler(RGP_EVT_LATEPOLLPACKET, EXT_NODE(node));
  864. }
  865. }
  866. }
  867. /* Reset the regroup tick counter after checking for IamAlives. */
  868. rgp->clock_ticks = 0;
  869. } /* check all nodes for IamAlives received */
  870. else
  871. rgp->clock_ticks++;
  872. /* rgp->pfail_state is set to a non-zero value when a pfail event
  873. * is reported to regroup. It is decremented at every regroup clock
  874. * tick till it reaches zero. While this number is non-zero, missing
  875. * self IamAlives are ignored and do not cause the node to halt.
  876. * This gives the sending hardware some time to recover from power
  877. * failures before self IamAlives are checked.
  878. */
  879. if (rgp->pfail_state)
  880. rgp->pfail_state--;
  881. #endif // NT
  882. RGP_UNLOCK;
  883. } /* rgp_periodic_check */
  884. /************************************************************************
  885. * rgp_received_packet
  886. * ===================
  887. *
  888. * Description:
  889. *
  890. * Routine to be called by the message system when an unacknowledged
  891. * packet sent by the Regroup module is received from any node. These
  892. * packets include IamAlive packets, regroup status packets and poison
  893. * packets.
  894. *
  895. * Parameters:
  896. *
  897. * node_t node - node from which a packet has been received
  898. *
  899. * void *packet - address of the received packet data
  900. *
  901. * int packetlen - length in bytes of the received packet data
  902. *
  903. * Returns:
  904. *
  905. * void - no return value
  906. *
  907. * Algorithm:
  908. *
  909. * Does different things based on the packet subtype.
  910. *
  911. ************************************************************************/
  912. _priv _resident void
  913. RGP_RECEIVED_PACKET(node_t node, void *packet, int packetlen)
  914. {
  915. rgp_unseq_pkt_t *unseq_pkt = (rgp_unseq_pkt_t *) packet;
  916. node = INT_NODE(node);
  917. /* If the packet is from a node that cannot be in our cluster,
  918. * simply ignore it.
  919. */
  920. if (node >= (node_t) rgp->num_nodes)
  921. return;
  922. /* If the sending node is excluded by the outer screen, then it is
  923. * not part of the current (most recently known) configuration.
  924. * Therefore the packet should not be honored, and a poison message
  925. * should be sent to try to kill this renegade processor unless
  926. * it is sending US a poison packet. If it is sending us a poison
  927. * packet, we cannot send it a poison in return because that results
  928. * in an infinite loop. In this case, we just halt because this
  929. * situation implies that there is a split brain situation and our
  930. * split brain avoidance algorithm has failed.
  931. */
  932. /* NT Notes
  933. *
  934. * even with poison pkts being sent and recv'ed in the kernel, we still
  935. * want to make these checks since clusnet doesn't have the regroup stage
  936. * info and regroup packets themselves find there way in here.
  937. */
  938. if (!ClusterMember(rgp->outerscreen, node)
  939. #if defined( NT )
  940. ||
  941. ClusterMember(rgp->OS_specific_control.Banished, node)
  942. #endif
  943. )
  944. {
  945. if (rgp->rgppkt.stage == RGP_COLDLOADED)
  946. {
  947. // We are doing this check in srgpsm.c
  948. // No need to do it here
  949. // RGP_ERROR(RGP_RELOADFAILED);
  950. //
  951. }
  952. else if (unseq_pkt->pktsubtype == RGP_UNACK_POISON)
  953. {
  954. RGP_ERROR((uint16) (RGP_PARIAH + EXT_NODE(node)));
  955. } else {
  956. /* Must send a poison packet to the sender.
  957. */
  958. ClusterInsert(rgp->poison_targets, node);
  959. rgp_broadcast(RGP_UNACK_POISON);
  960. }
  961. return;
  962. }
  963. switch (unseq_pkt->pktsubtype)
  964. {
  965. case RGP_UNACK_IAMALIVE :
  966. {
  967. /* Count the number of IamAlives received */
  968. if ( node == rgp->mynode )
  969. RGP_INCREMENT_COUNTER( RcvdLocalIAmAlive );
  970. else
  971. RGP_INCREMENT_COUNTER( RcvdRemoteIAmAlive );
  972. if (rgp->node_states[node].status == RGP_NODE_ALIVE)
  973. rgp->node_states[node].pollstate = IAMALIVE_RECEIVED;
  974. else if (rgp->node_states[node].status == RGP_NODE_COMING_UP)
  975. {
  976. /* If the node has not yet been marked fully up, it is time to
  977. * do so.
  978. */
  979. rgp_monitor_node(EXT_NODE(node));
  980. /* We must tell the OS that the new node is up in case the
  981. * OS needs the IamAlives to figure that out.
  982. */
  983. rgp_newnode_online(EXT_NODE(node));
  984. }
  985. else
  986. /* If the node state is neither alive nor coming up, it
  987. * must not be in our outerscreen. The outerscreen check
  988. * above must have passed and we should not get here.
  989. */
  990. RGP_ERROR(RGP_INTERNAL_ERROR);
  991. break;
  992. }
  993. case RGP_UNACK_REGROUP :
  994. {
  995. /* Count the number of regroup status packets received. */
  996. RGP_INCREMENT_COUNTER( RcvdRegroup );
  997. /* Any good packet can be treated as an IamAlive packet. */
  998. rgp->node_states[node].pollstate = IAMALIVE_RECEIVED;
  999. RGP_EVENT_HANDLER_EX (RGP_EVT_RECEIVED_PACKET, EXT_NODE(node), (void*)unseq_pkt);
  1000. break;
  1001. }
  1002. case RGP_UNACK_POISON :
  1003. {
  1004. /* If our node is in RGP_PRUNING stage and have been pruned out,
  1005. * the poison packet probably implies that the sender has gone
  1006. * into the next stage and declared us down. In this case, use
  1007. * the more appropriate RGP_PRUNED_OUT halt code. Otherwise,
  1008. * use the poison packet halt code. In either case, we must halt.
  1009. */
  1010. if ( (rgp->rgppkt.stage == RGP_PRUNING) &&
  1011. !ClusterMember(rgp->rgppkt.pruning_result, rgp->mynode) )
  1012. RGP_ERROR(RGP_PRUNED_OUT);
  1013. else
  1014. {
  1015. if (rgp->rgppkt.stage == RGP_COLDLOADED)
  1016. {
  1017. RGP_ERROR(RGP_RELOADFAILED);
  1018. return;
  1019. }
  1020. else
  1021. RGP_ERROR((uint16) (RGP_PARIAH + EXT_NODE(node)));
  1022. }
  1023. break;
  1024. }
  1025. default :
  1026. {
  1027. /* Ignore the unknown packet type. */
  1028. break;
  1029. }
  1030. }
  1031. }
  1032. /*---------------------------------------------------------------------------*/
  1033. #ifdef __cplusplus
  1034. }
  1035. #endif /* __cplusplus */
  1036. #if 0
  1037. History of changes to this file:
  1038. -------------------------------------------------------------------------
  1039. 1995, December 13 F40:KSK0610 /*F40:KSK06102.6*/
  1040. This file is part of the portable Regroup Module used in the NonStop
  1041. Kernel (NSK) and Loosely Coupled UNIX (LCU) operating systems. There
  1042. are 10 files in the module - jrgp.h, jrgpos.h, wrgp.h, wrgpos.h,
  1043. srgpif.c, srgpos.c, srgpsm.c, srgputl.c, srgpcli.c and srgpsvr.c.
  1044. The last two are simulation files to test the Regroup Module on a
  1045. UNIX workstation in user mode with processes simulating processor nodes
  1046. and UDP datagrams used to send unacknowledged datagrams.
  1047. This file was first submitted for release into NSK on 12/13/95.
  1048. ------------------------------------------------------------------------------
  1049. This change occurred on 19 Jan 1996 /*F40:MB06458.1*/
  1050. Changes for phase IV Sierra message system release. Includes: /*F40:MB06458.2*/
  1051. - Some cleanup of the code /*F40:MB06458.3*/
  1052. - Increment KCCB counters to count the number of setup messages and /*F40:MB06458.4*/
  1053. unsequenced messages sent. /*F40:MB06458.5*/
  1054. - Fixed some bugs /*F40:MB06458.6*/
  1055. - Disable interrupts before allocating broadcast sibs. /*F40:MB06458.7*/
  1056. - Change per-packet-timeout to 5ms /*F40:MB06458.8*/
  1057. - Make the regroup and powerfail broadcast use highest priority /*F40:MB06458.9*/
  1058. tnet services queue. /*F40:MB06458.10*/
  1059. - Call the millicode backdoor to get the processor status from SP /*F40:MB06458.11*/
  1060. - Fixed expand bug in msg_listen_ and msg_readctrl_ /*F40:MB06458.12*/
  1061. - Added enhancement to msngr_sendmsg_ so that clients do not need /*F40:MB06458.13*/
  1062. to be unstoppable before calling this routine. /*F40:MB06458.14*/
  1063. - Added new steps in the build file called /*F40:MB06458.15*/
  1064. MSGSYS_C - compiles all the message system C files /*F40:MB06458.16*/
  1065. MSDRIVER - compiles all the MSDriver files /*F40:MB06458.17*/
  1066. REGROUP - compiles all the regroup files /*F40:MB06458.18*/
  1067. - remove #pragma env libspace because we set it as a command line /*F40:MB06458.19*/
  1068. parameter. /*F40:MB06458.20*/
  1069. ----------------------------------------------------------------------- /*F40:MB06458.21*/
  1070. #endif /* 0 - change descriptions */
  1071.