Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1229 lines
42 KiB

  1. #ifdef __TANDEM
  2. #pragma columns 79
  3. #pragma page "srgpif.c - T9050 - interface routines for Regroup Module"
  4. #endif
  5. /* @@@ START COPYRIGHT @@@
  6. ** Tandem Confidential: Need to Know only
  7. ** Copyright (c) 1995, Tandem Computers Incorporated
  8. ** Protected as an unpublished work.
  9. ** All Rights Reserved.
  10. **
  11. ** The computer program listings, specifications, and documentation
  12. ** herein are the property of Tandem Computers Incorporated and shall
  13. ** not be reproduced, copied, disclosed, or used in whole or in part
  14. ** for any reason without the prior express written permission of
  15. ** Tandem Computers Incorporated.
  16. **
  17. ** @@@ END COPYRIGHT @@@
  18. **/
  19. /*---------------------------------------------------------------------------
  20. * This file (srgpif.c) contains all the external interface routines
  21. * of Regroup.
  22. *---------------------------------------------------------------------------*/
  23. #ifdef __cplusplus
  24. extern "C" {
  25. #endif /* __cplusplus */
  26. #include <wrgp.h>
  27. /************************************************************************
  28. * rgp_estimate_memory
  29. * ===================
  30. *
  31. * Description:
  32. *
  33. * Routine to find the number of bytes of memory needed by regroup.
  34. * The only global memory used by Regroup is for the rgp_control structure.
  35. * The caller must allocate and zero out a chunk of this much memory
  36. * and then call rgp_init() with a pointer to this memory.
  37. *
  38. * Parameters:
  39. *
  40. * None
  41. *
  42. * Returns:
  43. *
  44. * int - number of bytes of locked down and initialized (to 0) memory
  45. * needed by Regroup. The memory must be 4-byte aligned.
  46. *
  47. * Algorithm:
  48. *
  49. * Uses the size of the rgp_control_t to calculate the number of
  50. * bytes needed.
  51. *
  52. ************************************************************************/
  53. _priv _resident int
  54. RGP_ESTIMATE_MEMORY(void)
  55. {
  56. return(sizeof(rgp_control_t));
  57. }
  58. /************************************************************************
  59. * rgp_init
  60. * ========
  61. *
  62. * Description:
  63. *
  64. * Routine to initialize the global Regroup data structures.
  65. *
  66. * Parameters:
  67. *
  68. * node_t this_node -
  69. * node number of local node; regroup uses bit masks to represent
  70. * nodes in the cluster and starts numbering nodes from 0. The OS
  71. * starts numbering at LOWEST_NODENUM. This transformation is
  72. * maintained in all the regroup interfaces to the OS.
  73. *
  74. * unsigned int num_nodes -
  75. * number of nodes in the configured node number space =
  76. * (largest configured node number - LOWEST_NODENUM + 1).
  77. *
  78. * void *rgp_buffer -
  79. * pointer to a block of locked down memory initialized to 0; this is
  80. * for use by Regroup as its global memory; must be 4-byte aligned
  81. *
  82. * int rgp_buflen -
  83. * length in bytes of the locked down buffer *rgp_buffer; must be equal
  84. * to or greater than the number returned by rgp_estimate_memory()
  85. *
  86. * rgp_msgsys_p rgp_msgsys_p -
  87. * pointer to a common struct used by the message system and
  88. * Regroup to co-ordinate regroup related work
  89. *
  90. * Returns:
  91. *
  92. * void - no return value
  93. *
  94. * Algorithm:
  95. *
  96. * Initializes the Regroup global data structure with default initial
  97. * values and the parameters passed in.
  98. *
  99. ************************************************************************/
  100. _priv _resident void
  101. RGP_INIT(node_t this_node, unsigned int num_nodes,
  102. void *rgp_buffer, int rgp_buflen,
  103. rgp_msgsys_p rgp_msgsys_p)
  104. {
  105. this_node = INT_NODE(this_node); /* adjust the node number by the offset */
  106. if ((num_nodes > MAX_CLUSTER_SIZE) ||
  107. (this_node >= (node_t) num_nodes) ||
  108. (rgp_buflen < rgp_estimate_memory()) /* buffer too small */ ||
  109. ((ULONG_PTR)rgp_buffer % 4) /* buffer not 4-byte aligned */
  110. )
  111. RGP_ERROR(RGP_INTERNAL_ERROR);
  112. #ifdef NSK
  113. /* In NSK, the caller must set up the global rgp pointer. */
  114. #else
  115. rgp = (rgp_control_t *) rgp_buffer;
  116. #endif /* NSK */
  117. rgp->num_nodes = num_nodes; /* # of nodes configured */
  118. rgp->rgp_msgsys_p = rgp_msgsys_p; /* ptr to struct shared with Msgsys */
  119. rgp->mynode = this_node;
  120. #if defined (NT)
  121. /* Initialize RGP_LOCK, the CRITICALSECTION object that will be used
  122. * to synchronize access within the regroup procedures */
  123. InitializeCriticalSection( &rgp->OS_specific_control.RgpCriticalSection );
  124. #endif
  125. RGP_CLEANUP();
  126. /* We place a bit pattern in the IamAlive packet. This bit
  127. * pattern toggles all the bits.
  128. */
  129. rgp->iamalive_pkt.testpattern.words[0] = 0x0055FF6D;
  130. rgp->iamalive_pkt.testpattern.words[1] = 0x92CC33E3;
  131. rgp->iamalive_pkt.testpattern.words[2] = 0x718E49F0;
  132. rgp->iamalive_pkt.testpattern.words[3] = 0x92CC33E3;
  133. rgp->iamalive_pkt.testpattern.words[4] = 0x0055FF6D;
  134. rgp->iamalive_pkt.testpattern.words[5] = 0x0055FF6D;
  135. rgp->iamalive_pkt.testpattern.words[6] = 0x92CC33E3;
  136. rgp->iamalive_pkt.testpattern.words[7] = 0x718E49F0;
  137. rgp->iamalive_pkt.testpattern.words[8] = 0x92CC33E3;
  138. rgp->iamalive_pkt.testpattern.words[9] = 0x0055FF6D;
  139. rgp->iamalive_pkt.testpattern.words[10] = 0x55AA55AA;
  140. rgp->iamalive_pkt.testpattern.words[11] = 0x55AA55AA;
  141. rgp->iamalive_pkt.testpattern.words[12] = 0x55AA55AA;
  142. rgp->poison_pkt.pktsubtype = RGP_UNACK_POISON;
  143. rgp_init_OS(); /* OS-specific initializations */
  144. rgp_cleanup_OS(); /* OS-specific cleanup */
  145. /* Trace the call after the data structures have been initialized. */
  146. RGP_TRACE( "RGP Init called ", EXT_NODE(this_node), num_nodes,
  147. PtrToUlong(rgp_buffer), PtrToUlong(rgp_msgsys_p) ); /* TRACE */
  148. }
  149. /**************************************************************************
  150. * rgp_cleanup
  151. * ===========
  152. * Description:
  153. *
  154. * This function cleans up the RGP structure such that this node is
  155. * virtually returned to the state following RGP_INIT and ready to be
  156. * "join"ed into the cluster.
  157. *
  158. * Parameters:
  159. *
  160. * None
  161. *
  162. * Returns:
  163. *
  164. * None
  165. **************************************************************************/
  166. _priv _resident void
  167. RGP_CLEANUP(void)
  168. {
  169. node_t i;
  170. RGP_LOCK;
  171. /* Initialize the state of all possible nodes in the cluster. */
  172. for (i = 0; i < (node_t) rgp->num_nodes; i++)
  173. {
  174. rgp->node_states[i].status = RGP_NODE_DEAD;
  175. rgp->node_states[i].pollstate = AWAITING_IAMALIVE;
  176. rgp->node_states[i].lostHBs = 0;
  177. #if defined( NT )
  178. ClusnetSetNodeMembershipState(NmClusnetHandle,
  179. EXT_NODE( i ),
  180. ClusnetNodeStateDead);
  181. #endif // NT
  182. }
  183. for (i = (node_t)rgp->num_nodes; i < MAX_CLUSTER_SIZE; i++)
  184. {
  185. rgp->node_states[i].status = RGP_NODE_NOT_CONFIGURED;
  186. rgp->node_states[i].pollstate = AWAITING_IAMALIVE;
  187. rgp->node_states[i].lostHBs = 0;
  188. #if defined( NT )
  189. ClusnetSetNodeMembershipState(NmClusnetHandle,
  190. EXT_NODE( i ),
  191. ClusnetNodeStateNotConfigured);
  192. #endif // NT
  193. }
  194. rgp->rgpinfo.version = RGP_VERSION;
  195. rgp->rgpinfo.seqnum = RGP_INITSEQNUM;
  196. rgp->rgpinfo.iamalive_ticks = RGP_IAMALIVE_TICKS;
  197. rgp->rgpinfo.check_ticks = RGP_CHECK_TICKS;
  198. rgp->rgpinfo.Min_Stage1_ticks = RGP_MIN_STAGE1_TICKS;
  199. rgp->rgpinfo.a_tick = RGP_INACTIVE_PERIOD;
  200. ClusterInit(rgp->rgpinfo.cluster);
  201. rgp->rgppkt.stage = RGP_COLDLOADED;
  202. rgp->rgpcounter = 0;
  203. rgp->restartcount = 0;
  204. rgp->tiebreaker = rgp->mynode;
  205. /* Initialize the unacknowledged packet buffers */
  206. rgp->rgppkt.pktsubtype = RGP_UNACK_REGROUP;
  207. rgp->rgppkt.seqno = rgp->rgpinfo.seqnum;
  208. rgp->last_stable_seqno = rgp->rgpinfo.seqnum;
  209. ClusterCopy(rgp->OS_specific_control.CPUUPMASK, rgp->rgpinfo.cluster);
  210. ClusterCopy(rgp->outerscreen, rgp->rgpinfo.cluster);
  211. #if defined( NT )
  212. ClusnetSetOuterscreen( NmClusnetHandle, (ULONG)*((PUSHORT)rgp->outerscreen) );
  213. #endif
  214. ClusterCopy(rgp->innerscreen, rgp->rgpinfo.cluster);
  215. ClusterCopy(rgp->rgppkt.knownstage1, rgp->rgpinfo.cluster);
  216. ClusterCopy(rgp->rgppkt.knownstage2, rgp->rgpinfo.cluster);
  217. ClusterCopy(rgp->rgppkt.knownstage3, rgp->rgpinfo.cluster);
  218. ClusterCopy(rgp->rgppkt.knownstage4, rgp->rgpinfo.cluster);
  219. ClusterCopy(rgp->rgppkt.knownstage5, rgp->rgpinfo.cluster);
  220. ClusterCopy(rgp->rgppkt.pruning_result, rgp->rgpinfo.cluster);
  221. MatrixInit(rgp->rgppkt.connectivity_matrix);
  222. rgp->rgppkt_to_send.pktsubtype = RGP_UNACK_REGROUP;
  223. rgp->iamalive_pkt.pktsubtype = RGP_UNACK_IAMALIVE;
  224. RGP_UNLOCK;
  225. }
  226. /***************************************************************************
  227. * rgp_sequence_number
  228. * ===================
  229. * Description:
  230. *
  231. * This function returns the regroup sequence number.
  232. *
  233. * This provides only a subset of the functionality provided by
  234. * rgp_getrgpinfo(), but is a simpler function and has no structure
  235. * parameters, making it easier to call from PTAL.
  236. *
  237. * A regroup incident could be in progress when this routine is
  238. * called.
  239. *
  240. * Parameters:
  241. *
  242. * None
  243. *
  244. * Returns:
  245. *
  246. * uint32 - the current regroup sequence number; this reflects
  247. * how many regroup incidents have happened since
  248. * the system came up. Since one incident can result in
  249. * upto RGP_RESTART_MAX restarts each resulting in the
  250. * sequence # being bumped, this number does not always
  251. * equal the number of regroup incidents.
  252. *
  253. ***************************************************************************/
  254. _priv _resident uint32
  255. RGP_SEQUENCE_NUMBER(void)
  256. {
  257. return(rgp->rgpinfo.seqnum);
  258. }
  259. /************************************************************************
  260. * rgp_getrgpinfo
  261. * ==============
  262. *
  263. * Description:
  264. *
  265. * Routine to get Regroup parameters.
  266. *
  267. * Parameters:
  268. *
  269. * rgpinfo_t *rgpinfo - pointer to struct to be filled with Regroup
  270. * parameters.
  271. *
  272. * Returns:
  273. *
  274. * int - 0 if successful; -1 if Regroup is perturbed.
  275. *
  276. * Algorithm:
  277. *
  278. * Copies the rgpinfo struct from the Regroup global memory into the
  279. * struct passed in by the caller.
  280. *
  281. ************************************************************************/
  282. _priv _resident int
  283. RGP_GETRGPINFO(rgpinfo_t *rgpinfo)
  284. {
  285. int error = 0;
  286. /* If no rgpinfo structure is passed OR rgp_init() has not been called
  287. * earlier, halt.
  288. */
  289. if ((rgpinfo == RGP_NULL_PTR) || (rgp == RGP_NULL_PTR))
  290. RGP_ERROR( RGP_INTERNAL_ERROR );
  291. RGP_LOCK;
  292. if (rgp_is_perturbed())
  293. error = -1;
  294. else
  295. /* Copy the rgpinfo structure from regroup's internal struct. */
  296. *rgpinfo = rgp->rgpinfo;
  297. RGP_UNLOCK;
  298. return(error);
  299. }
  300. /************************************************************************
  301. * rgp_setrgpinfo
  302. * ==============
  303. *
  304. * Description:
  305. *
  306. * Routine to set Regroup parameters. This routine is to be called on
  307. * newly booting nodes to set the Regroup parameters to the values
  308. * in the master or reloading node. The parameters to be updated
  309. * include Regroup timing parameters and the cluster membership;
  310. * that is, the current set of nodes in the system.
  311. *
  312. * This routine can also be called on the first node to boot to
  313. * modify the Regroup timing parameters which are set to the default
  314. * values when rgp_init() is called. Such modification has to be done
  315. * before other nodes are added to the system.
  316. *
  317. * Parameters:
  318. *
  319. * rgpinfo_t *rgpinfo - pointer to struct with Regroup parameters to
  320. * be modified.
  321. *
  322. * Returns:
  323. *
  324. * int - 0 if successful; -1 if there is more than one node in the
  325. * cluster. This is to prevent modification of timing parameters
  326. * after the second node is added to the system.
  327. *
  328. * Algorithm:
  329. *
  330. * Copies the contents of the user-passed struct into the one in the
  331. * Regroup global memory and updates related parameters.
  332. *
  333. ************************************************************************/
  334. _priv _resident int
  335. RGP_SETRGPINFO(rgpinfo_t *rgpinfo)
  336. {
  337. int error = 0;
  338. node_t i;
  339. /* If no rgpinfo structure is passed OR the version # of the
  340. * structure is not understood OR rgp_init() has not been called,
  341. * halt.
  342. */
  343. if ((rgpinfo == RGP_NULL_PTR) ||
  344. (rgpinfo->version != RGP_VERSION) ||
  345. (rgp == RGP_NULL_PTR))
  346. RGP_ERROR( RGP_INTERNAL_ERROR );
  347. RGP_LOCK;
  348. /* The following checks must be made before proceeding:
  349. *
  350. * 1. Regroup must not be perturbed.
  351. *
  352. * 2. If rgp_start() has been called (regroup is in the
  353. * RGP_STABILIZED state), only the local node must be in the
  354. * cluster when this routine is called.
  355. *
  356. * 3. If rgp_start() has been called, this routine can be used
  357. * only to modify the timing parameters and not to specify the
  358. * cluster.
  359. *
  360. * If these restrictions are not followed, return -1.
  361. */
  362. RGP_TRACE( "RGP SetRGPInfo ",
  363. rgpinfo->version, /* TRACE */
  364. rgpinfo->seqnum, /* TRACE */
  365. rgpinfo->iamalive_ticks, /* TRACE */
  366. GetCluster( rgpinfo->cluster ) );/* TRACE */
  367. if ( rgp_is_perturbed() ||
  368. ( (rgp->rgppkt.stage == RGP_STABILIZED) &&
  369. ( (ClusterNumMembers(rgp->rgpinfo.cluster) > 1) ||
  370. !ClusterCompare(rgp->rgpinfo.cluster,rgpinfo->cluster)
  371. )
  372. )
  373. )
  374. error = -1;
  375. else
  376. {
  377. /* Copy the rgpinfo structure into regroup's internal struct. */
  378. rgp->rgpinfo = *rgpinfo;
  379. /* If iamalive_ticks is set to 0, use the default value instead. */ /*F40:KSK06102.2*/
  380. if (rgpinfo->iamalive_ticks == 0) /*F40:KSK06102.3*/
  381. rgp->rgpinfo.iamalive_ticks = RGP_IAMALIVE_TICKS; /*F40:KSK06102.4*/
  382. /*F40:KSK06102.5*/
  383. if (rgpinfo->check_ticks == 0)
  384. {
  385. rgp->rgpinfo.check_ticks = RGP_CHECK_TICKS;
  386. }
  387. if (rgpinfo->Min_Stage1_ticks == 0)
  388. rgp->rgpinfo.Min_Stage1_ticks =
  389. (rgp->rgpinfo.iamalive_ticks * rgp->rgpinfo.check_ticks);
  390. if (rgpinfo->a_tick == 0)
  391. rgp->rgpinfo.a_tick = RGP_CLOCK_PERIOD;
  392. // Tell Timer thread to restart RGP timer
  393. SetEvent (rgp->OS_specific_control.TimerSignal);
  394. /* The cluster should include the local node even if the cluster
  395. * field in the rgpinfo structure does not include it.
  396. */
  397. ClusterInsert(rgp->rgpinfo.cluster, rgp->mynode);
  398. /* Copy the sequence number into the regroup packet area. */
  399. rgp->rgppkt.seqno = rgp->rgpinfo.seqnum;
  400. /* Give the event epoch number and rgp sequence number to clusnet. */
  401. ClusnetRegroupFinished(NmClusnetHandle,
  402. rgp->OS_specific_control.EventEpoch,
  403. rgp->rgpinfo.seqnum);
  404. /* If nodes have been added in the cluster field, they must be
  405. * added to all the screens and their status must be set to
  406. * alive.
  407. */
  408. ClusterCopy(rgp->OS_specific_control.CPUUPMASK, rgp->rgpinfo.cluster);
  409. ClusterCopy(rgp->outerscreen, rgp->rgpinfo.cluster);
  410. #if defined( NT )
  411. ClusnetSetOuterscreen( NmClusnetHandle, (ULONG)*((PUSHORT)rgp->outerscreen) );
  412. ClusterComplement(rgp->ignorescreen, rgp->outerscreen);
  413. #endif
  414. ClusterCopy(rgp->innerscreen, rgp->rgpinfo.cluster);
  415. ClusterCopy(rgp->rgppkt.knownstage1, rgp->rgpinfo.cluster);
  416. ClusterCopy(rgp->rgppkt.knownstage2, rgp->rgpinfo.cluster);
  417. ClusterCopy(rgp->rgppkt.knownstage3, rgp->rgpinfo.cluster);
  418. ClusterCopy(rgp->rgppkt.knownstage4, rgp->rgpinfo.cluster);
  419. ClusterCopy(rgp->rgppkt.knownstage5, rgp->rgpinfo.cluster);
  420. ClusterCopy(rgp->rgppkt.pruning_result, rgp->rgpinfo.cluster);
  421. rgp->tiebreaker = rgp_select_tiebreaker(rgp->rgpinfo.cluster);
  422. for (i = 0; i < (node_t) rgp->num_nodes; i++)
  423. {
  424. if (ClusterMember(rgp->rgpinfo.cluster, i))
  425. {
  426. rgp->node_states[i].pollstate = IAMALIVE_RECEIVED;
  427. rgp->node_states[i].status = RGP_NODE_ALIVE;
  428. #if defined( NT )
  429. ClusnetSetNodeMembershipState(NmClusnetHandle,
  430. EXT_NODE( i ),
  431. ClusnetNodeStateAlive);
  432. #endif // NT
  433. }
  434. }
  435. /* Reset the clock counter so that IamAlives are sent when
  436. * the next timer tick arrives.
  437. */
  438. rgp->clock_ticks = 0;
  439. }
  440. RGP_UNLOCK;
  441. return(error);
  442. }
  443. /************************************************************************
  444. * rgp_start
  445. * =========
  446. *
  447. * Description:
  448. *
  449. * This routine signals the end of node integration into the cluster.
  450. * The node can now start participating in the Regroup algorithm.
  451. *
  452. * Parameters:
  453. *
  454. * void (*rgp_node_failed)()
  455. * pointer to a routine to be called when a node failure is
  456. * detected.
  457. *
  458. * int (*rgp_select_cluster)()
  459. * pointer to an optional routine to be called when link failures
  460. * cause multiple alternative clusters to be formed. This routine
  461. * should select one from a list of suggested clusters.
  462. *
  463. * Returns:
  464. *
  465. * void - no return value
  466. *
  467. * Algorithm:
  468. *
  469. * Installs the callback routines in the global data structure and
  470. * changes the Regroup state to RGP_STABILIZED.
  471. *
  472. ************************************************************************/
  473. _priv _resident void
  474. RGP_START(void (*nodedown_callback)(cluster_t failed_nodes),
  475. int (*select_cluster)(cluster_t cluster_choices[], int num_clusters)
  476. )
  477. {
  478. if (rgp == RGP_NULL_PTR)
  479. RGP_ERROR( RGP_INTERNAL_ERROR );
  480. RGP_LOCK;
  481. RGP_TRACE( "RGP Start called",
  482. rgp->rgppkt.stage, /* TRACE */
  483. PtrToUlong(nodedown_callback), /* TRACE */
  484. PtrToUlong(select_cluster), /* TRACE */
  485. 0 ); /* TRACE */
  486. /* Install callback routines for node failure notification and cluster
  487. * selection. If no routine is given by the caller, use default ones.
  488. */
  489. if (nodedown_callback == RGP_NULL_PTR)
  490. {
  491. #ifdef NSK
  492. /* In NSK, rgp_start() is called from pTAL code and passing routine
  493. * addresses is cumbersome. So, RGP_NULL_PTR is passed and we
  494. * call the routine rgp_node_failed() which must be supplied by
  495. * the message system.
  496. */
  497. rgp->nodedown_callback = rgp_node_failed; /* hardcoded name */
  498. #else
  499. /* A node down callback routine must be supplied. */
  500. RGP_ERROR( RGP_INTERNAL_ERROR );
  501. #endif /* NSK */
  502. }
  503. else
  504. rgp->nodedown_callback = nodedown_callback;
  505. #if 0
  506. /* The select cluster routine is optional. */
  507. if (select_cluster == RGP_NULL_PTR)
  508. rgp->select_cluster = rgp_select_cluster; /* supplied by regroup */
  509. else
  510. #endif
  511. //
  512. // Calling rgp_select_cluster is
  513. // not a good idea since it doesn't take into the consideration
  514. // quorum owner node.
  515. // If rgp->select_cluster == RGP_NULL_PTR, then srgpsm.c uses
  516. // rgp_select_cluster_ex, that will try to select the group
  517. // that contain the current quorum owner node
  518. rgp->select_cluster = select_cluster;
  519. #if defined(NT)
  520. /* Call the node up callback. This is where the local node gets
  521. * the node up callback for itself coming up. Other nodes call
  522. * the callback, for this node coming up, in rgp_monitor_node.
  523. */
  524. ClusterInsert(rgp->rgpinfo.cluster, rgp->mynode);
  525. ClusterCopy(rgp->OS_specific_control.CPUUPMASK, rgp->rgpinfo.cluster);
  526. if ( rgp->OS_specific_control.UpDownCallback != RGP_NULL_PTR )
  527. {
  528. (*(rgp->OS_specific_control.UpDownCallback))(
  529. EXT_NODE(rgp->mynode),
  530. NODE_UP
  531. );
  532. }
  533. #endif /* NT */
  534. RGP_UNLOCK;
  535. }
  536. /************************************************************************
  537. * rgp_add_node
  538. * ============
  539. *
  540. * Description:
  541. *
  542. * Called to add a newly booting node to the regroup masks. This prevents
  543. * Regroup from sending poison packets to the new node when it tries to
  544. * contact our node by sending IamAlive messages.
  545. *
  546. * Parameters:
  547. *
  548. * node_t node - node to be added to the recognition masks
  549. *
  550. * Returns:
  551. *
  552. * int - 0 on success and -1 on failure. The routine fails only if a
  553. * regroup incident is in progress.
  554. *
  555. * Algorithm:
  556. *
  557. * The node is added to all the recognition masks and its state is
  558. * changed to RGP_NODE_COMING_UP.
  559. *
  560. ************************************************************************/
  561. _priv _resident int
  562. RGP_ADD_NODE(node_t node)
  563. {
  564. int error = 0;
  565. RGP_LOCK;
  566. RGP_TRACE( "RGP Add node ", node, rgp->rgppkt.stage,
  567. GetCluster(rgp->outerscreen), /* TRACE */
  568. GetCluster(rgp->rgpinfo.cluster) ); /* TRACE */
  569. /* Cannot add a node while regroup is perturbed. Return -1 in that case.
  570. * The new node booting should fail due to the regroup incident anyway.
  571. */
  572. if (rgp_is_perturbed())
  573. error = -1;
  574. else
  575. {
  576. node = INT_NODE(node); /* adjust the node number by the offset */
  577. ClusterInsert(rgp->outerscreen, node);
  578. #if defined( NT )
  579. ClusnetSetOuterscreen( NmClusnetHandle, (ULONG)*((PUSHORT)rgp->outerscreen) );
  580. #endif
  581. ClusterInsert(rgp->innerscreen, node);
  582. ClusterInsert(rgp->rgppkt.knownstage1, node);
  583. ClusterInsert(rgp->rgppkt.knownstage2, node);
  584. ClusterInsert(rgp->rgppkt.knownstage3, node);
  585. ClusterInsert(rgp->rgppkt.knownstage4, node);
  586. ClusterInsert(rgp->rgppkt.knownstage5, node);
  587. ClusterInsert(rgp->rgppkt.pruning_result, node);
  588. rgp->node_states[node].pollstate = AWAITING_IAMALIVE;
  589. rgp->node_states[node].status = RGP_NODE_COMING_UP;
  590. rgp->node_states[node].lostHBs = 0;
  591. #if defined( NT )
  592. ClusterDelete( rgp->OS_specific_control.Banished, node );
  593. //
  594. // Remove joining node from ignore screen
  595. //
  596. ClusterDelete( rgp->ignorescreen, node );
  597. PackIgnoreScreen(&rgp->rgppkt, rgp->ignorescreen);
  598. ClusnetSetNodeMembershipState(NmClusnetHandle,
  599. EXT_NODE( node ),
  600. ClusnetNodeStateJoining);
  601. #endif // NT
  602. }
  603. RGP_UNLOCK;
  604. return(error);
  605. }
  606. /************************************************************************
  607. * rgp_monitor_node
  608. * ================
  609. *
  610. * Description:
  611. *
  612. * Called by all running nodes to change the status of a newly booted node
  613. * to UP. Can be called by the new node also; it is a no-op in this case.
  614. *
  615. * Parameters:
  616. *
  617. * node_t node - number of node being declared up
  618. *
  619. * Returns:
  620. *
  621. * int - 0 on success and -1 on failure. The routine fails only if the
  622. * state of the node is neither RGP_NODE_COMING_UP nor RGP_NODE_ALIVE.
  623. *
  624. * Algorithm:
  625. *
  626. * If the node is marked coming up, its state is changed to
  627. * RGP_NODE_ALIVE. If the node has already been marked up,
  628. * nothing is done.
  629. *
  630. ************************************************************************/
  631. _priv _resident int
  632. RGP_MONITOR_NODE(node_t node)
  633. {
  634. int error = 0;
  635. RGP_LOCK;
  636. RGP_TRACE( "RGP Monitor node", node, rgp->rgppkt.stage,
  637. GetCluster(rgp->outerscreen), /* TRACE */
  638. GetCluster(rgp->rgpinfo.cluster) ); /* TRACE */
  639. node = INT_NODE(node); /* adjust the node number by the offset */
  640. /* Accept the request only if the state of the node is COMING_UP or UP. */
  641. if (rgp->node_states[node].status == RGP_NODE_COMING_UP)
  642. {
  643. ClusterInsert(rgp->rgpinfo.cluster, node);
  644. rgp->tiebreaker = rgp_select_tiebreaker(rgp->rgpinfo.cluster);
  645. rgp->node_states[node].pollstate = IAMALIVE_RECEIVED;
  646. rgp->node_states[node].status = RGP_NODE_ALIVE;
  647. #if defined(NT)
  648. ClusterCopy(rgp->OS_specific_control.CPUUPMASK, rgp->rgpinfo.cluster);
  649. ClusnetSetNodeMembershipState(NmClusnetHandle,
  650. EXT_NODE( node ),
  651. ClusnetNodeStateAlive);
  652. /* A node came up. Call the node up callback. */
  653. if ( rgp->OS_specific_control.UpDownCallback != RGP_NULL_PTR )
  654. {
  655. (*(rgp->OS_specific_control.UpDownCallback))(
  656. EXT_NODE(node),
  657. NODE_UP
  658. );
  659. }
  660. #endif /* NT */
  661. }
  662. else if (rgp->node_states[node].status != RGP_NODE_ALIVE)
  663. /* Perhaps the booting node failed and regroup has already marked
  664. * it down. The cluster manager may have invoked a global update
  665. * resulting in this call before regroup reporetd the failure
  666. * of the node.
  667. */
  668. error = -1;
  669. RGP_UNLOCK;
  670. return(error);
  671. }
  672. /************************************************************************
  673. * rgp_remove_node
  674. * ===============
  675. *
  676. * Description:
  677. *
  678. * Called by the cluster manager to force out a booting node if booting
  679. * fails. Regroup may or may not have already removed the booting node
  680. * from the masks and declared it down, depending on what stage the
  681. * booting is in and when the booting node failed.
  682. *
  683. * Regroup can remove the node from the masks of all nodes in the cluster
  684. * by simply starting a new incident of regroup with any event code. This
  685. * will force all nodes to come to an agreement on cluster membership
  686. * that excludes the booting node. If the booting node is alive, it will
  687. * commit suicide since it will be in the incompetent (RGP_COLDLOADED)
  688. * state.
  689. *
  690. * Removing the new node from our masks is not necessary since regroup
  691. * will detect the node failure and adjust the masks. If we do remove it
  692. * from our masks BEFORE initiating regroup, regroup may complete quicker
  693. * since we will not wait in stage 1 for the node to check in. Also, this
  694. * could allow a node to be removed even after it is fully integrated.
  695. * This is because our node will send a poison packet to the removed node
  696. * if it tries to contact us.
  697. *
  698. * But this "enhancement" is not implemented because it requires a new
  699. * regroup event code which is examined by all nodes and processed
  700. * specially. Currently, the regroup event code is used only for
  701. * debugging info. Also, there is no guarantee that all nodes see the
  702. * same regroup reason code. For instance, some may see a missing
  703. * IamAlive while others may see a power failure.
  704. *
  705. * Parameters:
  706. *
  707. * node_t node - node to be removed from the recognition masks
  708. * (in external format).
  709. *
  710. * Returns:
  711. *
  712. * int - 0 on success and -1 on failure. The routine fails if a
  713. * regroup incident is in progress or rgp_start() has not been
  714. * called (as in a new node where the booting is not complete).
  715. *
  716. * Algorithm:
  717. *
  718. * If the node is still in the recognition masks, a new regroup incident
  719. * is started. This incident will result in all nodes declaring the node
  720. * dead and removing it from the recognition masks.
  721. *
  722. ************************************************************************/
  723. _priv _resident int
  724. RGP_REMOVE_NODE(node_t node)
  725. {
  726. int error = 0;
  727. RGP_LOCK;
  728. RGP_TRACE( "RGP Remove node ", node, rgp->rgppkt.stage,
  729. GetCluster(rgp->outerscreen), /* TRACE */
  730. GetCluster(rgp->rgpinfo.cluster) ); /* TRACE */
  731. if (rgp->rgppkt.stage == RGP_STABILIZED)
  732. {
  733. if (ClusterMember(rgp->outerscreen, INT_NODE(node)))
  734. {
  735. /* Node is currently in our screen. The node may have never come up
  736. * after rgp_add_node() was called OR regroup may not have figured
  737. * out yet that the node is down. In either case, the node must
  738. * be forced out and all nodes in the cluster notified (by a regroup
  739. * incident). If the node is still running, it will commit suicide
  740. * when this regroup incident starts.
  741. */
  742. rgp_event_handler(RGP_EVT_LATEPOLLPACKET, node);
  743. }
  744. else
  745. {
  746. /* Either the node was not added to the cluster OR regroup has
  747. * already figured out that the node is dead and reported this.
  748. * In either case, there is nothing more to do.
  749. */
  750. }
  751. }
  752. else
  753. error = -1;
  754. RGP_UNLOCK;
  755. return(error);
  756. }
  757. /************************************************************************
  758. * rgp_is_perturbed
  759. * ================
  760. *
  761. * Description:
  762. *
  763. * Function to check if a regroup incident is in progress.
  764. *
  765. * Parameters:
  766. *
  767. * None.
  768. *
  769. * Returns:
  770. *
  771. * int - 0 if no regroup is quiescent; non-zero if a regroup incident
  772. * is in progress.
  773. *
  774. * Algorithm:
  775. *
  776. * Looks at the current state of the Regroup algorithm.
  777. *
  778. ************************************************************************/
  779. _priv _resident int
  780. RGP_IS_PERTURBED(void)
  781. {
  782. uint8 stage = rgp->rgppkt.stage;
  783. return((stage != RGP_STABILIZED) && (stage != RGP_COLDLOADED));
  784. }
  785. /************************************************************************
  786. * rgp_periodic_check
  787. * ==================
  788. *
  789. * Description:
  790. *
  791. * This routine is invoked every RGP_CLOCK_PERIOD by the timer interrupt
  792. * handler of the native OS. It performs Regroups's periodic operations.
  793. *
  794. * Parameters:
  795. *
  796. * None
  797. *
  798. * Returns:
  799. *
  800. * void - no return value
  801. *
  802. * Algorithm:
  803. *
  804. * This routine requests Iamalive packets to be sent, checks if
  805. * IamAlives have been received (and calls rgp_event_handler() if
  806. * not) and sends a clock tick to the regroup algorithm if it is in
  807. * progress.
  808. *
  809. * IamAlives are checked at twice the IamAlive period. The regroup
  810. * global variable clock_ticks is incremented in each call. After
  811. * the IamAlives are checked, clock_ticks is reset to 0. Thus, the
  812. * ticker counts time modulo twice the IamAlive ticks.
  813. *
  814. ************************************************************************/
  815. _priv _resident void
  816. RGP_PERIODIC_CHECK(void)
  817. {
  818. node_t node;
  819. RGP_LOCK;
  820. /* If regroup is active, give it a shot at each regroup clock tick. */
  821. if ((rgp->rgppkt.stage != RGP_STABILIZED) &&
  822. (rgp->rgppkt.stage != RGP_COLDLOADED))
  823. rgp_event_handler(RGP_EVT_CLOCK_TICK, RGP_NULL_NODE);
  824. #if !defined( NT )
  825. /* Send IamAlive messages at appropriate intervals. */
  826. if ( (rgp->clock_ticks == 0) ||
  827. (rgp->clock_ticks == rgp->rgpinfo.iamalive_ticks) )
  828. {
  829. rgp_broadcast(RGP_UNACK_IAMALIVE);
  830. rgp->clock_ticks++;
  831. }
  832. /* Check for missing IamAlives at IamAlive sending period,
  833. * But flag an error (LATE_POLL) only if "check_ticks" IamAlives missed.
  834. * The checking is offset from the sending by one clock tick.
  835. */
  836. else if ( rgp->clock_ticks >= (rgp->rgpinfo.iamalive_ticks - 1) )
  837. { /* check all nodes for IamAlives received */
  838. for (node = 0; node < (node_t) rgp->num_nodes; node++)
  839. {
  840. if (rgp->node_states[node].status == RGP_NODE_ALIVE)
  841. {
  842. if ( rgp->node_states[node].pollstate == IAMALIVE_RECEIVED )
  843. { /* checked in in time */
  844. #if defined(TDM_DEBUG)
  845. if ( rgp->OS_specific_control.debug.doing_tracing )
  846. {
  847. printf ("Node %d: Node %d is alive. My rgp state=%d\n",
  848. EXT_NODE(rgp->mynode), EXT_NODE(node), rgp->rgppkt.stage );
  849. }
  850. #endif
  851. rgp->node_states[node].pollstate = AWAITING_IAMALIVE;
  852. rgp->node_states[node].lostHBs = 0;
  853. }
  854. else if ( rgp->node_states[node].lostHBs++ < rgp->rgpinfo.check_ticks )
  855. ;// allow upto (check_ticks-1) IamAlives to be lost.
  856. else
  857. {
  858. /* missing IamAlives */
  859. if (node == rgp->mynode) /* missed my own packets */
  860. {
  861. /* We should be lenient if we just had a power failure.
  862. */
  863. if (rgp->pfail_state == 0) /* no recent power failure */
  864. RGP_ERROR( RGP_MISSED_POLL_TO_SELF );
  865. }
  866. else
  867. rgp_event_handler(RGP_EVT_LATEPOLLPACKET, EXT_NODE(node));
  868. }
  869. }
  870. }
  871. /* Reset the regroup tick counter after checking for IamAlives. */
  872. rgp->clock_ticks = 0;
  873. } /* check all nodes for IamAlives received */
  874. else
  875. rgp->clock_ticks++;
  876. /* rgp->pfail_state is set to a non-zero value when a pfail event
  877. * is reported to regroup. It is decremented at every regroup clock
  878. * tick till it reaches zero. While this number is non-zero, missing
  879. * self IamAlives are ignored and do not cause the node to halt.
  880. * This gives the sending hardware some time to recover from power
  881. * failures before self IamAlives are checked.
  882. */
  883. if (rgp->pfail_state)
  884. rgp->pfail_state--;
  885. #endif // NT
  886. RGP_UNLOCK;
  887. } /* rgp_periodic_check */
  888. /************************************************************************
  889. * rgp_received_packet
  890. * ===================
  891. *
  892. * Description:
  893. *
  894. * Routine to be called by the message system when an unacknowledged
  895. * packet sent by the Regroup module is received from any node. These
  896. * packets include IamAlive packets, regroup status packets and poison
  897. * packets.
  898. *
  899. * Parameters:
  900. *
  901. * node_t node - node from which a packet has been received
  902. *
  903. * void *packet - address of the received packet data
  904. *
  905. * int packetlen - length in bytes of the received packet data
  906. *
  907. * Returns:
  908. *
  909. * void - no return value
  910. *
  911. * Algorithm:
  912. *
  913. * Does different things based on the packet subtype.
  914. *
  915. ************************************************************************/
  916. _priv _resident void
  917. RGP_RECEIVED_PACKET(node_t node, void *packet, int packetlen)
  918. {
  919. rgp_unseq_pkt_t *unseq_pkt = (rgp_unseq_pkt_t *) packet;
  920. node = INT_NODE(node);
  921. /* If the packet is from a node that cannot be in our cluster,
  922. * simply ignore it.
  923. */
  924. if (node >= (node_t) rgp->num_nodes)
  925. return;
  926. /* If the sending node is excluded by the outer screen, then it is
  927. * not part of the current (most recently known) configuration.
  928. * Therefore the packet should not be honored, and a poison message
  929. * should be sent to try to kill this renegade processor unless
  930. * it is sending US a poison packet. If it is sending us a poison
  931. * packet, we cannot send it a poison in return because that results
  932. * in an infinite loop. In this case, we just halt because this
  933. * situation implies that there is a split brain situation and our
  934. * split brain avoidance algorithm has failed.
  935. */
  936. /* NT Notes
  937. *
  938. * even with poison pkts being sent and recv'ed in the kernel, we still
  939. * want to make these checks since clusnet doesn't have the regroup stage
  940. * info and regroup packets themselves find there way in here.
  941. */
  942. if (!ClusterMember(rgp->outerscreen, node)
  943. #if defined( NT )
  944. ||
  945. ClusterMember(rgp->OS_specific_control.Banished, node)
  946. #endif
  947. )
  948. {
  949. if (rgp->rgppkt.stage == RGP_COLDLOADED)
  950. {
  951. // We are doing this check in srgpsm.c
  952. // No need to do it here
  953. // RGP_ERROR(RGP_RELOADFAILED);
  954. //
  955. }
  956. else if (unseq_pkt->pktsubtype == RGP_UNACK_POISON)
  957. {
  958. RGP_ERROR((uint16) (RGP_PARIAH + EXT_NODE(node)));
  959. } else {
  960. /* Must send a poison packet to the sender.
  961. */
  962. ClusterInsert(rgp->poison_targets, node);
  963. rgp_broadcast(RGP_UNACK_POISON);
  964. }
  965. return;
  966. }
  967. switch (unseq_pkt->pktsubtype)
  968. {
  969. case RGP_UNACK_IAMALIVE :
  970. {
  971. /* Count the number of IamAlives received */
  972. if ( node == rgp->mynode )
  973. RGP_INCREMENT_COUNTER( RcvdLocalIAmAlive );
  974. else
  975. RGP_INCREMENT_COUNTER( RcvdRemoteIAmAlive );
  976. if (rgp->node_states[node].status == RGP_NODE_ALIVE)
  977. rgp->node_states[node].pollstate = IAMALIVE_RECEIVED;
  978. else if (rgp->node_states[node].status == RGP_NODE_COMING_UP)
  979. {
  980. /* If the node has not yet been marked fully up, it is time to
  981. * do so.
  982. */
  983. rgp_monitor_node(EXT_NODE(node));
  984. /* We must tell the OS that the new node is up in case the
  985. * OS needs the IamAlives to figure that out.
  986. */
  987. rgp_newnode_online(EXT_NODE(node));
  988. }
  989. else
  990. /* If the node state is neither alive nor coming up, it
  991. * must not be in our outerscreen. The outerscreen check
  992. * above must have passed and we should not get here.
  993. */
  994. RGP_ERROR(RGP_INTERNAL_ERROR);
  995. break;
  996. }
  997. case RGP_UNACK_REGROUP :
  998. {
  999. /* Count the number of regroup status packets received. */
  1000. RGP_INCREMENT_COUNTER( RcvdRegroup );
  1001. /* Any good packet can be treated as an IamAlive packet. */
  1002. rgp->node_states[node].pollstate = IAMALIVE_RECEIVED;
  1003. RGP_EVENT_HANDLER_EX (RGP_EVT_RECEIVED_PACKET, EXT_NODE(node), (void*)unseq_pkt);
  1004. break;
  1005. }
  1006. case RGP_UNACK_POISON :
  1007. {
  1008. /* If our node is in RGP_PRUNING stage and have been pruned out,
  1009. * the poison packet probably implies that the sender has gone
  1010. * into the next stage and declared us down. In this case, use
  1011. * the more appropriate RGP_PRUNED_OUT halt code. Otherwise,
  1012. * use the poison packet halt code. In either case, we must halt.
  1013. */
  1014. if ( (rgp->rgppkt.stage == RGP_PRUNING) &&
  1015. !ClusterMember(rgp->rgppkt.pruning_result, rgp->mynode) )
  1016. RGP_ERROR(RGP_PRUNED_OUT);
  1017. else
  1018. {
  1019. if (rgp->rgppkt.stage == RGP_COLDLOADED)
  1020. {
  1021. RGP_ERROR(RGP_RELOADFAILED);
  1022. return;
  1023. }
  1024. else
  1025. RGP_ERROR((uint16) (RGP_PARIAH + EXT_NODE(node)));
  1026. }
  1027. break;
  1028. }
  1029. default :
  1030. {
  1031. /* Ignore the unknown packet type. */
  1032. break;
  1033. }
  1034. }
  1035. }
  1036. /*---------------------------------------------------------------------------*/
  1037. #ifdef __cplusplus
  1038. }
  1039. #endif /* __cplusplus */
  1040. #if 0
  1041. History of changes to this file:
  1042. -------------------------------------------------------------------------
  1043. 1995, December 13 F40:KSK0610 /*F40:KSK06102.6*/
  1044. This file is part of the portable Regroup Module used in the NonStop
  1045. Kernel (NSK) and Loosely Coupled UNIX (LCU) operating systems. There
  1046. are 10 files in the module - jrgp.h, jrgpos.h, wrgp.h, wrgpos.h,
  1047. srgpif.c, srgpos.c, srgpsm.c, srgputl.c, srgpcli.c and srgpsvr.c.
  1048. The last two are simulation files to test the Regroup Module on a
  1049. UNIX workstation in user mode with processes simulating processor nodes
  1050. and UDP datagrams used to send unacknowledged datagrams.
  1051. This file was first submitted for release into NSK on 12/13/95.
  1052. ------------------------------------------------------------------------------
  1053. This change occurred on 19 Jan 1996 /*F40:MB06458.1*/
  1054. Changes for phase IV Sierra message system release. Includes: /*F40:MB06458.2*/
  1055. - Some cleanup of the code /*F40:MB06458.3*/
  1056. - Increment KCCB counters to count the number of setup messages and /*F40:MB06458.4*/
  1057. unsequenced messages sent. /*F40:MB06458.5*/
  1058. - Fixed some bugs /*F40:MB06458.6*/
  1059. - Disable interrupts before allocating broadcast sibs. /*F40:MB06458.7*/
  1060. - Change per-packet-timeout to 5ms /*F40:MB06458.8*/
  1061. - Make the regroup and powerfail broadcast use highest priority /*F40:MB06458.9*/
  1062. tnet services queue. /*F40:MB06458.10*/
  1063. - Call the millicode backdoor to get the processor status from SP /*F40:MB06458.11*/
  1064. - Fixed expand bug in msg_listen_ and msg_readctrl_ /*F40:MB06458.12*/
  1065. - Added enhancement to msngr_sendmsg_ so that clients do not need /*F40:MB06458.13*/
  1066. to be unstoppable before calling this routine. /*F40:MB06458.14*/
  1067. - Added new steps in the build file called /*F40:MB06458.15*/
  1068. MSGSYS_C - compiles all the message system C files /*F40:MB06458.16*/
  1069. MSDRIVER - compiles all the MSDriver files /*F40:MB06458.17*/
  1070. REGROUP - compiles all the regroup files /*F40:MB06458.18*/
  1071. - remove #pragma env libspace because we set it as a command line /*F40:MB06458.19*/
  1072. parameter. /*F40:MB06458.20*/
  1073. ----------------------------------------------------------------------- /*F40:MB06458.21*/
  1074. #endif /* 0 - change descriptions */