Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

695 lines
16 KiB

  1. /*++
  2. Copyright (c) 1996-1999 Microsoft Corporation
  3. Module Name:
  4. member.c
  5. Abstract:
  6. Cluster membership management routines for the Node Manager.
  7. Author:
  8. Mike Massa (mikemas) 12-Mar-1996
  9. Revision History:
  10. --*/
  11. #include "nmp.h"
  12. #include <clusrtl.h>
  13. //
  14. // Data
  15. //
  16. BOOLEAN NmpMembershipCleanupOk = FALSE;
  17. BITSET NmpUpNodeSet = 0;
  18. LIST_ENTRY NmpLeaderChangeWaitList = {NULL, NULL};
  19. //
  20. // Routines
  21. //
  22. VOID
  23. NmpMarkNodeUp(
  24. CL_NODE_ID NodeId
  25. )
  26. /*++
  27. Notes:
  28. Called with the NmpLock held.
  29. --*/
  30. {
  31. BitsetAdd(NmpUpNodeSet, NodeId);
  32. return;
  33. }
  34. VOID
  35. NmpNodeUpEventHandler(
  36. IN PNM_NODE Node
  37. )
  38. /*++
  39. Notes:
  40. Called with the NmpLock held.
  41. --*/
  42. {
  43. NmpMarkNodeUp(Node->NodeId);
  44. //
  45. // Don't declare the local node to be up. The join code will
  46. // take care of this.
  47. //
  48. if ((Node != NmLocalNode) && (Node->State == ClusterNodeJoining)) {
  49. ClRtlLogPrint(LOG_UNUSUAL,
  50. "[NMJOIN] Joining node %1!u! is now participating in the cluster membership.\n",
  51. Node->NodeId
  52. );
  53. CL_ASSERT(NmpJoinerNodeId == Node->NodeId);
  54. CL_ASSERT(Node->State == ClusterNodeJoining);
  55. CL_ASSERT(NmpJoinTimer == 0);
  56. CL_ASSERT(NmpJoinAbortPending == FALSE);
  57. CL_ASSERT(NmpJoinerUp == FALSE);
  58. NmpJoinerUp = TRUE;
  59. }
  60. return;
  61. } // NmpNodeUpEventHandler
  62. VOID
  63. NmpNodeDownEventHandler(
  64. IN PNM_NODE Node
  65. )
  66. {
  67. NmpMultiNodeDownEventHandler( BitsetFromUnit(Node->NodeId) );
  68. }
  69. DWORD
  70. NmpMultiNodeDownEventHandler(
  71. IN BITSET DownedNodeSet
  72. )
  73. {
  74. CL_NODE_ID i;
  75. PNM_NODE node;
  76. DWORD status;
  77. BOOLEAN iAmNewLeader = FALSE;
  78. PNM_LEADER_CHANGE_WAIT_ENTRY waitEntry;
  79. PLIST_ENTRY listEntry;
  80. ClRtlLogPrint(LOG_NOISE, "[NM] Down node set: %1!04X!.\n", DownedNodeSet);
  81. NmpAcquireLock();
  82. //
  83. // Compute the new up node set
  84. //
  85. BitsetSubtract(NmpUpNodeSet, DownedNodeSet);
  86. ClRtlLogPrint(LOG_NOISE, "[NM] New up node set: %1!04X!.\n", NmpUpNodeSet);
  87. //
  88. // Check for failure of a joining node.
  89. //
  90. if (NmpJoinerNodeId != ClusterInvalidNodeId) {
  91. if (NmpJoinerNodeId == NmLocalNodeId) {
  92. //
  93. // The joining node is the local node. Halt.
  94. //
  95. ClRtlLogPrint(LOG_NOISE,
  96. "[NMJOIN] Aborting join because of change in membership.\n"
  97. );
  98. CsInconsistencyHalt(ERROR_CLUSTER_JOIN_ABORTED);
  99. }
  100. else if ( (BitsetIsMember(NmpJoinerNodeId, DownedNodeSet))
  101. ||
  102. ( (BitsetIsMember(NmpSponsorNodeId, DownedNodeSet)) &&
  103. (!BitsetIsMember(NmpJoinerNodeId, DownedNodeSet))
  104. )
  105. )
  106. {
  107. //
  108. // The joining node is down or the sponsor is down and the joiner
  109. // is not yet an active member. Cleanup the join state. If the
  110. // sponsor is down and the joiner is an active member, we will
  111. // clean up when we detect that the joiner has perished.
  112. //
  113. ClRtlLogPrint(LOG_NOISE,
  114. "[NMJOIN] Aborting join of node %1!u! sponsored by node %2!u!\n",
  115. NmpJoinerNodeId,
  116. NmpSponsorNodeId
  117. );
  118. //
  119. // Reset joiner state if sponsor died
  120. //
  121. if (BitsetIsMember(NmpSponsorNodeId, DownedNodeSet)) {
  122. node = NmpIdArray[NmpJoinerNodeId];
  123. node->State = ClusterNodeDown;
  124. // [GorN 4/3/2000]
  125. // Without a node down, cluadmin won't refresh the state.
  126. // If this code is to be changed to emit CLUSTER_NODE_CHANGE_EVENT or
  127. // some other event, NmpUpdateJoinAbort has to be changed as well,
  128. // so that we will have the same join cleanup behavior
  129. BitsetAdd(DownedNodeSet, NmpJoinerNodeId);
  130. }
  131. NmpJoinerNodeId = ClusterInvalidNodeId;
  132. NmpSponsorNodeId = ClusterInvalidNodeId;
  133. NmpJoinTimer = 0;
  134. NmpJoinAbortPending = FALSE;
  135. NmpJoinSequence = 0;
  136. NmpJoinerUp = FALSE;
  137. NmpJoinerOutOfSynch = FALSE;
  138. }
  139. else {
  140. //
  141. // Mark that the joiner is out of synch with the cluster
  142. // state. The sponsor will eventually abort the join.
  143. //
  144. ClRtlLogPrint(LOG_NOISE,
  145. "[NMJOIN] Joiner node %1!u! is now out of synch with the cluster state.\n",
  146. NmpJoinerNodeId
  147. );
  148. NmpJoinerOutOfSynch = TRUE;
  149. }
  150. }
  151. //
  152. // Check if the leader node went down
  153. //
  154. if (BitsetIsMember(NmpLeaderNodeId, DownedNodeSet)) {
  155. BOOL isEventSet;
  156. //
  157. // Elect a new leader - active node with the smallest ID.
  158. //
  159. for (i = ClusterMinNodeId; i <= NmMaxNodeId; i++) {
  160. if (BitsetIsMember(i, NmpUpNodeSet)) {
  161. NmpLeaderNodeId = i;
  162. break;
  163. }
  164. }
  165. CL_ASSERT(i <= NmMaxNodeId);
  166. if (NmpLeaderNodeId == NmLocalNodeId) {
  167. //
  168. // The local node is the new leader.
  169. //
  170. ClRtlLogPrint(LOG_NOISE,
  171. "[NM] This node is the new leader.\n"
  172. );
  173. iAmNewLeader = TRUE;
  174. }
  175. else {
  176. ClRtlLogPrint(LOG_NOISE,
  177. "[NM] Node %1!u! is the new leader.\n",
  178. NmpLeaderNodeId
  179. );
  180. }
  181. //
  182. // Wake up any threads waiting for an RPC call to the leader to
  183. // complete.
  184. //
  185. while (!IsListEmpty(&NmpLeaderChangeWaitList)) {
  186. listEntry = RemoveHeadList(&NmpLeaderChangeWaitList);
  187. //
  188. // NULL out the entry's links to indicate that it has been
  189. // dequeued. The users of the notification feature depend
  190. // on this action.
  191. //
  192. listEntry->Flink = NULL; listEntry->Blink = NULL;
  193. //
  194. // Wake up the waiting thread.
  195. //
  196. waitEntry = (PNM_LEADER_CHANGE_WAIT_ENTRY) listEntry;
  197. isEventSet = SetEvent(waitEntry->LeaderChangeEvent);
  198. CL_ASSERT(isEventSet != 0);
  199. }
  200. }
  201. //
  202. // First recovery pass - clean up node states and disable communication
  203. //
  204. for (i = ClusterMinNodeId; i <= NmMaxNodeId; i++) {
  205. node = NmpIdArray[i];
  206. if ( (node != NULL) && (BitsetIsMember(i, DownedNodeSet)) ) {
  207. node->State = ClusterNodeDown;
  208. status = ClusnetOfflineNodeComm(
  209. NmClusnetHandle,
  210. node->NodeId
  211. );
  212. CL_ASSERT(
  213. (status == ERROR_SUCCESS) ||
  214. (status == ERROR_CLUSTER_NODE_ALREADY_DOWN)
  215. );
  216. }
  217. }
  218. //
  219. // Inform the rest of the service that these nodes are gone
  220. //
  221. ClusterEventEx(
  222. CLUSTER_EVENT_NODE_DOWN_EX,
  223. EP_CONTEXT_VALID,
  224. ULongToPtr(DownedNodeSet)
  225. );
  226. //
  227. // Second recovery pass - clean up network states and issue old-style
  228. // node down events
  229. //
  230. for (i = ClusterMinNodeId; i <= NmMaxNodeId; i++) {
  231. node = NmpIdArray[i];
  232. if ( (node != NULL) && (BitsetIsMember(i, DownedNodeSet)) ) {
  233. //
  234. // Issue an individual node down event.
  235. //
  236. ClusterEvent(CLUSTER_EVENT_NODE_DOWN, node);
  237. //
  238. // Now do Intracluster RPC cleanup...
  239. //
  240. NmpTerminateRpcsToNode(node->NodeId);
  241. //
  242. // Update the network and interface information.
  243. //
  244. NmpUpdateNetworkConnectivityForDownNode(node);
  245. //
  246. // Log an event
  247. //
  248. if (NmpLeaderNodeId == NmLocalNodeId) {
  249. LPCWSTR nodeName = OmObjectName(node);
  250. CsLogEvent1(
  251. LOG_UNUSUAL,
  252. NM_EVENT_NODE_DOWN,
  253. nodeName
  254. );
  255. }
  256. }
  257. }
  258. //
  259. // If this node is the new leader, schedule a state computation for all
  260. // networks. State reports may have been received before this node
  261. // assumed leadership duties.
  262. //
  263. if (iAmNewLeader) {
  264. NmpRecomputeNT5NetworkAndInterfaceStates();
  265. }
  266. NmpReleaseLock();
  267. return(ERROR_SUCCESS);
  268. } // NmpNodesDownEventHandler //
  269. DWORD
  270. NmpNodeChange(
  271. IN DWORD NodeId,
  272. IN NODESTATUS NewStatus
  273. )
  274. {
  275. PNM_NODE node;
  276. CL_ASSERT(
  277. (NodeId >= ClusterMinNodeId) &&
  278. (NodeId <= NmMaxNodeId)
  279. );
  280. NmpAcquireLock();
  281. node = NmpIdArray[NodeId];
  282. CL_ASSERT(node != NULL);
  283. if (node != NULL) {
  284. if (NewStatus == NODE_DOWN) {
  285. NmpNodeDownEventHandler(node);
  286. }
  287. else {
  288. CL_ASSERT(NewStatus == NODE_UP);
  289. NmpNodeUpEventHandler(node);
  290. }
  291. }
  292. NmpReleaseLock();
  293. return(ERROR_SUCCESS);
  294. } // NmpNodeChange
  295. VOID
  296. NmpHoldIoEventHandler(
  297. VOID
  298. )
  299. {
  300. ClRtlLogPrint(LOG_NOISE,
  301. "[NM] Holding I/O.\n"
  302. );
  303. #if defined(HOLD_IO_IS_SAFE_NOW)
  304. FmHoldIO();
  305. #endif
  306. return;
  307. }
  308. VOID
  309. NmpResumeIoEventHandler(
  310. VOID
  311. )
  312. {
  313. ClRtlLogPrint(LOG_NOISE,
  314. "[NM] Resuming I/O.\n"
  315. );
  316. #if defined(HOLD_IO_IS_SAFE_NOW)
  317. FmResumeIO();
  318. #endif
  319. return;
  320. }
  321. BOOL
  322. NmpCheckQuorumEventHandler(
  323. VOID
  324. )
  325. {
  326. BOOL haveQuorum;
  327. //
  328. // daviddio 06/19/2000
  329. //
  330. // Before asking FM to arbitrate, determine if we have any
  331. // viable network interfaces. If not, return failure to MM
  332. // and allow other cluster nodes to arbitrate. The SCM
  333. // will restart the cluster service, so that if no nodes
  334. // successfully arbitrate, we will get another shot.
  335. //
  336. if (NmpCheckForNetwork()) {
  337. ClRtlLogPrint(LOG_NOISE,
  338. "[NM] Checking if we own the quorum resource.\n"
  339. );
  340. haveQuorum = FmArbitrateQuorumResource();
  341. if (haveQuorum) {
  342. ClRtlLogPrint(LOG_NOISE,
  343. "[NM] We own the quorum resource.\n"
  344. );
  345. }
  346. else {
  347. ClRtlLogPrint(LOG_NOISE,
  348. "[NM] We do not own the quorum resource, status %1!u!.\n",
  349. GetLastError()
  350. );
  351. //[GN] ClusnetHalt( NmClusnetHandle ); => NmpHaltEventHandler
  352. //
  353. }
  354. } else {
  355. ClRtlLogPrint(LOG_CRITICAL,
  356. "[NM] Abdicating quorum because no valid network "
  357. "interfaces were detected.\n"
  358. );
  359. haveQuorum = FALSE;
  360. }
  361. return(haveQuorum);
  362. } // NmpCheckQuorumEventHandler
  363. void
  364. NmpMsgCleanup1(
  365. IN DWORD DeadNodeId
  366. )
  367. {
  368. ClRtlLogPrint(LOG_NOISE,
  369. "[NM] Phase 1 message cleanup - node %1!u!.\n",
  370. DeadNodeId
  371. );
  372. return;
  373. }
  374. void
  375. NmpMsgCleanup2(
  376. IN BITSET DownedNodeSet
  377. )
  378. {
  379. ClRtlLogPrint(LOG_NOISE,
  380. "[NM] Phase 2 message cleanup - node %1!04X!.\n",
  381. DownedNodeSet
  382. );
  383. NmpAcquireLock();
  384. if ( NmpCleanupIfJoinAborted &&
  385. (NmpJoinerNodeId != ClusterInvalidNodeId) &&
  386. BitsetIsMember(NmpJoinerNodeId, DownedNodeSet) )
  387. {
  388. //
  389. // Since the joiner is in the DownedNodeSet mask
  390. // the node down will be delivered on this node by a regroup engine.
  391. // No need for NmpUpdateAbortJoin to issue a node down.
  392. //
  393. NmpCleanupIfJoinAborted = FALSE;
  394. ClRtlLogPrint(LOG_NOISE,
  395. "[NM] NmpCleanupIfJoinAborted is set to false. Joiner - %1!u!.\n",
  396. NmpJoinerNodeId
  397. );
  398. }
  399. NmpReleaseLock();
  400. //
  401. // Inform the rest of the service that these nodes are gone
  402. //
  403. ClusterSyncEventEx(
  404. CLUSTER_EVENT_NODE_DOWN_EX,
  405. EP_CONTEXT_VALID,
  406. ULongToPtr(DownedNodeSet)
  407. );
  408. return;
  409. }
  410. VOID
  411. NmpHaltEventHandler(
  412. IN DWORD HaltCode
  413. )
  414. {
  415. WCHAR string[16];
  416. // Do a graceful stop if we are shutting down //
  417. if (HaltCode == MM_STOP_REQUESTED) {
  418. DWORD Status = ERROR_SUCCESS;
  419. ClRtlLogPrint(LOG_UNUSUAL,
  420. "[NM] Prompt shutdown is requested by a membership engine\n"
  421. );
  422. ClusnetHalt( NmClusnetHandle );
  423. CsLogEvent(LOG_NOISE, SERVICE_SUCCESSFUL_TERMINATION);
  424. CsServiceStatus.dwCurrentState = SERVICE_STOPPED;
  425. CsServiceStatus.dwControlsAccepted = 0;
  426. CsServiceStatus.dwCheckPoint = 0;
  427. CsServiceStatus.dwWaitHint = 0;
  428. CsServiceStatus.dwWin32ExitCode = Status;
  429. CsServiceStatus.dwServiceSpecificExitCode = Status;
  430. CsAnnounceServiceStatus();
  431. ExitProcess(Status);
  432. } else {
  433. wsprintfW(&(string[0]), L"%u", HaltCode);
  434. ClRtlLogPrint(LOG_CRITICAL,
  435. "[NM] Halting this node due to membership or communications error. Halt code = %1!u!\n",
  436. HaltCode
  437. );
  438. ClusnetHalt( NmClusnetHandle );
  439. //
  440. // Adjust membership code to win32 error code. (If mapping exits)
  441. //
  442. HaltCode = MMMapHaltCodeToDosError( HaltCode );
  443. CsInconsistencyHalt(HaltCode);
  444. }
  445. }
  446. void
  447. NmpJoinFailed(
  448. void
  449. )
  450. {
  451. return;
  452. }
  453. DWORD
  454. NmpGumUpdateHandler(
  455. IN DWORD Context,
  456. IN BOOL SourceNode,
  457. IN DWORD BufferLength,
  458. IN PVOID Buffer
  459. )
  460. /*++
  461. Routine Description:
  462. Handles GUM updates for membership events.
  463. Arguments:
  464. Context - Supplies the update context. This is the message type
  465. SourceNode - Supplies whether or not the update originated on this node.
  466. BufferLength - Supplies the length of the update.
  467. Buffer - Supplies a pointer to the buffer.
  468. Return Value:
  469. ERROR_SUCCESS if successful
  470. Win32 error code otherwise
  471. --*/
  472. {
  473. DWORD status;
  474. if (Context == NmUpdateJoinComplete) {
  475. status = NmpUpdateJoinComplete(Buffer);
  476. }
  477. else {
  478. status = ERROR_SUCCESS;
  479. ClRtlLogPrint(LOG_UNUSUAL,
  480. "[NM] Discarding unknown gum request %1!u!\n",
  481. Context
  482. );
  483. }
  484. return(status);
  485. } // NmpUpdateGumHandler
  486. DWORD
  487. NmpMembershipInit(
  488. VOID
  489. )
  490. {
  491. DWORD status;
  492. ClRtlLogPrint(LOG_NOISE,"[NM] Initializing membership...\n");
  493. InitializeListHead(&NmpLeaderChangeWaitList);
  494. //
  495. // Initialize membership engine.
  496. //
  497. status = MMInit(
  498. NmLocalNodeId,
  499. NmMaxNodes,
  500. NmpNodeChange,
  501. NmpCheckQuorumEventHandler,
  502. NmpHoldIoEventHandler,
  503. NmpResumeIoEventHandler,
  504. NmpMsgCleanup1,
  505. NmpMsgCleanup2,
  506. NmpHaltEventHandler,
  507. NmpJoinFailed,
  508. NmpMultiNodeDownEventHandler
  509. );
  510. if (status != MM_OK) {
  511. status = MMMapStatusToDosError(status);
  512. ClRtlLogPrint(LOG_CRITICAL,
  513. "[NM] Membership initialization failed, status %1!u!.\n",
  514. status
  515. );
  516. return(status);
  517. }
  518. NmpMembershipCleanupOk = TRUE;
  519. ClRtlLogPrint(LOG_NOISE,"[NM] Membership initialization complete.\n");
  520. return(ERROR_SUCCESS);
  521. } // NmpMembershipInit
  522. VOID
  523. NmpMembershipShutdown(
  524. VOID
  525. )
  526. {
  527. if (NmpMembershipCleanupOk) {
  528. ClRtlLogPrint(LOG_NOISE,"[NM] Shutting down membership...\n");
  529. MMShutdown();
  530. NmpMembershipCleanupOk = FALSE;
  531. ClRtlLogPrint(LOG_NOISE,"[NM] Membership shutdown complete.\n");
  532. }
  533. return;
  534. } // NmpMembershipShutdown
  535.