Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

715 lines
17 KiB

  1. /*++
  2. Copyright (c) 1996-1999 Microsoft Corporation
  3. Module Name:
  4. member.c
  5. Abstract:
  6. Cluster membership management routines for the Node Manager.
  7. Author:
  8. Mike Massa (mikemas) 12-Mar-1996
  9. Revision History:
  10. --*/
  11. #include "nmp.h"
  12. #include <clusrtl.h>
  13. //
  14. // Data
  15. //
  16. BOOLEAN NmpMembershipCleanupOk = FALSE;
  17. BITSET NmpUpNodeSet = 0;
  18. LIST_ENTRY NmpLeaderChangeWaitList = {NULL, NULL};
  19. //
  20. // Routines
  21. //
  22. VOID
  23. NmpMarkNodeUp(
  24. CL_NODE_ID NodeId
  25. )
  26. /*++
  27. Notes:
  28. Called with the NmpLock held.
  29. --*/
  30. {
  31. BitsetAdd(NmpUpNodeSet, NodeId);
  32. return;
  33. }
  34. VOID
  35. NmpNodeUpEventHandler(
  36. IN PNM_NODE Node
  37. )
  38. /*++
  39. Notes:
  40. Called with the NmpLock held.
  41. --*/
  42. {
  43. NmpMarkNodeUp(Node->NodeId);
  44. // MM has Declared the node to be up. Reset The node down event.
  45. if (!ResetEvent(Node->MmNodeStateDownEvent)) {
  46. DWORD status = GetLastError();
  47. ClRtlLogPrint(LOG_CRITICAL,
  48. "[NMJOIN] Failed to reset node down event for Node= %1!u! status= %2!u!.\n",
  49. Node->NodeId,
  50. status
  51. );
  52. CsInconsistencyHalt(status);
  53. }
  54. //
  55. // Don't declare the local node to be up. The join code will
  56. // take care of this.
  57. //
  58. if ((Node != NmLocalNode) && (Node->State == ClusterNodeJoining)) {
  59. ClRtlLogPrint(LOG_UNUSUAL,
  60. "[NMJOIN] Joining node %1!u! is now participating in the cluster membership.\n",
  61. Node->NodeId
  62. );
  63. CL_ASSERT(NmpJoinerNodeId == Node->NodeId);
  64. CL_ASSERT(Node->State == ClusterNodeJoining);
  65. CL_ASSERT(NmpJoinTimer == 0);
  66. CL_ASSERT(NmpJoinAbortPending == FALSE);
  67. CL_ASSERT(NmpJoinerUp == FALSE);
  68. NmpJoinerUp = TRUE;
  69. }
  70. return;
  71. } // NmpNodeUpEventHandler
  72. VOID
  73. NmpNodeDownEventHandler(
  74. IN PNM_NODE Node
  75. )
  76. {
  77. NmpMultiNodeDownEventHandler( BitsetFromUnit(Node->NodeId) );
  78. }
  79. DWORD
  80. NmpMultiNodeDownEventHandler(
  81. IN BITSET DownedNodeSet
  82. )
  83. {
  84. CL_NODE_ID i;
  85. PNM_NODE node;
  86. DWORD status;
  87. BOOLEAN iAmNewLeader = FALSE;
  88. PNM_LEADER_CHANGE_WAIT_ENTRY waitEntry;
  89. PLIST_ENTRY listEntry;
  90. ClRtlLogPrint(LOG_NOISE, "[NM] Down node set: %1!04X!.\n", DownedNodeSet);
  91. NmpAcquireLock();
  92. //
  93. // Compute the new up node set
  94. //
  95. BitsetSubtract(NmpUpNodeSet, DownedNodeSet);
  96. ClRtlLogPrint(LOG_NOISE, "[NM] New up node set: %1!04X!.\n", NmpUpNodeSet);
  97. //
  98. // Check for failure of a joining node.
  99. //
  100. if (NmpJoinerNodeId != ClusterInvalidNodeId) {
  101. if (NmpJoinerNodeId == NmLocalNodeId) {
  102. //
  103. // The joining node is the local node. Halt.
  104. //
  105. ClRtlLogPrint(LOG_NOISE,
  106. "[NMJOIN] Aborting join because of change in membership.\n"
  107. );
  108. CsInconsistencyHalt(ERROR_CLUSTER_JOIN_ABORTED);
  109. }
  110. else if ( (BitsetIsMember(NmpJoinerNodeId, DownedNodeSet))
  111. ||
  112. ( (BitsetIsMember(NmpSponsorNodeId, DownedNodeSet)) &&
  113. (!BitsetIsMember(NmpJoinerNodeId, DownedNodeSet))
  114. )
  115. )
  116. {
  117. //
  118. // The joining node is down or the sponsor is down and the joiner
  119. // is not yet an active member. Cleanup the join state. If the
  120. // sponsor is down and the joiner is an active member, we will
  121. // clean up when we detect that the joiner has perished.
  122. //
  123. ClRtlLogPrint(LOG_NOISE,
  124. "[NMJOIN] Aborting join of node %1!u! sponsored by node %2!u!\n",
  125. NmpJoinerNodeId,
  126. NmpSponsorNodeId
  127. );
  128. //
  129. // Reset joiner state if sponsor died
  130. //
  131. if (BitsetIsMember(NmpSponsorNodeId, DownedNodeSet)) {
  132. node = NmpIdArray[NmpJoinerNodeId];
  133. node->State = ClusterNodeDown;
  134. // [GorN 4/3/2000]
  135. // Without a node down, cluadmin won't refresh the state.
  136. // If this code is to be changed to emit CLUSTER_NODE_CHANGE_EVENT or
  137. // some other event, NmpUpdateJoinAbort has to be changed as well,
  138. // so that we will have the same join cleanup behavior
  139. BitsetAdd(DownedNodeSet, NmpJoinerNodeId);
  140. }
  141. NmpJoinerNodeId = ClusterInvalidNodeId;
  142. NmpSponsorNodeId = ClusterInvalidNodeId;
  143. NmpJoinTimer = 0;
  144. NmpJoinAbortPending = FALSE;
  145. NmpJoinSequence = 0;
  146. NmpJoinerUp = FALSE;
  147. NmpJoinerOutOfSynch = FALSE;
  148. }
  149. else {
  150. //
  151. // Mark that the joiner is out of synch with the cluster
  152. // state. The sponsor will eventually abort the join.
  153. //
  154. ClRtlLogPrint(LOG_NOISE,
  155. "[NMJOIN] Joiner node %1!u! is now out of synch with the cluster state.\n",
  156. NmpJoinerNodeId
  157. );
  158. NmpJoinerOutOfSynch = TRUE;
  159. }
  160. }
  161. //
  162. // Check if the leader node went down
  163. //
  164. if (BitsetIsMember(NmpLeaderNodeId, DownedNodeSet)) {
  165. BOOL isEventSet;
  166. //
  167. // Elect a new leader - active node with the smallest ID.
  168. //
  169. for (i = ClusterMinNodeId; i <= NmMaxNodeId; i++) {
  170. if (BitsetIsMember(i, NmpUpNodeSet)) {
  171. NmpLeaderNodeId = i;
  172. break;
  173. }
  174. }
  175. CL_ASSERT(i <= NmMaxNodeId);
  176. if (NmpLeaderNodeId == NmLocalNodeId) {
  177. //
  178. // The local node is the new leader.
  179. //
  180. ClRtlLogPrint(LOG_NOISE,
  181. "[NM] This node is the new leader.\n"
  182. );
  183. iAmNewLeader = TRUE;
  184. }
  185. else {
  186. ClRtlLogPrint(LOG_NOISE,
  187. "[NM] Node %1!u! is the new leader.\n",
  188. NmpLeaderNodeId
  189. );
  190. }
  191. //
  192. // Wake up any threads waiting for an RPC call to the leader to
  193. // complete.
  194. //
  195. while (!IsListEmpty(&NmpLeaderChangeWaitList)) {
  196. listEntry = RemoveHeadList(&NmpLeaderChangeWaitList);
  197. //
  198. // NULL out the entry's links to indicate that it has been
  199. // dequeued. The users of the notification feature depend
  200. // on this action.
  201. //
  202. listEntry->Flink = NULL; listEntry->Blink = NULL;
  203. //
  204. // Wake up the waiting thread.
  205. //
  206. waitEntry = (PNM_LEADER_CHANGE_WAIT_ENTRY) listEntry;
  207. isEventSet = SetEvent(waitEntry->LeaderChangeEvent);
  208. CL_ASSERT(isEventSet != 0);
  209. }
  210. }
  211. //
  212. // First recovery pass - clean up node states and disable communication
  213. //
  214. for (i = ClusterMinNodeId; i <= NmMaxNodeId; i++) {
  215. node = NmpIdArray[i];
  216. if ( (node != NULL) && (BitsetIsMember(i, DownedNodeSet)) ) {
  217. node->State = ClusterNodeDown;
  218. //MM has declared the node to be down. Set the node down event.
  219. if (!SetEvent(node->MmNodeStateDownEvent)) {
  220. status = GetLastError();
  221. ClRtlLogPrint(LOG_CRITICAL,
  222. "[NMJOIN] Failed to set node down event for Node= %1!u! status= %2!u!.\n",
  223. node->NodeId,
  224. status
  225. );
  226. CsInconsistencyHalt(status);
  227. }
  228. status = ClusnetOfflineNodeComm(
  229. NmClusnetHandle,
  230. node->NodeId
  231. );
  232. CL_ASSERT(
  233. (status == ERROR_SUCCESS) ||
  234. (status == ERROR_CLUSTER_NODE_ALREADY_DOWN)
  235. );
  236. }
  237. }
  238. //
  239. // Inform the rest of the service that these nodes are gone
  240. //
  241. ClusterEventEx(
  242. CLUSTER_EVENT_NODE_DOWN_EX,
  243. EP_CONTEXT_VALID,
  244. ULongToPtr(DownedNodeSet)
  245. );
  246. //
  247. // Second recovery pass - clean up network states and issue old-style
  248. // node down events
  249. //
  250. for (i = ClusterMinNodeId; i <= NmMaxNodeId; i++) {
  251. node = NmpIdArray[i];
  252. if ( (node != NULL) && (BitsetIsMember(i, DownedNodeSet)) ) {
  253. //
  254. // Issue an individual node down event.
  255. //
  256. ClusterEvent(CLUSTER_EVENT_NODE_DOWN, node);
  257. //
  258. // Now do Intracluster RPC cleanup...
  259. //
  260. NmpTerminateRpcsToNode(node->NodeId);
  261. //
  262. // Update the network and interface information.
  263. //
  264. NmpUpdateNetworkConnectivityForDownNode(node);
  265. //
  266. // Log an event
  267. //
  268. if (NmpLeaderNodeId == NmLocalNodeId) {
  269. LPCWSTR nodeName = OmObjectName(node);
  270. CsLogEvent1(
  271. LOG_UNUSUAL,
  272. NM_EVENT_NODE_DOWN,
  273. nodeName
  274. );
  275. }
  276. }
  277. }
  278. //
  279. // If this node is the new leader, schedule a state computation for all
  280. // networks. State reports may have been received before this node
  281. // assumed leadership duties.
  282. //
  283. if (iAmNewLeader) {
  284. NmpRecomputeNT5NetworkAndInterfaceStates();
  285. }
  286. NmpReleaseLock();
  287. return(ERROR_SUCCESS);
  288. } // NmpNodesDownEventHandler //
  289. DWORD
  290. NmpNodeChange(
  291. IN DWORD NodeId,
  292. IN NODESTATUS NewStatus
  293. )
  294. {
  295. PNM_NODE node;
  296. CL_ASSERT(
  297. (NodeId >= ClusterMinNodeId) &&
  298. (NodeId <= NmMaxNodeId)
  299. );
  300. NmpAcquireLock();
  301. node = NmpIdArray[NodeId];
  302. CL_ASSERT(node != NULL);
  303. if (node != NULL) {
  304. if (NewStatus == NODE_DOWN) {
  305. NmpNodeDownEventHandler(node);
  306. }
  307. else {
  308. CL_ASSERT(NewStatus == NODE_UP);
  309. NmpNodeUpEventHandler(node);
  310. }
  311. }
  312. NmpReleaseLock();
  313. return(ERROR_SUCCESS);
  314. } // NmpNodeChange
  315. VOID
  316. NmpHoldIoEventHandler(
  317. VOID
  318. )
  319. {
  320. ClRtlLogPrint(LOG_NOISE,
  321. "[NM] Holding I/O.\n"
  322. );
  323. #if defined(HOLD_IO_IS_SAFE_NOW)
  324. FmHoldIO();
  325. #endif
  326. return;
  327. }
  328. VOID
  329. NmpResumeIoEventHandler(
  330. VOID
  331. )
  332. {
  333. ClRtlLogPrint(LOG_NOISE,
  334. "[NM] Resuming I/O.\n"
  335. );
  336. #if defined(HOLD_IO_IS_SAFE_NOW)
  337. FmResumeIO();
  338. #endif
  339. return;
  340. }
  341. BOOL
  342. NmpCheckQuorumEventHandler(
  343. VOID
  344. )
  345. {
  346. BOOL haveQuorum;
  347. //
  348. // daviddio 06/19/2000
  349. //
  350. // Before asking FM to arbitrate, determine if we have any
  351. // viable network interfaces. If not, return failure to MM
  352. // and allow other cluster nodes to arbitrate. The SCM
  353. // will restart the cluster service, so that if no nodes
  354. // successfully arbitrate, we will get another shot.
  355. //
  356. if (NmpCheckForNetwork()) {
  357. ClRtlLogPrint(LOG_NOISE,
  358. "[NM] Checking if we own the quorum resource.\n"
  359. );
  360. haveQuorum = FmArbitrateQuorumResource();
  361. if (haveQuorum) {
  362. ClRtlLogPrint(LOG_NOISE,
  363. "[NM] We own the quorum resource.\n"
  364. );
  365. }
  366. else {
  367. ClRtlLogPrint(LOG_NOISE,
  368. "[NM] We do not own the quorum resource, status %1!u!.\n",
  369. GetLastError()
  370. );
  371. //[GN] ClusnetHalt( NmClusnetHandle ); => NmpHaltEventHandler
  372. //
  373. }
  374. } else {
  375. ClRtlLogPrint(LOG_CRITICAL,
  376. "[NM] Abdicating quorum because no valid network "
  377. "interfaces were detected.\n"
  378. );
  379. haveQuorum = FALSE;
  380. }
  381. return(haveQuorum);
  382. } // NmpCheckQuorumEventHandler
  383. void
  384. NmpMsgCleanup1(
  385. IN DWORD DeadNodeId
  386. )
  387. {
  388. ClRtlLogPrint(LOG_NOISE,
  389. "[NM] Phase 1 message cleanup - node %1!u!.\n",
  390. DeadNodeId
  391. );
  392. return;
  393. }
  394. void
  395. NmpMsgCleanup2(
  396. IN BITSET DownedNodeSet
  397. )
  398. {
  399. ClRtlLogPrint(LOG_NOISE,
  400. "[NM] Phase 2 message cleanup - node %1!04X!.\n",
  401. DownedNodeSet
  402. );
  403. NmpAcquireLock();
  404. if ( NmpCleanupIfJoinAborted &&
  405. (NmpJoinerNodeId != ClusterInvalidNodeId) &&
  406. BitsetIsMember(NmpJoinerNodeId, DownedNodeSet) )
  407. {
  408. //
  409. // Since the joiner is in the DownedNodeSet mask
  410. // the node down will be delivered on this node by a regroup engine.
  411. // No need for NmpUpdateAbortJoin to issue a node down.
  412. //
  413. NmpCleanupIfJoinAborted = FALSE;
  414. ClRtlLogPrint(LOG_NOISE,
  415. "[NM] NmpCleanupIfJoinAborted is set to false. Joiner - %1!u!.\n",
  416. NmpJoinerNodeId
  417. );
  418. }
  419. NmpReleaseLock();
  420. //
  421. // Inform the rest of the service that these nodes are gone
  422. //
  423. ClusterSyncEventEx(
  424. CLUSTER_EVENT_NODE_DOWN_EX,
  425. EP_CONTEXT_VALID,
  426. ULongToPtr(DownedNodeSet)
  427. );
  428. return;
  429. }
  430. VOID
  431. NmpHaltEventHandler(
  432. IN DWORD HaltCode
  433. )
  434. {
  435. WCHAR string[16];
  436. // Do a graceful stop if we are shutting down //
  437. if (HaltCode == MM_STOP_REQUESTED) {
  438. DWORD Status = ERROR_SUCCESS;
  439. ClRtlLogPrint(LOG_UNUSUAL,
  440. "[NM] Prompt shutdown is requested by a membership engine\n"
  441. );
  442. ClusnetHalt( NmClusnetHandle );
  443. CsLogEvent(LOG_NOISE, SERVICE_SUCCESSFUL_TERMINATION);
  444. CsServiceStatus.dwCurrentState = SERVICE_STOPPED;
  445. CsServiceStatus.dwControlsAccepted = 0;
  446. CsServiceStatus.dwCheckPoint = 0;
  447. CsServiceStatus.dwWaitHint = 0;
  448. CsServiceStatus.dwWin32ExitCode = Status;
  449. CsServiceStatus.dwServiceSpecificExitCode = Status;
  450. CsAnnounceServiceStatus();
  451. ExitProcess(Status);
  452. } else {
  453. wsprintfW(&(string[0]), L"%u", HaltCode);
  454. ClRtlLogPrint(LOG_CRITICAL,
  455. "[NM] Halting this node due to membership or communications error. Halt code = %1!u!\n",
  456. HaltCode
  457. );
  458. ClusnetHalt( NmClusnetHandle );
  459. //
  460. // Adjust membership code to win32 error code. (If mapping exits)
  461. //
  462. HaltCode = MMMapHaltCodeToDosError( HaltCode );
  463. CsInconsistencyHalt(HaltCode);
  464. }
  465. }
  466. void
  467. NmpJoinFailed(
  468. void
  469. )
  470. {
  471. return;
  472. }
  473. DWORD
  474. NmpGumUpdateHandler(
  475. IN DWORD Context,
  476. IN BOOL SourceNode,
  477. IN DWORD BufferLength,
  478. IN PVOID Buffer
  479. )
  480. /*++
  481. Routine Description:
  482. Handles GUM updates for membership events.
  483. Arguments:
  484. Context - Supplies the update context. This is the message type
  485. SourceNode - Supplies whether or not the update originated on this node.
  486. BufferLength - Supplies the length of the update.
  487. Buffer - Supplies a pointer to the buffer.
  488. Return Value:
  489. ERROR_SUCCESS if successful
  490. Win32 error code otherwise
  491. --*/
  492. {
  493. DWORD status;
  494. if (Context == NmUpdateJoinComplete) {
  495. status = NmpUpdateJoinComplete(Buffer);
  496. }
  497. else {
  498. status = ERROR_SUCCESS;
  499. ClRtlLogPrint(LOG_UNUSUAL,
  500. "[NM] Discarding unknown gum request %1!u!\n",
  501. Context
  502. );
  503. }
  504. return(status);
  505. } // NmpUpdateGumHandler
  506. DWORD
  507. NmpMembershipInit(
  508. VOID
  509. )
  510. {
  511. DWORD status;
  512. ClRtlLogPrint(LOG_NOISE,"[NM] Initializing membership...\n");
  513. InitializeListHead(&NmpLeaderChangeWaitList);
  514. //
  515. // Initialize membership engine.
  516. //
  517. status = MMInit(
  518. NmLocalNodeId,
  519. NmMaxNodes,
  520. NmpNodeChange,
  521. NmpCheckQuorumEventHandler,
  522. NmpHoldIoEventHandler,
  523. NmpResumeIoEventHandler,
  524. NmpMsgCleanup1,
  525. NmpMsgCleanup2,
  526. NmpHaltEventHandler,
  527. NmpJoinFailed,
  528. NmpMultiNodeDownEventHandler
  529. );
  530. if (status != MM_OK) {
  531. status = MMMapStatusToDosError(status);
  532. ClRtlLogPrint(LOG_CRITICAL,
  533. "[NM] Membership initialization failed, status %1!u!.\n",
  534. status
  535. );
  536. return(status);
  537. }
  538. NmpMembershipCleanupOk = TRUE;
  539. ClRtlLogPrint(LOG_NOISE,"[NM] Membership initialization complete.\n");
  540. return(ERROR_SUCCESS);
  541. } // NmpMembershipInit
  542. VOID
  543. NmpMembershipShutdown(
  544. VOID
  545. )
  546. {
  547. if (NmpMembershipCleanupOk) {
  548. ClRtlLogPrint(LOG_NOISE,"[NM] Shutting down membership...\n");
  549. MMShutdown();
  550. NmpMembershipCleanupOk = FALSE;
  551. ClRtlLogPrint(LOG_NOISE,"[NM] Membership shutdown complete.\n");
  552. }
  553. return;
  554. } // NmpMembershipShutdown