Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

781 lines
23 KiB

  1. /*++
  2. Copyright (c) 1996 Microsoft Corporation
  3. Module Name:
  4. fmevent.c
  5. Abstract:
  6. Event Handler for the Failover Manager component of the
  7. NT Cluster Service
  8. Author:
  9. Rod Gamache (rodga) 19-Mar-1996
  10. Revision History:
  11. --*/
  12. #include "fmp.h"
  13. #define LOG_MODULE EVENT
  14. //
  15. // Global data initialized in this module
  16. //
  17. //
  18. // Local functions
  19. //
  20. DWORD
  21. WINAPI
  22. FmpEventHandler(
  23. IN CLUSTER_EVENT Event,
  24. IN PVOID Context
  25. )
  26. /*++
  27. Routine Description:
  28. This routine handles events for the Failover Manager.
  29. In many cases the request is posted to the FM's work queue, so
  30. that the mainline event process is not blocked.
  31. Arguments:
  32. Event - The event to be processed. Only one event at a time.
  33. If the event is not handled, return ERROR_SUCCESS.
  34. Context - A pointer to context associated with the particular event.
  35. Returns:
  36. ERROR_SHUTDOWN_CLUSTER - if the Cluster must be shutdown.
  37. A Win32 error code on other errors.
  38. Notes:
  39. The conservation of energy, and laws of inertia apply here.
  40. If a resource comes online it is because someone requested it to be so.
  41. Therefore, the energy from that request goes into the state of the Group,
  42. by requesting the Group to go online.
  43. However, if a resource goes offline, it could be because of a failure.
  44. We therefore only mark the state of a Group as offline if all resources
  45. contained within the group are offline.
  46. --*/
  47. {
  48. DWORD status;
  49. switch ( Event ) {
  50. case CLUSTER_EVENT_NODE_ADDED:
  51. CL_ASSERT( Context != NULL );
  52. FmpPostWorkItem( FM_EVENT_NODE_ADDED, Context, 0 );
  53. break;
  54. case CLUSTER_EVENT_NODE_UP:
  55. ClRtlLogPrint(LOG_NOISE,"[FM] Node up event\n");
  56. //
  57. // FM no longer cares about node up events.
  58. //
  59. break;
  60. case CLUSTER_EVENT_NODE_DOWN:
  61. FmpMajorEvent = TRUE; // Node Down is a major event.
  62. ClRtlLogPrint(LOG_NOISE,"[FM] FmpEventHandler::Node down event\n");
  63. FmpHandleNodeDownEvent( Context );
  64. break;
  65. case CLUSTER_EVENT_NODE_DELETED:
  66. FmpHandleNodeEvictEvent( Context );
  67. break;
  68. default:
  69. break;
  70. }
  71. return(ERROR_SUCCESS);
  72. } // FmEventHandler
  73. DWORD
  74. WINAPI
  75. FmpSyncEventHandler(
  76. IN CLUSTER_EVENT Event,
  77. IN PVOID Context
  78. )
  79. /*++
  80. Routine Description:
  81. Processes nodes down cluster events. Update locker/locking nodes
  82. state and decide if we need to replay last update in async handler.
  83. Arguments:
  84. Event - Supplies the type of cluster event.
  85. Context - Supplies the event-specific context
  86. Return Value:
  87. ERROR_SUCCESS
  88. --*/
  89. {
  90. BITSET DownedNodes = (BITSET)((ULONG_PTR)Context);
  91. DWORD NodeId;
  92. if (Event != CLUSTER_EVENT_NODE_DOWN_EX) {
  93. return(ERROR_SUCCESS);
  94. }
  95. CL_ASSERT(BitsetIsNotMember(NmLocalNodeId, DownedNodes));
  96. ClRtlLogPrint(LOG_NOISE,
  97. "[FM] FmpSyncEventHandler:: %1!04X!.\n",
  98. DownedNodes);
  99. //
  100. // mark the nodes that go down
  101. // till the worker thread finishes processing the groups that belonged
  102. // to this node, we will block a join from the same node
  103. //
  104. for(NodeId = ClusterMinNodeId; NodeId <= NmMaxNodeId; ++NodeId)
  105. {
  106. if (BitsetIsMember(NodeId, DownedNodes))
  107. {
  108. gFmpNodeArray[NodeId].dwNodeDownProcessingInProgress = 1;
  109. }
  110. }
  111. return(ERROR_SUCCESS);
  112. }
  113. VOID
  114. FmpHandleGroupFailure(
  115. IN PFM_GROUP Group,
  116. IN PFM_RESOURCE pResource OPTIONAL
  117. )
  118. /*++
  119. Routine Description:
  120. Handles Group failure notifications from the resource manager. If the
  121. Group can be moved to some other system and we are within the failover
  122. threshold, then move it. Otherwise, just leave the Group (partially)
  123. online on this system.
  124. Arguments:
  125. Group - a pointer to the Group object for the failed Group.
  126. pResource - A pointer to the failed resource which caused the group failure. OPTIONAL
  127. Returns:
  128. None.
  129. --*/
  130. {
  131. DWORD status;
  132. DWORD tickCount;
  133. DWORD withinFailoverPeriod;
  134. DWORD failoverPeriodInMs;
  135. BOOL newTime;
  136. PFM_RESOURCE Resource;
  137. PLIST_ENTRY listEntry;
  138. //
  139. // Chittur Subbaraman (chitturs) - 6/10/2001
  140. //
  141. // Changed the function to optionally take in a pResource and notify the group if we decide
  142. // to failover the group
  143. //
  144. FmpAcquireLocalGroupLock( Group );
  145. if ( ( !IS_VALID_FM_GROUP( Group ) ) || ( Group->OwnerNode != NmLocalNode ) ) {
  146. FmpReleaseLocalGroupLock( Group );
  147. return;
  148. }
  149. ClRtlLogPrint(LOG_NOISE,
  150. "[FM] FmpHandleGroupFailure, Entry: Group failure for %1!ws!...\n",
  151. OmObjectId(Group));
  152. //
  153. // Convert Group's failover period from hours to milliseconds.
  154. //
  155. failoverPeriodInMs = Group->FailoverPeriod * (3600*1000);
  156. //
  157. // Get current time (in tick counts). We can save about 1193 hours worth
  158. // of milliseconds (or almost 50 days) in one DWORD.
  159. //
  160. tickCount = GetTickCount();
  161. //
  162. // Compute boolean that indicates if we are whithin the failover period.
  163. //
  164. withinFailoverPeriod = ( ((tickCount - Group->FailureTime) <=
  165. failoverPeriodInMs ) ? TRUE : FALSE);
  166. //
  167. // Tally another failure.
  168. //
  169. if ( withinFailoverPeriod ) {
  170. ++Group->NumberOfFailures;
  171. newTime = FALSE;
  172. } else {
  173. Group->FailureTime = tickCount;
  174. Group->NumberOfFailures = 1;
  175. newTime = TRUE;
  176. }
  177. //
  178. // Tell everyone about our new FailureCount. Propagate failure
  179. // count
  180. //
  181. FmpPropagateFailureCount( Group, newTime );
  182. //
  183. // If this group is the same as the quorum group and the quorum
  184. // resource has failed
  185. //
  186. if ( ( gpQuoResource->Group == Group ) &&
  187. ( gpQuoResource->State == ClusterResourceFailed ) )
  188. {
  189. if ( pResource != NULL ) FmpTerminateResource( pResource );
  190. FmpCleanupQuorumResource(gpQuoResource);
  191. #if DBG
  192. if (IsDebuggerPresent())
  193. {
  194. DebugBreak();
  195. }
  196. #endif
  197. CsInconsistencyHalt(ERROR_QUORUM_RESOURCE_ONLINE_FAILED);
  198. }
  199. //
  200. // First check if we can move the Group someplace else.
  201. //
  202. if ( FmpGroupCanMove( Group ) &&
  203. (Group->NumberOfFailures <= Group->FailoverThreshold) ) {
  204. //
  205. // Chittur Subbaraman (chitturs) - 4/13/99
  206. //
  207. // Now create the FmpDoMoveGroupOnFailure thread to handle the
  208. // group move. The thread will wait until the group state becomes
  209. // stable and then initiate the move.
  210. //
  211. if( !( Group->dwStructState &
  212. FM_GROUP_STRUCT_MARKED_FOR_MOVE_ON_FAIL ) )
  213. {
  214. PMOVE_GROUP pContext = NULL;
  215. DWORD dwThreadId = 0;
  216. HANDLE hThread = NULL;
  217. //
  218. // The decision to failover the group has been made (more or less). So, notify
  219. // all the group's resources of this decision.
  220. //
  221. FmpNotifyGroupStateChangeReason( Group, eResourceStateChangeReasonFailover );
  222. if ( pResource != NULL ) FmpTerminateResource( pResource );
  223. pContext = LocalAlloc( LMEM_FIXED, sizeof( MOVE_GROUP ) );
  224. if ( pContext == NULL ) {
  225. status = ERROR_NOT_ENOUGH_MEMORY;
  226. ClRtlLogPrint(LOG_UNUSUAL,
  227. "[FM] Group failure for group <%1!ws!>. Unable to allocate memory.\n",
  228. OmObjectId(Group));
  229. FmpReleaseLocalGroupLock( Group );
  230. return;
  231. }
  232. ClRtlLogPrint(LOG_UNUSUAL,
  233. "[FM] Group failure for group <%1!ws!>. Create thread to take offline and move.\n",
  234. OmObjectId(Group));
  235. //
  236. // Reference the Group object. You don't want the group object
  237. // to be deleted at the time the FmpDoMoveGroupOnFailure thread
  238. // executes.
  239. //
  240. OmReferenceObject( Group );
  241. pContext->Group = Group;
  242. pContext->DestinationNode = NULL;
  243. hThread = CreateThread( NULL,
  244. 0,
  245. FmpDoMoveGroupOnFailure,
  246. pContext,
  247. 0,
  248. &dwThreadId );
  249. if ( hThread == NULL ) {
  250. status = GetLastError();
  251. ClRtlLogPrint(LOG_UNUSUAL,
  252. "[FM] Failed to create FmpDoMoveGroupOnFailure thread for group <%1!ws!>. Error %2!u!.\n",
  253. OmObjectId(Group),
  254. status);
  255. LocalFree( pContext );
  256. OmDereferenceObject( Group );
  257. } else {
  258. CloseHandle( hThread );
  259. //
  260. // Mark the group as being moved on failure. This is necessary
  261. // so that you don't spawn new FmpDoMoveGroupOnFailure threads
  262. // which try to concurrently move the group. Note that the
  263. // worker thread which calls this function may deliver multiple
  264. // failure notifications.
  265. //
  266. Group->dwStructState |= FM_GROUP_STRUCT_MARKED_FOR_MOVE_ON_FAIL;
  267. }
  268. }
  269. else {
  270. if ( pResource != NULL ) FmpTerminateResource( pResource );
  271. }
  272. } else {
  273. ClRtlLogPrint(LOG_UNUSUAL,
  274. "[FM] Group failure for %1!ws!, but can't move. Failure count = %2!d!.\n",
  275. OmObjectId(Group), Group->NumberOfFailures);
  276. if ( pResource != NULL ) FmpTerminateResource( pResource );
  277. // All attempts to bring group online failed - start the watchdog timer
  278. // to attempt a restart of all failed resources in this group.
  279. for ( listEntry = Group->Contains.Flink;
  280. listEntry != &(Group->Contains);
  281. listEntry = listEntry->Flink )
  282. {
  283. Resource = CONTAINING_RECORD(listEntry, FM_RESOURCE, ContainsLinkage);
  284. FmpDelayedStartRes(Resource);
  285. }
  286. }
  287. FmpReleaseLocalGroupLock( Group );
  288. ClRtlLogPrint(LOG_NOISE,
  289. "[FM] FmpHandleGroupFailure, Exit: Group failure for %1!ws!...\n",
  290. OmObjectId(Group));
  291. return;
  292. } // FmpHandleGroupFailure
  293. BOOL
  294. FmpGroupCanMove(
  295. IN PFM_GROUP Group
  296. )
  297. /*++
  298. Routine Description:
  299. Indicates whether there is another system that is in the preferred owner
  300. list that can take a Group.
  301. Arguments:
  302. Group - the Group to check if it can move.
  303. Returns:
  304. TRUE - the Group can (probably) move to another system.
  305. FALSE - there is no place to move this Group.
  306. --*/
  307. {
  308. DWORD status;
  309. PNM_NODE node;
  310. node = FmpFindAnotherNode( Group, FALSE );
  311. if (node != NULL ) {
  312. return(TRUE);
  313. }
  314. return(FALSE);
  315. } // FmpGroupCanMove
  316. DWORD
  317. FmpNodeDown(
  318. PVOID Context
  319. )
  320. /*++
  321. Routine Description:
  322. This routine handles a node down event from the NM layer.
  323. Arguments:
  324. Context - The node that went down.
  325. Returns:
  326. ERROR_SUCCESS if everything was handled okay.
  327. ERROR_SHUTDOWN_CLUSTER if catastrophy happens.
  328. Win32 error code otherwise (???).
  329. --*/
  330. {
  331. PNM_NODE pNode = (PNM_NODE)Context;
  332. DWORD dwStatus;
  333. LPCWSTR pszNodeId;
  334. DWORD dwNodeLen;
  335. DWORD dwClusterHighestVersion;
  336. ClRtlLogPrint(LOG_NOISE,
  337. "[FM] FmpNodeDown::Node down %1!ws!\n",
  338. OmObjectId(pNode));
  339. //
  340. // Chittur Subbaraman (chitturs) - 3/30/99
  341. //
  342. // Acquire the global group lock to synchronize with the shutdown
  343. //
  344. FmpAcquireGroupLock();
  345. if (!FmpFMOnline || FmpShutdown)
  346. {
  347. //
  348. // We don't care about membership changes until we have finished
  349. // initializing and we're not shutting down.
  350. //
  351. FmpReleaseGroupLock();
  352. ClRtlLogPrint(LOG_CRITICAL,
  353. "[FM] FmpNodeDown - ignore node down event.\n" );
  354. goto FnExit;
  355. }
  356. FmpReleaseGroupLock();
  357. //SS: Note all nodes will send this update
  358. //The latter updates should not find any groups that belong to
  359. //this node
  360. //We cant rely on only the locker node making this update
  361. //since the locker node may die before it is able to do this and
  362. //that can result in these groups being orphaned
  363. pszNodeId = OmObjectId(pNode);
  364. dwNodeLen = (lstrlenW(pszNodeId)+1)*sizeof(WCHAR);
  365. NmGetClusterOperationalVersion( &dwClusterHighestVersion,
  366. NULL,
  367. NULL );
  368. //
  369. // If this is a non Win2k-Whistler mixed mode cluster, attempt to randomize the
  370. // group preferred owners list and send it as a part of node down GUM.
  371. //
  372. if ( CLUSTER_GET_MAJOR_VERSION( dwClusterHighestVersion ) >=
  373. NT51_MAJOR_VERSION )
  374. {
  375. PFM_GROUP_NODE_LIST pGroupNodeList = NULL;
  376. //
  377. // Attempt to get a contiguous buffer containing the list of group IDs and suggested
  378. // owners for them.
  379. //
  380. dwStatus = FmpPrepareGroupNodeList( &pGroupNodeList );
  381. if ( dwStatus != ERROR_SUCCESS )
  382. {
  383. //
  384. // If the call returns ERROR_CLUSTER_INVALID_REQUEST, it means a user has turned
  385. // off the randomization algorithm.
  386. //
  387. if ( dwStatus != ERROR_CLUSTER_INVALID_REQUEST )
  388. ClRtlLogPrint(LOG_CRITICAL, "[FM] FmpNodeDown: FmpPrepareGroupNodeList returns %1!u!...\n",
  389. dwStatus);
  390. LocalFree( pGroupNodeList );
  391. goto use_old_gum;
  392. }
  393. //
  394. // If the list does not even contain any entries, just switch to the old gum. No point in
  395. // sending the list header around.
  396. //
  397. if ( pGroupNodeList->cbGroupNodeList < sizeof ( FM_GROUP_NODE_LIST ) )
  398. {
  399. ClRtlLogPrint(LOG_NOISE, "[FM] FmpNodeDown: FmpPrepareGroupNodeList returns empty list...\n");
  400. LocalFree( pGroupNodeList );
  401. goto use_old_gum;
  402. }
  403. //
  404. // Invoke GUM to pass around the dead node ID and the randomized group node list
  405. //
  406. dwStatus = GumSendUpdateEx( GumUpdateFailoverManager,
  407. FmUpdateUseRandomizedNodeListForGroups,
  408. 2,
  409. dwNodeLen,
  410. pszNodeId,
  411. pGroupNodeList->cbGroupNodeList,
  412. pGroupNodeList );
  413. if ( dwStatus != ERROR_SUCCESS )
  414. {
  415. ClRtlLogPrint(LOG_CRITICAL,
  416. "[FM] FmpNodeDown: GUM update FmUpdateUseRandomizedNodeListForGroups failed %1!d!\n",
  417. dwStatus);
  418. }
  419. LocalFree( pGroupNodeList );
  420. goto FnExit;
  421. }
  422. use_old_gum:
  423. dwStatus = GumSendUpdateEx(GumUpdateFailoverManager,
  424. FmUpdateAssignOwnerToGroups,
  425. 1,
  426. dwNodeLen,
  427. pszNodeId);
  428. if (dwStatus != ERROR_SUCCESS)
  429. {
  430. ClRtlLogPrint(LOG_CRITICAL,
  431. "[FM] FmpNodeDown: Gumupdate failed %1!d!\n",
  432. dwStatus);
  433. }
  434. FnExit:
  435. gFmpNodeArray[NmGetNodeId(pNode)].dwNodeDownProcessingThreadId = 0;
  436. OmDereferenceObject ( pNode );
  437. return(ERROR_SUCCESS);
  438. } // FmpNodeDown
  439. BOOL
  440. WINAPI
  441. FmVerifyNodeDown(
  442. IN PNM_NODE Node,
  443. OUT LPBOOL IsDown
  444. )
  445. /*++
  446. Routine Description:
  447. This routine attempts to verify whether a given node is down. This can
  448. only be done if there is some shared resource that the other system
  449. currently 'owns'. We will attempt to negotiate the shared resource and
  450. if we 'win' the negotiation we'll declare that other system down. If we
  451. loose arbitration, we declare the other system as still up.
  452. Arguments:
  453. Node - A pointer to the node structure for the other system.
  454. IsDown - A we can perform the verification, this indicates the results of
  455. that verification.
  456. Returns:
  457. TRUE - If we can perform the verification.
  458. FALSE - If we can't perform the verification.
  459. --*/
  460. {
  461. return(FALSE);
  462. } // FmVerifyNodeDown
  463. DWORD
  464. FmpHandleNodeDownEvent(
  465. IN PVOID pContext
  466. )
  467. /*++
  468. Routine Description:
  469. This function creates a thread to handle the node down event.
  470. Arguments:
  471. pContext - Pointer to the context structure
  472. Returns:
  473. ERROR_SUCCESS
  474. --*/
  475. {
  476. HANDLE hThread = NULL;
  477. DWORD dwError;
  478. //
  479. // Chittur Subbaraman (chitturs) - 7/31/99
  480. //
  481. // Create a thread to handle the FM node down event. Let us not
  482. // rely on the FM worker thread to handle this. This is because
  483. // the FM worker thread could be trying to online some resource
  484. // and that could get stuck for some time since the quorum resource
  485. // is not online. Now in some cases, only after the node down event
  486. // is processed the quorum resource could come online. (This is
  487. // highly likely especially in a 2 node cluster.)
  488. //
  489. ClRtlLogPrint(LOG_NOISE,
  490. "[FM] FmpHandleNodeDownEvent - Create thread to handle node down event....\n"
  491. );
  492. //
  493. // Reference the node object
  494. //
  495. OmReferenceObject( pContext );
  496. hThread = CreateThread( NULL,
  497. 0,
  498. FmpNodeDown,
  499. pContext,
  500. 0,
  501. &gFmpNodeArray[NmGetNodeId(pContext)].dwNodeDownProcessingThreadId );
  502. if ( hThread == NULL )
  503. {
  504. OmDereferenceObject( pContext );
  505. dwError = GetLastError();
  506. ClRtlLogPrint(LOG_CRITICAL,
  507. "[FM] FmpHandleNodeDownEvent - Unable to create thread to handle node down event. Error=0x%1!08lx!\r\n",
  508. dwError);
  509. CsInconsistencyHalt( dwError );
  510. }
  511. CloseHandle( hThread );
  512. return( ERROR_SUCCESS );
  513. } // FmpHandleNodeDownEvent
  514. VOID
  515. FmpHandleNodeEvictEvent(
  516. IN PVOID pContext
  517. )
  518. /*++
  519. Routine Description:
  520. This function synchronizes the FM evict processing with the node down event handler.
  521. Arguments:
  522. pContext - Pointer to the context structure (just contains the node object)
  523. Returns:
  524. None
  525. --*/
  526. {
  527. HANDLE hThread;
  528. PNM_NODE pNode = ( PNM_NODE ) pContext;
  529. DWORD dwWaitStatus;
  530. ClRtlLogPrint(LOG_NOISE,
  531. "[FM] FmpHandleNodeEvictEvent: Handle node %1!u! evict event\n",
  532. NmGetNodeId(pNode));
  533. //
  534. // Chittur Subbaraman (chitturs) - 10/8/2001
  535. //
  536. // This event handler was designed to solve the synchronization problem between FM node down processing
  537. // and FM node evict processing. In the past, NM evict GUM handler used to directly post a work item to
  538. // FM worker thread to perform the evict processing. NM posts the node down event to FM via the
  539. // event processing mechanism. Since these two activities were independent of each other, the node
  540. // evict processing could complete before the node down processing began. This caused the node down
  541. // processing to fail (and hence claim groups on the evicted down node) horribly since it could no
  542. // longer reference the evicted node. To solve this problem, NM no longer posts any work item to
  543. // the FM worker thread to perform evict processing. Instead, it uses the event processing mechanism
  544. // to post the CLUSTER_EVENT_NODE_DELETED event to the FmpEventHandler. That function invokes this
  545. // function. In this function, we detect if the node down processing is in progress and if so we wait
  546. // until the node down processing thread completes its job. Then we go ahead and do the FM evict
  547. // processing (which involves posting a work item to the FM worker thread). This function is designed
  548. // based on the assumption that NM ALWAYS posts the CLUSTER_EVENT_NODE_DOWN event BEFORE the
  549. // CLUSTER_EVENT_NODE_DELETED event. Note also that since the FmpEventHandler handles both these
  550. // cluster events serially one after another, we are guaranteed not to have any races between
  551. // the evict processing code and then node down handling code.
  552. //
  553. //
  554. // Check if the FM node down handler is working on a node down event for the node that is being
  555. // evicted
  556. //
  557. if ( gFmpNodeArray[NmGetNodeId(pNode)].dwNodeDownProcessingThreadId != 0 )
  558. {
  559. ClRtlLogPrint(LOG_NOISE,
  560. "[FM] FmpHandleNodeEvictEvent: Thread 0x%1!08lx! is currently processing node down, try opening it for wait\n",
  561. gFmpNodeArray[NmGetNodeId(pNode)].dwNodeDownProcessingThreadId);
  562. //
  563. // Get a handle to that thread
  564. //
  565. hThread = OpenThread ( SYNCHRONIZE, // Desired access
  566. FALSE, // Inherit handles
  567. gFmpNodeArray[NmGetNodeId(pNode)].dwNodeDownProcessingThreadId ); // Thread ID
  568. if ( hThread != NULL )
  569. {
  570. //
  571. // Wait until that thread terminates. Is it better to wait until a timeout and do an
  572. // inconsistency halt if the thread doesn't terminate ? If so, how well can you determine the
  573. // time for the node down processing (including GUMs) to complete ?
  574. //
  575. dwWaitStatus = WaitForSingleObject ( hThread, INFINITE );
  576. CloseHandle ( hThread );
  577. ClRtlLogPrint(LOG_NOISE,
  578. "[FM] FmpHandleNodeEvictEvent: Returning from wait, wait status %1!u!, continue with eviction\n",
  579. dwWaitStatus);
  580. } else
  581. {
  582. ClRtlLogPrint(LOG_NOISE,
  583. "[FM] FmpHandleNodeEvictEvent: Unable to open thread 0x%1!08lx!, proceed with eviction\n",
  584. gFmpNodeArray[NmGetNodeId(pNode)].dwNodeDownProcessingThreadId);
  585. }
  586. }// if ( gFmpNodeArray[NmGetNodeId(pContext)]
  587. //
  588. // Invoke the FM API to evict the node
  589. //
  590. FmEvictNode ( pNode );
  591. } // FmpHandleEvictEvent