Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

655 lines
17 KiB

  1. /*++
  2. Copyright (c) 1996 Microsoft Corporation
  3. Module Name:
  4. fmevent.c
  5. Abstract:
  6. Event Handler for the Failover Manager component of the
  7. NT Cluster Service
  8. Author:
  9. Rod Gamache (rodga) 19-Mar-1996
  10. Revision History:
  11. --*/
  12. #include "fmp.h"
  13. #define LOG_MODULE EVENT
  14. //
  15. // Global data initialized in this module
  16. //
  17. //
  18. // Local functions
  19. //
  20. DWORD
  21. WINAPI
  22. FmpEventHandler(
  23. IN CLUSTER_EVENT Event,
  24. IN PVOID Context
  25. )
  26. /*++
  27. Routine Description:
  28. This routine handles events for the Failover Manager.
  29. In many cases the request is posted to the FM's work queue, so
  30. that the mainline event process is not blocked.
  31. Arguments:
  32. Event - The event to be processed. Only one event at a time.
  33. If the event is not handled, return ERROR_SUCCESS.
  34. Context - A pointer to context associated with the particular event.
  35. Returns:
  36. ERROR_SHUTDOWN_CLUSTER - if the Cluster must be shutdown.
  37. A Win32 error code on other errors.
  38. Notes:
  39. The conservation of energy, and laws of inertia apply here.
  40. If a resource comes online it is because someone requested it to be so.
  41. Therefore, the energy from that request goes into the state of the Group,
  42. by requesting the Group to go online.
  43. However, if a resource goes offline, it could be because of a failure.
  44. We therefore only mark the state of a Group as offline if all resources
  45. contained within the group are offline.
  46. --*/
  47. {
  48. DWORD status;
  49. switch ( Event ) {
  50. case CLUSTER_EVENT_GROUP_FAILED:
  51. CL_ASSERT( Context != NULL );
  52. FmpPostWorkItem( FM_EVENT_GROUP_FAILED, Context, 0 );
  53. break;
  54. case CLUSTER_EVENT_NODE_ADDED:
  55. CL_ASSERT( Context != NULL );
  56. FmpPostWorkItem( FM_EVENT_NODE_ADDED, Context, 0 );
  57. break;
  58. case CLUSTER_EVENT_NODE_UP:
  59. ClRtlLogPrint(LOG_NOISE,"[FM] Node up event\n");
  60. //
  61. // FM no longer cares about node up events.
  62. //
  63. break;
  64. case CLUSTER_EVENT_NODE_DOWN:
  65. FmpMajorEvent = TRUE; // Node Down is a major event.
  66. ClRtlLogPrint(LOG_NOISE,"[FM] FmpEventHandler::Node down event\n");
  67. FmpHandleNodeDownEvent( Context );
  68. break;
  69. default:
  70. break;
  71. }
  72. return(ERROR_SUCCESS);
  73. } // FmEventHandler
  74. DWORD
  75. WINAPI
  76. FmpSyncEventHandler(
  77. IN CLUSTER_EVENT Event,
  78. IN PVOID Context
  79. )
  80. /*++
  81. Routine Description:
  82. Processes nodes down cluster events. Update locker/locking nodes
  83. state and decide if we need to replay last update in async handler.
  84. Arguments:
  85. Event - Supplies the type of cluster event.
  86. Context - Supplies the event-specific context
  87. Return Value:
  88. ERROR_SUCCESS
  89. --*/
  90. {
  91. BITSET DownedNodes = (BITSET)((ULONG_PTR)Context);
  92. DWORD NodeId;
  93. if (Event != CLUSTER_EVENT_NODE_DOWN_EX) {
  94. return(ERROR_SUCCESS);
  95. }
  96. CL_ASSERT(BitsetIsNotMember(NmLocalNodeId, DownedNodes));
  97. ClRtlLogPrint(LOG_NOISE,
  98. "[FM] FmpSyncEventHandler:: %1!04X!.\n",
  99. DownedNodes);
  100. //
  101. // mark the nodes that go down
  102. // till the worker thread finishes processing the groups that belonged
  103. // to this node, we will block a join from the same node
  104. //
  105. for(NodeId = ClusterMinNodeId; NodeId <= NmMaxNodeId; ++NodeId)
  106. {
  107. if (BitsetIsMember(NodeId, DownedNodes))
  108. {
  109. gFmpNodeArray[NodeId].dwNodeDownProcessingInProgress = 1;
  110. }
  111. }
  112. return(ERROR_SUCCESS);
  113. }
  114. VOID
  115. FmpHandleGroupFailure(
  116. IN PFM_GROUP Group
  117. )
  118. /*++
  119. Routine Description:
  120. Handles Group failure notifications from the resource manager. If the
  121. Group can be moved to some other system and we are within the failover
  122. threshold, then move it. Otherwise, just leave the Group (partially)
  123. online on this system.
  124. Arguments:
  125. Group - a pointer to the Group object for the failed Group.
  126. Returns:
  127. None.
  128. --*/
  129. {
  130. DWORD status;
  131. DWORD tickCount;
  132. DWORD withinFailoverPeriod;
  133. DWORD failoverPeriodInMs;
  134. BOOL newTime;
  135. PFM_RESOURCE Resource;
  136. PLIST_ENTRY listEntry;
  137. FmpAcquireLocalGroupLock( Group );
  138. if ( ( !IS_VALID_FM_GROUP( Group ) ) || ( Group->OwnerNode != NmLocalNode ) ) {
  139. FmpReleaseLocalGroupLock( Group );
  140. return;
  141. }
  142. ClRtlLogPrint(LOG_NOISE,
  143. "[FM] FmpHandleGroupFailure, Entry: Group failure for %1!ws!...\n",
  144. OmObjectId(Group));
  145. //
  146. // Convert Group's failover period from hours to milliseconds.
  147. //
  148. failoverPeriodInMs = Group->FailoverPeriod * (3600*1000);
  149. //
  150. // Get current time (in tick counts). We can save about 1193 hours worth
  151. // of milliseconds (or almost 50 days) in one DWORD.
  152. //
  153. tickCount = GetTickCount();
  154. //
  155. // Compute boolean that indicates if we are whithin the failover period.
  156. //
  157. withinFailoverPeriod = ( ((tickCount - Group->FailureTime) <=
  158. failoverPeriodInMs ) ? TRUE : FALSE);
  159. //
  160. // Tally another failure.
  161. //
  162. if ( withinFailoverPeriod ) {
  163. ++Group->NumberOfFailures;
  164. newTime = FALSE;
  165. } else {
  166. Group->FailureTime = tickCount;
  167. Group->NumberOfFailures = 1;
  168. newTime = TRUE;
  169. }
  170. //
  171. // Tell everyone about our new FailureCount. Propagate failure
  172. // count
  173. //
  174. FmpPropagateFailureCount( Group, newTime );
  175. //
  176. // If this group is the same as the quorum group and the quorum
  177. // resource has failed
  178. //
  179. if ( ( gpQuoResource->Group == Group ) &&
  180. ( gpQuoResource->State == ClusterResourceFailed ) )
  181. {
  182. FmpCleanupQuorumResource(gpQuoResource);
  183. #if DBG
  184. if (IsDebuggerPresent())
  185. {
  186. DebugBreak();
  187. }
  188. #endif
  189. CsInconsistencyHalt(ERROR_QUORUM_RESOURCE_ONLINE_FAILED);
  190. }
  191. //
  192. // First check if we can move the Group someplace else.
  193. //
  194. if ( FmpGroupCanMove( Group ) &&
  195. (Group->NumberOfFailures <= Group->FailoverThreshold) ) {
  196. //
  197. // Chittur Subbaraman (chitturs) - 4/13/99
  198. //
  199. // Now create the FmpDoMoveGroupOnFailure thread to handle the
  200. // group move. The thread will wait until the group state becomes
  201. // stable and then initiate the move.
  202. //
  203. if( !( Group->dwStructState &
  204. FM_GROUP_STRUCT_MARKED_FOR_MOVE_ON_FAIL ) )
  205. {
  206. PMOVE_GROUP pContext = NULL;
  207. DWORD dwThreadId = 0;
  208. HANDLE hThread = NULL;
  209. pContext = LocalAlloc( LMEM_FIXED, sizeof( MOVE_GROUP ) );
  210. if ( pContext == NULL ) {
  211. status = ERROR_NOT_ENOUGH_MEMORY;
  212. ClRtlLogPrint(LOG_UNUSUAL,
  213. "[FM] Group failure for group <%1!ws!>. Unable to allocate memory.\n",
  214. OmObjectId(Group));
  215. FmpReleaseLocalGroupLock( Group );
  216. return;
  217. }
  218. ClRtlLogPrint(LOG_UNUSUAL,
  219. "[FM] Group failure for group <%1!ws!>. Create thread to take offline and move.\n",
  220. OmObjectId(Group));
  221. //
  222. // Reference the Group object. You don't want the group object
  223. // to be deleted at the time the FmpDoMoveGroupOnFailure thread
  224. // executes.
  225. //
  226. OmReferenceObject( Group );
  227. pContext->Group = Group;
  228. pContext->DestinationNode = NULL;
  229. hThread = CreateThread( NULL,
  230. 0,
  231. FmpDoMoveGroupOnFailure,
  232. pContext,
  233. 0,
  234. &dwThreadId );
  235. if ( hThread == NULL ) {
  236. status = GetLastError();
  237. ClRtlLogPrint(LOG_UNUSUAL,
  238. "[FM] Failed to create FmpDoMoveGroupOnFailure thread for group <%1!ws!>. Error %2!u!.\n",
  239. OmObjectId(Group),
  240. status);
  241. LocalFree( pContext );
  242. OmDereferenceObject( Group );
  243. } else {
  244. CloseHandle( hThread );
  245. //
  246. // Mark the group as being moved on failure. This is necessary
  247. // so that you don't spawn new FmpDoMoveGroupOnFailure threads
  248. // which try to concurrently move the group. Note that the
  249. // worker thread which calls this function may deliver multiple
  250. // failure notifications.
  251. //
  252. Group->dwStructState |= FM_GROUP_STRUCT_MARKED_FOR_MOVE_ON_FAIL;
  253. }
  254. }
  255. } else {
  256. ClRtlLogPrint(LOG_UNUSUAL,
  257. "[FM] Group failure for %1!ws!, but can't move. Failure count = %2!d!.\n",
  258. OmObjectId(Group), Group->NumberOfFailures);
  259. // All attempts to bring group online failed - start the watchdog timer
  260. // to attempt a restart of all failed resources in this group.
  261. for ( listEntry = Group->Contains.Flink;
  262. listEntry != &(Group->Contains);
  263. listEntry = listEntry->Flink )
  264. {
  265. Resource = CONTAINING_RECORD(listEntry, FM_RESOURCE, ContainsLinkage);
  266. FmpDelayedStartRes(Resource);
  267. }
  268. }
  269. FmpReleaseLocalGroupLock( Group );
  270. ClRtlLogPrint(LOG_NOISE,
  271. "[FM] FmpHandleGroupFailure, Exit: Group failure for %1!ws!...\n",
  272. OmObjectId(Group));
  273. return;
  274. } // FmpHandleGroupFailure
  275. BOOL
  276. FmpGroupCanMove(
  277. IN PFM_GROUP Group
  278. )
  279. /*++
  280. Routine Description:
  281. Indicates whether there is another system that is in the preferred owner
  282. list that can take a Group.
  283. Arguments:
  284. Group - the Group to check if it can move.
  285. Returns:
  286. TRUE - the Group can (probably) move to another system.
  287. FALSE - there is no place to move this Group.
  288. --*/
  289. {
  290. DWORD status;
  291. PNM_NODE node;
  292. node = FmpFindAnotherNode( Group, FALSE );
  293. if (node != NULL ) {
  294. return(TRUE);
  295. }
  296. return(FALSE);
  297. } // FmpGroupCanMove
  298. DWORD
  299. FmpNodeDown(
  300. PVOID Context
  301. )
  302. /*++
  303. Routine Description:
  304. This routine handles a node down event from the NM layer.
  305. Arguments:
  306. Context - The node that went down.
  307. Returns:
  308. ERROR_SUCCESS if everything was handled okay.
  309. ERROR_SHUTDOWN_CLUSTER if catastrophy happens.
  310. Win32 error code otherwise (???).
  311. --*/
  312. {
  313. PNM_NODE pNode = (PNM_NODE)Context;
  314. DWORD dwStatus;
  315. LPCWSTR pszNodeId;
  316. DWORD dwNodeLen;
  317. DWORD dwClusterHighestVersion;
  318. ClRtlLogPrint(LOG_NOISE,
  319. "[FM] FmpNodeDown::Node down %1!ws!\n",
  320. OmObjectId(pNode));
  321. //
  322. // Chittur Subbaraman (chitturs) - 3/30/99
  323. //
  324. // Acquire the global group lock to synchronize with the shutdown
  325. //
  326. FmpAcquireGroupLock();
  327. if (!FmpFMOnline || FmpShutdown)
  328. {
  329. //
  330. // We don't care about membership changes until we have finished
  331. // initializing and we're not shutting down.
  332. //
  333. FmpReleaseGroupLock();
  334. ClRtlLogPrint(LOG_CRITICAL,
  335. "[FM] FmpNodeDown - ignore node down event.\n" );
  336. return(ERROR_SUCCESS);
  337. }
  338. FmpReleaseGroupLock();
  339. //SS: Note all nodes will send this update
  340. //The latter updates should not find any groups that belong to
  341. //this node
  342. //We cant rely on only the locker node making this update
  343. //since the locker node may die before it is able to do this and
  344. //that can result in these groups being orphaned
  345. pszNodeId = OmObjectId(pNode);
  346. dwNodeLen = (lstrlenW(pszNodeId)+1)*sizeof(WCHAR);
  347. NmGetClusterOperationalVersion( &dwClusterHighestVersion,
  348. NULL,
  349. NULL );
  350. //
  351. // If this is a non Win2k-Whistler mixed mode cluster, attempt to randomize the
  352. // group preferred owners list and send it as a part of node down GUM.
  353. //
  354. if ( CLUSTER_GET_MAJOR_VERSION( dwClusterHighestVersion ) >=
  355. NT51_MAJOR_VERSION )
  356. {
  357. PFM_GROUP_NODE_LIST pGroupNodeList = NULL;
  358. //
  359. // Attempt to get a contiguous buffer containing the list of group IDs and suggested
  360. // owners for them.
  361. //
  362. dwStatus = FmpPrepareGroupNodeList( &pGroupNodeList );
  363. if ( dwStatus != ERROR_SUCCESS )
  364. {
  365. //
  366. // If the call returns ERROR_CLUSTER_INVALID_REQUEST, it means a user has turned
  367. // off the randomization algorithm.
  368. //
  369. if ( dwStatus != ERROR_CLUSTER_INVALID_REQUEST )
  370. ClRtlLogPrint(LOG_CRITICAL, "[FM] FmpNodeDown: FmpPrepareGroupNodeList returns %1!u!...\n",
  371. dwStatus);
  372. LocalFree( pGroupNodeList );
  373. goto use_old_gum;
  374. }
  375. //
  376. // If the list does not even contain any entries, just switch to the old gum. No point in
  377. // sending the list header around.
  378. //
  379. if ( pGroupNodeList->cbGroupNodeList < sizeof ( FM_GROUP_NODE_LIST ) )
  380. {
  381. ClRtlLogPrint(LOG_NOISE, "[FM] FmpNodeDown: FmpPrepareGroupNodeList returns empty list...\n");
  382. LocalFree( pGroupNodeList );
  383. goto use_old_gum;
  384. }
  385. //
  386. // Invoke GUM to pass around the dead node ID and the randomized group node list
  387. //
  388. dwStatus = GumSendUpdateEx( GumUpdateFailoverManager,
  389. FmUpdateUseRandomizedNodeListForGroups,
  390. 2,
  391. dwNodeLen,
  392. pszNodeId,
  393. pGroupNodeList->cbGroupNodeList,
  394. pGroupNodeList );
  395. if ( dwStatus != ERROR_SUCCESS )
  396. {
  397. ClRtlLogPrint(LOG_CRITICAL,
  398. "[FM] FmpNodeDown: GUM update FmUpdateUseRandomizedNodeListForGroups failed %1!d!\n",
  399. dwStatus);
  400. }
  401. LocalFree( pGroupNodeList );
  402. return( ERROR_SUCCESS );
  403. }
  404. use_old_gum:
  405. dwStatus = GumSendUpdateEx(GumUpdateFailoverManager,
  406. FmUpdateAssignOwnerToGroups,
  407. 1,
  408. dwNodeLen,
  409. pszNodeId);
  410. if (dwStatus != ERROR_SUCCESS)
  411. {
  412. ClRtlLogPrint(LOG_CRITICAL,
  413. "[FM] FmpNodeDown: Gumupdate failed %1!d!\n",
  414. dwStatus);
  415. }
  416. return(ERROR_SUCCESS);
  417. } // FmpNodeDown
  418. BOOL
  419. WINAPI
  420. FmVerifyNodeDown(
  421. IN PNM_NODE Node,
  422. OUT LPBOOL IsDown
  423. )
  424. /*++
  425. Routine Description:
  426. This routine attempts to verify whether a given node is down. This can
  427. only be done if there is some shared resource that the other system
  428. currently 'owns'. We will attempt to negotiate the shared resource and
  429. if we 'win' the negotiation we'll declare that other system down. If we
  430. loose arbitration, we declare the other system as still up.
  431. Arguments:
  432. Node - A pointer to the node structure for the other system.
  433. IsDown - A we can perform the verification, this indicates the results of
  434. that verification.
  435. Returns:
  436. TRUE - If we can perform the verification.
  437. FALSE - If we can't perform the verification.
  438. --*/
  439. {
  440. return(FALSE);
  441. } // FmVerifyNodeDown
  442. DWORD
  443. FmpHandleNodeDownEvent(
  444. IN PVOID pContext
  445. )
  446. /*++
  447. Routine Description:
  448. This function creates a thread to handle the node down event.
  449. Arguments:
  450. pContext - Pointer to the context structure
  451. Returns:
  452. ERROR_SUCCESS
  453. --*/
  454. {
  455. HANDLE hThread = NULL;
  456. DWORD dwThreadId;
  457. DWORD dwError;
  458. //
  459. // Chittur Subbaraman (chitturs) - 7/31/99
  460. //
  461. // Create a thread to handle the FM node down event. Let us not
  462. // rely on the FM worker thread to handle this. This is because
  463. // the FM worker thread could be trying to online some resource
  464. // and that could get stuck for some time since the quorum resource
  465. // is not online. Now in some cases, only after the node down event
  466. // is processed the quorum resource could come online. (This is
  467. // highly likely especially in a 2 node cluster.)
  468. //
  469. ClRtlLogPrint(LOG_NOISE,
  470. "[FM] FmpHandleNodeDownEvent - Create thread to handle node down event....\n"
  471. );
  472. hThread = CreateThread( NULL,
  473. 0,
  474. FmpNodeDown,
  475. pContext,
  476. 0,
  477. &dwThreadId );
  478. if ( hThread == NULL )
  479. {
  480. dwError = GetLastError();
  481. ClRtlLogPrint(LOG_CRITICAL,
  482. "[FM] FmpHandleNodeDownEvent - Unable to create thread to handle node down event. Error=0x%1!08lx!\r\n",
  483. dwError);
  484. CsInconsistencyHalt( dwError );
  485. }
  486. CloseHandle( hThread );
  487. return( ERROR_SUCCESS );
  488. } // FmpHandleNodeDownEvent