Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

5604 lines
171 KiB

  1. /*++
  2. Copyright (c) 1996 Microsoft Corporation
  3. Module Name:
  4. group.c
  5. Abstract:
  6. Cluster group management routines.
  7. Author:
  8. Rod Gamache (rodga) 8-Mar-1996
  9. Notes:
  10. WARNING: All of the routines in this file assume that the group
  11. lock is held when they are called.
  12. Revision History:
  13. --*/
  14. #include "fmp.h"
  15. #define LOG_MODULE GROUP
  16. //
  17. // Global Data
  18. //
  19. CRITICAL_SECTION FmpGroupLock;
  20. //
  21. // Local function prototypes
  22. //
  23. /////////////////////////////////////////////////////////////////////////////
  24. //
  25. // Group Management Routines
  26. //
  27. /////////////////////////////////////////////////////////////////////////////
  28. BOOL
  29. FmpInPreferredList(
  30. IN PFM_GROUP Group,
  31. IN PNM_NODE Node,
  32. IN BOOL bRecalc,
  33. IN PFM_RESOURCE pRefResource
  34. )
  35. /*++
  36. Routine Description:
  37. Check if a node is in the preferred list for the Group.
  38. Arguments:
  39. Group - Pointer to the group object with the preferred owners list.
  40. Node - The Node to check for.
  41. bRecalc - If set to TRUE, we recalculate the preferred list for the group
  42. based on the possible node list for the reference resource.
  43. pRefResource - If NULL, we walk all the resources in the
  44. group and calculate their possible node list to see
  45. if it has since expanded due to the fact that dlls
  46. were copied to nodes.
  47. Return Value:
  48. TRUE - if the node is in the list.
  49. FALSE - if the node is NOT in the list.
  50. --*/
  51. {
  52. PLIST_ENTRY listEntry;
  53. PPREFERRED_ENTRY preferredEntry;
  54. BOOL bRet = FALSE;
  55. //
  56. // For each entry in the Preferred list, it must exist in the possible
  57. // list.
  58. //
  59. ChkInPrefList:
  60. for ( listEntry = Group->PreferredOwners.Flink;
  61. listEntry != &(Group->PreferredOwners);
  62. listEntry = listEntry->Flink ) {
  63. preferredEntry = CONTAINING_RECORD( listEntry,
  64. PREFERRED_ENTRY,
  65. PreferredLinkage );
  66. if ( preferredEntry->PreferredNode == Node ) {
  67. return(TRUE);
  68. }
  69. }
  70. if (bRecalc)
  71. {
  72. PFM_RESOURCE pResource;
  73. DWORD dwStatus;
  74. LPWSTR lpszOwners = NULL;
  75. DWORD dwMaxSize=0;
  76. HDMKEY hGroupKey;
  77. DWORD dwSize = 0;
  78. hGroupKey = DmOpenKey(DmGroupsKey, OmObjectId(Group),
  79. KEY_READ);
  80. if (hGroupKey == NULL)
  81. {
  82. dwStatus = GetLastError();
  83. ClRtlLogPrint(LOG_CRITICAL,
  84. "[FM] FmInPreferredList: Couldnt open group key\r\n",
  85. dwStatus);
  86. CL_UNEXPECTED_ERROR(dwStatus);
  87. goto FnExit;
  88. }
  89. //the group preferred list must not be set by the user
  90. //if it is then there is no point in doing this recalculation
  91. dwStatus = DmQueryMultiSz( hGroupKey,
  92. CLUSREG_NAME_GRP_PREFERRED_OWNERS,
  93. &lpszOwners,
  94. &dwMaxSize,
  95. &dwSize );
  96. if (lpszOwners)
  97. LocalFree(lpszOwners);
  98. DmCloseKey(hGroupKey);
  99. if (dwStatus == ERROR_FILE_NOT_FOUND)
  100. {
  101. DWORD dwUserModified;
  102. for (listEntry = Group->Contains.Flink;
  103. listEntry != &(Group->Contains);
  104. listEntry = listEntry->Flink)
  105. {
  106. pResource = CONTAINING_RECORD(listEntry, FM_RESOURCE, ContainsLinkage);
  107. //the resource possible node list must not be set by the user
  108. //if it is, then we can skip this resource
  109. dwStatus = DmQueryDword( pResource->RegistryKey,
  110. CLUSREG_NAME_RES_USER_MODIFIED_POSSIBLE_LIST,
  111. &dwUserModified,
  112. NULL );
  113. if (dwStatus == ERROR_FILE_NOT_FOUND)
  114. {
  115. FmpSetPossibleNodeForResType(OmObjectId(pResource->Type),
  116. TRUE);
  117. if (FmpInPossibleListForResType(pResource->Type,Node) &&
  118. !FmpInPossibleListForResource(pResource, Node))
  119. {
  120. //add to the resource possible node list
  121. //this will or add to the pref list of the group
  122. FmChangeResourceNode(pResource, Node, TRUE);
  123. }
  124. }
  125. }
  126. //set bRecalc to be FALSE so that we dont evaluate this again
  127. bRecalc = FALSE;
  128. goto ChkInPrefList;
  129. }
  130. }
  131. FnExit:
  132. return(bRet);
  133. } // FmpInPreferredList
  134. BOOL
  135. FmpHigherInPreferredList(
  136. IN PFM_GROUP Group,
  137. IN PNM_NODE Node1,
  138. IN PNM_NODE Node2
  139. )
  140. /*++
  141. Routine Description:
  142. Check if Node1 is higher (in priority) in the preferred owners list than
  143. Node1.
  144. Arguments:
  145. Group - Pointer to the group object with the preferred owners list.
  146. Node1 - The Node that should be higher in the list.
  147. Node2 - The Node that should be lower in the list.
  148. Return Value:
  149. TRUE - if Node1 is higher in the list.
  150. FALSE - if Node2 is higher in the list, or Node1 is not in the list at all.
  151. --*/
  152. {
  153. PLIST_ENTRY listEntry;
  154. PPREFERRED_ENTRY preferredEntry;
  155. DWORD orderedOwners = 0;
  156. //
  157. // For each entry in the Preferred list, check whether Node1 or Node2 is
  158. // higher.
  159. //
  160. for ( listEntry = Group->PreferredOwners.Flink;
  161. listEntry != &(Group->PreferredOwners),
  162. orderedOwners < Group->OrderedOwners;
  163. listEntry = listEntry->Flink ) {
  164. preferredEntry = CONTAINING_RECORD( listEntry,
  165. PREFERRED_ENTRY,
  166. PreferredLinkage );
  167. if ( preferredEntry->PreferredNode == Node1 ) {
  168. return(TRUE);
  169. }
  170. if ( preferredEntry->PreferredNode == Node2 ) {
  171. return(FALSE);
  172. }
  173. orderedOwners++;
  174. }
  175. return(FALSE);
  176. } // FmpHigherInPreferredList
  177. DWORD
  178. FmpSetPreferredEntry(
  179. IN PFM_GROUP Group,
  180. IN PNM_NODE Node
  181. )
  182. /*++
  183. Routine Description:
  184. Add a node to the preferred list for the Group.
  185. Arguments:
  186. Group - Pointer to the group object with the preferred owners list.
  187. Node - The Node to add.
  188. Return Value:
  189. ERROR_SUCCESS if node is added.
  190. ERROR_NOT_ENOUGH_MEMORY on failure.
  191. --*/
  192. {
  193. PLIST_ENTRY listEntry;
  194. PPREFERRED_ENTRY preferredEntry;
  195. //
  196. // Make sure entry is not already present in list.
  197. //
  198. if ( FmpInPreferredList( Group, Node, FALSE, NULL ) ) {
  199. return(ERROR_SUCCESS);
  200. }
  201. //
  202. // Create the Preferred Owners List entry.
  203. //
  204. preferredEntry = LocalAlloc( LMEM_FIXED, sizeof(PREFERRED_ENTRY) );
  205. if ( preferredEntry == NULL ) {
  206. ClRtlLogPrint( LOG_CRITICAL,
  207. "[FM] Error allocating preferred owner entry for group %1!ws!. Stopped adding.\n",
  208. OmObjectId(Group));
  209. return(ERROR_NOT_ENOUGH_MEMORY);
  210. }
  211. //
  212. // Create the preferred owner entry and keep a reference on the node object.
  213. //
  214. OmReferenceObject( Node );
  215. preferredEntry->PreferredNode = Node;
  216. InsertTailList( &Group->PreferredOwners,
  217. &preferredEntry->PreferredLinkage );
  218. return(ERROR_SUCCESS);
  219. } // FmpSetPreferredEntry
  220. BOOL FmpFindNodeThatMightBeAddedToPrefList(
  221. IN PFM_GROUP pGroup,
  222. IN PNM_NODE *pDestNode,
  223. IN PVOID pNode,
  224. IN LPCWSTR szName)
  225. {
  226. BOOL bRet = TRUE; //assume we will continue enumeration
  227. *pDestNode = NULL;
  228. //if this node is not up or if this is the local node, continue
  229. if ((pNode == NmLocalNode) || (NmGetNodeState(pNode) != ClusterNodeUp))
  230. {
  231. return(bRet);
  232. }
  233. if (FmpInPreferredList(pGroup, pNode, TRUE, NULL))
  234. {
  235. bRet = FALSE;
  236. *pDestNode = pNode;
  237. }
  238. return(bRet);
  239. }
  240. PNM_NODE
  241. FmpFindAnotherNode(
  242. IN PFM_GROUP Group,
  243. IN BOOL bChooseMostPreferredNode
  244. )
  245. /*++
  246. Routine Description:
  247. Check if another node is up that can take the group.
  248. Arguments:
  249. Group - Pointer to the group object we're checking.
  250. bChooseMostPreferredNode - Whether to choose the most preferred node or not.
  251. Return Value:
  252. Pointer to node object that the group can move to.
  253. NULL if another system is not found.
  254. --*/
  255. {
  256. PLIST_ENTRY listEntry;
  257. PPREFERRED_ENTRY preferredEntry;
  258. PNM_NODE first = NULL;
  259. BOOLEAN flag = FALSE;
  260. //
  261. // First, let us give the anti-affinity algorithm a shot at picking the node.
  262. //
  263. first = FmpGetNodeNotHostingUndesiredGroups ( Group,
  264. TRUE, // Rule out local node
  265. bChooseMostPreferredNode );
  266. if ( first != NULL )
  267. {
  268. goto FnExit;
  269. }
  270. //
  271. // For each entry in the Preferred list, find a system (other than the
  272. // local system that is up).
  273. //
  274. if ( bChooseMostPreferredNode )
  275. {
  276. first = FmpGetNonLocalPreferredNode( Group );
  277. //
  278. // In this case in which you are doing a user-initiated move, give the randomized
  279. // preferred list algorithm a chance to pick the node. Note that if the randomized
  280. // algorithm could not pick a node, it will return the supplied suggested node itself.
  281. //
  282. if ( first != NULL )
  283. {
  284. first = FmpPickNodeFromPreferredListAtRandom ( Group,
  285. first, // Suggested default
  286. TRUE, // Dont choose local node
  287. TRUE ); // Check whether randomization
  288. // should be disabled
  289. }
  290. }
  291. else
  292. {
  293. for ( listEntry = Group->PreferredOwners.Flink;
  294. listEntry != &(Group->PreferredOwners);
  295. listEntry = listEntry->Flink ) {
  296. preferredEntry = CONTAINING_RECORD( listEntry,
  297. PREFERRED_ENTRY,
  298. PreferredLinkage );
  299. if ( (preferredEntry->PreferredNode != NmLocalNode) &&
  300. (NmGetExtendedNodeState(preferredEntry->PreferredNode) == ClusterNodeUp) ) {
  301. if (flag == TRUE)
  302. return(preferredEntry->PreferredNode);
  303. else if (first == NULL)
  304. first = preferredEntry->PreferredNode;
  305. } else if (preferredEntry->PreferredNode == NmLocalNode) {
  306. flag = TRUE;
  307. }
  308. }
  309. }
  310. //if we couldnt find a node, we retry again since the user might have
  311. //expanded the possible node list for resource type since then
  312. //if the group preferred list is not set by the user,
  313. //we recalculate it since it could have
  314. if (first == NULL)
  315. {
  316. LPWSTR lpszOwners = NULL;
  317. DWORD dwMaxSize=0;
  318. HDMKEY hGroupKey;
  319. DWORD dwSize = 0;
  320. DWORD dwStatus;
  321. hGroupKey = DmOpenKey(DmGroupsKey, OmObjectId(Group),
  322. KEY_READ);
  323. if (hGroupKey == NULL)
  324. {
  325. dwStatus = GetLastError();
  326. ClRtlLogPrint(LOG_CRITICAL,
  327. "[FM] FmInPreferredList: Couldnt open group key\r\n",
  328. dwStatus);
  329. CL_UNEXPECTED_ERROR(dwStatus);
  330. goto FnExit;
  331. }
  332. //the group preferred list must not be set by the user
  333. //if it is then there is no point in doing this recalculation
  334. dwStatus = DmQueryMultiSz( hGroupKey,
  335. CLUSREG_NAME_GRP_PREFERRED_OWNERS,
  336. &lpszOwners,
  337. &dwMaxSize,
  338. &dwSize );
  339. if (lpszOwners)
  340. LocalFree(lpszOwners);
  341. DmCloseKey(hGroupKey);
  342. if (dwStatus == ERROR_FILE_NOT_FOUND)
  343. OmEnumObjects(ObjectTypeNode, FmpFindNodeThatMightBeAddedToPrefList,
  344. Group, &first);
  345. }
  346. FnExit:
  347. return(first);
  348. } // FmpFindAnotherNode
  349. PNM_NODE
  350. FmpGetPreferredNode(
  351. IN PFM_GROUP Group
  352. )
  353. /*++
  354. Routine Description:
  355. Find best node that can take the group
  356. Arguments:
  357. Group - Pointer to the group object we're checking.
  358. Return Value:
  359. Pointer to node object that the group can move to.
  360. NULL if another system is not found.
  361. --*/
  362. {
  363. PLIST_ENTRY listEntry;
  364. PPREFERRED_ENTRY preferredEntry;
  365. PNM_NODE pNode = NULL;
  366. //
  367. // First, let us give the anti-affinity algorithm a shot at picking the node.
  368. //
  369. pNode = FmpGetNodeNotHostingUndesiredGroups ( Group,
  370. FALSE, // Don't rule out local node
  371. TRUE ); // Choose preferred node if possible
  372. if ( pNode != NULL )
  373. {
  374. return ( pNode );
  375. }
  376. //
  377. // For each entry in the Preferred list, find a system that is up.
  378. //
  379. for ( listEntry = Group->PreferredOwners.Flink;
  380. listEntry != &(Group->PreferredOwners);
  381. listEntry = listEntry->Flink ) {
  382. preferredEntry = CONTAINING_RECORD( listEntry,
  383. PREFERRED_ENTRY,
  384. PreferredLinkage );
  385. if (NmGetNodeState(preferredEntry->PreferredNode) == ClusterNodeUp ) {
  386. return(preferredEntry->PreferredNode);
  387. }
  388. }
  389. return(NULL);
  390. } // FmpGetPreferredNode
  391. PNM_NODE
  392. FmpGetNonLocalPreferredNode(
  393. IN PFM_GROUP Group
  394. )
  395. /*++
  396. Routine Description:
  397. Find best node that can take the group which is not the local node.
  398. Arguments:
  399. Group - Pointer to the group object we're checking.
  400. Return Value:
  401. Pointer to node object that the group can move to.
  402. NULL if another system is not found.
  403. --*/
  404. {
  405. PLIST_ENTRY listEntry;
  406. PPREFERRED_ENTRY preferredEntry;
  407. //
  408. // For each entry in the Preferred list, find a system (other than the
  409. // local system that is up).
  410. //
  411. for ( listEntry = Group->PreferredOwners.Flink;
  412. listEntry != &(Group->PreferredOwners);
  413. listEntry = listEntry->Flink ) {
  414. preferredEntry = CONTAINING_RECORD( listEntry,
  415. PREFERRED_ENTRY,
  416. PreferredLinkage );
  417. if ( preferredEntry->PreferredNode == NmLocalNode ) {
  418. continue;
  419. }
  420. if (NmGetNodeState(preferredEntry->PreferredNode) == ClusterNodeUp ) {
  421. return(preferredEntry->PreferredNode);
  422. }
  423. }
  424. return(NULL);
  425. } // FmpGetNonLocalPreferredNode
  426. BOOL
  427. FmpIsGroupQuiet(
  428. IN PFM_GROUP Group,
  429. IN CLUSTER_GROUP_STATE WantedState
  430. )
  431. /*++
  432. Routine Description:
  433. Checks if the group has any pending resources.
  434. Arguments:
  435. Group - the Group to check.
  436. WantedState - the state the Group wants to get to.
  437. Return Value:
  438. TRUE - if the Group is not doing anything now.
  439. FALSE otherwise.
  440. --*/
  441. {
  442. DWORD status;
  443. PLIST_ENTRY listEntry;
  444. PFM_RESOURCE Resource;
  445. if ( Group->MovingList ) {
  446. return(FALSE);
  447. }
  448. //
  449. // Check all of the resources contained within this group.
  450. //
  451. for ( listEntry = Group->Contains.Flink;
  452. listEntry != &(Group->Contains);
  453. listEntry = listEntry->Flink ) {
  454. Resource = CONTAINING_RECORD(listEntry, FM_RESOURCE, ContainsLinkage);
  455. switch ( WantedState ) {
  456. case ClusterGroupOnline:
  457. // if resource is pending, then offline pending is bad
  458. if ( Resource->State == ClusterResourceOfflinePending ) {
  459. return(FALSE);
  460. }
  461. break;
  462. case ClusterGroupOffline:
  463. // if resource is pending, then online pending is bad
  464. if ( Resource->State == ClusterResourceOnlinePending ) {
  465. return(FALSE);
  466. }
  467. break;
  468. default:
  469. // any pending state is bad
  470. if ( Resource->State >= ClusterResourcePending ) {
  471. return(FALSE);
  472. }
  473. break;
  474. }
  475. }
  476. return(TRUE);
  477. } // FmpIsGroupQuiet
  478. VOID
  479. FmpSetGroupPersistentState(
  480. IN PFM_GROUP Group,
  481. IN CLUSTER_GROUP_STATE State
  482. )
  483. /*++
  484. Routine Description:
  485. Sets the PersistentState of a Group. This includes the registry.
  486. Arguments:
  487. Group - The Group to set the state for.
  488. State - The new state for the Group.
  489. Returns:
  490. ERROR_SUCCESS if successful.
  491. A Win32 error code on failure.
  492. Notes:
  493. The LocalGroupLock must be held.
  494. --*/
  495. {
  496. DWORD persistentState;
  497. LPWSTR persistentStateName = CLUSREG_NAME_GRP_PERSISTENT_STATE;
  498. if (!gbIsQuoResEnoughSpace)
  499. return;
  500. FmpAcquireLocalGroupLock( Group );
  501. //
  502. // If the current state has changed, then do the work. Otherwise,
  503. // skip the effort.
  504. //
  505. if ( Group->PersistentState != State ) {
  506. Group->PersistentState = State;
  507. CL_ASSERT( Group->RegistryKey != NULL );
  508. //
  509. // Set the new value, but only if it is online or offline.
  510. //
  511. if ( State == ClusterGroupOnline ) {
  512. persistentState = 1;
  513. DmSetValue( Group->RegistryKey,
  514. persistentStateName,
  515. REG_DWORD,
  516. (LPBYTE)&persistentState,
  517. sizeof(DWORD) );
  518. } else if ( State == ClusterGroupOffline ) {
  519. persistentState = 0;
  520. DmSetValue( Group->RegistryKey,
  521. persistentStateName,
  522. REG_DWORD,
  523. (LPBYTE)&persistentState,
  524. sizeof(DWORD) );
  525. }
  526. }
  527. FmpReleaseLocalGroupLock( Group );
  528. } // FmpSetGroupPersistentState
  529. DWORD
  530. FmpOnlineGroup(
  531. IN PFM_GROUP Group,
  532. IN BOOL ForceOnline
  533. )
  534. /*++
  535. Routine Description:
  536. Bring the specified group online. This means bringing all of the
  537. individual resources contained within the group online. This is an
  538. atomic operation - so either all resources contained within the group
  539. are brought online, or none of them are.
  540. Arguments:
  541. Group - Supplies a pointer to the group structure to bring online.
  542. ForceOnline - TRUE if all resources in the Group should be forced online.
  543. Retruns:
  544. ERROR_SUCCESS if the request was successful.
  545. A Win32 error code on failure.
  546. --*/
  547. {
  548. DWORD status, retstatus = ERROR_SUCCESS;
  549. PLIST_ENTRY listEntry;
  550. PFM_RESOURCE Resource;
  551. BOOL bPending = FALSE;
  552. ClRtlLogPrint(LOG_NOISE,
  553. "[FM] OnlineGroup for %1!ws! owner %2!ws!\n",
  554. OmObjectId(Group), OmObjectId(Group->OwnerNode));
  555. FmpAcquireLocalGroupLock( Group );
  556. //
  557. // Check if we are the owner... if not, return failure.
  558. //
  559. if ( gpQuoResource->Group != Group &&
  560. ((Group->OwnerNode != NmLocalNode) ||
  561. !FmpInPreferredList( Group, Group->OwnerNode, TRUE, NULL) ) ) {
  562. FmpReleaseLocalGroupLock( Group );
  563. return(ERROR_HOST_NODE_NOT_RESOURCE_OWNER);
  564. }
  565. //
  566. // Make sure the group is quiet
  567. //
  568. if ( !FmpIsGroupQuiet( Group, ClusterGroupOnline ) ) {
  569. FmpReleaseLocalGroupLock( Group );
  570. return(ERROR_INVALID_STATE);
  571. }
  572. //log an event saying we are trying on online a group
  573. FmpLogGroupInfoEvent1( FM_EVENT_GROUP_START_ONLINE, OmObjectName(Group));
  574. //if the quorum group is in this group bring it online first
  575. //This is called when a node goes down and its groups are
  576. //being reclaimed, the order in which the resoures are brought
  577. //online is important
  578. if ( gpQuoResource->Group == Group)
  579. {
  580. //SS:: if the quorum resource is in the group, it must be
  581. //brought online irrespective of the persistent state
  582. //so we will pass in true here
  583. //Apps can mess with persistent state via the common
  584. //properties and then cause havoc so we need to force the
  585. //quorum resource online despite that
  586. status = FmpDoOnlineResource( gpQuoResource,
  587. TRUE );
  588. if ( (status != ERROR_SUCCESS) &&
  589. (status != ERROR_IO_PENDING) ) {
  590. ClRtlLogPrint(LOG_NOISE,
  591. "[FM] OnlineGroup: Failed on resource %1!ws!. Status %2!u!\n",
  592. OmObjectId(gpQuoResource),
  593. status);
  594. CL_UNEXPECTED_ERROR(status);
  595. }
  596. }
  597. //
  598. // Bring online all of the resources contained within this group.
  599. //
  600. for ( listEntry = Group->Contains.Flink;
  601. listEntry != &(Group->Contains);
  602. listEntry = listEntry->Flink ) {
  603. Resource = CONTAINING_RECORD(listEntry, FM_RESOURCE, ContainsLinkage);
  604. status = FmpDoOnlineResource( Resource,
  605. ForceOnline );
  606. if (status == ERROR_IO_PENDING) {
  607. bPending = TRUE;
  608. }
  609. if ( (status != ERROR_SUCCESS) &&
  610. (status != ERROR_NODE_CANT_HOST_RESOURCE) &&
  611. (status != ERROR_IO_PENDING) ) {
  612. ClRtlLogPrint(LOG_NOISE,
  613. "[FM] OnlineGroup: Failed on resource %1!ws!. Status %2!u!\n",
  614. OmObjectId(Resource),
  615. status);
  616. retstatus = status;
  617. }
  618. }
  619. //
  620. // Normally bringing the resources online propagates the group state,
  621. // but in order to get the state right for a group with no resources,
  622. // manually propagate the state here.
  623. //
  624. FmpPropagateGroupState(Group);
  625. ClRtlLogPrint(LOG_NOISE,
  626. "[FM] OnlineGroup: setting group state to Online for %1!ws!\n",
  627. OmObjectId(Group));
  628. if (retstatus == ERROR_SUCCESS) {
  629. if (bPending)
  630. {
  631. Group->dwStructState |= FM_GROUP_STRUCT_MARKED_FOR_COMPLETION_EVENT;
  632. retstatus = ERROR_IO_PENDING;
  633. }
  634. else
  635. {
  636. //log an event saying we are the onlinegroup request was completed
  637. FmpLogGroupInfoEvent1( FM_EVENT_GROUP_COMPLETE_ONLINE, OmObjectName(Group));
  638. }
  639. }
  640. else
  641. {
  642. //log an event saying we are the onlinegroup request was completed
  643. FmpLogGroupInfoEvent1( FM_EVENT_GROUP_FAILED_ONLINE_OFFLINE, OmObjectName(Group));
  644. }
  645. FmpReleaseLocalGroupLock( Group );
  646. return(retstatus);
  647. } // FmpOnlineGroup
  648. DWORD
  649. FmpOfflineGroup(
  650. IN PFM_GROUP Group,
  651. IN BOOL OfflineQuorum,
  652. IN BOOL SetPersistent
  653. )
  654. /*++
  655. Routine Description:
  656. Bring the specified group offline. This means bringing all of the
  657. individual resources contained within the group offline.
  658. Arguments:
  659. Group - Supplies a pointer to the group structure to bring offline.
  660. OfflineQuorum - TRUE if any quorum resource in this group should
  661. be taken offline. FALSE if the quorum resource should be left online.
  662. SetPersistent - TRUE if the persistent state of each resource should be
  663. updated.
  664. Returns:
  665. ERROR_SUCCESS if the request was successful.
  666. A Win32 error code on failure.
  667. --*/
  668. {
  669. DWORD status;
  670. PLIST_ENTRY listEntry;
  671. PFM_RESOURCE Resource;
  672. DWORD returnStatus = ERROR_SUCCESS;
  673. PRESOURCE_ENUM ResourceEnum=NULL;
  674. DWORD i;
  675. FmpAcquireLocalGroupLock( Group );
  676. //if the group has been marked for delete, then fail this call
  677. if (!IS_VALID_FM_GROUP(Group))
  678. {
  679. FmpReleaseLocalGroupLock( Group);
  680. return (ERROR_GROUP_NOT_AVAILABLE);
  681. }
  682. ClRtlLogPrint(LOG_NOISE,
  683. "[FM] FmpOfflineGroup, Group=%1!ws!\n",
  684. OmObjectId(Group));
  685. //
  686. // Check if we are the owner... if not, return failure.
  687. //
  688. if ( Group->OwnerNode != NmLocalNode ) {
  689. returnStatus = ERROR_HOST_NODE_NOT_RESOURCE_OWNER;
  690. goto error_exit;
  691. }
  692. //
  693. // Make sure the group is quiet
  694. //
  695. if ( !FmpIsGroupQuiet( Group, ClusterGroupOffline ) ) {
  696. returnStatus = ERROR_INVALID_STATE;
  697. goto error_exit;
  698. }
  699. //
  700. // Get the list of resources in the group and their states.
  701. //
  702. returnStatus = FmpGetResourceList( &ResourceEnum, Group );
  703. if ( returnStatus != ERROR_SUCCESS ) {
  704. goto error_exit;
  705. }
  706. //log an event saying we are trying on offline a group
  707. FmpLogGroupInfoEvent1( FM_EVENT_GROUP_START_OFFLINE, OmObjectName(Group));
  708. // offline all resources except the quorum resource
  709. for ( i = 0; i < ResourceEnum->EntryCount; i++ ) {
  710. Resource = OmReferenceObjectById( ObjectTypeResource,
  711. ResourceEnum->Entry[i].Id );
  712. if ( Resource == NULL ) {
  713. returnStatus = ERROR_RESOURCE_NOT_FOUND;
  714. goto error_exit;
  715. }
  716. //quorum resource is brought offline last
  717. if (Resource->QuorumResource)
  718. {
  719. OmDereferenceObject(Resource);
  720. continue;
  721. }
  722. if (SetPersistent) {
  723. FmpSetResourcePersistentState( Resource, ClusterResourceOffline );
  724. }
  725. status = FmpOfflineResource( Resource, FALSE);
  726. OmDereferenceObject( Resource );
  727. if ( (status != ERROR_SUCCESS) &&
  728. (status != ERROR_IO_PENDING) ) {
  729. returnStatus = status;
  730. goto error_exit;
  731. }
  732. if ( status == ERROR_IO_PENDING ) {
  733. returnStatus = ERROR_IO_PENDING;
  734. }
  735. }
  736. // bring the quorum resource offline now, if asked to bring quorum offline
  737. // This allows other resources to come offline and save their checkpoints
  738. // The quorum resource offline should block till the resources have
  739. // finished saving the checkpoint
  740. if (ResourceEnum->ContainsQuorum >= 0)
  741. {
  742. if (!OfflineQuorum)
  743. {
  744. //if the quorum resource should not be taken offline
  745. returnStatus = ERROR_QUORUM_RESOURCE;
  746. }
  747. else if (returnStatus == ERROR_SUCCESS)
  748. {
  749. CL_ASSERT((DWORD)ResourceEnum->ContainsQuorum < ResourceEnum->EntryCount);
  750. Resource = OmReferenceObjectById( ObjectTypeResource,
  751. ResourceEnum->Entry[ResourceEnum->ContainsQuorum].Id );
  752. if ( Resource == NULL ) {
  753. returnStatus = ERROR_RESOURCE_NOT_FOUND;
  754. goto error_exit;
  755. }
  756. ClRtlLogPrint(LOG_NOISE,
  757. "[FM] FmpOfflineGroup: Bring quorum resource offline\n");
  758. if ( !(Resource->Flags & RESOURCE_WAITING) ) {
  759. if (Resource->State != ClusterResourceOffline) {
  760. Resource->State = ClusterResourceOnline; // [HACKHACK]
  761. }
  762. status = FmpOfflineResource( Resource , FALSE);
  763. OmDereferenceObject( Resource );
  764. if ( (status != ERROR_SUCCESS) &&
  765. (status != ERROR_IO_PENDING) )
  766. {
  767. returnStatus = status;
  768. goto error_exit;
  769. }
  770. if ( status == ERROR_IO_PENDING )
  771. returnStatus = ERROR_IO_PENDING;
  772. } else {
  773. OmDereferenceObject( Resource );
  774. }
  775. }
  776. }
  777. //
  778. // Normally bringing the resources offline propagates the group state,
  779. // but in order to get the state right for a group with no resources,
  780. // manually propagate the state here.
  781. //
  782. if (SetPersistent)
  783. FmpPropagateGroupState(Group);
  784. error_exit:
  785. if (returnStatus == ERROR_SUCCESS)
  786. {
  787. FmpLogGroupInfoEvent1( FM_EVENT_GROUP_COMPLETE_OFFLINE, OmObjectName(Group));
  788. }
  789. else if (returnStatus == ERROR_IO_PENDING)
  790. {
  791. Group->dwStructState |= FM_GROUP_STRUCT_MARKED_FOR_COMPLETION_EVENT;
  792. }
  793. else
  794. {
  795. //log an event saying that we failed to offline the group
  796. FmpLogGroupInfoEvent1( FM_EVENT_GROUP_FAILED_ONLINE_OFFLINE, OmObjectName(Group));
  797. }
  798. FmpReleaseLocalGroupLock( Group );
  799. if (ResourceEnum)
  800. FmpDeleteResourceEnum( ResourceEnum );
  801. return(returnStatus);
  802. } // FmpOfflineGroup
  803. CLUSTER_GROUP_STATE
  804. FmpGetGroupState(
  805. IN PFM_GROUP Group,
  806. IN BOOL IsNormalized
  807. )
  808. /*++
  809. Routine Description:
  810. Get the Group state, either normalized to ClusterGroupOnline or
  811. ClusterGroupOffline or not normalized.
  812. Arguments:
  813. Group - The Group we're interested in.
  814. IsNormalized - Should the Group state be normalized ?
  815. Returns:
  816. The current Group state which is one of (in increasing order of
  817. precedence)
  818. ClusterGroupOnline, ClusterGroupOffline
  819. ClusterGroupPartialOnline
  820. ClusterGroupPending (only if IsNormalized is FALSE)
  821. ClusterGroupFailed (only if IsNormalized is FALSE)
  822. --*/
  823. {
  824. PLIST_ENTRY listEntry;
  825. PFM_RESOURCE resource;
  826. CLUSTER_GROUP_STATE state;
  827. CLUSTER_RESOURCE_STATE firstResourceState;
  828. CLUSTER_RESOURCE_STATE resourceState;
  829. // Chittur Subbaraman (chitturs) - 09/16/98 (Modified this function
  830. // to work with IsNormalized flag)
  831. FmpAcquireLocalGroupLock( Group );
  832. if ( !IsListEmpty(&Group->Contains) ) {
  833. listEntry = Group->Contains.Flink;
  834. resource = CONTAINING_RECORD(listEntry,
  835. FM_RESOURCE,
  836. ContainsLinkage);
  837. //
  838. // Get the first resource's state
  839. //
  840. firstResourceState = resource->State;
  841. if ( IsNormalized == FALSE ) {
  842. BOOL IsPending = FALSE;
  843. BOOL IsPartialOnline = FALSE;
  844. //
  845. // First check whether any resource in the group has
  846. // failed. If so, set the group state to ClusterGroupFailed
  847. // and exit immediately. If no resource in the group has
  848. // failed, but at least one of them is in the pending state,
  849. // then set the group state to ClusterGroupPending and exit
  850. // immediately. If no resource in the group is in either
  851. // the failed or in the pending state, then check whether
  852. // some resources in the group are in online and some in the
  853. // offline state. Then, set the group state to
  854. // ClusterGroupPartialOnline and exit immediately.
  855. //
  856. for ( ;
  857. listEntry != &(Group->Contains);
  858. listEntry = listEntry->Flink ) {
  859. resource = CONTAINING_RECORD(listEntry,
  860. FM_RESOURCE,
  861. ContainsLinkage);
  862. resourceState = resource->State;
  863. if ( resourceState == ClusterResourceFailed ) {
  864. state = ClusterGroupFailed;
  865. //
  866. // This state has the highest precedence, so
  867. // exit immediately.
  868. //
  869. goto FnExit;
  870. } else if ( (resourceState == ClusterResourceOnlinePending) ||
  871. (resourceState == ClusterResourceOfflinePending) ) {
  872. IsPending = TRUE;
  873. } else {
  874. CL_ASSERT( (resourceState == ClusterResourceOffline) ||
  875. (resourceState == ClusterResourceOnline) ||
  876. (resourceState == ClusterResourceInitializing) );
  877. if ( resourceState == ClusterResourceInitializing ) {
  878. //
  879. // Normalize this state to offline state
  880. //
  881. resourceState = ClusterResourceOffline;
  882. }
  883. if ( firstResourceState == ClusterResourceInitializing ) {
  884. //
  885. // Normalize this state to offline state
  886. //
  887. firstResourceState = ClusterResourceOffline;
  888. }
  889. if ( firstResourceState != resourceState ) {
  890. IsPartialOnline = TRUE;
  891. }
  892. }
  893. }
  894. if ( IsPending == TRUE ) {
  895. state = ClusterGroupPending;
  896. //
  897. // This state has the next highest precedence after
  898. // ClusterGroupFailed state
  899. //
  900. goto FnExit;
  901. }
  902. if ( IsPartialOnline == TRUE ) {
  903. state = ClusterGroupPartialOnline;
  904. //
  905. // This state has the next highest precedence after
  906. // ClusterGroupFailed and ClusterGroupPending states
  907. //
  908. goto FnExit;
  909. }
  910. if ( firstResourceState == ClusterResourceOnline ) {
  911. state = ClusterGroupOnline;
  912. //
  913. // If the first resource is in an online state,
  914. // then the group state should be online.
  915. //
  916. goto FnExit;
  917. }
  918. if ( firstResourceState == ClusterResourceOffline ) {
  919. state = ClusterGroupOffline;
  920. //
  921. // If the first resource is in an offline state,
  922. // then the group state should be offline.
  923. //
  924. goto FnExit;
  925. }
  926. }
  927. //
  928. // The control gets here only if IsNormalized is TRUE
  929. //
  930. if ( (firstResourceState == ClusterResourceOnline) ||
  931. (firstResourceState == ClusterResourceOnlinePending) ) {
  932. state = ClusterGroupOnline;
  933. firstResourceState = ClusterResourceOnline;
  934. } else {
  935. CL_ASSERT( (firstResourceState == ClusterResourceOffline) ||
  936. (firstResourceState == ClusterResourceFailed) ||
  937. (firstResourceState == ClusterResourceOfflinePending) ||
  938. (firstResourceState == ClusterResourceInitializing) );
  939. state = ClusterGroupOffline;
  940. firstResourceState = ClusterResourceOffline;
  941. }
  942. //
  943. // Now check each resource to see if they match the first.
  944. //
  945. for (listEntry = Group->Contains.Flink;
  946. listEntry != &(Group->Contains);
  947. listEntry = listEntry->Flink ) {
  948. resource = CONTAINING_RECORD(listEntry,
  949. FM_RESOURCE,
  950. ContainsLinkage);
  951. resourceState = resource->State;
  952. //
  953. // Normalize pending states to their final state, and Failed and Initializing
  954. // to Offline.
  955. //
  956. if ( resourceState == ClusterResourceOnlinePending ) {
  957. resourceState = ClusterResourceOnline;
  958. } else if ( (resourceState == ClusterResourceOfflinePending) ||
  959. (resourceState == ClusterResourceFailed) ||
  960. (resourceState == ClusterResourceInitializing) ) {
  961. resourceState = ClusterResourceOffline;
  962. }
  963. //
  964. // We only need 1 resource that is not the same as the first resource
  965. // to be in a partially online state.
  966. //
  967. if ( firstResourceState != resourceState ) {
  968. state = ClusterGroupPartialOnline;
  969. break;
  970. }
  971. }
  972. } else {
  973. //
  974. // The group is empty, so I guess it must be offline.
  975. //
  976. state = Group->PersistentState;
  977. }
  978. FnExit:
  979. FmpReleaseLocalGroupLock( Group );
  980. return(state);
  981. } // FmpGetGroupState
  982. DWORD
  983. FmpPropagateGroupState(
  984. IN PFM_GROUP Group
  985. )
  986. /*++
  987. Routine Description:
  988. Set and propagate the state of the group to other components on the
  989. local system and to other systems in the cluster.
  990. Arguments:
  991. Group - The Group to propagate the state.
  992. Return:
  993. ERROR_SUCCESS if successful.
  994. A Win32 error code on failure.
  995. Notes:
  996. We will use the first resource's state to determine what should be the
  997. state for the whole group. If all resources match the state of the first
  998. resource, then that is the state of the Group. If any resource disagrees
  999. with the first resource, then the state is PartialOnline.
  1000. --*/
  1001. {
  1002. GUM_GROUP_STATE groupState;
  1003. LPCWSTR groupId;
  1004. DWORD groupIdSize;
  1005. DWORD status;
  1006. PLIST_ENTRY listEntry;
  1007. CLUSTER_RESOURCE_STATE firstResourceState;
  1008. CLUSTER_GROUP_STATE state;
  1009. FmpAcquireLocalGroupLock( Group );
  1010. //
  1011. // If we no longer own the Group, then just return now.
  1012. //
  1013. // This can happen when a resource goes offline (via a terminate), but
  1014. // the group ownership has already migrated to another system.
  1015. // We will assume that returning success is okay in this case.
  1016. //
  1017. if ( Group->OwnerNode != NmLocalNode ) {
  1018. FmpReleaseLocalGroupLock( Group );
  1019. return(ERROR_SUCCESS);
  1020. }
  1021. //
  1022. // Chittur Subbaraman (chitturs) - 6/28/99
  1023. //
  1024. // If the group is marked for deletion, then don't do anything.
  1025. //
  1026. if ( !IS_VALID_FM_GROUP( Group ) ) {
  1027. FmpReleaseLocalGroupLock( Group );
  1028. return(ERROR_SUCCESS);
  1029. }
  1030. state = FmpGetGroupState( Group, TRUE );
  1031. //
  1032. // If the state has changed, then update the local system.
  1033. //
  1034. ++Group->StateSequence;
  1035. if ( state != Group->State ) {
  1036. Group->State = state;
  1037. switch ( state ) {
  1038. case ClusterGroupOnline:
  1039. case ClusterGroupPartialOnline:
  1040. ClusterEvent(CLUSTER_EVENT_GROUP_ONLINE, Group);
  1041. break;
  1042. case ClusterGroupOffline:
  1043. case ClusterGroupFailed:
  1044. ClusterEvent(CLUSTER_EVENT_GROUP_OFFLINE, Group);
  1045. break;
  1046. default:
  1047. break;
  1048. }
  1049. //
  1050. // Prepare to notify the other systems.
  1051. //
  1052. groupId = OmObjectId( Group );
  1053. groupIdSize = (lstrlenW( groupId ) + 1) * sizeof(WCHAR);
  1054. //
  1055. // Set Group state
  1056. //
  1057. groupState.State = state;
  1058. groupState.PersistentState = Group->PersistentState;
  1059. groupState.StateSequence = Group->StateSequence;
  1060. status = GumSendUpdateEx(GumUpdateFailoverManager,
  1061. FmUpdateGroupState,
  1062. 3,
  1063. groupIdSize,
  1064. groupId,
  1065. (lstrlenW(OmObjectId(NmLocalNode))+1)*sizeof(WCHAR),
  1066. OmObjectId(NmLocalNode),
  1067. sizeof(groupState),
  1068. &groupState);
  1069. ClRtlLogPrint(LOG_NOISE,
  1070. "[FM] FmpPropagateGroupState: Group %1!ws! state = %2!u!, persistent state = %3!u!\n",
  1071. OmObjectId(Group),
  1072. groupState.State,
  1073. groupState.PersistentState);
  1074. } else {
  1075. //
  1076. // Assume that the state didn't change, but the owning node did.
  1077. //
  1078. //
  1079. // Prepare to notify the other systems.
  1080. //
  1081. groupId = OmObjectId( Group );
  1082. groupIdSize = (lstrlenW( groupId ) + 1) * sizeof(WCHAR);
  1083. status = GumSendUpdateEx(GumUpdateFailoverManager,
  1084. FmUpdateGroupNode,
  1085. 2,
  1086. groupIdSize,
  1087. groupId,
  1088. (lstrlenW(OmObjectId(NmLocalNode))+1)*sizeof(WCHAR),
  1089. OmObjectId(NmLocalNode));
  1090. }
  1091. FmpReleaseLocalGroupLock( Group );
  1092. return(status);
  1093. } // FmpPropagateGroupState
  1094. DWORD
  1095. FmpPropagateFailureCount(
  1096. IN PFM_GROUP Group,
  1097. IN BOOL NewTime
  1098. )
  1099. /*++
  1100. Routine Description:
  1101. Propagate NumberOfFailures for the group to other systems in the cluster.
  1102. Arguments:
  1103. Group - The Group to propagate the state.
  1104. NewTime - TRUE if last failure time should be reset also. FALSE otherwise.
  1105. Return:
  1106. ERROR_SUCCESS if successful.
  1107. A Win32 error code on failure.
  1108. Notes:
  1109. The Local Group lock must be held.
  1110. --*/
  1111. {
  1112. PGUM_FAILURE_COUNT failureCount;
  1113. DWORD failureCountSize;
  1114. LPCWSTR groupId;
  1115. DWORD status;
  1116. //
  1117. // Prepare to notify the other systems.
  1118. //
  1119. groupId = OmObjectId( Group );
  1120. failureCountSize = sizeof(GUM_FAILURE_COUNT) - 1 +
  1121. ((lstrlenW(groupId) + 1) * sizeof(WCHAR));
  1122. failureCount = LocalAlloc(LMEM_FIXED, failureCountSize);
  1123. if ( failureCount == NULL ) {
  1124. return(ERROR_NOT_ENOUGH_MEMORY);
  1125. }
  1126. failureCount->Count = Group->NumberOfFailures;
  1127. failureCount->NewTime = (DWORD)NewTime;
  1128. wcscpy(&failureCount->GroupId[0], groupId);
  1129. status = GumSendUpdate( GumUpdateFailoverManager,
  1130. FmUpdateFailureCount,
  1131. failureCountSize,
  1132. failureCount );
  1133. LocalFree( failureCount );
  1134. return(status);
  1135. } // FmpPropagateFailureCount
  1136. PFM_GROUP
  1137. FmpCreateGroup(
  1138. IN LPWSTR GroupId,
  1139. IN BOOL Initialize
  1140. )
  1141. /*++
  1142. Routine Description:
  1143. Creates a new Group object.
  1144. Arguments:
  1145. GroupId - The Id of the new Group.
  1146. Initialize - TRUE if the Group should be initialized, FALSE otherwise.
  1147. Returns:
  1148. A non-NULL pointer to the Group if successful.
  1149. NULL - The Group could not be created.
  1150. Notes:
  1151. 1) Passing Initialize as FALSE allows for creating the group and it
  1152. resources, but complete initialization can happen later.
  1153. 2) The Group List lock must be held.
  1154. 3) If the Group is created, the reference count on the object is 1. If
  1155. the group is not create (i.e., it already exists) then the reference count
  1156. is not incremented and the caller may add a reference as needed.
  1157. --*/
  1158. {
  1159. PFM_GROUP group = NULL;
  1160. DWORD status = ERROR_SUCCESS;
  1161. BOOL Created;
  1162. //
  1163. // Open an existing group or create a new one.
  1164. //
  1165. group = OmCreateObject( ObjectTypeGroup,
  1166. GroupId,
  1167. NULL,
  1168. &Created);
  1169. if (group == NULL) {
  1170. return(NULL);
  1171. }
  1172. if (!Created) {
  1173. ClRtlLogPrint(LOG_NOISE,
  1174. "[FM] Opened existing group %1!ws!\n",
  1175. GroupId);
  1176. //this is the quorum group being recreated again,
  1177. if ((!FmpFMOnline) && (group->RegistryKey == NULL))
  1178. {
  1179. status = FmpInitializeGroup(group, Initialize);
  1180. }
  1181. OmDereferenceObject( group );
  1182. goto FnExit;
  1183. }
  1184. else
  1185. {
  1186. ClRtlLogPrint(LOG_NOISE,
  1187. "[FM] Creating group %1!ws!\n",
  1188. GroupId);
  1189. group->State = ClusterGroupOffline;
  1190. InitializeCriticalSection( &group->Lock );
  1191. group->dwStructState = FM_GROUP_STRUCT_CREATED;
  1192. //
  1193. // Insert the group into its list.
  1194. //
  1195. status = FmpInitializeGroup( group , Initialize);
  1196. if ( status != ERROR_SUCCESS ) {
  1197. goto FnExit;
  1198. }
  1199. //
  1200. // Insert the group into its list.
  1201. //
  1202. status = OmInsertObject( group );
  1203. if ( status != ERROR_SUCCESS ) {
  1204. goto FnExit;
  1205. }
  1206. }
  1207. FnExit:
  1208. if (status != ERROR_SUCCESS)
  1209. {
  1210. FmpAcquireLocalGroupLock( group );
  1211. FmpDestroyGroup( group, FALSE );
  1212. SetLastError(status);
  1213. group = NULL;
  1214. }
  1215. return(group);
  1216. } // FmpCreateGroup
  1217. DWORD FmpInitializeGroup(
  1218. IN PFM_GROUP Group,
  1219. IN BOOL Initialize
  1220. )
  1221. {
  1222. DWORD status;
  1223. //
  1224. // Initialize the Group
  1225. //
  1226. InitializeListHead( &(Group->Contains) );
  1227. InitializeListHead( &(Group->PreferredOwners) );
  1228. InitializeListHead( &(Group->DmRundownList) );
  1229. InitializeListHead( &(Group->WaitQueue) );
  1230. Group->MovingList = NULL;
  1231. //
  1232. // Read the registry information if directed to do so.
  1233. //
  1234. status = FmpQueryGroupInfo( Group, Initialize );
  1235. if ( status != ERROR_SUCCESS ) {
  1236. ClRtlLogPrint(LOG_NOISE,
  1237. "[FM] FmpInitializeGroup: FmpQueryGroupInfo failed, status=%1!u!\n",
  1238. status);
  1239. }
  1240. return(status);
  1241. }
  1242. DWORD
  1243. FmpDestroyGroup(
  1244. IN PFM_GROUP Group,
  1245. IN BOOL bDeleteObjOnly
  1246. )
  1247. /*++
  1248. Routine Description:
  1249. Closes a group.
  1250. First, this routine verifies that all resources contained within
  1251. the Group are closed.
  1252. If the group is online, it is brought offline.
  1253. Note that the group object itself is not dereferenced here. This is
  1254. done so that FmpCleanupGroups can simply enumerate all the groups,
  1255. destroying each one in turn. This approach means a group may be
  1256. destroyed multiple times if there are outstanding references to it, but
  1257. that is not a problem since no work will be done on subsequent calls.
  1258. IF bDeleteObjOnly is TRUE, then the resource monitor is not invoked and
  1259. group state is not touched.
  1260. Arguments:
  1261. FoundGroup - Returns the found group.
  1262. Group - Supplies the current group.
  1263. Name - Supplies the current group's name.
  1264. Return Value:
  1265. TRUE - to continue searching
  1266. FALSE - to stop the search. The matching group is returned in
  1267. *FoundGroup
  1268. Notes:
  1269. The LocalGroupLock MUST be held! This routine will release that lock
  1270. as part of cleanup.
  1271. --*/
  1272. {
  1273. PLIST_ENTRY listEntry;
  1274. PFM_RESOURCE Resource;
  1275. PPREFERRED_ENTRY preferredEntry;
  1276. DWORD status = ERROR_SUCCESS;
  1277. ClRtlLogPrint(LOG_NOISE,
  1278. "[FM] DestroyGroup: destroying %1!ws!\n",
  1279. OmObjectId(Group));
  1280. //
  1281. // Make sure there are no resources in the Group.
  1282. //
  1283. for ( listEntry = Group->Contains.Flink;
  1284. listEntry != &(Group->Contains);
  1285. ) {
  1286. Resource = CONTAINING_RECORD(listEntry, FM_RESOURCE, ContainsLinkage);
  1287. listEntry = listEntry->Flink;
  1288. RemoveEntryList( &Resource->ContainsLinkage );
  1289. //dereference for removing from the contains list
  1290. OmDereferenceObject( Resource );
  1291. FmpAcquireLocalResourceLock( Resource );
  1292. if (!bDeleteObjOnly)
  1293. Resource->QuorumResource = FALSE;
  1294. FmpDestroyResource( Resource, bDeleteObjOnly );
  1295. //the reference count on the group wrt to being
  1296. //referenced by the resource is handled in FmpDestroyResource
  1297. }
  1298. CL_ASSERT(IsListEmpty(&Group->Contains));
  1299. //
  1300. //
  1301. // Make sure the preferred owners list is drained.
  1302. //
  1303. while ( !IsListEmpty( &Group->PreferredOwners ) ) {
  1304. listEntry = RemoveHeadList(&Group->PreferredOwners);
  1305. preferredEntry = CONTAINING_RECORD( listEntry,
  1306. PREFERRED_ENTRY,
  1307. PreferredLinkage );
  1308. OmDereferenceObject( preferredEntry->PreferredNode );
  1309. LocalFree( preferredEntry );
  1310. }
  1311. //
  1312. // Now that there are no remaining resources in this group
  1313. // we're done, so remove it from it's object type list.
  1314. //
  1315. status = OmRemoveObject( Group );
  1316. //
  1317. // Close the Group's registry key.
  1318. //
  1319. DmRundownList( &Group->DmRundownList );
  1320. if ( Group->RegistryKey != NULL ) {
  1321. DmCloseKey( Group->RegistryKey );
  1322. Group->RegistryKey = NULL;
  1323. Group->Initialized = FALSE;
  1324. }
  1325. //
  1326. // We must release the lock prior to the dereference, in case this is
  1327. // the last dereference of the object!
  1328. //
  1329. FmpReleaseLocalGroupLock( Group );
  1330. ClRtlLogPrint(LOG_NOISE,
  1331. "[FM] FmpDestroyGroup: Group %1!ws! destroyed.\n",
  1332. OmObjectId(Group));
  1333. OmDereferenceObject( Group );
  1334. return(status);
  1335. } // FmpDestroyGroup
  1336. ///////////////////////////////////////////////////////////////////////////
  1337. //
  1338. // Initialization/Cleanup Routines
  1339. //
  1340. ///////////////////////////////////////////////////////////////////////////
  1341. DWORD
  1342. FmpInitGroups(
  1343. IN BOOL Initialize
  1344. )
  1345. /*++
  1346. Routine Description:
  1347. Processes the Cluster group list in the registry. For each
  1348. group key found, a cluster group is created.
  1349. Arguments:
  1350. Initialize - TRUE if resources should be initialized. FALSE otherwise.
  1351. Return Value:
  1352. ERROR_SUCCESS if successful.
  1353. A Win32 error code on failure.
  1354. --*/
  1355. {
  1356. DWORD status;
  1357. DWORD keyIndex = 0;
  1358. LPWSTR groupId = NULL;
  1359. DWORD groupIdMaxSize = 0;
  1360. PFM_GROUP ignored;
  1361. ClRtlLogPrint(LOG_NOISE,"[FM] Processing groups list.\n");
  1362. FmpAcquireGroupLock();
  1363. //
  1364. // Enumerate the subkeys. Each subkey name corresponds to a group name.
  1365. //
  1366. for (keyIndex = 0; ; keyIndex++) {
  1367. status = FmpRegEnumerateKey( DmGroupsKey,
  1368. keyIndex,
  1369. &groupId,
  1370. &groupIdMaxSize
  1371. );
  1372. if (status == NO_ERROR) {
  1373. ignored = FmpCreateGroup( groupId,
  1374. Initialize );
  1375. continue;
  1376. }
  1377. if (status == ERROR_NO_MORE_ITEMS) {
  1378. status = NO_ERROR;
  1379. } else {
  1380. ClRtlLogPrint(LOG_NOISE,"[FM] EnumGroup error %1!u!\n", status);
  1381. }
  1382. break;
  1383. }
  1384. FmpReleaseGroupLock();
  1385. ClRtlLogPrint(LOG_NOISE,"[FM] All groups created.\n");
  1386. if (groupId != NULL) {
  1387. LocalFree(groupId);
  1388. }
  1389. return(status);
  1390. } // FmpInitGroups
  1391. DWORD
  1392. FmpCompleteInitGroup(
  1393. IN PFM_GROUP Group
  1394. )
  1395. /*++
  1396. Routine Description:
  1397. Finish initialization of all resources within the group.
  1398. Arguments:
  1399. Group - The group to finish initializing.
  1400. Return Value:
  1401. ERROR_SUCCESS if successful.
  1402. A Win32 error code on failure.
  1403. --*/
  1404. {
  1405. PLIST_ENTRY listEntry;
  1406. PFM_RESOURCE Resource;
  1407. FmpAcquireLocalGroupLock(Group);
  1408. //
  1409. // For each resource in the Group, make sure that it has been fully
  1410. // initialized.
  1411. //
  1412. for ( listEntry = Group->Contains.Flink;
  1413. listEntry != &(Group->Contains);
  1414. listEntry = listEntry->Flink ) {
  1415. Resource = CONTAINING_RECORD(listEntry, FM_RESOURCE, ContainsLinkage);
  1416. FmpInitializeResource( Resource, TRUE );
  1417. }
  1418. FmpReleaseLocalGroupLock(Group);
  1419. return(ERROR_SUCCESS);
  1420. } // FmpCompleteInitGroup
  1421. DWORD
  1422. FmpCleanupGroupsWorker(
  1423. IN PFM_CLEANUP_INFO pFmCleanupInfo
  1424. )
  1425. /*++
  1426. Routine Description:
  1427. This routine walks through an enumerated list of all the groups
  1428. owned by the local node and tries to shut them down cleanly.
  1429. In the first phase it tries to bring
  1430. all resources offline except the quorum one.
  1431. In the second phase it waits for the group to reach stable state
  1432. and then move it. It tries to bring the quorum resource offline as
  1433. well by moving the quorum group.
  1434. Arguments:
  1435. pFmCleanupInfo - ptr to a strucuture containing the groups to be
  1436. offlined/moved and the timelimit in which to do so.
  1437. Returns:
  1438. None.
  1439. Assumptions:
  1440. --*/
  1441. {
  1442. DWORD Status = ERROR_SUCCESS;
  1443. DWORD i;
  1444. PFM_GROUP pGroup;
  1445. PGROUP_ENUM pGroupEnum;
  1446. BOOL bContainsQuorumGroup;
  1447. BOOL bQuorumGroup = FALSE;
  1448. DWORD CleanupStatus = ERROR_SUCCESS;
  1449. ClRtlLogPrint(LOG_NOISE,
  1450. "[FM] FmpCleanupGroupsWorker: Entry\r\n");
  1451. //
  1452. // This is done in two passes. In the first pass, we offline/move all
  1453. // resources except the quorum resource. In the second pass, we offline/move
  1454. // everything and then destroy the group. This allows resources that are
  1455. // being shutdown to write to the registry and have the updates logged to
  1456. // the quorum disk.
  1457. //
  1458. pGroupEnum = pFmCleanupInfo->pGroupEnum;
  1459. bContainsQuorumGroup = pFmCleanupInfo->bContainsQuorumGroup;
  1460. // Now offline all of the non-quorum resources...
  1461. // but don't wait for them to finish. I.E. get as much work done as
  1462. // possible as fast as possible.
  1463. //
  1464. for ( i = 0; i < pGroupEnum->EntryCount; i++ )
  1465. {
  1466. pGroup = OmReferenceObjectById( ObjectTypeGroup,
  1467. pGroupEnum->Entry[i].Id );
  1468. //try and offline all resources except the quorum
  1469. //resource
  1470. Status = FmpCleanupGroupPhase1(pGroup, pFmCleanupInfo->dwTimeOut);
  1471. if ((Status != ERROR_IO_PENDING) && (Status != ERROR_SUCCESS) &&
  1472. (Status != ERROR_QUORUM_RESOURCE))
  1473. CleanupStatus = Status;
  1474. OmDereferenceObject(pGroup);
  1475. }
  1476. //this finishes the second phase of the cleanup on shutdown
  1477. //if the quorum group is in this list, skip it and process it
  1478. //at the end
  1479. if (CleanupStatus == ERROR_SUCCESS)
  1480. {
  1481. for ( i = 0; i < pGroupEnum->EntryCount; i++ )
  1482. {
  1483. pGroup = OmReferenceObjectById( ObjectTypeGroup,
  1484. pGroupEnum->Entry[i].Id );
  1485. if (gpQuoResource->Group == pGroup)
  1486. {
  1487. ClRtlLogPrint(LOG_NOISE,
  1488. "[FM] FmpCleanupGroupsWorker: Quorum group belongs to this node, process phase 2 later\r\n");
  1489. bQuorumGroup = TRUE;
  1490. OmDereferenceObject(pGroup);
  1491. continue;
  1492. }
  1493. //try and offline all groups, including the quorum resource
  1494. //also try and move the resource to other nodes
  1495. Status = FmpCleanupGroupPhase2(pGroup);
  1496. OmDereferenceObject(pGroup);
  1497. }
  1498. if (bQuorumGroup)
  1499. Status = FmpCleanupGroupPhase2(gpQuoResource->Group);
  1500. }
  1501. else
  1502. {
  1503. //phase 1 didnt work for some reason
  1504. //try and offline the quorum resource alone.
  1505. //TODO::Should we also terminate all resources
  1506. // No way to terminate services ???
  1507. if (bContainsQuorumGroup)
  1508. FmpCleanupQuorumResource(gpQuoResource);
  1509. }
  1510. return(Status);
  1511. } // FmpCleanupGroupsWorker
  1512. DWORD
  1513. FmpCleanupGroupPhase1(
  1514. IN PFM_GROUP Group,
  1515. IN DWORD dwTimeOut
  1516. )
  1517. /*++
  1518. Routine Description:
  1519. This routine is the first phase for clean up all groups owned by the node
  1520. on shutdown.
  1521. In this phase, we try and bring all resources offline except the quorum
  1522. resource. In this phase we dont block for the resources to reach a stable
  1523. state
  1524. We give the group the shutdown timeout specified for the cluster
  1525. to reach a stable state before we try to offline it. If it doesnt
  1526. reach a stable state in this period then we shut it down abruptly.
  1527. Arguments:
  1528. Group - The Group to offline.
  1529. Returns:
  1530. ERROR_SUCCESS if successful.
  1531. A Win32 error code on failure.
  1532. --*/
  1533. {
  1534. DWORD Status = ERROR_SUCCESS;
  1535. DWORD dwRetryCount = (2 * dwTimeOut)/1000;//we check after every 1/2 sec
  1536. ClRtlLogPrint(LOG_NOISE,
  1537. "[FM] FmpCleanupGroupsPhase1: Entry, Group = %1!ws!\r\n",
  1538. OmObjectId(Group));
  1539. ChkGroupState:
  1540. FmpAcquireLocalGroupLock( Group );
  1541. //
  1542. // Just offline the group
  1543. //
  1544. if ( Group->OwnerNode == NmLocalNode )
  1545. {
  1546. //
  1547. // Make sure the group is quiet
  1548. //
  1549. if ( !FmpIsGroupQuiet( Group, ClusterGroupOffline ) )
  1550. {
  1551. FmpReleaseLocalGroupLock( Group );
  1552. ClRtlLogPrint(LOG_NOISE,
  1553. "[FM] FmpCleanupGroupsPhase1: Group is not quiet, wait\r\n");
  1554. //we give it a minute to recover totally
  1555. Sleep(500);
  1556. if (dwRetryCount--)
  1557. goto ChkGroupState;
  1558. else
  1559. {
  1560. Status = ERROR_REQUEST_ABORTED;
  1561. goto FnExit;
  1562. }
  1563. }
  1564. //
  1565. // Notify the group's resources that the cluster service is shutting down. This must be
  1566. // done BEFORE any of the resources are brought offline.
  1567. //
  1568. FmpNotifyGroupStateChangeReason( Group, eResourceStateChangeReasonShutdown );
  1569. //
  1570. // Just take the group offline. Don't wait, don't pass go...
  1571. //
  1572. // Dont take the quorum resource offline in phase 1
  1573. // The quorum resource must be the last one to be taken offline
  1574. Status = FmpOfflineGroup(Group, FALSE, FALSE);
  1575. }
  1576. FmpReleaseLocalGroupLock( Group );
  1577. FnExit:
  1578. ClRtlLogPrint(LOG_NOISE,
  1579. "[FM] FmpCleanupGroupsPhase1: Exit, status=%1!u!\r\n",
  1580. Status);
  1581. return(Status);
  1582. } // FmpCleanupGroupsPhase1
  1583. DWORD
  1584. FmpCleanupGroupPhase2(
  1585. IN PFM_GROUP Group
  1586. )
  1587. /*++
  1588. Routine Description:
  1589. This routine is the second phase for clean up all groups owned by the node
  1590. on shutdown.
  1591. In this phase, we try and bring all resources offline including the quorum
  1592. resource. We also try to move the quorum resource
  1593. We give the group 10 seconds to reach a stable state before we try to
  1594. move it.
  1595. Arguments:
  1596. Group - The Group to offline.
  1597. Returns:
  1598. ERROR_SUCCESS if successful.
  1599. A Win32 error code on failure.
  1600. --*/
  1601. {
  1602. DWORD Status = ERROR_SUCCESS;
  1603. DWORD dwRetryCount= 120 * 12;
  1604. ClRtlLogPrint(LOG_NOISE,
  1605. "[FM] FmpCleanupGroupsPhase2: Entry, Group = %1!ws!\r\n",
  1606. OmObjectId(Group));
  1607. FmpAcquireLocalGroupLock( Group );
  1608. //
  1609. // Try to move the Group before destroying it if we own it.
  1610. //
  1611. if ( Group->OwnerNode == NmLocalNode )
  1612. {
  1613. //
  1614. // First make sure the group is really offline.
  1615. // In phase 1 we began the offline process... we need to check it here.
  1616. //
  1617. WaitSomeMore:
  1618. //
  1619. // [GorN] [10/05/1999]
  1620. // We need to wait for the quorum to go offline, otherwise
  1621. // the surviving node will not be able to arbitrate.
  1622. //
  1623. // FmpWaitForGroup keeps issuing RmOffline for the quorum,
  1624. // resrcmon returns ERROR_INVALID_STATE, for the second offline,
  1625. // since offline is already in progress.
  1626. //
  1627. // This causes us to break out of this look while the quorum resource
  1628. // is still being offline.
  1629. //
  1630. // [HACKHACK] The following fix for the problem is a hack.
  1631. // It would be better either to make resmon return IO_PENDING when
  1632. // somebody is trying to offline the resource that is in offline pending
  1633. //
  1634. // Or not to call FmRmOffline the second time in FM.
  1635. //
  1636. Status = FmpOfflineGroup(Group, TRUE, FALSE);
  1637. if (Status == ERROR_IO_PENDING ||
  1638. (Status == ERROR_INVALID_STATE
  1639. && Group == gpQuoResource->Group) )
  1640. {
  1641. //FmpWaitForGroup() will release the lock
  1642. Status = FmpWaitForGroup(Group);
  1643. ClRtlLogPrint(LOG_NOISE,
  1644. "[FM] FmpCleanupGroupsPhase2: Sleep and retry\r\n");
  1645. Sleep(2*1000);
  1646. //Reacquire the group lock and check if the group is offline
  1647. FmpAcquireLocalGroupLock(Group);
  1648. if (dwRetryCount--)
  1649. goto WaitSomeMore;
  1650. }
  1651. else if (Status != ERROR_SUCCESS)
  1652. {
  1653. goto FnExit;
  1654. }
  1655. else
  1656. {
  1657. // The Move routine frees the LocalGroupLock!
  1658. FmpMoveGroup( Group, NULL, TRUE, NULL, TRUE );
  1659. FmpAcquireLocalGroupLock( Group );
  1660. }
  1661. }
  1662. FnExit:
  1663. FmpReleaseLocalGroupLock(Group);
  1664. ClRtlLogPrint(LOG_NOISE,
  1665. "[FM] FmpCleanupGroupsPhase2: Exit\n");
  1666. return(TRUE);
  1667. } // FmpCleanupGroupsPhase2
  1668. BOOL
  1669. FmpEnumNodeState(
  1670. OUT DWORD *pStatus,
  1671. IN PVOID Context2,
  1672. IN PNM_NODE Node,
  1673. IN LPCWSTR Name
  1674. )
  1675. /*++
  1676. Routine Description:
  1677. Node enumeration callback for FM shutdown. Queries the state
  1678. of other nodes to see if any are up.
  1679. Arguments:
  1680. pStatus - Returns TRUE if other node is up.
  1681. Context2 - Not used
  1682. Node - Supplies the node.
  1683. Name - Supplies the node's name.
  1684. Return Value:
  1685. TRUE - to indicate that the enumeration should continue.
  1686. FALSE - to indicate that the enumeration should not continue.
  1687. --*/
  1688. {
  1689. DWORD Status;
  1690. DWORD NodeId;
  1691. PGROUP_ENUM NodeGroups = NULL;
  1692. PRESOURCE_ENUM NodeResources = NULL;
  1693. DWORD i;
  1694. PFM_GROUP Group;
  1695. PFM_RESOURCE Resource;
  1696. if (Node == NmLocalNode) {
  1697. return(TRUE);
  1698. }
  1699. //
  1700. // Enumerate all other node's group states. This includes all nodes
  1701. // that are up, as well as nodes that are paused.
  1702. //
  1703. if ((NmGetNodeState(Node) == ClusterNodeUp) ||
  1704. (NmGetNodeState(Node) == ClusterNodePaused)){
  1705. *pStatus = TRUE;
  1706. return(FALSE);
  1707. }
  1708. return(TRUE);
  1709. } // FmpEnumNodeState
  1710. VOID
  1711. FmpCleanupGroups(
  1712. IN BOOL ClusterShutDownEvent
  1713. )
  1714. /*++
  1715. Routine Description:
  1716. This routine kicks off the cleanup of the FM layer.
  1717. Arguments:
  1718. None.
  1719. Returns:
  1720. None.
  1721. --*/
  1722. {
  1723. DWORD Status;
  1724. DWORD dwTimeOut;
  1725. DWORD dwDefaultTimeOut;
  1726. HANDLE hCleanupThread;
  1727. DWORD otherNodesUp = FALSE;
  1728. DWORD dwThreadId;
  1729. DWORD i,dwTimeOutCount;
  1730. PGROUP_ENUM pGroupEnum;
  1731. BOOL bQuorumGroup = FALSE;
  1732. PFM_CLEANUP_INFO pFmCleanupInfo;
  1733. ClRtlLogPrint(LOG_NOISE,
  1734. "[FM] FmpCleanupGroups: Entry\r\n");
  1735. //
  1736. // If we don't know the quorum resource or we are not online,
  1737. // then leave immediately
  1738. //
  1739. if ( !gpQuoResource ) {
  1740. goto FnExit;
  1741. }
  1742. ACQUIRE_EXCLUSIVE_LOCK(gQuoChangeLock);
  1743. //if this is called when fmformphaseprocessing is going on
  1744. //then the quorum group doesnt exist, other groups dont exist
  1745. //either
  1746. if (FmpFMFormPhaseProcessing)
  1747. FmpCleanupQuorumResource(gpQuoResource);
  1748. else
  1749. CL_ASSERT(gpQuoResource->Group != NULL)
  1750. RELEASE_LOCK(gQuoChangeLock);
  1751. //
  1752. // Find and sort all known groups, hold the group lock while enumerating
  1753. //
  1754. FmpAcquireGroupLock();
  1755. Status = FmpEnumSortGroups(&pGroupEnum, OmObjectId(NmLocalNode), &bQuorumGroup);
  1756. FmpReleaseGroupLock();
  1757. if (Status != ERROR_SUCCESS) {
  1758. goto FnExit;
  1759. }
  1760. //
  1761. // See if any other node in the cluster is up...
  1762. // If so, we will use the default timeout value.
  1763. // Otherwise, we will use what we believe is a more reasonable time.
  1764. //
  1765. OmEnumObjects( ObjectTypeNode,
  1766. FmpEnumNodeState,
  1767. &otherNodesUp,
  1768. NULL );
  1769. dwDefaultTimeOut = CLUSTER_SHUTDOWN_TIMEOUT * 60; // default timeout (secs)
  1770. switch ( CsShutdownRequest ) {
  1771. case CsShutdownTypeShutdown:
  1772. if ( otherNodesUp ) {
  1773. dwTimeOut = 15; // other node will time us out quickly - say 15 secs
  1774. } else {
  1775. dwTimeOut = 30; // otherwise use 30 seconds
  1776. }
  1777. break;
  1778. default:
  1779. // apply default value to registry
  1780. dwDefaultTimeOut = CLUSTER_SHUTDOWN_TIMEOUT; // default timeout (mins)
  1781. Status = DmQueryDword( DmClusterParametersKey,
  1782. CLUSREG_NAME_CLUS_SHUTDOWN_TIMEOUT,
  1783. &dwTimeOut,
  1784. &dwDefaultTimeOut);
  1785. dwTimeOut *= 60; // convert to secs.
  1786. break;
  1787. }
  1788. //convert to msecs
  1789. dwTimeOut *= 1000;
  1790. pFmCleanupInfo = (PFM_CLEANUP_INFO)LocalAlloc(LMEM_FIXED, sizeof(FM_CLEANUP_INFO));
  1791. if (!pFmCleanupInfo)
  1792. {
  1793. Status = ERROR_NOT_ENOUGH_MEMORY;
  1794. goto FnExit;
  1795. }
  1796. pFmCleanupInfo->pGroupEnum = pGroupEnum;
  1797. pFmCleanupInfo->dwTimeOut = dwTimeOut; //in msecs
  1798. pFmCleanupInfo->bContainsQuorumGroup = bQuorumGroup;
  1799. //
  1800. // Start the worker thread to perform cleanup.
  1801. //
  1802. hCleanupThread = CreateThread( NULL,
  1803. 0,
  1804. FmpCleanupGroupsWorker,
  1805. pFmCleanupInfo,
  1806. 0,
  1807. &dwThreadId );
  1808. if ( hCleanupThread == NULL ) {
  1809. //SS: if we own the quorum resource should we cleanup the quorum resource
  1810. //this will avoid corruption
  1811. if (bQuorumGroup)
  1812. FmpCleanupQuorumResource(gpQuoResource);
  1813. goto FnExit;
  1814. }
  1815. // Rohit (rjain): This path is taken when Cluster Service is shutting
  1816. // down. ServiceStatus checkpoint is incremented after every WaitHint
  1817. // units of time. For this the waiting period of dwTimeOut is divided into
  1818. // multiple waiting periods of dwWaitHint units each.
  1819. if((ClusterShutDownEvent==TRUE) && (dwTimeOut > CsServiceStatus.dwWaitHint))
  1820. {
  1821. dwTimeOutCount=dwTimeOut/((CsServiceStatus.dwWaitHint == 0)?1:CsServiceStatus.dwWaitHint);
  1822. ClRtlLogPrint(LOG_NOISE,
  1823. "[FM] FmpCleanupGroups: dwTimeOut=%1!u! dwTimoutCount=%2!u! waithint =%3!u! \r\n",
  1824. dwTimeOut,dwTimeOutCount, CsServiceStatus.dwWaitHint);
  1825. for(i=0;i<dwTimeOutCount;i++){
  1826. Status = WaitForSingleObject(hCleanupThread, CsServiceStatus.dwWaitHint);
  1827. switch(Status) {
  1828. case WAIT_OBJECT_0:
  1829. //everything is fine
  1830. ClRtlLogPrint(LOG_NOISE,
  1831. "[FM] FmpCleanupGroups: Cleanup thread finished in time\r\n");
  1832. break;
  1833. case WAIT_TIMEOUT:
  1834. //should we terminate the thread
  1835. //try and clean up the quorum resource
  1836. //this will avoid corruption on the quorum disk
  1837. //TODO::Should we also terminate all resources
  1838. // No way to terminate services ???
  1839. if(i == (dwTimeOutCount-1)){
  1840. ClRtlLogPrint(LOG_UNUSUAL,
  1841. "[FM] FmpCleanupGroups: Timed out on the CleanupThread\r\n");
  1842. if (bQuorumGroup)
  1843. FmpCleanupQuorumResource(gpQuoResource);
  1844. }
  1845. break;
  1846. case WAIT_FAILED:
  1847. ClRtlLogPrint(LOG_UNUSUAL,
  1848. "[DM] FmpCleanupGroups: wait on CleanupEvent failed 0x%1!08lx!\r\n",
  1849. GetLastError());
  1850. break;
  1851. }
  1852. if(Status== WAIT_OBJECT_0 || Status==WAIT_FAILED)
  1853. break;
  1854. CsServiceStatus.dwCheckPoint++;
  1855. CsAnnounceServiceStatus();
  1856. }
  1857. goto FnExit;
  1858. }
  1859. //
  1860. // Wait for the thread to complete or a timeout.
  1861. //
  1862. Status = WaitForSingleObject(hCleanupThread, dwTimeOut);
  1863. switch(Status) {
  1864. case WAIT_OBJECT_0:
  1865. //everything is fine
  1866. ClRtlLogPrint(LOG_NOISE,
  1867. "[FM] FmpCleanupGroups: Cleanup thread finished in time\r\n");
  1868. break;
  1869. case WAIT_TIMEOUT:
  1870. //should we terminate the thread
  1871. //try and clean up the quorum resource
  1872. //this will avoid corruption on the quorum disk
  1873. //TODO::Should we also terminate all resources
  1874. // No way to terminate services ???
  1875. ClRtlLogPrint(LOG_UNUSUAL,
  1876. "[FM] FmpCleanupGroups: Timed out on the CleanupThread\r\n");
  1877. if (bQuorumGroup)
  1878. FmpCleanupQuorumResource(gpQuoResource);
  1879. break;
  1880. case WAIT_FAILED:
  1881. ClRtlLogPrint(LOG_UNUSUAL,
  1882. "[DM] FmpCleanupGroups: wait on CleanupEvent failed 0x%1!08lx!\r\n",
  1883. GetLastError());
  1884. break;
  1885. }
  1886. FnExit:
  1887. //SS: dont bother cleaning up, we are going to exit after this
  1888. #if 0
  1889. if (pGroupEnum) LocalFree(GroupEnum);
  1890. #endif
  1891. ClRtlLogPrint(LOG_NOISE,
  1892. "[FM] FmpCleanupGroups: Exit\r\n");
  1893. return;
  1894. } // FmpCleanupGroups
  1895. DWORD
  1896. FmpCleanupQuorumResource(
  1897. IN PFM_RESOURCE Resource
  1898. )
  1899. /*++
  1900. Routine Description:
  1901. This routine is for emergency clean up of the quorum resource.
  1902. In this phase, we dont try and acquire any locks. We just try to
  1903. bring the quorum resource offline. Hopefully the api is offline and
  1904. nothing funky is attempted on the quorum group/resource during this
  1905. time. This should only be called during the shutdown of FM.
  1906. Arguments:
  1907. Group - The Group to offline.
  1908. Returns:
  1909. ERROR_SUCCESS if successful.
  1910. A Win32 error code on failure.
  1911. --*/
  1912. {
  1913. DWORD status = ERROR_SUCCESS;
  1914. DWORD state;
  1915. ClRtlLogPrint(LOG_NOISE,
  1916. "[FM] FmpCleanupQuorum: Offline resource <%1!ws!> <%2!ws!>\n",
  1917. OmObjectName(Resource),
  1918. OmObjectId(Resource) );
  1919. //
  1920. // If the resource is already offline, then return immediately.
  1921. //
  1922. // We should not have to check if a resource has been initialized,
  1923. // since if it hasn't, then we will return because the pre-initialized
  1924. // state of a resource is Offline.
  1925. //
  1926. if ( Resource->State == ClusterResourceOffline ) {
  1927. //
  1928. // If this is the quorum resource, make sure any reservation
  1929. // threads are stopped!
  1930. //
  1931. FmpRmTerminateResource( Resource );
  1932. return(ERROR_SUCCESS);
  1933. }
  1934. if (Resource->State > ClusterResourcePending ) {
  1935. ClRtlLogPrint(LOG_NOISE,
  1936. "[FM] FmpCleanupQuorum: Offline resource <%1!ws!> is in pending state\n",
  1937. OmObjectName(Resource) );
  1938. FmpRmTerminateResource( Resource );
  1939. return(ERROR_SUCCESS);
  1940. }
  1941. //make sure the quorum logs can be flushed and closed
  1942. OmNotifyCb(Resource, NOTIFY_RESOURCE_PREOFFLINE);
  1943. //it may not be prudent to call offline without holding any locks
  1944. //just call terminate
  1945. FmpRmTerminateResource( Resource );
  1946. ClRtlLogPrint(LOG_NOISE,
  1947. "[FM] FmpCleanupQuorum: RmOfflineResource returns %1!u!\r\n",
  1948. status);
  1949. return(status);
  1950. }
  1951. DWORD
  1952. FmpMoveGroup(
  1953. IN PFM_GROUP Group,
  1954. IN PNM_NODE DestinationNode OPTIONAL,
  1955. IN BOOL ShutdownHandler,
  1956. OUT PNM_NODE *pChosenDestinationNode OPTIONAL,
  1957. IN BOOL bChooseMostPreferredNode
  1958. )
  1959. /*++
  1960. Routine Description:
  1961. Move the specified Group. This means taking all of the individual
  1962. resources contained within the group offline and requesting the
  1963. DestinationNode to bring the Group Online.
  1964. Arguments:
  1965. Group - Supplies a pointer to the group structure to move.
  1966. DestinationNode - Supplies the node object to move the group to. If not
  1967. present, then move it to 'highest' entry in the preferred list.
  1968. ShutdownHandler - TRUE if the shutdown handler is invoking this function.
  1969. pChosenDestinationNode - Set to the destination node of the move and
  1970. will be passed on to FmpCompleteMoveGroup, if necessary.
  1971. bChooseMostPreferredNode - If the destination node is not supplied,
  1972. indicates whether to choose the most preferred node or not.
  1973. Returns:
  1974. ERROR_SUCCESS if the request was successful.
  1975. A Win32 error code on failure.
  1976. Notes:
  1977. It is assumed that the Group and all contained resources are offline
  1978. from the requesting node when this call returns. The Group may or
  1979. may not be online on the DestinationNode, depending on whether the
  1980. online request succeeded. This means that the status return is merely
  1981. the status return for the Online request for the DestinationNode.
  1982. The LocalGroupLock MUST also be held. The LocalGroupLock is released
  1983. by this routine.
  1984. --*/
  1985. {
  1986. PNM_NODE node, pQuorumTargetNode = NULL;
  1987. DWORD status;
  1988. PFM_RESOURCE resource;
  1989. PLIST_ENTRY listEntry;
  1990. PRESOURCE_ENUM resourceList=NULL;
  1991. DWORD dwMoveStatus = ERROR_SUCCESS;
  1992. BOOL fMoveUserInitiated;
  1993. BOOL fStateChangeReasonNotified;
  1994. ClRtlLogPrint(LOG_NOISE,
  1995. "[FM] FmpMoveGroup: Entry\r\n");
  1996. //
  1997. // Move is user initiated if it doesn't originate from the shutdown handler and it doesn't
  1998. // originate from the failover move call FmpDoMoveGroupOnFailure.
  1999. //
  2000. fMoveUserInitiated = ( ( ShutdownHandler == FALSE ) &&
  2001. ( bChooseMostPreferredNode == TRUE ) );
  2002. //
  2003. // If this move is not user-initiated, then we should NOT have notified the state change reason,
  2004. // else we definitely should have.
  2005. //
  2006. fStateChangeReasonNotified = ( fMoveUserInitiated == TRUE ) ? FALSE:TRUE;
  2007. if ( !ShutdownHandler )
  2008. {
  2009. if ( !FmpFMOnline )
  2010. {
  2011. status = ERROR_CLUSTER_NODE_NOT_READY;
  2012. goto FnExit;
  2013. }
  2014. if ( FmpShutdown )
  2015. {
  2016. status = ERROR_SHUTDOWN_IN_PROGRESS;
  2017. goto FnExit;
  2018. }
  2019. }
  2020. //
  2021. // See which system owns the group in order to control the move request.
  2022. //
  2023. if ( Group->OwnerNode != NmLocalNode )
  2024. {
  2025. if ( Group->OwnerNode == NULL )
  2026. {
  2027. status = ERROR_HOST_NODE_NOT_AVAILABLE;
  2028. goto FnExit;
  2029. }
  2030. //
  2031. // The other system owns the Group ... let them do the work.
  2032. //
  2033. ClRtlLogPrint(LOG_NOISE,
  2034. "[FM] FmpMoveGroup: Request node %1!ws! to move Group %2!ws!\n",
  2035. OmObjectId(Group->OwnerNode),
  2036. OmObjectId(Group));
  2037. // FmcMoveGroupRequest must release the Group lock.
  2038. status = FmcMoveGroupRequest( Group,
  2039. DestinationNode );
  2040. if ( status != ERROR_SUCCESS )
  2041. {
  2042. ClRtlLogPrint(LOG_NOISE,
  2043. "[FM] FmpMoveGroup: Requested system %1!ws! to move group %2!ws! failed with status %3!u!.\n",
  2044. OmObjectId(Group->OwnerNode),
  2045. OmObjectId(Group),
  2046. status);
  2047. }
  2048. FmpAcquireLocalGroupLock( Group );
  2049. goto FnExit;
  2050. }
  2051. else
  2052. {
  2053. //
  2054. // We control the move.
  2055. //
  2056. if ( !FmpIsGroupQuiet(Group, ClusterGroupStateUnknown) )
  2057. {
  2058. //
  2059. // If a move is pending or resources are pending,
  2060. // then return now.
  2061. //
  2062. ClRtlLogPrint(LOG_NOISE,
  2063. "[FM] FmpMoveGroup: Request to move group <%1!ws!> when it is busy.\n",
  2064. OmObjectName(Group) );
  2065. status = ERROR_INVALID_STATE;
  2066. goto FnExit;
  2067. }
  2068. if ( ARGUMENT_PRESENT( DestinationNode ) )
  2069. {
  2070. //
  2071. // Check if we are the destination... if so, we're done.
  2072. //
  2073. if ( NmLocalNode == DestinationNode )
  2074. {
  2075. status = ERROR_SUCCESS;
  2076. goto FnExit;
  2077. }
  2078. node = DestinationNode;
  2079. }
  2080. else
  2081. {
  2082. node = FmpFindAnotherNode( Group, bChooseMostPreferredNode );
  2083. if ( node == NULL )
  2084. {
  2085. status = ERROR_HOST_NODE_NOT_AVAILABLE;
  2086. goto FnExit;
  2087. }
  2088. }
  2089. if ( ARGUMENT_PRESENT ( pChosenDestinationNode ) )
  2090. {
  2091. *pChosenDestinationNode = node;
  2092. }
  2093. ClRtlLogPrint(LOG_NOISE,
  2094. "[FM] FmpMoveGroup: Moving group %1!ws! to node %2!ws! (%3!d!)\n",
  2095. OmObjectId(Group),
  2096. OmObjectId(node),
  2097. NmGetNodeId(node));
  2098. //
  2099. // If the other system is not up, then fail now.
  2100. //
  2101. if ( NmGetExtendedNodeState(node) != ClusterNodeUp )
  2102. {
  2103. status = ERROR_HOST_NODE_NOT_AVAILABLE;
  2104. goto FnExit;
  2105. }
  2106. //
  2107. // If the other system is not in the preferred list, then fail this
  2108. // now.
  2109. //
  2110. if ( !FmpInPreferredList( Group, node, TRUE, NULL) )
  2111. {
  2112. status = ERROR_CLUSTER_NODE_NOT_FOUND;
  2113. goto FnExit;
  2114. }
  2115. //
  2116. // Get the list of resources in the group and their states.
  2117. //
  2118. status = FmpGetResourceList( &resourceList, Group );
  2119. if ( status != ERROR_SUCCESS )
  2120. {
  2121. goto FnExit;
  2122. }
  2123. Group->MovingList = resourceList;
  2124. //SS: log an eventlog to say we are going to offline the group
  2125. FmpLogGroupInfoEvent1( FM_EVENT_GROUP_START_OFFLINE, OmObjectName(Group));
  2126. Group->dwStructState |= FM_GROUP_STRUCT_MARKED_FOR_COMPLETION_EVENT;
  2127. //
  2128. // If this call is made as a part of a user-initiated move, then notify the group's
  2129. // resources of the impending move. Mark the state change reason flag so that you will
  2130. // notify of a failed move correctly.
  2131. //
  2132. if ( fMoveUserInitiated )
  2133. {
  2134. FmpNotifyGroupStateChangeReason( Group, eResourceStateChangeReasonMove );
  2135. fStateChangeReasonNotified = TRUE;
  2136. }
  2137. //
  2138. // At this point the other system should be up!
  2139. //
  2140. status = FmpOfflineResourceList( resourceList, TRUE );
  2141. //SS: avoid the window when the group lock is released
  2142. //and the moving flag is not set true
  2143. //moving will be continued in another thread context if pending is
  2144. //returned
  2145. if ( status != ERROR_SUCCESS )
  2146. {
  2147. goto FnRestore;
  2148. }
  2149. //SS:the group is offline, log an eventlog to mark the completion
  2150. FmpLogGroupInfoEvent1( FM_EVENT_GROUP_COMPLETE_OFFLINE, OmObjectName(Group));
  2151. //unmask the bit so that we dont log an error again on some other failure
  2152. //after this point in time
  2153. Group->dwStructState &= ~FM_GROUP_STRUCT_MARKED_FOR_COMPLETION_EVENT;
  2154. // for now make sure that the group state is propagated here
  2155. // In general it is propagated by the worker thread. Since
  2156. // the ownership is going to change, we want to make sure that the
  2157. // last known state is propagated from this node to others before
  2158. // that.
  2159. FmpPropagateGroupState(Group);
  2160. //
  2161. // Assume the other node is going to take ownership. This is done
  2162. // before, in case the Group state changes. We want to accept the
  2163. // Group/resource state changes from the remote system when they
  2164. // arrive. We've already verified that node is in the preferred list!
  2165. //
  2166. TESTPT(TpFailPreMoveWithNodeDown)
  2167. {
  2168. ClusterEvent( CLUSTER_EVENT_NODE_DOWN, node );
  2169. }
  2170. //
  2171. // Chittur Subbaraman (chitturs) - 5/18/99
  2172. //
  2173. // Modified to handle the move group request of a quorum group in
  2174. // case the destination node could not arbitrate for the quorum
  2175. // resource.
  2176. //
  2177. do
  2178. {
  2179. //
  2180. // Before making the RPC, set the intended owner of the group
  2181. //
  2182. FmpSetIntendedOwnerForGroup( Group, NmGetNodeId( node ) );
  2183. try {
  2184. ClRtlLogPrint(LOG_NOISE,
  2185. "[FM] FmpMoveGroup: Take group %2!ws! request to remote node %1!ws!\n",
  2186. OmObjectId(node),
  2187. OmObjectId(Group));
  2188. dwMoveStatus = status = FmcTakeGroupRequest( node, OmObjectId( Group ), resourceList );
  2189. } except (I_RpcExceptionFilter(RpcExceptionCode())) {
  2190. LPCWSTR pszNodeId;
  2191. LPCWSTR pszGroupId;
  2192. status = GetExceptionCode ();
  2193. ClRtlLogPrint(LOG_NOISE,
  2194. "[FM] FmpMoveGroup: Exception in FmcTakeGroupRequest %2!ws! request to remote node %1!ws!, status=%3!u!\n",
  2195. OmObjectId(node),
  2196. OmObjectId(Group),
  2197. status);
  2198. //
  2199. // An exception from RPC indicates that the other node is either dead
  2200. // or insane. We dont know whether it took ownership or not.
  2201. // So, let the FM node down handler handle the group.
  2202. //
  2203. GumCommFailure( GumUpdateFailoverManager,
  2204. NmGetNodeId(node),
  2205. GetExceptionCode(),
  2206. TRUE );
  2207. //
  2208. // The new owner node that is now dead might have set the intended
  2209. // owner as NULL or it might not have set this. It might have
  2210. // set the owner node to himself or might not have.
  2211. // If it has set the owner node for this group as himself, then
  2212. // the FM node down handler will assume responsibility for this
  2213. // group. If the target node dies before it sets himself as the owner,
  2214. // then again, the FM node down handler will assume responsibility
  2215. // for the group. We wake up when the gum sync handling is over.
  2216. // Right now, the gum update for the owner node may still be in
  2217. // progress so we cant be sure if that update was completed on
  2218. // all nodes.
  2219. //
  2220. //
  2221. // Chittur Subbaraman (chitturs) - 6/7/99
  2222. //
  2223. // Issue a GUM update to handle this group. Using this
  2224. // GUM update prevents any race condition with the
  2225. // node down processing code.
  2226. //
  2227. // TODO: This does not cover the case in which
  2228. // FmpTakeGroupRequest crashes after setting the
  2229. // intended owner to invalid ID. In such a case,
  2230. // the following handler won't take ownership of the
  2231. // group. Also, claim handler will not touch the
  2232. // group.
  2233. //
  2234. pszNodeId = OmObjectId( node );
  2235. pszGroupId = OmObjectId( Group );
  2236. GumSendUpdateEx( GumUpdateFailoverManager,
  2237. FmUpdateCompleteGroupMove,
  2238. 2,
  2239. (lstrlenW(pszNodeId)+1)*sizeof(WCHAR),
  2240. pszNodeId,
  2241. (lstrlenW(pszGroupId)+1)*sizeof(WCHAR),
  2242. pszGroupId);
  2243. status = ERROR_HOST_NODE_NOT_AVAILABLE;
  2244. FM_DEREF_QUORUM_TARGET ( pQuorumTargetNode );
  2245. goto FnExit;
  2246. }
  2247. //
  2248. // If this group is the quorum group, map the error to retry in case the remote node is
  2249. // not ready. This will let this node retry the group move request.
  2250. //
  2251. if ( ( Group == gpQuoResource->Group ) &&
  2252. ( ( status == ERROR_CLUSTER_NODE_SHUTTING_DOWN ) ||
  2253. ( status == ERROR_CLUSTER_NODE_NOT_READY ) ) )
  2254. {
  2255. status = ERROR_RETRY;
  2256. //
  2257. // Give a chance for the remote node to startup or shutdown. Don't murder
  2258. // the same remote node with RPCs.
  2259. //
  2260. Sleep ( 3000 );
  2261. }
  2262. if ( status == ERROR_RETRY )
  2263. {
  2264. //
  2265. // The destination refused to take the quorum group since it
  2266. // did not win the arbitration. So let us see who won the
  2267. // arbitration.
  2268. //
  2269. DWORD dwSelectedQuorumOwnerId;
  2270. CL_ASSERT( Group == gpQuoResource->Group );
  2271. ClRtlLogPrint(LOG_NOISE,
  2272. "[FM] FmpMoveGroup: Remote node asked us to resend take group request for group %1!ws! to another node ...\n",
  2273. OmObjectId( Group ));
  2274. //
  2275. // Get the ID of the node which the MM believes is the best
  2276. // candidate to own the quorum resource. This is a call that
  2277. // blocks while RGP is in progress.
  2278. //
  2279. MMApproxArbitrationWinner( &dwSelectedQuorumOwnerId );
  2280. if ( ( dwSelectedQuorumOwnerId == NmGetNodeId( NmLocalNode ) ) ||
  2281. ( dwSelectedQuorumOwnerId == MM_INVALID_NODE ) )
  2282. {
  2283. //
  2284. // The local node is chosen by MM or no node is chosen by
  2285. // the MM. The latter case will happen if no RGP has
  2286. // occurred at the time this call is made. Let us see if we
  2287. // can arbitrate for the quorum resource.
  2288. //
  2289. status = FmpRmArbitrateResource( gpQuoResource );
  2290. if ( status != ERROR_SUCCESS )
  2291. {
  2292. //
  2293. // Too bad. We will halt and let FmpNodeDown handler
  2294. // handle the quorum group.
  2295. //
  2296. ClRtlLogPrint(LOG_CRITICAL,
  2297. "[FM] FmpMoveGroup: Local node %1!u! cannot arbitrate for quorum, Status = %1!u!...\n",
  2298. dwSelectedQuorumOwnerId,
  2299. status);
  2300. CsInconsistencyHalt( ERROR_QUORUM_RESOURCE_ONLINE_FAILED );
  2301. }
  2302. status = ERROR_RETRY;
  2303. break;
  2304. }
  2305. FM_DEREF_QUORUM_TARGET ( pQuorumTargetNode );
  2306. pQuorumTargetNode = node = NmReferenceNodeById( dwSelectedQuorumOwnerId );
  2307. if ( node == NULL )
  2308. {
  2309. ClRtlLogPrint(LOG_CRITICAL,
  2310. "[FM] FmpMoveGroup: Selected node %1!u! cannot be referenced...\n",
  2311. dwSelectedQuorumOwnerId);
  2312. CsInconsistencyHalt( ERROR_QUORUM_RESOURCE_ONLINE_FAILED );
  2313. }
  2314. } // if
  2315. } while ( status == ERROR_RETRY );
  2316. FM_DEREF_QUORUM_TARGET ( pQuorumTargetNode );
  2317. TESTPT(TpFailPostMoveWithNodeDown)
  2318. {
  2319. ClusterEvent( CLUSTER_EVENT_NODE_DOWN, node );
  2320. }
  2321. CL_ASSERT( status != ERROR_IO_PENDING );
  2322. if ( status != ERROR_SUCCESS )
  2323. {
  2324. ClRtlLogPrint(LOG_NOISE,
  2325. "[FM] FmpMoveGroup: FmcTakeGroupRequest to node %1!ws! to take group %2!ws! failed, status %3!u!.\n",
  2326. OmObjectId(node),
  2327. OmObjectId(Group),
  2328. status );
  2329. goto FnRestore;
  2330. }
  2331. //
  2332. // If the group is empty, then generate a Group state change event.
  2333. //
  2334. if ( IsListEmpty( &Group->Contains ) )
  2335. {
  2336. ClusterWideEvent( CLUSTER_EVENT_GROUP_OFFLINE,
  2337. Group );
  2338. }
  2339. }
  2340. FnRestore:
  2341. if ((status != ERROR_SUCCESS) && (status != ERROR_IO_PENDING))
  2342. {
  2343. //
  2344. //
  2345. // Notify the group's resources that this is a failed move. Note that this should be
  2346. // done BEFORE any future onlines. Also, mark the fStateChangeReasonNotified to be FALSE
  2347. // so that we won't drop the failed move reason down below also.
  2348. //
  2349. FmpNotifyGroupStateChangeReason( Group, eResourceStateChangeReasonFailedMove );
  2350. fStateChangeReasonNotified = FALSE;
  2351. if (Group->dwStructState & FM_GROUP_STRUCT_MARKED_FOR_COMPLETION_EVENT)
  2352. {
  2353. //SS: log an event saying we failed the last offline request
  2354. FmpLogGroupInfoEvent1( FM_EVENT_GROUP_FAILED_ONLINE_OFFLINE, OmObjectName(Group));
  2355. //unmask the bit so that we dont log a non-corresponding event again
  2356. Group->dwStructState &= ~FM_GROUP_STRUCT_MARKED_FOR_COMPLETION_EVENT;
  2357. }
  2358. //
  2359. // Chittur Subbaraman (chitturs) - 3/22/2000
  2360. //
  2361. // Reset the group's intended owner to invalid node ID if the
  2362. // node down handler did not do that.
  2363. //
  2364. if ( dwMoveStatus != ERROR_SUCCESS )
  2365. {
  2366. if ( FmpSetIntendedOwnerForGroup( Group, ClusterInvalidNodeId )
  2367. == ERROR_CLUSTER_INVALID_NODE )
  2368. {
  2369. ClRtlLogPrint(LOG_NOISE,
  2370. "[FM] FmpMoveGroup: Group <%1!ws!> has already been processed by node down handler....\r\n",
  2371. OmObjectName(Group));
  2372. goto FnExit;
  2373. }
  2374. }
  2375. // the move failed
  2376. // In all failure cases we want to bring the resources
  2377. // back online
  2378. // if it is pending, then we let FmpCompleteMoveGroup finish
  2379. // the work
  2380. if (resourceList)
  2381. {
  2382. //
  2383. // Terminate all of the resources in the group.
  2384. //
  2385. FmpTerminateResourceList( resourceList );
  2386. //
  2387. // Chittur Subbaraman (chitturs) - 4/10/2000
  2388. //
  2389. // Make sure to online the quorum group even if this node is
  2390. // shutting down. This is necessary so that other groups
  2391. // can be brought offline during this node's shutdown. Note
  2392. // that FmpOnlineResourceList would only online a group
  2393. // during a shutdown if the group is the quorum group.
  2394. //
  2395. if ( FmpFMGroupsInited )
  2396. FmpOnlineResourceList( resourceList, Group );
  2397. }
  2398. }
  2399. FnExit:
  2400. ClRtlLogPrint(LOG_NOISE,
  2401. "[FM] FmpMoveGroup: Exit group <%1!ws!>, status = %2!u!\r\n",
  2402. OmObjectName(Group),
  2403. status);
  2404. if ( status != ERROR_IO_PENDING )
  2405. {
  2406. if (resourceList)
  2407. {
  2408. FmpDeleteResourceEnum( resourceList );
  2409. Group->MovingList = NULL;
  2410. }
  2411. }
  2412. else
  2413. {
  2414. //if the state is pending mark for completion event
  2415. Group->dwStructState |= FM_GROUP_STRUCT_MARKED_FOR_COMPLETION_EVENT;
  2416. }
  2417. if ( ( status == ERROR_SUCCESS ) || ( status == ERROR_IO_PENDING ) )
  2418. {
  2419. //
  2420. // Chittur Subbaraman (chitturs) - 4/13/99
  2421. //
  2422. // If the FmpDoMoveGroupOnFailure thread is also waiting to do the
  2423. // move, then tell that thread to take its hands off.
  2424. //
  2425. if ( Group->dwStructState & FM_GROUP_STRUCT_MARKED_FOR_MOVE_ON_FAIL )
  2426. {
  2427. Group->dwStructState |= FM_GROUP_STRUCT_MARKED_FOR_REGULAR_MOVE;
  2428. }
  2429. } else if ( fStateChangeReasonNotified == TRUE )
  2430. {
  2431. //
  2432. //
  2433. // Notify the group's resources that this is a failed move.
  2434. //
  2435. FmpNotifyGroupStateChangeReason( Group, eResourceStateChangeReasonFailedMove );
  2436. }
  2437. FmpReleaseLocalGroupLock( Group );
  2438. return(status);
  2439. } // FmpMoveGroup
  2440. DWORD
  2441. FmpCompleteMoveGroup(
  2442. IN PFM_GROUP Group,
  2443. IN PNM_NODE DestinationNode
  2444. )
  2445. /*++
  2446. Routine Description:
  2447. This completes the move of a group by asking the other node to take
  2448. ownership.
  2449. This function is called by FmpMovePendingThread() after all the resources
  2450. are offline.
  2451. Arguments:
  2452. Group - Supplies a pointer to the group structure to move.
  2453. DestinationNode - Supplies the node object to move the group to. If not
  2454. present, then move it to 'highest' entry in the preferred list.
  2455. Returns:
  2456. ERROR_SUCCESS if the request was successful.
  2457. A Win32 error code on failure.
  2458. Notes:
  2459. It is assumed that the Group and all contained resources are offline
  2460. when this is called.
  2461. The LocalGroupLock MUST also be held. The LocalGroupLock is released
  2462. by this routine, especially before requesting a remote system to move
  2463. a group!
  2464. --*/
  2465. {
  2466. PNM_NODE node, pQuorumTargetNode = NULL;
  2467. DWORD status = ERROR_SUCCESS;
  2468. PFM_RESOURCE resource;
  2469. PLIST_ENTRY listEntry;
  2470. PRESOURCE_ENUM resourceList=NULL;
  2471. DWORD dwMoveStatus = ERROR_SUCCESS;
  2472. BOOL fStateChangeReasonNotified = TRUE; // In this function == reason notified already
  2473. resourceList = Group->MovingList;
  2474. if ( resourceList == NULL ) {
  2475. ClRtlLogPrint( LOG_NOISE,
  2476. "[FM] FmpCompleteMoveGroup: No moving list!\n" );
  2477. status = ERROR_SUCCESS;
  2478. goto FnRestore;
  2479. }
  2480. node = DestinationNode;
  2481. CL_ASSERT( node != NULL );
  2482. ClRtlLogPrint(LOG_NOISE,
  2483. "[FM] FmpCompleteMoveGroup: Completing the move for group %1!ws! to node %2!ws! (%3!d!)\n",
  2484. OmObjectName(Group),
  2485. OmObjectId(node),
  2486. NmGetNodeId(node));
  2487. status = FmpOfflineResourceList( resourceList, TRUE );
  2488. if ( status != ERROR_SUCCESS ) {
  2489. //by now the group must be offline!
  2490. //if not, mail the move, the resource that fails to go
  2491. //offline will force the other resources to come online
  2492. //again.
  2493. //how do we handle shutdowns
  2494. goto FnRestore;
  2495. }
  2496. // for now make sure that the group state is propagated here
  2497. // In general it is propagated by the worker thread. Since
  2498. // the ownership is going to change, we want to make sure that the
  2499. // last known state is propagated from this node to others before
  2500. // that.
  2501. FmpPropagateGroupState(Group);
  2502. //
  2503. // Chittur Subbaraman (chitturs) - 10/01/1999
  2504. //
  2505. // If the other system is not up, then fail now. Note that this
  2506. // check must be done only AFTER ensuring that the group state
  2507. // is stable. Otherwise some funny corner cases can result.
  2508. // E.g., If the complete move operation is aborted when one or
  2509. // more resources are in offline pending state since the destination
  2510. // node went down, then you first terminate the resource list and
  2511. // then online the list. As a part of all this, the online pending
  2512. // or the online states of the resources could be propagated
  2513. // synchronously. Now, the offline notification from the previous
  2514. // offline attempt could come in and be processed by the FM worker
  2515. // thread way too late and you could have spurious resource states
  2516. // in FM while the real resource state is different. Another
  2517. // issue here is during the lengthy offline operation here, the
  2518. // destination node could go down and come back up soon after and
  2519. // so aborting the move may not be prudent in such a case.
  2520. //
  2521. // But, don't do this optimization for the quorum group. This is
  2522. // because once the quorum group is made offline, then MM
  2523. // could decide who the group owner is. So, you may not be able to
  2524. // bring the group online necessarily in this node. To avoid such
  2525. // a case, we let FmcTakeGroupRequest fail and then let either the
  2526. // retry loop here move the group somewhere else or let the
  2527. // FM node down handler decide on the group's owner consulting
  2528. // with MM.
  2529. //
  2530. if ( ( NmGetExtendedNodeState(node) != ClusterNodeUp ) &&
  2531. ( Group != gpQuoResource->Group ) )
  2532. {
  2533. status = ERROR_HOST_NODE_NOT_AVAILABLE;
  2534. ClRtlLogPrint(LOG_NOISE,
  2535. "[FM] FmpCompleteMoveGroup: Restoring group <%1!ws!> on local node due to destination node unavailability...\n",
  2536. OmObjectName(Group));
  2537. goto FnRestore;
  2538. }
  2539. // SS::
  2540. // After this point the responsibility of failing the group
  2541. // back due to resource failures is with the destination code.
  2542. // If there is a failure to bring the resources online,
  2543. // the local restart policy on the destination node must kick
  2544. // in.
  2545. //
  2546. // if there is an rpc failure to communicate with the other node
  2547. // I suppose we should bring the resources online here again
  2548. // However, rpc failures can be pretty non descriptive - there is
  2549. // no way to determine from rpc errors if the rpc call actually
  2550. // executed on the remote side
  2551. //
  2552. // but unless we are pretty careful about this and do what gum does
  2553. // on rpc failures(banish the destination node) there is no way to
  2554. // guarantee that both nodes dont retry to restart the group
  2555. // If the destination node begins the process of bringing resources
  2556. // in the group online, FmsTakeGroupRequest must return success(note
  2557. // it should not return ERROR_IO_PENDING), else
  2558. // it returns an error code and this node will bring the group back
  2559. // to its previous state.
  2560. // Assume the other node is going to take ownership. This is done
  2561. // before, in case the Group state changes. We want to accept the
  2562. // Group/resource state changes from the remote system when they
  2563. // arrive. We've already verified that node is in the preferred list!
  2564. //
  2565. //we will reacquire the lock after making the rpc call
  2566. // SS::
  2567. // After this point the responsibility of failing the group
  2568. // back due to resource failures is with the destination code.
  2569. // If there is a failure to bring the resources online,
  2570. // the local restart policy on the destination node must kick
  2571. // in.
  2572. //
  2573. // if there is an rpc failure to communicate with the other node
  2574. // I suppose we should bring the resources online here again
  2575. // However, rpc failures can be pretty non descriptive - there is
  2576. // no way to determine from rpc errors if the rpc call actually
  2577. // executed on the remote side
  2578. //
  2579. // but unless we are pretty careful about this and do what gum does
  2580. // on rpc failures(banish the destination node) there is no way to
  2581. // guarantee that both nodes dont retry to restart the group
  2582. // If the destination node begins the process of bringing resources
  2583. // in the group online, FmsTakeGroupRequest must return success(note
  2584. // it should not return ERROR_IO_PENDING), else
  2585. // it returns an error code and this node will bring the group back
  2586. // to its previous state.
  2587. // Assume the other node is going to take ownership. This is done
  2588. // before, in case the Group state changes. We want to accept the
  2589. // Group/resource state changes from the remote system when they
  2590. // arrive. We've already verified that node is in the preferred list!
  2591. //
  2592. //
  2593. // Chittur Subbaraman (chitturs) - 5/18/99
  2594. //
  2595. // Modified to handle the move group request of a quorum group in
  2596. // case the destination node could not arbitrate for the quorum
  2597. // resource.
  2598. //
  2599. do
  2600. {
  2601. //
  2602. // Before making the RPC, set the intended owner of the group
  2603. //
  2604. FmpSetIntendedOwnerForGroup( Group, NmGetNodeId( node ) );
  2605. try {
  2606. ClRtlLogPrint(LOG_NOISE,
  2607. "[FM] FmpCompleteMoveGroup: Take group %2!ws! request to remote node %1!ws!\n",
  2608. OmObjectId(node),
  2609. OmObjectId(Group));
  2610. dwMoveStatus = status = FmcTakeGroupRequest( node, OmObjectId( Group ), resourceList );
  2611. } except (I_RpcExceptionFilter(RpcExceptionCode())) {
  2612. LPCWSTR pszNodeId;
  2613. LPCWSTR pszGroupId;
  2614. status = GetExceptionCode ();
  2615. ClRtlLogPrint(LOG_NOISE,
  2616. "[FM] FmpCompleteMoveGroup: Exception in FmcTakeGroupRequest %2!ws! request to remote node %1!ws!, status=%3!u!\n",
  2617. OmObjectId(node),
  2618. OmObjectId(Group),
  2619. status);
  2620. //
  2621. // An exception from RPC indicates that the other node is either dead
  2622. // or insane. We dont know whether it took ownership or not.
  2623. // So, let the FM node down handler handle the group.
  2624. //
  2625. GumCommFailure( GumUpdateFailoverManager,
  2626. NmGetNodeId(node),
  2627. GetExceptionCode(),
  2628. TRUE );
  2629. //
  2630. // The new owner node that is now dead might have set the intended
  2631. // owner as NULL or it might not have set this. It might have
  2632. // set the owner node to himself or might not have.
  2633. // If it has set the owner node for this group as himself, then
  2634. // the FM node down handler will assume responsibility for this
  2635. // group. If the target node dies before it sets himself as the owner,
  2636. // then again, the FM node down handler will assume responsibility
  2637. // for the group. We wake up when the gum sync handling is over.
  2638. // Right now, the gum update for the owner node may still be in
  2639. // progress so we cant be sure if that update was completed on
  2640. // all nodes.
  2641. //
  2642. //
  2643. // Chittur Subbaraman (chitturs) - 6/7/99
  2644. //
  2645. // Issue a GUM update to handle this group. Using this
  2646. // GUM update prevents any race condition with the
  2647. // node down processing code.
  2648. //
  2649. //
  2650. // TODO: This does not cover the case in which
  2651. // FmpTakeGroupRequest crashes after setting the
  2652. // intended owner to invalid ID. In such a case,
  2653. // the following handler won't take ownership of the
  2654. // group. Also, claim handler will not touch the
  2655. // group.
  2656. //
  2657. pszNodeId = OmObjectId( node );
  2658. pszGroupId = OmObjectId( Group );
  2659. GumSendUpdateEx( GumUpdateFailoverManager,
  2660. FmUpdateCompleteGroupMove,
  2661. 2,
  2662. (lstrlenW(pszNodeId)+1)*sizeof(WCHAR),
  2663. pszNodeId,
  2664. (lstrlenW(pszGroupId)+1)*sizeof(WCHAR),
  2665. pszGroupId);
  2666. status = ERROR_HOST_NODE_NOT_AVAILABLE;
  2667. FM_DEREF_QUORUM_TARGET ( pQuorumTargetNode );
  2668. goto FnExit;
  2669. }
  2670. //
  2671. // If this group is the quorum group, map the error to retry in case the remote node is
  2672. // not ready. This will let this node retry the group move request.
  2673. //
  2674. if ( ( Group == gpQuoResource->Group ) &&
  2675. ( ( status == ERROR_CLUSTER_NODE_SHUTTING_DOWN ) ||
  2676. ( status == ERROR_CLUSTER_NODE_NOT_READY ) ) )
  2677. {
  2678. status = ERROR_RETRY;
  2679. //
  2680. // Give a chance for the remote node to startup or shutdown. Don't murder
  2681. // the same remote node with RPCs.
  2682. //
  2683. Sleep ( 3000 );
  2684. }
  2685. if ( status == ERROR_RETRY )
  2686. {
  2687. //
  2688. // The destination refused to take the quorum group since it
  2689. // did not win the arbitration. So let us see who won the
  2690. // arbitration.
  2691. //
  2692. DWORD dwSelectedQuorumOwnerId;
  2693. CL_ASSERT( Group == gpQuoResource->Group );
  2694. ClRtlLogPrint(LOG_NOISE,
  2695. "[FM] FmpCompleteMoveGroup: Remote node asked us to resend take group request for group %1!ws! to another node ...\n",
  2696. OmObjectId( Group ));
  2697. //
  2698. // Get the ID of the node which the MM believes is the best
  2699. // candidate to own the quorum resource. This is a call that
  2700. // blocks while RGP is in progress.
  2701. //
  2702. MMApproxArbitrationWinner( &dwSelectedQuorumOwnerId );
  2703. if ( ( dwSelectedQuorumOwnerId == NmGetNodeId( NmLocalNode ) ) ||
  2704. ( dwSelectedQuorumOwnerId == MM_INVALID_NODE ) )
  2705. {
  2706. //
  2707. // The local node is chosen by MM or no node is chosen by
  2708. // the MM. The latter case will happen if no RGP has
  2709. // occurred at the time this call is made. Let us see if we
  2710. // can arbitrate for the quorum resource.
  2711. //
  2712. status = FmpRmArbitrateResource( gpQuoResource );
  2713. if ( status != ERROR_SUCCESS )
  2714. {
  2715. //
  2716. // Too bad. We will halt and let FmpNodeDown handler
  2717. // handle the quorum group.
  2718. //
  2719. ClRtlLogPrint(LOG_NOISE,
  2720. "[FM] FmpCompleteMoveGroup: Local node %1!u! cannot arbitrate for quorum group %3!ws!, Status = %2!u!...\n",
  2721. dwSelectedQuorumOwnerId,
  2722. status,
  2723. OmObjectId( Group ));
  2724. CsInconsistencyHalt( ERROR_QUORUM_RESOURCE_ONLINE_FAILED );
  2725. }
  2726. status = ERROR_RETRY;
  2727. break;
  2728. }
  2729. FM_DEREF_QUORUM_TARGET ( pQuorumTargetNode );
  2730. pQuorumTargetNode = node = NmReferenceNodeById( dwSelectedQuorumOwnerId );
  2731. if ( node == NULL )
  2732. {
  2733. ClRtlLogPrint(LOG_CRITICAL,
  2734. "[FM] FmpCompleteMoveGroup: Selected node %1!u! cannot be referenced...\n",
  2735. dwSelectedQuorumOwnerId);
  2736. CsInconsistencyHalt( ERROR_QUORUM_RESOURCE_ONLINE_FAILED );
  2737. }
  2738. } // if
  2739. } while ( status == ERROR_RETRY );
  2740. FM_DEREF_QUORUM_TARGET ( pQuorumTargetNode );
  2741. // At this point, the onus of taking care of the group is with the
  2742. // destination node whether it means restarting the group or
  2743. // failing it back
  2744. FnRestore:
  2745. //if there is any failure try and restore the previous states
  2746. if ((status != ERROR_IO_PENDING) && (status != ERROR_SUCCESS))
  2747. {
  2748. //
  2749. //
  2750. // Notify the group's resources that this is a failed move. Note that this should be
  2751. // done BEFORE any future onlines. Also, mark the fStateChangeReasonNotified to be FALSE
  2752. // so that we won't drop the failed move reason down below also.
  2753. //
  2754. FmpNotifyGroupStateChangeReason( Group, eResourceStateChangeReasonFailedMove );
  2755. fStateChangeReasonNotified = FALSE;
  2756. //
  2757. // Chittur Subbaraman (chitturs) - 3/22/2000
  2758. //
  2759. // Reset the group's intended owner to invalid node ID if the
  2760. // node down handler did not do that.
  2761. //
  2762. if ( dwMoveStatus != ERROR_SUCCESS )
  2763. {
  2764. if ( FmpSetIntendedOwnerForGroup( Group, ClusterInvalidNodeId )
  2765. == ERROR_CLUSTER_INVALID_NODE )
  2766. {
  2767. ClRtlLogPrint(LOG_NOISE,
  2768. "[FM] FmpCompleteMoveGroup: Group <%1!ws!> has already been processed by node down handler....\r\n",
  2769. OmObjectName(Group));
  2770. goto FnExit;
  2771. }
  2772. }
  2773. if (resourceList)
  2774. {
  2775. FmpTerminateResourceList( resourceList );
  2776. //
  2777. // Chittur Subbaraman (chitturs) - 4/10/2000
  2778. //
  2779. // Make sure to online the quorum group even if this node is
  2780. // shutting down. This is necessary so that other groups
  2781. // can be brought offline during this node's shutdown. Note
  2782. // that FmpOnlineResourceList would only online a group
  2783. // during a shutdown if the group is the quorum group.
  2784. //
  2785. if ( FmpFMGroupsInited )
  2786. FmpOnlineResourceList( resourceList, Group );
  2787. }
  2788. } else
  2789. {
  2790. //
  2791. // Chittur Subbaraman (chitturs) - 4/19/99
  2792. //
  2793. // If the FmpDoMoveGroupOnFailure thread is also waiting to do the
  2794. // move, then tell that thread to take its hands off.
  2795. //
  2796. if ( Group->dwStructState & FM_GROUP_STRUCT_MARKED_FOR_MOVE_ON_FAIL )
  2797. {
  2798. Group->dwStructState |= FM_GROUP_STRUCT_MARKED_FOR_REGULAR_MOVE;
  2799. }
  2800. }
  2801. FnExit:
  2802. ClRtlLogPrint(LOG_NOISE,
  2803. "[FM] FmpCompleteMoveGroup: Exit, status = %1!u!\r\n",
  2804. status);
  2805. //if the status is success or some other error, clean up the resource list
  2806. if (status != ERROR_IO_PENDING)
  2807. {
  2808. if ( ( status != ERROR_SUCCESS ) && ( fStateChangeReasonNotified == TRUE ) )
  2809. {
  2810. //
  2811. //
  2812. // Notify the group's resources that this is a failed move.
  2813. //
  2814. FmpNotifyGroupStateChangeReason( Group, eResourceStateChangeReasonFailedMove );
  2815. }
  2816. if (resourceList)
  2817. {
  2818. FmpDeleteResourceEnum( resourceList );
  2819. Group->MovingList = NULL;
  2820. }
  2821. }
  2822. FmpReleaseLocalGroupLock( Group );
  2823. return(status);
  2824. } // FmpCompleteMoveGroup
  2825. DWORD
  2826. FmpMovePendingThread(
  2827. IN LPVOID Context
  2828. )
  2829. /*++
  2830. Routine Description:
  2831. Continue trying to move a group if ERROR_IO_PENDING is returned.
  2832. We need to perform this operation, because part way through a move
  2833. request, we could get a pending return status. The processing of the
  2834. request is halted and the pending status is returned. However, the
  2835. remainder of the move operation needs to be performed.
  2836. Arguments:
  2837. Context - Pointer to the MOVE_GROUP structure to move.
  2838. Returns:
  2839. ERROR_SUCCESS.
  2840. --*/
  2841. {
  2842. PMOVE_GROUP moveGroup = (PMOVE_GROUP)Context;
  2843. PFM_GROUP group;
  2844. PNM_NODE node;
  2845. DWORD status;
  2846. DWORD loopCount = 100; // Only try this so many times and then give up
  2847. HANDLE waitArray[2];
  2848. group = moveGroup->Group;
  2849. node = moveGroup->DestinationNode;
  2850. ClRtlLogPrint(LOG_NOISE,
  2851. "[FM] FmpMovePendingThread Entry.\n");
  2852. //
  2853. // We must attempt to finish the move request for this Group.
  2854. //
  2855. // We are waiting for a resource to go offline and it finally goes
  2856. // offline and the Group's pending event is set.
  2857. //
  2858. // Or we are waiting for cluster shutdown (FmpShutdownEvent)
  2859. //
  2860. WaitSomeMore:
  2861. //acquire the lock since fmpwaitforgroup() releases it
  2862. FmpAcquireLocalGroupLock( group );
  2863. status = FmpWaitForGroup(group);
  2864. if (status == ERROR_SHUTDOWN_IN_PROGRESS) {
  2865. //
  2866. // We've been asked to shutdown
  2867. //
  2868. } else if (status == ERROR_SUCCESS) {
  2869. //acquire the group lock before calling FmpCompleteMoveGroup
  2870. FmpAcquireLocalGroupLock( group );
  2871. status = FmpCompleteMoveGroup( group, node );
  2872. if ( status == ERROR_IO_PENDING ) {
  2873. Sleep(500); // [HACKHACK] kludgy, I know, but nice solution might break something else
  2874. goto WaitSomeMore;
  2875. }
  2876. } else {
  2877. ClRtlLogPrint(LOG_UNUSUAL,
  2878. "[FM] FmpMovePendingThread got error %1!d! waiting for group to shutdown.\n",
  2879. status);
  2880. }
  2881. //
  2882. // We're done with the move now.
  2883. //
  2884. if ( status != ERROR_IO_PENDING ) {
  2885. CL_ASSERT( group->MovingList == NULL );
  2886. }
  2887. //
  2888. // Now dereference the Group and node object (if non-NULL) and
  2889. // free our local context.
  2890. //
  2891. OmDereferenceObject( group );
  2892. if ( node != NULL ) {
  2893. OmDereferenceObject( node );
  2894. }
  2895. LocalFree( Context );
  2896. ClRtlLogPrint(LOG_NOISE,
  2897. "[FM] FmpMovePendingThread Exit.\n");
  2898. return(ERROR_SUCCESS);
  2899. } // FmpMovePendingThread
  2900. DWORD
  2901. FmpCreateMovePendingThread(
  2902. IN PFM_GROUP Group,
  2903. IN PNM_NODE DestinationNode
  2904. )
  2905. /*++
  2906. Routine Description:
  2907. Crate a thread that will continue to call the move routine for a given
  2908. Group.
  2909. Arguments:
  2910. Group - A pointer to the Group to move.
  2911. DestinationNode - The destination node for the move request.
  2912. Returns:
  2913. ERROR_IO_PENDING if the thread was created successfully. This assumes
  2914. that this routine was called because of this error return.
  2915. A Win32 error code on failure.
  2916. --*/
  2917. {
  2918. HANDLE threadHandle=NULL;
  2919. DWORD threadId;
  2920. PMOVE_GROUP context=NULL;
  2921. DWORD status=ERROR_IO_PENDING; //assume success
  2922. FmpAcquireLocalGroupLock( Group );
  2923. if ( Group->OwnerNode != NmLocalNode ) {
  2924. status = ERROR_HOST_NODE_NOT_RESOURCE_OWNER;
  2925. goto FnExit;
  2926. }
  2927. //
  2928. // If there is a pending event, then the group is not available for any
  2929. // new requests.
  2930. //
  2931. if ( FmpIsGroupPending(Group) ) {
  2932. status = ERROR_GROUP_NOT_AVAILABLE;
  2933. goto FnExit;
  2934. }
  2935. context = LocalAlloc(LMEM_FIXED, sizeof(MOVE_GROUP));
  2936. if ( context == NULL ) {
  2937. status = ERROR_NOT_ENOUGH_MEMORY;
  2938. goto FnExit;
  2939. }
  2940. //
  2941. // Keep reference on the Group and node object (if present) while we
  2942. // retain pointers.
  2943. //
  2944. OmReferenceObject( Group );
  2945. if ( DestinationNode != NULL ) {
  2946. OmReferenceObject( DestinationNode );
  2947. }
  2948. //
  2949. // Fill in context fields
  2950. //
  2951. context->Group = Group;
  2952. context->DestinationNode = DestinationNode;
  2953. threadHandle = CreateThread( NULL,
  2954. 0,
  2955. FmpMovePendingThread,
  2956. context,
  2957. 0,
  2958. &threadId );
  2959. if ( threadHandle == NULL )
  2960. {
  2961. OmDereferenceObject( Group );
  2962. if ( DestinationNode != NULL ) {
  2963. OmDereferenceObject( DestinationNode );
  2964. }
  2965. status = GetLastError();
  2966. LocalFree(context);
  2967. goto FnExit;
  2968. }
  2969. FnExit:
  2970. if (threadHandle) CloseHandle( threadHandle );
  2971. FmpReleaseLocalGroupLock( Group );
  2972. return(status);
  2973. } // FmpCreateMovePendingThread
  2974. DWORD
  2975. FmpDoMoveGroup(
  2976. IN PFM_GROUP Group,
  2977. IN PNM_NODE DestinationNode,
  2978. IN BOOL bChooseMostPreferredNode
  2979. )
  2980. /*++
  2981. Routine Description:
  2982. This routine performs the action of moving a Group. This requires taking
  2983. a Group offline and then bringing the Group online. The Offline and
  2984. Online requests may pend, so we have to pick up the work in order to
  2985. complete the request. This means handling the offline pending case, since
  2986. the online pending request will eventually complete.
  2987. Arguments:
  2988. Group - The Group to move.
  2989. DestinationNode - The destination node for the move request.
  2990. bChooseMostPreferredNode - If the destination node is not supplied,
  2991. indicates whether to choose the most preferred node or not.
  2992. Returns:
  2993. ERROR_SUCCESS if successful.
  2994. A Win32 error code on failure.
  2995. --*/
  2996. {
  2997. DWORD status;
  2998. PNM_NODE node;
  2999. PNM_NODE ChosenDestinationNode = NULL;
  3000. //
  3001. // We can only support one request on this Group at a time.
  3002. //
  3003. ClRtlLogPrint(LOG_NOISE,
  3004. "[FM] FmpDoMoveGroup: Entry\r\n");
  3005. FmpAcquireLocalGroupLock( Group );
  3006. //if the group has been marked for delete, then fail this call
  3007. if (!IS_VALID_FM_GROUP(Group))
  3008. {
  3009. FmpReleaseLocalGroupLock( Group);
  3010. return (ERROR_GROUP_NOT_AVAILABLE);
  3011. }
  3012. if ( FmpIsGroupPending(Group) ) {
  3013. FmpReleaseLocalGroupLock( Group );
  3014. return(ERROR_GROUP_NOT_AVAILABLE);
  3015. }
  3016. node = Group->OwnerNode;
  3017. // Note: the local group lock is released by the FmpMoveGroup routine.
  3018. status = FmpMoveGroup( Group, DestinationNode, FALSE, &ChosenDestinationNode, bChooseMostPreferredNode );
  3019. //
  3020. // If we were the owner of the group and the request is pending, then
  3021. // start a thread to complete the move request.
  3022. //
  3023. if ( (node == NmLocalNode) &&
  3024. (status == ERROR_IO_PENDING) ) {
  3025. status = FmpCreateMovePendingThread( Group, ChosenDestinationNode );
  3026. }
  3027. //
  3028. // Chittur Subbaraman (chitturs) - 7/31/2000
  3029. //
  3030. // Log an event to the eventlog if the group is moving due to a failure.
  3031. //
  3032. if ( ( bChooseMostPreferredNode == FALSE ) &&
  3033. ( ( status == ERROR_SUCCESS ) || ( status == ERROR_IO_PENDING ) ) )
  3034. {
  3035. CsLogEvent3( LOG_NOISE,
  3036. FM_EVENT_GROUP_FAILOVER,
  3037. OmObjectName(Group),
  3038. OmObjectName(NmLocalNode),
  3039. OmObjectName(ChosenDestinationNode) );
  3040. }
  3041. ClRtlLogPrint(LOG_NOISE,
  3042. "[FM] FmpDoMoveGroup: Exit, status = %1!u!\r\n",
  3043. status);
  3044. return(status);
  3045. } // FmpDoMoveGroup
  3046. DWORD
  3047. FmpTakeGroupRequest(
  3048. IN PFM_GROUP Group,
  3049. IN PRESOURCE_ENUM ResourceList
  3050. )
  3051. /*++
  3052. Routine Description:
  3053. Performs a Take Group Request from (THE) remote system and returns
  3054. status for that request.
  3055. Arguments:
  3056. Group - The Group to take online locally.
  3057. ResourceList - The list of resources and their states.
  3058. Return Value:
  3059. ERROR_SUCCESS if successful.
  3060. A Win32 error code on error.
  3061. --*/
  3062. {
  3063. DWORD status = ERROR_SUCCESS;
  3064. ClRtlLogPrint(LOG_NOISE,
  3065. "[FM] FmpTakeGroupRequest: To take group '%1!ws!'.\n",
  3066. OmObjectId(Group) );
  3067. FmpAcquireLocalGroupLock( Group );
  3068. if ( !FmpFMOnline )
  3069. {
  3070. if (FmpShutdown)
  3071. status = ERROR_CLUSTER_NODE_SHUTTING_DOWN;
  3072. else
  3073. status = ERROR_CLUSTER_NODE_NOT_READY;
  3074. CL_LOGFAILURE(status);
  3075. ClRtlLogPrint(LOG_NOISE,
  3076. "[FM] FmpTakeGroupRequest: Group '%1!ws!' cannot be accepted, status=%2!u!...\n",
  3077. OmObjectId(Group),
  3078. status);
  3079. goto FnExit;
  3080. }
  3081. //every body should be able to host the quorum group
  3082. //so we dont check the prefferred owner list for this group
  3083. if ( ( gpQuoResource->Group != Group) &&
  3084. !FmpInPreferredList( Group, NmLocalNode, FALSE, NULL) )
  3085. {
  3086. //
  3087. // Nobody should ever ask us to take a group that can't run here.
  3088. //
  3089. status = ERROR_CLUSTER_NODE_NOT_FOUND;
  3090. CL_LOGFAILURE( status);
  3091. goto FnExit;
  3092. }
  3093. //
  3094. // Take ownership of the Group.
  3095. //
  3096. if ( Group->OwnerNode == NmLocalNode ) {
  3097. //SS:://We are alreay the owner ?? How did this happen
  3098. status = ERROR_SUCCESS;
  3099. goto FnExit;
  3100. }
  3101. //
  3102. // Chittur Subbaraman (chitturs) - 5/18/99
  3103. //
  3104. // Handle quorum group in a special way. Make sure you can arbitrate
  3105. // for the quorum resource. If not, you could get killed when you
  3106. // try to bring it online and you fail.
  3107. //
  3108. if ( Group == gpQuoResource->Group )
  3109. {
  3110. //call FmpArbitrateResource() instead of FmpRmArbitrateResource() in order to give
  3111. //the chance to initialize incase move is called immediately after an install of a
  3112. //third party quorum resource dll
  3113. status = FmpArbitrateResource( gpQuoResource );
  3114. if ( status != ERROR_SUCCESS )
  3115. {
  3116. ClRtlLogPrint(LOG_NOISE,
  3117. "[FM] FmpTakeGroupRequest: MM did not select local node %1!u! as the arbitration winner, Status %2!u!\n",
  3118. NmLocalNodeId,
  3119. status);
  3120. status = ERROR_RETRY;
  3121. goto FnExit;
  3122. }
  3123. }
  3124. status = FmpSetOwnerForGroup( Group, NmLocalNode );
  3125. if ( status != ERROR_SUCCESS )
  3126. {
  3127. ClRtlLogPrint(LOG_NOISE,
  3128. "[FM] FmpTakeGroupRequest: Set owner GUM update returns %1!u! for group <%2!ws!>...\n\r",
  3129. status,
  3130. OmObjectId(Group));
  3131. if ( status == ERROR_GROUP_NOT_AVAILABLE )
  3132. {
  3133. //
  3134. // If the node down processing GUM handler has claimed ownership
  3135. // of this group, consider everything as being fine.
  3136. //
  3137. status = ERROR_SUCCESS;
  3138. }
  3139. goto FnExit;
  3140. }
  3141. FmpSetIntendedOwnerForGroup(Group, ClusterInvalidNodeId);
  3142. // prepare to bring this group online
  3143. FmpPrepareGroupForOnline( Group );
  3144. //
  3145. // Online what needs to be online.
  3146. //
  3147. // SS: Note that we ignore the error from FmpOnlineResourceList
  3148. // This is because at this point the onus of taking care of the group
  3149. // is with us.
  3150. //
  3151. FmpOnlineResourceList( ResourceList, Group );
  3152. FnExit:
  3153. FmpReleaseLocalGroupLock( Group );
  3154. ClRtlLogPrint(LOG_NOISE,
  3155. "[FM] FmpTakeGroupRequest: Exit for group <%1!ws!>, Status = %2!u!...\n",
  3156. OmObjectId(Group),
  3157. status);
  3158. return(status);
  3159. } // FmpTakeGroupRequest
  3160. DWORD
  3161. FmpUpdateChangeGroupName(
  3162. IN BOOL SourceNode,
  3163. IN LPCWSTR GroupId,
  3164. IN LPCWSTR NewName
  3165. )
  3166. /*++
  3167. Routine Description:
  3168. GUM dispatch routine for changing the friendly name of a group.
  3169. Arguments:
  3170. SourceNode - Supplies whether or not this node initiated the GUM update.
  3171. Not used.
  3172. ResourceId - Supplies the group ID.
  3173. NewName - Supplies the new friendly name.
  3174. Return Value:
  3175. ERROR_SUCCESS if successful.
  3176. Win32 error code otherwise.
  3177. --*/
  3178. {
  3179. PFM_GROUP Group;
  3180. DWORD Status;
  3181. //
  3182. // Chittur Subbaraman (chitturs) - 4/19/98
  3183. //
  3184. // If FM groups are not initialized or FM is shutting down, don't
  3185. // do anything.
  3186. //
  3187. if ( !FmpFMGroupsInited ||
  3188. FmpShutdown ) {
  3189. return(ERROR_SUCCESS);
  3190. }
  3191. Group = OmReferenceObjectById(ObjectTypeGroup, GroupId);
  3192. if (Group == NULL) {
  3193. return(ERROR_GROUP_NOT_FOUND);
  3194. }
  3195. Status = OmSetObjectName( Group, NewName);
  3196. if (Status == ERROR_SUCCESS) {
  3197. ClusterEvent(CLUSTER_EVENT_GROUP_PROPERTY_CHANGE, Group);
  3198. }
  3199. OmDereferenceObject(Group);
  3200. return(Status);
  3201. } // FmpUpdateChangeGroupName
  3202. BOOL
  3203. FmpEnumGroupNodeEvict(
  3204. IN PVOID Context1,
  3205. IN PVOID Context2,
  3206. IN PVOID Object,
  3207. IN LPCWSTR Name
  3208. )
  3209. /*++
  3210. Routine Description:
  3211. Group enumeration callback for removing node references when
  3212. a node is evicted.
  3213. Arguments:
  3214. Context1 - Supplies the node that is being evicted.
  3215. Context2 - not used
  3216. Object - Supplies a pointer to the group object
  3217. Name - Supplies the object name.
  3218. Return Value:
  3219. TRUE to continue enumeration
  3220. --*/
  3221. {
  3222. PFM_GROUP Group = (PFM_GROUP)Object;
  3223. PNM_NODE Node = (PNM_NODE)Context1;
  3224. PLIST_ENTRY listEntry;
  3225. PPREFERRED_ENTRY preferredEntry;
  3226. ClRtlLogPrint(LOG_NOISE,
  3227. "[FM] EnumGroupNodeEvict: Removing references to node %1!ws! from group %2!ws!\n",
  3228. OmObjectId(Node),
  3229. OmObjectId(Group));
  3230. FmpAcquireLocalGroupLock(Group);
  3231. //
  3232. // Walk the list of preferred owners. If this node is in the list, remove it.
  3233. //
  3234. for ( listEntry = Group->PreferredOwners.Flink;
  3235. listEntry != &(Group->PreferredOwners);
  3236. listEntry = listEntry->Flink ) {
  3237. preferredEntry = CONTAINING_RECORD( listEntry,
  3238. PREFERRED_ENTRY,
  3239. PreferredLinkage );
  3240. if ( preferredEntry->PreferredNode == Node ) {
  3241. RemoveEntryList(&preferredEntry->PreferredLinkage);
  3242. OmDereferenceObject(preferredEntry->PreferredNode);
  3243. LocalFree(preferredEntry);
  3244. break;
  3245. }
  3246. }
  3247. FmpReleaseLocalGroupLock(Group);
  3248. ClusterEvent(CLUSTER_EVENT_GROUP_PROPERTY_CHANGE, Group);
  3249. return(TRUE);
  3250. } // FmpEnumGroupNodeEvict
  3251. VOID FmpCheckForGroupCompletionEvent(
  3252. IN PFM_GROUP pGroup)
  3253. {
  3254. CLUSTER_GROUP_STATE GroupState;
  3255. //check the struct state to see if an event log needs to be logged
  3256. if (pGroup->dwStructState & FM_GROUP_STRUCT_MARKED_FOR_COMPLETION_EVENT)
  3257. {
  3258. //find the state of the group and log it
  3259. GroupState = FmpGetGroupState(pGroup, FALSE);
  3260. switch(GroupState)
  3261. {
  3262. case ClusterGroupOnline:
  3263. FmpLogGroupInfoEvent1( FM_EVENT_GROUP_COMPLETE_ONLINE, OmObjectName(pGroup));
  3264. //reset the state
  3265. pGroup->dwStructState &= ~FM_GROUP_STRUCT_MARKED_FOR_COMPLETION_EVENT;
  3266. break;
  3267. case ClusterGroupOffline:
  3268. FmpLogGroupInfoEvent1( FM_EVENT_GROUP_COMPLETE_OFFLINE, OmObjectName(pGroup));
  3269. //reset the state
  3270. pGroup->dwStructState &= ~FM_GROUP_STRUCT_MARKED_FOR_COMPLETION_EVENT;
  3271. break;
  3272. case ClusterGroupPartialOnline:
  3273. //SS: treat partial online as failing to bring a group completely online
  3274. FmpLogGroupInfoEvent1( FM_EVENT_GROUP_FAILED_ONLINE_OFFLINE, OmObjectName(pGroup));
  3275. //reset the state
  3276. pGroup->dwStructState &= ~FM_GROUP_STRUCT_MARKED_FOR_COMPLETION_EVENT;
  3277. break;
  3278. case ClusterGroupFailed:
  3279. FmpLogGroupInfoEvent1( FM_EVENT_GROUP_FAILED_ONLINE_OFFLINE, OmObjectName(pGroup));
  3280. //reset the state
  3281. pGroup->dwStructState &= ~FM_GROUP_STRUCT_MARKED_FOR_COMPLETION_EVENT;
  3282. break;
  3283. case ClusterGroupPending:
  3284. //it is not time to log an event as yet
  3285. //wait for another signal to log the event and reset the bit
  3286. break;
  3287. default:
  3288. ClRtlLogPrint(LOG_CRITICAL,
  3289. "[FM] FmpCheckForGroupCompletionEvent: The state %1!u! for Group %2!ws! is unexpected\r\n",
  3290. GroupState, OmObjectId(pGroup));
  3291. CL_ASSERT(FALSE);
  3292. }
  3293. }
  3294. }
  3295. VOID
  3296. FmpSignalGroupWaiters(
  3297. IN PFM_GROUP Group
  3298. )
  3299. /*++
  3300. Routine Description:
  3301. Wakes up any threads waiting for this group to achieve a
  3302. stable state.
  3303. Arguments:
  3304. Group - Supplies the group.
  3305. Return Value:
  3306. None.
  3307. --*/
  3308. {
  3309. PLIST_ENTRY ListEntry;
  3310. PFM_WAIT_BLOCK WaitBlock;
  3311. FmpAcquireLocalGroupLock( Group );
  3312. while (!IsListEmpty(&Group->WaitQueue)) {
  3313. ListEntry = RemoveHeadList(&Group->WaitQueue);
  3314. WaitBlock = CONTAINING_RECORD(ListEntry,
  3315. FM_WAIT_BLOCK,
  3316. ListEntry);
  3317. WaitBlock->Status = ERROR_SUCCESS;
  3318. SetEvent(WaitBlock->hEvent);
  3319. }
  3320. FmpReleaseLocalGroupLock( Group );
  3321. }
  3322. DWORD
  3323. FmpWaitForGroup(
  3324. IN PFM_GROUP Group
  3325. )
  3326. /*++
  3327. Routine Description:
  3328. Waits for a group to reach a stable state.
  3329. Arguments:
  3330. Group - supplies the group
  3331. Comments - Assumption, is that the group lock is held when this is called.
  3332. This function releases the group lock before the wait
  3333. Return Value:
  3334. ERROR_SUCCESS if successful
  3335. ERROR_SHUTDOWN_IN_PROGRESS if the cluster is being shutdown
  3336. Win32 error code otherwise
  3337. --*/
  3338. {
  3339. FM_WAIT_BLOCK WaitBlock;
  3340. HANDLE WaitArray[2];
  3341. DWORD Status;
  3342. CLUSTER_GROUP_STATE GroupState;
  3343. WaitBlock.hEvent = CreateEvent(NULL, TRUE, FALSE, NULL);
  3344. if (WaitBlock.hEvent == NULL) {
  3345. FmpReleaseLocalGroupLock( Group );
  3346. return(GetLastError());
  3347. }
  3348. //
  3349. // Check to see if it transitioned before we got the lock.
  3350. //
  3351. GroupState = FmpGetGroupState( Group , TRUE );
  3352. if ((GroupState == ClusterGroupOffline) ||
  3353. (GroupState == ClusterGroupOnline) ||
  3354. (GroupState == ClusterGroupFailed)) {
  3355. CloseHandle( WaitBlock.hEvent );
  3356. FmpReleaseLocalGroupLock( Group );
  3357. return(ERROR_SUCCESS);
  3358. }
  3359. //
  3360. // Chittur Subbaraman (chitturs) - 10/31/1999
  3361. //
  3362. // Now before waiting, really make sure one or more resources in the
  3363. // group is in pending state.
  3364. //
  3365. GroupState = FmpGetGroupState( Group, FALSE );
  3366. if ( GroupState != ClusterGroupPending ) {
  3367. CloseHandle( WaitBlock.hEvent );
  3368. FmpReleaseLocalGroupLock( Group );
  3369. ClRtlLogPrint(LOG_NOISE,
  3370. "[FM] FmpWaitForGroup: Group <%1!ws!> state is %2!d!, not waiting for event...\r\n",
  3371. OmObjectName(Group),
  3372. GroupState );
  3373. return( ERROR_SUCCESS );
  3374. }
  3375. //
  3376. // Add this wait block to the queue.
  3377. //
  3378. InsertTailList(&Group->WaitQueue, &WaitBlock.ListEntry);
  3379. FmpReleaseLocalGroupLock( Group );
  3380. //
  3381. // Wait for the group to become stable or for the cluster to shutdown.
  3382. //
  3383. WaitArray[0] = FmpShutdownEvent;
  3384. WaitArray[1] = WaitBlock.hEvent;
  3385. Status = WaitForMultipleObjects(2, WaitArray, FALSE, INFINITE);
  3386. CloseHandle(WaitBlock.hEvent);
  3387. if (Status == 0) {
  3388. return(ERROR_SHUTDOWN_IN_PROGRESS);
  3389. } else {
  3390. return(WaitBlock.Status);
  3391. }
  3392. }
  3393. /****
  3394. @func DWORD | FmpDeleteGroup| This makes the gum call to delete the
  3395. group.
  3396. @parm IN PFM_GROUP | pGroup | The group that must be deleted.
  3397. @comm The group lock must be held when calling this api.
  3398. @rdesc Returns a result code. ERROR_SUCCESS on success.
  3399. ****/
  3400. DWORD
  3401. FmpDeleteGroup(
  3402. IN PFM_GROUP pGroup)
  3403. {
  3404. PCWSTR pszGroupId;
  3405. DWORD dwBufSize;
  3406. DWORD dwGroupLen;
  3407. DWORD dwStatus;
  3408. pszGroupId = OmObjectId( pGroup );
  3409. dwGroupLen = (lstrlenW(pszGroupId)+1) * sizeof(WCHAR);
  3410. //
  3411. // Send message.
  3412. //
  3413. dwStatus = GumSendUpdateEx(GumUpdateFailoverManager,
  3414. FmUpdateDeleteGroup,
  3415. 1,
  3416. dwGroupLen,
  3417. pszGroupId);
  3418. return(dwStatus);
  3419. }
  3420. VOID
  3421. FmpGroupLastReference(
  3422. IN PFM_GROUP pGroup
  3423. )
  3424. /*++
  3425. Routine Description:
  3426. Last dereference to group object processing routine.
  3427. All cleanup for a group should really be done here!
  3428. Arguments:
  3429. Resource - pointer the group being removed.
  3430. Return Value:
  3431. None.
  3432. --*/
  3433. {
  3434. if ( pGroup->OwnerNode != NULL )
  3435. OmDereferenceObject(pGroup->OwnerNode);
  3436. if (pGroup->dwStructState & FM_GROUP_STRUCT_CREATED)
  3437. DeleteCriticalSection(&pGroup->Lock);
  3438. return;
  3439. } // FmpGroupLastReference
  3440. DWORD
  3441. FmpDoMoveGroupOnFailure(
  3442. IN LPVOID pContext
  3443. )
  3444. /*++
  3445. Routine Description:
  3446. Move a group after ensuring that all resources in the group are
  3447. in stable state. This thread is forked from FmpHandleGroupFailure.
  3448. Arguments:
  3449. pContext - Pointer to the MOVE_GROUP structure to move.
  3450. Returns:
  3451. ERROR_SUCCESS.
  3452. --*/
  3453. {
  3454. PMOVE_GROUP pMoveGroup = ( PMOVE_GROUP ) pContext;
  3455. PFM_GROUP pGroup;
  3456. DWORD dwStatus;
  3457. PLIST_ENTRY pListEntry;
  3458. PFM_RESOURCE pResource;
  3459. //
  3460. // Chittur Subbaraman (chitturs) - 4/13/99
  3461. //
  3462. // This thread first waits until all the resources within the
  3463. // failed group are in stable state and then initiates the
  3464. // move.
  3465. //
  3466. pGroup = pMoveGroup->Group;
  3467. ClRtlLogPrint(LOG_NOISE,
  3468. "[FM] FmpDoMoveGroupOnFailure: Entry for Group <%1!ws!>...\n",
  3469. OmObjectId(pGroup));
  3470. TryAgain:
  3471. FmpAcquireLocalGroupLock( pGroup );
  3472. //
  3473. // This thread must yield if someone else takes responsibility for
  3474. // the move.
  3475. //
  3476. // Condition 1: Protects against the case in which someone moves
  3477. // the group to another node and back to you while this thread is
  3478. // sleeping (very rare, I agree).
  3479. //
  3480. // Condition 2: Protects against the common move case.
  3481. //
  3482. // Condition 3: Protects against the case in which the
  3483. // FmpMovePendingThread is waiting in FmpWaitForGroup while
  3484. // this thread got the resource lock and reached here.
  3485. //
  3486. if ( ( pGroup->dwStructState &
  3487. FM_GROUP_STRUCT_MARKED_FOR_REGULAR_MOVE ) ||
  3488. ( pGroup->OwnerNode != NmLocalNode ) ||
  3489. ( pGroup->MovingList != NULL ) )
  3490. {
  3491. ClRtlLogPrint(LOG_NOISE,
  3492. "[FM] FmpDoMoveGroupOnFailure: Group <%1!ws!> move being yielded to someone else who is moving it...\n",
  3493. OmObjectId(pGroup));
  3494. goto FnExit;
  3495. }
  3496. //
  3497. // If FM is shutting down, just exit.
  3498. //
  3499. if ( FmpShutdown )
  3500. {
  3501. ClRtlLogPrint(LOG_NOISE,
  3502. "[FM] FmpDoMoveGroupOnFailure: Giving up Group <%1!ws!> move. FM is shutting down ...\n",
  3503. OmObjectId(pGroup));
  3504. goto FnExit;
  3505. }
  3506. //
  3507. // If the group has been marked for delete, then also exit. This is
  3508. // just an optimization. FmpDoMoveGroup does this check also.
  3509. //
  3510. if ( !IS_VALID_FM_GROUP( pGroup ) )
  3511. {
  3512. ClRtlLogPrint(LOG_NOISE,
  3513. "[FM] FmpDoMoveGroupOnFailure: Group <%1!ws!> marked for delete. Exiting ...\n",
  3514. OmObjectId(pGroup));
  3515. goto FnExit;
  3516. }
  3517. //
  3518. // Wait until all resources within the group become stable.
  3519. //
  3520. for ( pListEntry = pGroup->Contains.Flink;
  3521. pListEntry != &(pGroup->Contains);
  3522. pListEntry = pListEntry->Flink )
  3523. {
  3524. pResource = CONTAINING_RECORD( pListEntry,
  3525. FM_RESOURCE,
  3526. ContainsLinkage );
  3527. if ( pResource->State > ClusterResourcePending )
  3528. {
  3529. FmpReleaseLocalGroupLock( pGroup );
  3530. Sleep ( 200 );
  3531. goto TryAgain;
  3532. }
  3533. }
  3534. //
  3535. // Initiate a move now that the group is quiet.
  3536. //
  3537. dwStatus = FmpDoMoveGroup( pGroup, NULL, FALSE );
  3538. ClRtlLogPrint(LOG_NOISE,
  3539. "[FM] FmpDoMoveGroupOnFailure: FmpDoMoveGroup returns %1!u!\n",
  3540. dwStatus);
  3541. FnExit:
  3542. LocalFree( pContext );
  3543. pGroup->dwStructState &=
  3544. ~( FM_GROUP_STRUCT_MARKED_FOR_MOVE_ON_FAIL | FM_GROUP_STRUCT_MARKED_FOR_REGULAR_MOVE );
  3545. FmpReleaseLocalGroupLock( pGroup );
  3546. OmDereferenceObject( pGroup );
  3547. ClRtlLogPrint(LOG_NOISE,
  3548. "[FM] FmpDoMoveGroupOnFailure Exit.\n");
  3549. return( ERROR_SUCCESS );
  3550. } // FmpDoMoveGroupOnFailure
  3551. /****
  3552. @func DWORD | FmpSetIntendedOwnerForGroup| This makes the gum call
  3553. to set the intended owner for the group before a move.
  3554. @parm IN PFM_GROUP | pGroup | The group whose intended owner
  3555. is to be set.
  3556. @comm The local group lock is held while making this call.
  3557. @rdesc Returns a result code. ERROR_SUCCESS on success.
  3558. ****/
  3559. DWORD FmpSetIntendedOwnerForGroup(
  3560. IN PFM_GROUP pGroup,
  3561. IN DWORD dwNodeId)
  3562. {
  3563. PCWSTR pszGroupId;
  3564. DWORD dwGroupLen;
  3565. DWORD dwStatus;
  3566. pszGroupId = OmObjectId( pGroup );
  3567. dwGroupLen = (lstrlenW(pszGroupId)+1) * sizeof(WCHAR);
  3568. //
  3569. // Send message.
  3570. //
  3571. dwStatus = GumSendUpdateEx(GumUpdateFailoverManager,
  3572. FmUpdateGroupIntendedOwner,
  3573. 2,
  3574. dwGroupLen,
  3575. pszGroupId,
  3576. sizeof(DWORD),
  3577. &dwNodeId
  3578. );
  3579. return(dwStatus);
  3580. }
  3581. /****
  3582. @func DWORD | FmpSetOwnerForGroup | On a move the new owner
  3583. node makes this gum call to inform all nodes that it
  3584. owns this particular group.
  3585. @parm IN PFM_GROUP | pGroup | The group whose owner must be set.
  3586. @parm IN PNM_NODE | pNode | The group's owner node.
  3587. @comm The local group lock is held while making this call.
  3588. @rdesc Returns a result code. ERROR_SUCCESS on success.
  3589. ****/
  3590. DWORD FmpSetOwnerForGroup(
  3591. IN PFM_GROUP pGroup,
  3592. IN PNM_NODE pNode
  3593. )
  3594. {
  3595. PCWSTR pszGroupId;
  3596. PCWSTR pszNodeId;
  3597. DWORD dwGroupLen;
  3598. DWORD dwNodeLen;
  3599. DWORD dwStatus;
  3600. pszGroupId = OmObjectId( pGroup );
  3601. dwGroupLen = (lstrlenW(pszGroupId)+1) * sizeof(WCHAR);
  3602. pszNodeId = OmObjectId(pNode);
  3603. dwNodeLen = (lstrlenW(pszNodeId)+1) * sizeof(WCHAR);
  3604. //
  3605. // Send message.
  3606. //
  3607. dwStatus = GumSendUpdateEx(GumUpdateFailoverManager,
  3608. FmUpdateCheckAndSetGroupOwner,
  3609. 2,
  3610. dwGroupLen,
  3611. pszGroupId,
  3612. dwNodeLen,
  3613. pszNodeId
  3614. );
  3615. return(dwStatus);
  3616. }
  3617. PNM_NODE
  3618. FmpGetNodeNotHostingUndesiredGroups(
  3619. IN PFM_GROUP pGroup,
  3620. IN BOOL fRuleOutLocalNode,
  3621. IN BOOL fChooseMostPreferredNode
  3622. )
  3623. /*++
  3624. Routine Description:
  3625. Find a preferred node that does not host groups with CLUSREG_NAME_GRP_ANTI_AFFINITY_CLASS_NAME
  3626. property set to the same value as the supplied group.
  3627. Arguments:
  3628. pGroup - Pointer to the group object we're checking.
  3629. fRuleOutLocalNode - Should the local node be considered or not.
  3630. fChooseMostPreferredNode - Should the most preferred node be chosen after antiaffinity needs
  3631. are satisfied ?
  3632. Return Value:
  3633. Pointer to node object that satisfies the anti-affinity condition.
  3634. NULL if a node cannot be not found.
  3635. Note:
  3636. The antiaffinity property value is defined as a MULTI_SZ property. However for this implementation
  3637. we ignore all the string values beyond the first value. The MULTI_SZ definition is to allow
  3638. future expansion of the algorithm implemented by this function.
  3639. --*/
  3640. {
  3641. PLIST_ENTRY plistEntry;
  3642. PPREFERRED_ENTRY pPreferredEntry;
  3643. GROUP_AFFINITY_NODE_INFO GroupAffinityNodeInfo;
  3644. PNM_NODE pNode = NULL;
  3645. DWORD dwIndex = 0, i, j;
  3646. DWORD dwClusterHighestVersion;
  3647. BOOL fFoundLocalNode = FALSE;
  3648. DWORD dwIndexStart = 0;
  3649. GroupAffinityNodeInfo.ppNmNodeList = NULL;
  3650. //
  3651. // Chittur Subbaraman (chitturs) - 3/6/2001
  3652. //
  3653. // This function works as follows. First, it makes a list of possible candidate nodes that the
  3654. // group can be hosted on. Next, it enumerates all groups in the cluster and for those
  3655. // groups that have the AntiAffinityClassName property set, it will remove those group's
  3656. // current owner nodes from the list of possible candidate nodes if they are present there.
  3657. // Note that this function will return a node only if the pruning has positively taken place.
  3658. // Else, it will return NULL.
  3659. //
  3660. // IMPORTANT NOTE: This function is called by all nodes from the node down processing FM
  3661. // GUM handler. For all nodes to reach exactly the same decision on the group placement,
  3662. // it is crucial that all nodes call this function for groups in exactly the same order.
  3663. // E.g., if node 1 was hosting groups A, B and C and it died, then all the remaining nodes
  3664. // must call this function first for group A, then for group B and finally for group C.
  3665. // This is because once group A is placed by this function, then group B's placement is
  3666. // influenced by group A's placement and similarly for groups B and C. This order is
  3667. // ensured since all nodes OM will maintain groups in the same order since OM creates this
  3668. // list based on enumerating the group key (under Cluster\Groups) and that must occur in the
  3669. // same order in all nodes.
  3670. //
  3671. //
  3672. // It is too bad that we can't hold any locks while enumerating groups and looking at the
  3673. // property field since that will soon result in a deadlock (since we can't hold group locks
  3674. // from within a GUM and this function is invoked from a GUM).
  3675. //
  3676. //
  3677. // If we are dealing with the mixed mode cluster or if the group does not have the antiaffinity
  3678. // property set, then don't do anything.
  3679. //
  3680. NmGetClusterOperationalVersion( &dwClusterHighestVersion,
  3681. NULL,
  3682. NULL );
  3683. if ( ( CLUSTER_GET_MAJOR_VERSION( dwClusterHighestVersion ) < NT51_MAJOR_VERSION ) ||
  3684. ( pGroup->lpszAntiAffinityClassName == NULL ) )
  3685. {
  3686. goto FnExit;
  3687. }
  3688. //
  3689. // Initialize the node list.
  3690. //
  3691. GroupAffinityNodeInfo.ppNmNodeList = LocalAlloc ( LPTR,
  3692. ClusterDefaultMaxNodes * sizeof ( PNM_NODE ) );
  3693. if ( GroupAffinityNodeInfo.ppNmNodeList == NULL )
  3694. {
  3695. ClRtlLogPrint(LOG_CRITICAL, "[FM] FmpGetNodeNotHostingUndesiredGroups: Failed in alloc, Status %1!d!\n",
  3696. GetLastError());
  3697. goto FnExit;
  3698. }
  3699. //
  3700. // For each entry in the preferred list, find a system that is up and that does not
  3701. // host any groups with an anti-affinity to the supplied group.
  3702. //
  3703. for ( plistEntry = pGroup->PreferredOwners.Flink;
  3704. plistEntry != &(pGroup->PreferredOwners);
  3705. plistEntry = plistEntry->Flink )
  3706. {
  3707. pPreferredEntry = CONTAINING_RECORD( plistEntry,
  3708. PREFERRED_ENTRY,
  3709. PreferredLinkage );
  3710. if ( NmGetNodeState( pPreferredEntry->PreferredNode ) == ClusterNodeUp )
  3711. {
  3712. //
  3713. // If you are not required to choose the most preferred node, note down the
  3714. // index of the node next to the local node so that our search can begin
  3715. // from that index.
  3716. //
  3717. if ( ( fChooseMostPreferredNode == FALSE ) && ( fFoundLocalNode == TRUE ) )
  3718. {
  3719. fFoundLocalNode = FALSE;
  3720. dwIndexStart = dwIndex;
  3721. }
  3722. if ( pPreferredEntry->PreferredNode == NmLocalNode )
  3723. {
  3724. fFoundLocalNode = TRUE;
  3725. if ( fRuleOutLocalNode ) continue;
  3726. }
  3727. GroupAffinityNodeInfo.ppNmNodeList[dwIndex] = pPreferredEntry->PreferredNode;
  3728. dwIndex ++;
  3729. }// if
  3730. } // for
  3731. //
  3732. // Initialize the other fields in the GroupAffinityNodeInfo structure.
  3733. //
  3734. GroupAffinityNodeInfo.pGroup = pGroup;
  3735. GroupAffinityNodeInfo.fDidPruningOccur = FALSE;
  3736. //
  3737. // Enumerate all the groups and rule out nodes that host groups with the supplied
  3738. // anti-affinity property set.
  3739. //
  3740. OmEnumObjects ( ObjectTypeGroup,
  3741. FmpCheckForAntiAffinityProperty,
  3742. pGroup->lpszAntiAffinityClassName,
  3743. &GroupAffinityNodeInfo );
  3744. //
  3745. // No pruning occurred so far. So, don't proceed further and let the caller decide on
  3746. // a best node for the group using some other algorithm.
  3747. //
  3748. if ( GroupAffinityNodeInfo.fDidPruningOccur == FALSE )
  3749. {
  3750. goto FnExit;
  3751. }
  3752. //
  3753. // Now, pick the first node from the list that is a valid node. Note that the start index
  3754. // from which we start a search varies depending on the input parameter to this function.
  3755. //
  3756. j = dwIndexStart;
  3757. for ( i=0; i<ClusterDefaultMaxNodes; i++ )
  3758. {
  3759. if ( GroupAffinityNodeInfo.ppNmNodeList[j] != NULL )
  3760. {
  3761. pNode = GroupAffinityNodeInfo.ppNmNodeList[j];
  3762. ClRtlLogPrint(LOG_NOISE, "[FM] FmpGetNodeNotHostingUndesiredGroups: Choosing node %1!d! for group %2!ws! [%3!ws!]...\n",
  3763. NmGetNodeId(pNode),
  3764. OmObjectId(pGroup),
  3765. OmObjectName(pGroup));
  3766. goto FnExit;
  3767. }
  3768. j = ( j+1 ) % ClusterDefaultMaxNodes;
  3769. } // for
  3770. FnExit:
  3771. LocalFree( GroupAffinityNodeInfo.ppNmNodeList );
  3772. return( pNode );
  3773. } // FmpGetNodeNotHostingUndesiredGroups
  3774. BOOL
  3775. FmpCheckForAntiAffinityProperty(
  3776. IN LPCWSTR lpszAntiAffinityClassName,
  3777. IN PGROUP_AFFINITY_NODE_INFO pGroupAffinityNodeInfo,
  3778. IN PFM_GROUP pGroup,
  3779. IN LPCWSTR lpszGroupName
  3780. )
  3781. /*++
  3782. Routine Description:
  3783. Remove a node from the supplied node list if it hosts the supplied group with the supplied
  3784. anti-affinity property set.
  3785. Arguments:
  3786. lpszAntiAffinityClassName - The name property to check for.
  3787. pGroupAffinityNodeInfo - Structure containing a list of nodes that is to be pruned possibly.
  3788. pGroup - Supplies the group.
  3789. lpszGroupName - Supplies the group's name.
  3790. Return Value:
  3791. TRUE - to indicate that the enumeration should continue.
  3792. FALSE - to indicate that the enumeration should not continue.
  3793. --*/
  3794. {
  3795. DWORD i;
  3796. //
  3797. // If the supplied group has the anti-affinity property not set or if it has the
  3798. // property set but is not same as the one we are checking against or if it is same
  3799. // as the group we are interested in placing, then just return specifying that the
  3800. // enum should continue.
  3801. //
  3802. if ( ( pGroup->lpszAntiAffinityClassName == NULL ) ||
  3803. ( pGroup == pGroupAffinityNodeInfo->pGroup ) ||
  3804. ( lstrcmp ( lpszAntiAffinityClassName, pGroup->lpszAntiAffinityClassName ) != 0 ) )
  3805. {
  3806. goto FnExit;
  3807. }
  3808. //
  3809. // If you reached here, this means that the supplied group has the anti-affinity property
  3810. // set and is same as the property we are checking against. So, prune the node list.
  3811. //
  3812. for ( i=0; i<ClusterDefaultMaxNodes; i++ )
  3813. {
  3814. if ( ( pGroupAffinityNodeInfo->ppNmNodeList[i] != NULL ) &&
  3815. ( pGroup->OwnerNode == pGroupAffinityNodeInfo->ppNmNodeList[i] ) )
  3816. {
  3817. ClRtlLogPrint(LOG_NOISE, "[FM] FmpCheckForAntiAffinityProperty: Pruning node %1!d! for group %2!ws! due to "
  3818. "group %3!ws!, AntiAffinityClassName=%4!ws!...\n",
  3819. NmGetNodeId(pGroupAffinityNodeInfo->ppNmNodeList[i]),
  3820. OmObjectId(pGroupAffinityNodeInfo->pGroup),
  3821. OmObjectId(pGroup),
  3822. lpszAntiAffinityClassName);
  3823. pGroupAffinityNodeInfo->ppNmNodeList[i] = NULL;
  3824. //
  3825. // Mark that pruning was attempted.
  3826. //
  3827. pGroupAffinityNodeInfo->fDidPruningOccur = TRUE;
  3828. goto FnExit;
  3829. } // if
  3830. } // for
  3831. FnExit:
  3832. return( TRUE );
  3833. } // FmpCheckForAntiAffinityProperty
  3834. PNM_NODE
  3835. FmpPickNodeFromPreferredListAtRandom(
  3836. IN PFM_GROUP pGroup,
  3837. IN PNM_NODE pSuggestedPreferredNode OPTIONAL,
  3838. IN BOOL fRuleOutLocalNode,
  3839. IN BOOL fCheckForDisablingRandomization
  3840. )
  3841. /*++
  3842. Routine Description:
  3843. Find a preferred node for the group that is UP in a random fashion.
  3844. Arguments:
  3845. pGroup - Pointer to the group object we're interested in.
  3846. pSuggestedPreferredNode - Suggested fallback option in case this random result is undesired. OPTIONAL
  3847. fRuleOutLocalNode - Should the local node be ruled out from consideration.
  3848. fCheckForDisablingRandomization - Check whether randomization should be disabled.
  3849. Return Value:
  3850. The preferred node that is picked.
  3851. NULL if a node cannot be not found.
  3852. Comments:
  3853. This function is called from both FmpMoveGroup as well as from FmpNodeDown. In the former case,
  3854. we will have a non-NULL suggested preferred node, rule out local node option, check
  3855. for property setting disabling randomization and check for mixed mode clusters to disable
  3856. randomization. In the latter case, these parameters are the opposite.
  3857. --*/
  3858. {
  3859. UUID uuId;
  3860. USHORT usHashValue;
  3861. PNM_NODE pNode = NULL, pSelectedNode = pSuggestedPreferredNode;
  3862. DWORD dwNodeId;
  3863. DWORD dwRetry = 0;
  3864. DWORD dwStatus;
  3865. DWORD dwDisabled = 0;
  3866. DWORD dwClusterHighestVersion;
  3867. //
  3868. // Chittur Subbaraman (chitturs) - 4/18/2001
  3869. //
  3870. if ( fCheckForDisablingRandomization )
  3871. {
  3872. //
  3873. // If you are here, this means you are coming as a part of a user-initiated move.
  3874. // Check whether the randomization applies.
  3875. //
  3876. //
  3877. // First, check if are operating in a mixed version cluster. If so, don't randomize.
  3878. //
  3879. NmGetClusterOperationalVersion( &dwClusterHighestVersion,
  3880. NULL,
  3881. NULL );
  3882. if ( CLUSTER_GET_MAJOR_VERSION( dwClusterHighestVersion ) <
  3883. NT51_MAJOR_VERSION )
  3884. {
  3885. return ( pSelectedNode );
  3886. }
  3887. //
  3888. // Next check if the user has turned off the randomization algorithm by setting
  3889. // HKLM\Cluster\DisableGroupPreferredOwnersRandomization DWORD to 1.
  3890. //
  3891. dwStatus = DmQueryDword( DmClusterParametersKey,
  3892. CLUSREG_NAME_DISABLE_GROUP_PREFERRED_OWNER_RANDOMIZATION,
  3893. &dwDisabled,
  3894. NULL );
  3895. if ( ( dwStatus == ERROR_SUCCESS ) &&
  3896. ( dwDisabled == 1 ) )
  3897. {
  3898. return ( pSelectedNode );
  3899. }
  3900. }
  3901. //
  3902. // This function will attempt to pick a node at random from the group's preferred owners list
  3903. // in case the caller does not suggest a preferred node which is set by the user. So, first
  3904. // this function checks this case and bails out if the condition is met. Otherwise, it
  3905. // will generate a random number between 1 and NmMaxNodeId and see if (a) that node is in
  3906. // the group's preferred list, and (b) that node is UP. If so, it picks up the node. Note
  3907. // that the function will try 10 times to pick a node and then gives up. If no
  3908. // node is found, this function will return the suggested node which in some cases could be
  3909. // NULL.
  3910. //
  3911. ClRtlLogPrint(LOG_NOISE, "[FM] FmpPickNodeFromPreferredListAtRandom: Picking node for group %1!ws! [%2!ws!], suggested node %3!u!...\n",
  3912. OmObjectId(pGroup),
  3913. OmObjectName(pGroup),
  3914. (pSuggestedPreferredNode == NULL) ? 0:NmGetNodeId(pSuggestedPreferredNode));
  3915. if ( ( pSuggestedPreferredNode != NULL ) &&
  3916. ( FmpIsNodeUserPreferred ( pGroup, pSuggestedPreferredNode ) ) )
  3917. {
  3918. ClRtlLogPrint(LOG_NOISE, "[FM] FmpPickNodeFromPreferredListAtRandom: Node %2!u! for group %1!ws! is user preferred...\n",
  3919. OmObjectId(pGroup),
  3920. NmGetNodeId(pSuggestedPreferredNode));
  3921. goto FnExit;
  3922. }
  3923. if ( pGroup->lpszAntiAffinityClassName != NULL )
  3924. {
  3925. ClRtlLogPrint(LOG_NOISE, "[FM] FmpPickNodeFromPreferredListAtRandom: Group %1!ws! has antiaffinity property set...\n",
  3926. OmObjectId(pGroup));
  3927. goto FnExit;
  3928. }
  3929. //
  3930. // Retry 25 times so that we can have a good chance of getting a valid node. Note that we
  3931. // supply NmMaxNodeId to the srand() function and its value is equal to the node limit of
  3932. // 16. So, to get a valid node in a smaller size cluster, we have to have the retry count
  3933. // to be reasonable.
  3934. //
  3935. while ( dwRetry++ < 25 )
  3936. {
  3937. dwStatus = UuidFromString( ( LPWSTR ) OmObjectId(pGroup), &uuId );
  3938. if ( dwStatus != RPC_S_OK )
  3939. {
  3940. ClRtlLogPrint(LOG_CRITICAL, "[FM] FmpPickNodeFromPreferredListAtRandom: Unable to get UUID from string %1!ws!, Status %2!u!...\n",
  3941. OmObjectId(pGroup),
  3942. dwStatus);
  3943. goto FnExit;
  3944. }
  3945. usHashValue = UuidHash( &uuId, &dwStatus );
  3946. if ( dwStatus != RPC_S_OK )
  3947. {
  3948. ClRtlLogPrint(LOG_CRITICAL, "[FM] FmpPickNodeFromPreferredListAtRandom: Unable to get hash value for UUID %1!ws!, Status %2!u!...\n",
  3949. OmObjectId(pGroup),
  3950. dwStatus);
  3951. goto FnExit;
  3952. }
  3953. //
  3954. // Seed the random number generate with a value that is as random as it gets.
  3955. //
  3956. srand( GetTickCount() * usHashValue * ( dwRetry + 1 ) );
  3957. //
  3958. // Find the node ID that is between ClusterMinNodeId and NmMaxNodeId. We use NmMaxNodeId
  3959. // here since there is no simple way to get the count of configured nodes. Note that we
  3960. // have to ensure that the node ID falls within this range, otherwise assertion trips
  3961. // in NmReferenceNodeById.
  3962. //
  3963. dwNodeId = ( DWORD ) ( ( double ) rand() / ( double ) ( RAND_MAX ) * NmMaxNodeId ) + 1;
  3964. if ( dwNodeId > NmMaxNodeId ) dwNodeId = NmMaxNodeId;
  3965. if ( dwNodeId < ClusterMinNodeId ) dwNodeId = ClusterMinNodeId;
  3966. //
  3967. // In case the caller asks you to rule out local node, do so.
  3968. //
  3969. if ( ( fRuleOutLocalNode ) && ( dwNodeId == NmLocalNodeId ) ) continue;
  3970. //
  3971. // Reference and dereference the node objects. Note that we are only interested in
  3972. // getting a pointer to the node object and we use the fact that the node in the preferred
  3973. // list must be referenced.
  3974. //
  3975. pNode = NmReferenceNodeById ( dwNodeId );
  3976. if ( pNode == NULL ) continue;
  3977. if ( ( FmpInPreferredList( pGroup, pNode, FALSE, NULL ) ) &&
  3978. ( NmGetExtendedNodeState( pNode ) == ClusterNodeUp ) )
  3979. {
  3980. pSelectedNode = pNode;
  3981. break;
  3982. }
  3983. OmDereferenceObject ( pNode );
  3984. pNode = NULL;
  3985. }// while
  3986. FnExit:
  3987. if ( pNode != NULL ) OmDereferenceObject ( pNode );
  3988. ClRtlLogPrint(LOG_NOISE, "[FM] FmpPickNodeFromPreferredListAtRandom: Selected node %2!u! for group %1!ws!...\n",
  3989. OmObjectId(pGroup),
  3990. (pSelectedNode == NULL) ? 0:NmGetNodeId(pSelectedNode));
  3991. return ( pSelectedNode );
  3992. }// FmpPickNodeFromPreferredNodeAtRandom
  3993. BOOL
  3994. FmpIsNodeUserPreferred(
  3995. IN PFM_GROUP pGroup,
  3996. IN PNM_NODE pPreferredNode
  3997. )
  3998. /*++
  3999. Routine Description:
  4000. Check whether the supplied node is set as a preferred node by the user.
  4001. Arguments:
  4002. pGroup - Pointer to the group object we're interested in.
  4003. pPreferredNode - Preferred node to check for.
  4004. Return Value:
  4005. TRUE - The supplied preferred node is user set.
  4006. FALSE otherwise
  4007. --*/
  4008. {
  4009. DWORD dwStatus;
  4010. BOOL fPreferredByUser = FALSE;
  4011. LPWSTR lpmszPreferredNodeList = NULL;
  4012. LPCWSTR lpszPreferredNode;
  4013. DWORD cbPreferredNodeList = 0;
  4014. DWORD cbBuffer = 0;
  4015. DWORD dwIndex;
  4016. PNM_NODE pNode;
  4017. //
  4018. // Look for any preferred owners set by the user
  4019. //
  4020. dwStatus = DmQueryMultiSz( pGroup->RegistryKey,
  4021. CLUSREG_NAME_GRP_PREFERRED_OWNERS,
  4022. &lpmszPreferredNodeList,
  4023. &cbBuffer,
  4024. &cbPreferredNodeList );
  4025. if ( dwStatus != ERROR_SUCCESS )
  4026. {
  4027. goto FnExit;
  4028. }
  4029. //
  4030. // Parse the multisz and check whether the supplied node exists in the list
  4031. //
  4032. for ( dwIndex = 0; ; dwIndex++ )
  4033. {
  4034. lpszPreferredNode = ClRtlMultiSzEnum( lpmszPreferredNodeList,
  4035. cbPreferredNodeList/sizeof(WCHAR),
  4036. dwIndex );
  4037. if ( lpszPreferredNode == NULL )
  4038. {
  4039. break;
  4040. }
  4041. pNode = OmReferenceObjectById( ObjectTypeNode,
  4042. lpszPreferredNode );
  4043. if ( pNode == NULL )
  4044. {
  4045. ClRtlLogPrint(LOG_CRITICAL, "[FM] FmpIsNodeUserPreferred: Unable to reference node %1!ws!, Status %2!u!...\n",
  4046. lpszPreferredNode,
  4047. dwStatus);
  4048. continue;
  4049. }
  4050. if ( pNode == pPreferredNode )
  4051. {
  4052. fPreferredByUser = TRUE;
  4053. OmDereferenceObject ( pNode );
  4054. break;
  4055. }
  4056. OmDereferenceObject ( pNode );
  4057. } // for
  4058. FnExit:
  4059. LocalFree ( lpmszPreferredNodeList );
  4060. return ( fPreferredByUser );
  4061. }// FmpIsNodeUserPreferred
  4062. DWORD
  4063. FmpPrepareGroupNodeList(
  4064. OUT PFM_GROUP_NODE_LIST *ppGroupNodeList
  4065. )
  4066. /*++
  4067. Routine Description:
  4068. Prepares a buffer containing the group ID and preferred owner node ID of all groups.
  4069. Arguments:
  4070. ppGroupNodeList - Pointer to a buffer containing group IDs and preferred nodes.
  4071. Return Value:
  4072. ERROR_SUCCESS on success
  4073. Win32 error code otherwise
  4074. --*/
  4075. {
  4076. DWORD cbBuffer = 512; // Let us try a 512 byte buffer to start with.
  4077. DWORD dwStatus;
  4078. DWORD dwDisabled = 0;
  4079. //
  4080. // First check if the user has turned off the randomization algorithm by setting
  4081. // HKLM\Cluster\DisableGroupPreferredOwnersRandomization DWORD to 1.
  4082. //
  4083. dwStatus = DmQueryDword( DmClusterParametersKey,
  4084. CLUSREG_NAME_DISABLE_GROUP_PREFERRED_OWNER_RANDOMIZATION,
  4085. &dwDisabled,
  4086. NULL );
  4087. if ( ( dwStatus == ERROR_SUCCESS ) &&
  4088. ( dwDisabled == 1 ) )
  4089. {
  4090. dwStatus = ERROR_CLUSTER_INVALID_REQUEST;
  4091. return ( dwStatus );
  4092. }
  4093. //
  4094. // This function allocates contiguous memory for a list so that the entire buffer
  4095. // can be passed on to GUM.
  4096. //
  4097. *ppGroupNodeList = LocalAlloc( LPTR, cbBuffer );
  4098. if ( *ppGroupNodeList == NULL )
  4099. {
  4100. dwStatus = GetLastError();
  4101. ClRtlLogPrint(LOG_CRITICAL, "[FM] FmpPrepareGroupNodeList: Memory alloc failed, Status %1!u!...\n",
  4102. dwStatus);
  4103. return ( dwStatus );
  4104. }
  4105. //
  4106. // Initialize the size of the list to the size of the header minus first element.
  4107. //
  4108. ( *ppGroupNodeList )->cbGroupNodeList = sizeof ( FM_GROUP_NODE_LIST ) -
  4109. sizeof ( FM_GROUP_NODE_LIST_ENTRY );
  4110. //
  4111. // Enumerate all the groups, find a possibly random preferred owner for each group and
  4112. // return all the info in the buffer.
  4113. //
  4114. return OmEnumObjects ( ObjectTypeGroup,
  4115. FmpAddGroupNodeToList,
  4116. ppGroupNodeList,
  4117. &cbBuffer );
  4118. }// FmpPrepareGroupNodeList
  4119. DWORD
  4120. FmpAddGroupNodeToList(
  4121. IN PFM_GROUP_NODE_LIST *ppGroupNodeList,
  4122. IN LPDWORD pcbBuffer,
  4123. IN PFM_GROUP pGroup,
  4124. IN LPCWSTR lpszGroupId
  4125. )
  4126. /*++
  4127. Routine Description:
  4128. Find a random preferred owner for the given group and add the info to a buffer.
  4129. Arguments:
  4130. ppGroupNodeList - Pointer to a buffer containing group IDs and preferred nodes.
  4131. pcbBuffer - Size of the buffer.
  4132. pGroup - Group whose preferred node is to be found.
  4133. lpszGroupId - ID of the group.
  4134. Return Value:
  4135. ERROR_SUCCESS on success
  4136. Win32 error code otherwise
  4137. --*/
  4138. {
  4139. PNM_NODE pNode;
  4140. PFM_GROUP_NODE_LIST_ENTRY pGroupNodeListEntry;
  4141. PFM_GROUP_NODE_LIST pBuffer;
  4142. PLIST_ENTRY pListEntry;
  4143. DWORD dwStatus;
  4144. BOOL fLocked;
  4145. DWORD dwRetryCount = 10; // 5 secs retry
  4146. //
  4147. // Try to get the group lock since you access group lists here. Note that you won't be
  4148. // able to get the group lock in case some resource is stuck waiting for the quorum resource
  4149. // to come online and this thread called as a part of FmpNodeDown is responsible for sending
  4150. // the node down GUM which in turn will bring the quorum group online. In such a case, do not
  4151. // add this group to the list. The FM node down GUM handler will handle cases in which one
  4152. // or more groups is not in the supplied list and will fallback to the static preferred
  4153. // owners list for deciding on a group owner.
  4154. //
  4155. try_acquire_lock:
  4156. FmpTryAcquireLocalGroupLock( pGroup, fLocked );
  4157. if ( fLocked == FALSE )
  4158. {
  4159. if ( dwRetryCount == 0 )
  4160. {
  4161. ClRtlLogPrint(LOG_UNUSUAL,
  4162. "[FM] FmpAddGroupNodeToList: Can't get lock for group %1!ws! [%2!ws!], skip including the group in list...\n",
  4163. OmObjectId(pGroup),
  4164. OmObjectName(pGroup));
  4165. return ( TRUE );
  4166. }
  4167. dwRetryCount --;
  4168. Sleep( 500 );
  4169. goto try_acquire_lock;
  4170. }
  4171. CL_ASSERT( fLocked == TRUE );
  4172. //
  4173. // Skip the quorum group since we cannot randomize its preferred owners list since MM has a
  4174. // choke hold on the placement of quorum group.
  4175. //
  4176. if ( pGroup == gpQuoResource->Group ) goto FnExit;
  4177. //
  4178. // Try to pick a preferred node list for the group at random.
  4179. //
  4180. pNode = FmpPickNodeFromPreferredListAtRandom( pGroup,
  4181. NULL, // No suggested preferred owner
  4182. FALSE, // Can choose local node
  4183. FALSE ); // Check whether randomization should be
  4184. // disabled
  4185. //
  4186. // If no node could be picked, bail out
  4187. //
  4188. if ( pNode == NULL ) goto FnExit;
  4189. //
  4190. // Check whether the allocated buffer is big enough to hold the new entry. Note that the
  4191. // RHS of the equality need not contain the NULL char size since we allocate 1 WCHAR for it in
  4192. // the FM_GROUP_NODE_LIST_ENTRY structure. Also, note that we have to see if the current
  4193. // buffer size is big enough to hold the padding for DWORD alignment.
  4194. //
  4195. if ( *pcbBuffer < ( ( *ppGroupNodeList )->cbGroupNodeList +
  4196. ( sizeof ( FM_GROUP_NODE_LIST_ENTRY ) +
  4197. lstrlenW ( lpszGroupId ) * sizeof ( WCHAR ) +
  4198. sizeof ( DWORD ) - 1
  4199. ) & ~( sizeof ( DWORD ) - 1 )
  4200. ) )
  4201. {
  4202. //
  4203. // Reallocate a bigger buffer
  4204. //
  4205. pBuffer = LocalAlloc( LPTR, 2 * ( *pcbBuffer ) );
  4206. if ( pBuffer == NULL )
  4207. {
  4208. dwStatus = GetLastError();
  4209. ClRtlLogPrint(LOG_CRITICAL, "[FM] FmpAddGroupNodeToList: Memory alloc failed, Status %1!u!...\n",
  4210. dwStatus);
  4211. goto FnExit;
  4212. }
  4213. ( *pcbBuffer ) *= 2;
  4214. //
  4215. // Copy the contents of the old list to the new list.
  4216. //
  4217. CopyMemory( pBuffer, *ppGroupNodeList, ( *ppGroupNodeList )->cbGroupNodeList );
  4218. LocalFree ( *ppGroupNodeList );
  4219. *ppGroupNodeList = pBuffer;
  4220. }
  4221. //
  4222. // Find the pointer to the beginning of the new list entry
  4223. //
  4224. pGroupNodeListEntry = ( PFM_GROUP_NODE_LIST_ENTRY )
  4225. ( ( LPBYTE ) ( *ppGroupNodeList ) +
  4226. ( *ppGroupNodeList )->cbGroupNodeList );
  4227. //
  4228. // Adjust the size of the list. As above, size of NULL char is excluded. Align the length
  4229. // to a multiple of DWORD since we want the PFM_GROUP_NODE_LIST_ENTRY structure to be
  4230. // DWORD aligned since the structure starts with a DWORD.
  4231. //
  4232. ( *ppGroupNodeList )->cbGroupNodeList += ( sizeof ( FM_GROUP_NODE_LIST_ENTRY ) +
  4233. lstrlenW ( lpszGroupId ) * sizeof ( WCHAR ) +
  4234. sizeof ( DWORD ) - 1 ) & ~( sizeof ( DWORD ) - 1 );
  4235. //
  4236. // Set the contents of the list entry
  4237. //
  4238. pGroupNodeListEntry->dwPreferredNodeId = NmGetNodeId ( pNode );
  4239. lstrcpy( pGroupNodeListEntry->szGroupId, lpszGroupId );
  4240. FnExit:
  4241. FmpReleaseLocalGroupLock( pGroup );
  4242. return ( TRUE );
  4243. }// FmpPrepareGroupNodeList
  4244. PNM_NODE
  4245. FmpParseGroupNodeListForPreferredOwner(
  4246. IN PFM_GROUP pGroup,
  4247. IN PFM_GROUP_NODE_LIST pGroupNodeList,
  4248. IN PNM_NODE pSuggestedPreferredNode
  4249. )
  4250. /*++
  4251. Routine Description:
  4252. Parse the supplied group node list looking for a preferred node for the supplied group.
  4253. Arguments:
  4254. pGroup - The group whose preferred node must be found.
  4255. pGroupNodeList - The list contains preferred nodes of the group.
  4256. pSuggestedPreferredNode - Suggested preferred node fallback option.
  4257. Return Value:
  4258. The preferred node for the group.
  4259. --*/
  4260. {
  4261. PNM_NODE pSelectedNode = pSuggestedPreferredNode;
  4262. PFM_GROUP_NODE_LIST_ENTRY pGroupNodeListEntry;
  4263. BOOL fFoundGroup = FALSE;
  4264. PNM_NODE pNode = NULL;
  4265. DWORD dwStatus;
  4266. DWORD cbGroupNodeList;
  4267. //
  4268. // If the suggested node is user preferred or if it has an anti-affinity class name
  4269. // property set, don't do anything else. Just return the suggested owner.
  4270. //
  4271. if ( ( FmpIsNodeUserPreferred ( pGroup, pSuggestedPreferredNode ) ) ||
  4272. ( pGroup->lpszAntiAffinityClassName != NULL ) )
  4273. {
  4274. ClRtlLogPrint(LOG_NOISE, "[FM] FmpParseGroupNodeListForPreferredOwner: Node %2!u! for group %1!ws! is user preferred/antiaffinity property set...\n",
  4275. OmObjectId(pGroup),
  4276. NmGetNodeId(pSuggestedPreferredNode));
  4277. goto FnExit;
  4278. }
  4279. cbGroupNodeList = sizeof ( FM_GROUP_NODE_LIST ) -
  4280. sizeof ( FM_GROUP_NODE_LIST_ENTRY );
  4281. //
  4282. // Walk the supplied list looking for the group entry.
  4283. //
  4284. while ( cbGroupNodeList < pGroupNodeList->cbGroupNodeList )
  4285. {
  4286. pGroupNodeListEntry = ( PFM_GROUP_NODE_LIST_ENTRY ) ( ( LPBYTE ) pGroupNodeList +
  4287. cbGroupNodeList );
  4288. if ( lstrcmp( pGroupNodeListEntry->szGroupId, OmObjectId( pGroup ) ) == 0 )
  4289. {
  4290. fFoundGroup = TRUE;
  4291. break;
  4292. }
  4293. cbGroupNodeList += ( sizeof ( FM_GROUP_NODE_LIST_ENTRY ) +
  4294. lstrlenW ( pGroupNodeListEntry->szGroupId ) * sizeof ( WCHAR ) +
  4295. sizeof ( DWORD ) - 1 ) & ~( sizeof ( DWORD ) - 1 );
  4296. } // while
  4297. //
  4298. // Fallback to the suggested option if:
  4299. // (1) You did not find the group in the list
  4300. // (2) The preferred node for the group is invalid in the list
  4301. // (3) The preferred node for the group is down
  4302. //
  4303. if ( fFoundGroup == FALSE )
  4304. {
  4305. ClRtlLogPrint(LOG_NOISE, "[FM] FmpParseGroupNodeListForPreferredOwner: Did not find group %1!ws! in supplied list...\n",
  4306. OmObjectId(pGroup));
  4307. goto FnExit;
  4308. }
  4309. if ( ( pGroupNodeListEntry->dwPreferredNodeId == 0 ) ||
  4310. ( pGroupNodeListEntry->dwPreferredNodeId > NmMaxNodeId ) )
  4311. {
  4312. ClRtlLogPrint(LOG_NOISE, "[FM] FmpParseGroupNodeListForPreferredOwner: Invalid node %1!u! for group %1!ws! in supplied list...\n",
  4313. pGroupNodeListEntry->dwPreferredNodeId,
  4314. OmObjectId(pGroup));
  4315. goto FnExit;
  4316. }
  4317. pNode = NmReferenceNodeById( pGroupNodeListEntry->dwPreferredNodeId );
  4318. if ( pNode == NULL )
  4319. {
  4320. dwStatus = GetLastError();
  4321. ClRtlLogPrint(LOG_UNUSUAL, "[FM] FmpParseGroupNodeListForPreferredOwner: Unable to reference node %1!u! for group %1!ws!, Status %3!u!...\n",
  4322. pGroupNodeListEntry->dwPreferredNodeId,
  4323. OmObjectId(pGroup),
  4324. dwStatus);
  4325. goto FnExit;
  4326. }
  4327. if ( NmGetNodeState( pNode ) != ClusterNodeUp )
  4328. {
  4329. ClRtlLogPrint(LOG_UNUSUAL, "[FM] FmpParseGroupNodeListForPreferredOwner: Preferred node %1!u! for group %1!ws! is not UP...\n",
  4330. pGroupNodeListEntry->dwPreferredNodeId,
  4331. OmObjectId(pGroup));
  4332. goto FnExit;
  4333. }
  4334. pSelectedNode = pNode;
  4335. ClRtlLogPrint(LOG_NOISE, "[FM] FmpParseGroupNodeListForPreferredOwner: Selected node %1!u! for group %2!ws! from supplied randomized list...\n",
  4336. pGroupNodeListEntry->dwPreferredNodeId,
  4337. OmObjectId(pGroup));
  4338. FnExit:
  4339. //
  4340. // Dereference the node object since we depend on the original reference added to the
  4341. // group's preferred owner when it was added to the group structure.
  4342. //
  4343. if ( pNode != NULL ) OmDereferenceObject( pNode );
  4344. return ( pSelectedNode );
  4345. }// FmpParseGroupNodeListForPreferredOwner
  4346. VOID
  4347. FmpNotifyGroupStateChangeReason(
  4348. IN PFM_GROUP pGroup,
  4349. IN CLUSTER_RESOURCE_STATE_CHANGE_REASON eReason
  4350. )
  4351. /*++
  4352. Routine Description:
  4353. Notify a resource DLL about the reason for a state change.
  4354. Arguments:
  4355. pGroup - The group whose resources must be notified of the state change reason.
  4356. eReason - The reason for the state change.
  4357. Returns:
  4358. None.
  4359. Comments:
  4360. This function MUST be called with local group lock held.
  4361. --*/
  4362. {
  4363. PLIST_ENTRY pListEntry;
  4364. PFM_RESOURCE pResource;
  4365. ClRtlLogPrint(LOG_NOISE, "[FM] FmpNotifyGroupStateChangeReason: Notifying group %1!ws! [%2!ws!] of state change reason %3!u!...\n",
  4366. OmObjectName(pGroup),
  4367. OmObjectId(pGroup),
  4368. eReason);
  4369. //
  4370. // Walk the group contains list and attempt to notify each resource of the state change reason.
  4371. //
  4372. for ( pListEntry = pGroup->Contains.Flink;
  4373. pListEntry != &(pGroup->Contains );
  4374. pListEntry = pListEntry->Flink )
  4375. {
  4376. pResource = CONTAINING_RECORD( pListEntry,
  4377. FM_RESOURCE,
  4378. ContainsLinkage );
  4379. FmpNotifyResourceStateChangeReason( pResource, eReason );
  4380. } // for
  4381. }// FmpNotifyGroupStateChangeReason