Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1377 lines
35 KiB

  1. /*++
  2. Copyright (c) 1996 Microsoft Corporation
  3. Module Name:
  4. join.c
  5. Abstract:
  6. This module handles the initialization path where a newly booted
  7. node joins an existing cluster.
  8. Author:
  9. John Vert (jvert) 6/6/1996
  10. Revision History:
  11. --*/
  12. #include "initp.h"
  13. #include "lmcons.h"
  14. #include "lmremutl.h"
  15. #include "lmapibuf.h"
  16. #include <clusverp.h>
  17. //
  18. // Local types
  19. //
  20. typedef struct {
  21. LPWSTR Name;
  22. LPWSTR NetworkId;
  23. } JOIN_SPONSOR_CONTEXT, *PJOIN_SPONSOR_CONTEXT;
  24. //
  25. // Local data
  26. //
  27. CRITICAL_SECTION CsJoinLock;
  28. HANDLE CsJoinEvent = NULL;
  29. DWORD CsJoinThreadCount = 0;
  30. DWORD CsJoinStatus=ERROR_SUCCESS;
  31. RPC_BINDING_HANDLE CsJoinSponsorBinding = NULL;
  32. LPWSTR CsJoinSponsorName = NULL;
  33. //
  34. // Local function prototypes
  35. //
  36. VOID
  37. JoinpEnumNodesAndJoinByAddress(
  38. IN HDMKEY Key,
  39. IN PWSTR NodeId,
  40. IN PVOID Context
  41. );
  42. VOID
  43. JoinpEnumNodesAndJoinByHostName(
  44. IN HDMKEY Key,
  45. IN PWSTR NodeId,
  46. IN PVOID Context
  47. );
  48. VOID
  49. JoinpConnectToSponsor(
  50. IN PWSTR SponsorName
  51. );
  52. DWORD WINAPI
  53. JoinpConnectThread(
  54. LPVOID Parameter
  55. );
  56. DWORD
  57. JoinpAttemptJoin(
  58. LPWSTR SponsorName,
  59. RPC_BINDING_HANDLE JoinMasterBinding
  60. );
  61. BOOL
  62. JoinpAddNodeCallback(
  63. IN PVOID Context1,
  64. IN PVOID Context2,
  65. IN PVOID Object,
  66. IN LPCWSTR Name
  67. );
  68. BOOL
  69. JoinpEnumNetworksToSetPriority(
  70. IN PVOID Context1,
  71. IN PVOID Context2,
  72. IN PVOID Object,
  73. IN LPCWSTR Name
  74. );
  75. DWORD
  76. ClusterJoin(
  77. VOID
  78. )
  79. /*++
  80. Routine Description:
  81. Called to attempt to join a cluster that already exists.
  82. Arguments:
  83. None
  84. Return Value:
  85. ERROR_SUCCESS if successful
  86. Win32 error code otherwise.
  87. --*/
  88. {
  89. DWORD Status;
  90. LPWSTR ClusterIpAddress = NULL;
  91. LPWSTR ClusIpAddrResource = NULL;
  92. LPWSTR ClusterNameId = NULL;
  93. DWORD idMaxSize = 0;
  94. DWORD idSize = 0;
  95. HDMKEY hClusNameResKey = NULL;
  96. HDMKEY hClusIPAddrResKey = NULL;
  97. //
  98. // Try connecting using the cluster IP address first. get the cluster
  99. // name resource, looking up its dependency for the cluster IP addr
  100. //
  101. Status = DmQuerySz(DmClusterParametersKey,
  102. CLUSREG_NAME_CLUS_CLUSTER_NAME_RES,
  103. &ClusterNameId,
  104. &idMaxSize,
  105. &idSize);
  106. if (Status != ERROR_SUCCESS) {
  107. ClRtlLogPrint(LOG_CRITICAL,
  108. "[JOIN] failed to get cluster name resource, error %1!u!.\n",
  109. Status);
  110. goto error_exit;
  111. }
  112. //
  113. // open name resource key and read its DependsOn key
  114. //
  115. hClusNameResKey = DmOpenKey( DmResourcesKey, ClusterNameId, KEY_READ );
  116. if ( hClusNameResKey == NULL ) {
  117. Status = GetLastError();
  118. ClRtlLogPrint(LOG_CRITICAL,
  119. "[JOIN] failed to open Cluster Name resource key, error %1!u!.\n",
  120. Status);
  121. goto error_exit;
  122. }
  123. //
  124. // allocate enough space for the GUID and the Parameters string
  125. //
  126. idMaxSize = ( CS_NETWORK_ID_LENGTH + sizeof( CLUSREG_KEYNAME_PARAMETERS ) + 2)
  127. * sizeof(WCHAR);
  128. ClusIpAddrResource = LocalAlloc( LMEM_FIXED, idMaxSize );
  129. if ( ClusIpAddrResource == NULL ) {
  130. Status = ERROR_NOT_ENOUGH_MEMORY;
  131. ClRtlLogPrint(LOG_CRITICAL,
  132. "[JOIN] no memory for Cluster Ip address resource ID!\n");
  133. goto error_exit;
  134. }
  135. Status = DmQueryMultiSz(hClusNameResKey,
  136. CLUSREG_NAME_RES_DEPENDS_ON,
  137. &ClusIpAddrResource,
  138. &idMaxSize,
  139. &idSize);
  140. if ( Status != ERROR_SUCCESS ) {
  141. ClRtlLogPrint(LOG_CRITICAL,
  142. "[JOIN] failed to get Cluster Ip address resource ID, error %1!u!.\n",
  143. Status);
  144. goto error_exit;
  145. }
  146. lstrcatW( ClusIpAddrResource, L"\\" );
  147. lstrcatW( ClusIpAddrResource, CLUSREG_KEYNAME_PARAMETERS );
  148. hClusIPAddrResKey = DmOpenKey( DmResourcesKey, ClusIpAddrResource, KEY_READ );
  149. if ( hClusIPAddrResKey == NULL ) {
  150. Status = GetLastError();
  151. ClRtlLogPrint(LOG_CRITICAL,
  152. "[JOIN] failed to open Cluster IP Address resource key, error %1!u!.\n",
  153. Status);
  154. goto error_exit;
  155. }
  156. //
  157. // get the IP Address; note that these value names are not defined
  158. // in a global way. if they are changed, this code will break
  159. //
  160. idMaxSize = idSize = 0;
  161. Status = DmQuerySz(hClusIPAddrResKey,
  162. L"Address",
  163. &ClusterIpAddress,
  164. &idMaxSize,
  165. &idSize);
  166. if ( Status != ERROR_SUCCESS ) {
  167. ClRtlLogPrint(LOG_CRITICAL,
  168. "[JOIN] failed to get Cluster Ip address, error %1!u!.\n",
  169. Status);
  170. goto error_exit;
  171. }
  172. //
  173. // Spawn threads to find a sponsor. We will try the make connections using
  174. // the cluster IP address, the IP address of each node on each network, and
  175. // the name of each node in the cluster. The connects will proceed in
  176. // parallel. We'll use the first one that succeeds.
  177. //
  178. CsJoinEvent = CreateEvent(NULL, TRUE, FALSE, NULL);
  179. if (CsJoinEvent == NULL) {
  180. Status = GetLastError();
  181. ClRtlLogPrint(LOG_CRITICAL,
  182. "[JOIN] failed to create join event, error %1!u!.\n",
  183. Status
  184. );
  185. goto error_exit;
  186. }
  187. CsJoinThreadCount = 1;
  188. InitializeCriticalSection(&CsJoinLock);
  189. EnterCriticalSection(&CsJoinLock);
  190. DmEnumKeys(DmNetInterfacesKey, JoinpEnumNodesAndJoinByAddress, NULL);
  191. DmEnumKeys(DmNodesKey, JoinpEnumNodesAndJoinByHostName, NULL);
  192. //
  193. // give the other threads a chance to start since using the cluster IP
  194. // address to join with is problematic when the resource moves in the
  195. // middle of a join
  196. //
  197. Sleep( 1000 );
  198. JoinpConnectToSponsor(ClusterIpAddress);
  199. //update status for scm
  200. CsServiceStatus.dwCheckPoint++;
  201. CsAnnounceServiceStatus();
  202. if(CsJoinThreadCount == 1)
  203. SetEvent(CsJoinEvent);
  204. LeaveCriticalSection(&CsJoinLock);
  205. Status = WaitForSingleObject(CsJoinEvent, INFINITE);
  206. CL_ASSERT(Status == WAIT_OBJECT_0);
  207. EnterCriticalSection(&CsJoinLock);
  208. ClRtlLogPrint(LOG_NOISE,
  209. "[JOIN] Got out of the join wait, CsJoinThreadCount = %1!u!.\n",
  210. CsJoinThreadCount
  211. );
  212. if(--CsJoinThreadCount == 0) {
  213. CloseHandle(CsJoinEvent);
  214. DeleteCriticalSection(&CsJoinLock);
  215. }
  216. else
  217. LeaveCriticalSection(&CsJoinLock);
  218. //
  219. // All of the threads have failed or one of them made a connection,
  220. // use it to join.
  221. //
  222. if (CsJoinSponsorBinding != NULL) {
  223. CL_ASSERT(CsJoinSponsorName != NULL);
  224. ClRtlLogPrint(LOG_UNUSUAL,
  225. "[JOIN] Attempting join with sponsor %1!ws!.\n",
  226. CsJoinSponsorName
  227. );
  228. //
  229. // Chittur Subbaraman (chitturs) - 10/27/98
  230. //
  231. // If the database restore operation is requested, then
  232. // refuse to join the cluster and return an error code.
  233. //
  234. if ( CsDatabaseRestore == TRUE ) {
  235. Status = ERROR_CLUSTER_NODE_UP;
  236. LocalFree(CsJoinSponsorName);
  237. goto error_exit;
  238. }
  239. Status = JoinpAttemptJoin(CsJoinSponsorName, CsJoinSponsorBinding);
  240. RpcBindingFree(&CsJoinSponsorBinding);
  241. LocalFree(CsJoinSponsorName);
  242. }
  243. else {
  244. Status = ERROR_BAD_NETPATH;
  245. ClRtlLogPrint(LOG_CRITICAL,
  246. "[JOIN] Unable to connect to any sponsor node.\n"
  247. );
  248. //
  249. // rajdas: If the join did not suceed due to version mismatch we shouldn't try to form a cluster.
  250. // Bug ID: 152229
  251. //
  252. if(CsJoinStatus == ERROR_CLUSTER_INCOMPATIBLE_VERSIONS)
  253. bFormCluster = FALSE;
  254. }
  255. error_exit:
  256. if ( ClusterNameId ) {
  257. LocalFree( ClusterNameId );
  258. }
  259. if ( ClusterIpAddress ) {
  260. LocalFree( ClusterIpAddress );
  261. }
  262. if ( ClusIpAddrResource ) {
  263. LocalFree( ClusIpAddrResource );
  264. }
  265. if ( hClusNameResKey ) {
  266. DmCloseKey( hClusNameResKey );
  267. }
  268. if ( hClusIPAddrResKey ) {
  269. DmCloseKey( hClusIPAddrResKey );
  270. }
  271. return(Status);
  272. }
  273. VOID
  274. JoinpEnumNodesAndJoinByAddress(
  275. IN HDMKEY Key,
  276. IN PWSTR NetInterfaceId,
  277. IN PVOID Context
  278. )
  279. /*++
  280. Routine Description:
  281. Attempts to establish an RPC connection to a specified
  282. node using its IP address
  283. Arguments:
  284. Key - pointer to the node key handle
  285. NetInterfaceId - pointer to string representing net IF ID (guid)
  286. Context - pointer to a location to return the final status
  287. Return Value:
  288. None
  289. --*/
  290. {
  291. DWORD status;
  292. LPWSTR NetIFNodeID = NULL;
  293. LPWSTR NetIFIpAddress = NULL;
  294. DWORD idMaxSize = 0;
  295. DWORD idSize = 0;
  296. //
  297. // get the NodeId Value from the NetIF key and if it's us,
  298. // skip this netIF
  299. //
  300. status = DmQuerySz(Key,
  301. CLUSREG_NAME_NETIFACE_NODE,
  302. &NetIFNodeID,
  303. &idMaxSize,
  304. &idSize);
  305. if ( status == ERROR_SUCCESS ) {
  306. if (lstrcmpiW(NetIFNodeID, NmLocalNodeIdString) != 0) {
  307. //
  308. // it's not us so get the address and try it...
  309. //
  310. idMaxSize = idSize = 0;
  311. status = DmQuerySz(Key,
  312. CLUSREG_NAME_NETIFACE_ADDRESS,
  313. &NetIFIpAddress,
  314. &idMaxSize,
  315. &idSize);
  316. if ( status != ERROR_SUCCESS ) {
  317. ClRtlLogPrint(LOG_CRITICAL,
  318. "[JOIN] failed to get NetInterface Address, error %1!u!.\n",
  319. status);
  320. goto error_exit;
  321. }
  322. //
  323. // attempt the join with this address
  324. //
  325. JoinpConnectToSponsor(NetIFIpAddress);
  326. }
  327. }
  328. else {
  329. ClRtlLogPrint(LOG_CRITICAL,
  330. "[JOIN] failed to get NetInterface Node ID, error %1!u!.\n",
  331. status);
  332. }
  333. error_exit:
  334. DmCloseKey(Key);
  335. if ( NetIFNodeID ) {
  336. LocalFree( NetIFNodeID );
  337. }
  338. if ( NetIFIpAddress ) {
  339. LocalFree( NetIFIpAddress );
  340. }
  341. return;
  342. }
  343. VOID
  344. JoinpEnumNodesAndJoinByHostName(
  345. IN HDMKEY Key,
  346. IN PWSTR NodeId,
  347. IN PVOID Context
  348. )
  349. /*++
  350. Routine Description:
  351. Attempts to establish an RPC connection to a specified node using
  352. its host name
  353. Arguments:
  354. Key - pointer to the node key handle
  355. NodeId - pointer to string representing node ID (number)
  356. Context - pointer to a location to return the final status
  357. Return Value:
  358. None
  359. --*/
  360. {
  361. DWORD status;
  362. LPWSTR nodeName=NULL;
  363. DWORD nodeNameLen=0;
  364. DWORD nodeNameSize=0;
  365. //
  366. // Try to connect if this is not us
  367. //
  368. if (lstrcmpiW(NodeId, NmLocalNodeIdString) != 0) {
  369. status = DmQuerySz(Key,
  370. CLUSREG_NAME_NODE_NAME,
  371. &nodeName,
  372. &nodeNameLen,
  373. &nodeNameSize);
  374. if (status == ERROR_SUCCESS) {
  375. JoinpConnectToSponsor(nodeName);
  376. LocalFree(nodeName);
  377. }
  378. }
  379. DmCloseKey(Key);
  380. return;
  381. }
  382. VOID
  383. JoinpConnectToSponsor(
  384. IN PWSTR SponsorName
  385. )
  386. /*++
  387. Routine Description:
  388. Attempts to establish an RPC connection to a specified node.
  389. Arguments:
  390. SponsorName - The name (or IP address) of the target sponsor.
  391. Return Value:
  392. ERROR_SUCCESS if an RPC connection is successfully made to the sponsor.
  393. An RPC error code otherwise.
  394. --*/
  395. {
  396. HANDLE threadHandle;
  397. DWORD status = ERROR_SUCCESS;
  398. DWORD threadId;
  399. LPWSTR name;
  400. BOOL setEvent = FALSE;
  401. ClRtlLogPrint(LOG_UNUSUAL,
  402. "[JOIN] Spawning thread to connect to sponsor %1!ws!\n",
  403. SponsorName
  404. );
  405. name = LocalAlloc( LMEM_FIXED, (lstrlenW(SponsorName) + 1 ) * sizeof(WCHAR) );
  406. if (name != NULL) {
  407. lstrcpyW(name, SponsorName);
  408. CsJoinThreadCount++;
  409. threadHandle = CreateThread(
  410. NULL,
  411. 0,
  412. JoinpConnectThread,
  413. name,
  414. 0,
  415. &threadId
  416. );
  417. if (threadHandle != NULL) {
  418. CloseHandle(threadHandle);
  419. }
  420. else {
  421. status = GetLastError();
  422. ClRtlLogPrint(LOG_CRITICAL,
  423. "[JOIN] Failed to spawn connect thread, error %1!u!.\n",
  424. status
  425. );
  426. --CsJoinThreadCount;
  427. }
  428. }
  429. else {
  430. ClRtlLogPrint(LOG_CRITICAL,
  431. "[JOIN] Failed to allocate memory.\n"
  432. );
  433. }
  434. return;
  435. } // JoinpConnectToSponsor
  436. DWORD WINAPI
  437. VerifyJoinVersionData(
  438. LPWSTR sponsorName
  439. )
  440. /*++
  441. Routine Description:
  442. Verify that the sponsor and the joiner are compatible
  443. Arguments:
  444. sponsorName - pointer to text string of sponsor to use
  445. Return Value:
  446. ERROR_SUCCESS - if ok to continue join
  447. --*/
  448. {
  449. DWORD status;
  450. LPWSTR bindingString = NULL;
  451. RPC_BINDING_HANDLE bindingHandle = NULL;
  452. DWORD SponsorNodeId;
  453. DWORD ClusterHighestVersion;
  454. DWORD ClusterLowestVersion;
  455. DWORD JoinStatus;
  456. DWORD packageIndex;
  457. //
  458. // Attempt to connect to the sponsor's JoinVersion RPC interface.
  459. //
  460. status = RpcStringBindingComposeW(
  461. L"6e17aaa0-1a47-11d1-98bd-0000f875292e",
  462. L"ncadg_ip_udp",
  463. sponsorName,
  464. NULL,
  465. NULL,
  466. &bindingString);
  467. if (status != RPC_S_OK) {
  468. ClRtlLogPrint(LOG_UNUSUAL,
  469. "[JOIN] Unable to compose JoinVersion string binding for sponsor %1!ws!, status %2!u!.\n",
  470. sponsorName,
  471. status
  472. );
  473. goto error_exit;
  474. }
  475. status = RpcBindingFromStringBindingW(bindingString, &bindingHandle);
  476. RpcStringFreeW(&bindingString);
  477. if (status != RPC_S_OK) {
  478. ClRtlLogPrint(LOG_UNUSUAL,
  479. "[JOIN] Unable to build JoinVersion binding for sponsor %1!ws!, status %2!u!.\n",
  480. sponsorName,
  481. status
  482. );
  483. goto error_exit;
  484. }
  485. //
  486. // under load, the sponsor might take a while to respond back to the
  487. // joiner. The default timeout is at 30 secs and this seems to work
  488. // ok. Note that this means the sponsor has 30 secs to reply to either
  489. // the RPC request or ping. As long it makes any reply, then the joiner's
  490. // RPC will continue to wait and not time out the sponsor.
  491. //
  492. status = RpcMgmtSetComTimeout( bindingHandle, CLUSTER_JOINVERSION_RPC_COM_TIMEOUT );
  493. if (status != RPC_S_OK) {
  494. ClRtlLogPrint(LOG_UNUSUAL,
  495. "[JOIN] Unable to set JoinVersion com timeout for sponsor %1!ws!, status %2!u!.\n",
  496. sponsorName,
  497. status
  498. );
  499. }
  500. status = RpcEpResolveBinding(bindingHandle, JoinVersion_v2_0_c_ifspec);
  501. if (status != RPC_S_OK) {
  502. if ( (status == RPC_S_SERVER_UNAVAILABLE) ||
  503. (status == RPC_S_NOT_LISTENING) ||
  504. (status == EPT_S_NOT_REGISTERED)
  505. )
  506. {
  507. ClRtlLogPrint(LOG_NOISE,
  508. "[JOIN] Sponsor %1!ws! is not available (JoinVersion), status=%2!u!.\n",
  509. sponsorName,
  510. status
  511. );
  512. }
  513. else {
  514. ClRtlLogPrint(LOG_UNUSUAL,
  515. "[JOIN] Unable to resolve JoinVersion endpoint for sponsor %1!ws!, status %2!u!.\n",
  516. sponsorName,
  517. status
  518. );
  519. }
  520. goto error_exit;
  521. }
  522. if ( CsUseAuthenticatedRPC ) {
  523. //
  524. // run through the list of RPC security packages, trying to establish
  525. // a security context with this binding.
  526. //
  527. for (packageIndex = 0;
  528. packageIndex < CsNumberOfRPCSecurityPackages;
  529. ++packageIndex )
  530. {
  531. status = RpcBindingSetAuthInfoW(bindingHandle,
  532. CsServiceDomainAccount,
  533. RPC_C_AUTHN_LEVEL_CONNECT,
  534. CsRPCSecurityPackage[ packageIndex ],
  535. NULL,
  536. RPC_C_AUTHZ_NAME);
  537. if (status != RPC_S_OK) {
  538. ClRtlLogPrint(LOG_UNUSUAL,
  539. "[JOIN] Unable to set JoinVersion AuthInfo using %1!ws! package, status %2!u!.\n",
  540. CsRPCSecurityPackageName[packageIndex],
  541. status);
  542. continue;
  543. }
  544. status = CsRpcGetJoinVersionData(bindingHandle,
  545. NmLocalNodeId,
  546. CsMyHighestVersion,
  547. CsMyLowestVersion,
  548. &SponsorNodeId,
  549. &ClusterHighestVersion,
  550. &ClusterLowestVersion,
  551. &JoinStatus);
  552. if ( status == RPC_S_OK ) {
  553. break;
  554. } else {
  555. ClRtlLogPrint(LOG_UNUSUAL,
  556. "[JOIN] Unable to get join version data from sponsor %1!ws! using %2!ws! package, status %3!u!.\n",
  557. sponsorName,
  558. CsRPCSecurityPackageName[packageIndex],
  559. status);
  560. }
  561. }
  562. } else {
  563. //
  564. // get the version data from the sponsor and determine if we
  565. // should continue to join
  566. //
  567. status = CsRpcGetJoinVersionData(bindingHandle,
  568. NmLocalNodeId,
  569. CsMyHighestVersion,
  570. CsMyLowestVersion,
  571. &SponsorNodeId,
  572. &ClusterHighestVersion,
  573. &ClusterLowestVersion,
  574. &JoinStatus);
  575. if ( status != RPC_S_OK ) {
  576. ClRtlLogPrint(LOG_UNUSUAL,
  577. "[JOIN] Unable to get join version data from sponsor %1!ws!, status %2!u!.\n",
  578. sponsorName,
  579. status);
  580. }
  581. }
  582. //
  583. // jump out now if nothing work (as in the case of a form)
  584. //
  585. if ( status != ERROR_SUCCESS ) {
  586. goto error_exit;
  587. }
  588. //
  589. // use the join lock to set the RPC package index
  590. //
  591. EnterCriticalSection( &CsJoinLock );
  592. if ( CsRPCSecurityPackageIndex < 0 ) {
  593. CsRPCSecurityPackageIndex = packageIndex;
  594. }
  595. LeaveCriticalSection( &CsJoinLock );
  596. //
  597. // check the sponsor was in agreement with the join
  598. //
  599. if ( JoinStatus != ERROR_SUCCESS ) {
  600. ClRtlLogPrint(LOG_UNUSUAL,
  601. "[JOIN] Sponsor %1!ws! has discontinued join, status %2!u!.\n",
  602. sponsorName,
  603. JoinStatus);
  604. if (JoinStatus == ERROR_CLUSTER_INCOMPATIBLE_VERSIONS)
  605. {
  606. ClRtlLogPrint(LOG_CRITICAL,
  607. "[JOIN] Join version data from sponsor %1!ws! doesn't match: JH: 0x%2!08X! JL: 0x%3!08X! SH: 0x%4!08X! SL: 0x%5!08X!.\n",
  608. sponsorName,
  609. CsMyHighestVersion,
  610. CsMyLowestVersion,
  611. ClusterHighestVersion,
  612. ClusterLowestVersion);
  613. //
  614. // rajdas: In this case I have managed to contact a sponsor, but there is a version mismatch. If all the join
  615. // threads meet the same fate, clussvc should not try to form a cluster.
  616. // BUG ID: 152229
  617. //
  618. CsJoinStatus = ERROR_CLUSTER_INCOMPATIBLE_VERSIONS;
  619. }
  620. goto error_exit;
  621. }
  622. // SS: we will leave this check because win2K clusters didnt do the
  623. // server side check, so the client must continue to do it
  624. //
  625. // now check that it is ok to join. We want this node to run
  626. // at the highest level of compatibility possible. One of the
  627. // following conditions must be true:
  628. //
  629. // 1) the High versions match exactly (major and build number)
  630. // 2) our Highest matches the sponsor's Lowest exactly, downgrading
  631. // the sponsor to our level of compatibility
  632. // 3) our Lowest matches the sponsor's Highest, downgrading ourselves
  633. // to the sponsor's level of compatibility
  634. //
  635. // note that the minor (build) version must match as well. The previous
  636. // version numbers are "well known" and shouldn't change when a newer
  637. // version is available/implemented.
  638. //
  639. if ( CsMyHighestVersion == ClusterHighestVersion ||
  640. CsMyHighestVersion == ClusterLowestVersion ||
  641. CsMyLowestVersion == ClusterHighestVersion
  642. #if 1 // CLUSTER_BETA
  643. || CsNoVersionCheck
  644. #endif
  645. )
  646. {
  647. status = ERROR_SUCCESS;
  648. } else {
  649. ClRtlLogPrint(LOG_CRITICAL,
  650. "[JOIN] Join version data from sponsor %1!ws! doesn't match: JH: 0x%2!08X! JL: 0x%3!08X! SH: 0x%4!08X! SL: 0x%5!08X!.\n",
  651. sponsorName,
  652. CsMyHighestVersion,
  653. CsMyLowestVersion,
  654. ClusterHighestVersion,
  655. ClusterLowestVersion);
  656. status = ERROR_CLUSTER_INCOMPATIBLE_VERSIONS;
  657. //
  658. // rajdas: In this case I have managed to contact a sponsor, but there is a version mismatch. If all the join
  659. // threads meet the same fate, clussvc should not try to form a cluster.
  660. // BUG ID: 152229
  661. //
  662. CsJoinStatus = ERROR_CLUSTER_INCOMPATIBLE_VERSIONS;
  663. }
  664. error_exit:
  665. if (bindingHandle != NULL) {
  666. RpcBindingFree(&bindingHandle);
  667. }
  668. return status;
  669. }
  670. DWORD WINAPI
  671. JoinpConnectThread(
  672. LPVOID Parameter
  673. )
  674. {
  675. LPWSTR sponsorName = Parameter;
  676. DWORD status;
  677. LPWSTR bindingString = NULL;
  678. RPC_BINDING_HANDLE bindingHandle = NULL;
  679. BOOL setEvent = FALSE;
  680. //
  681. // Try to connect to the specified node.
  682. //
  683. ClRtlLogPrint(LOG_UNUSUAL,
  684. "[JOIN] Asking %1!ws! to sponsor us.\n",
  685. sponsorName
  686. );
  687. //
  688. // connect to the JoinVersion interface first to see if we should progress
  689. // any further. since this is the first RPC call to the other node, we can
  690. // determine which security package should be used for the other interfaces.
  691. //
  692. status = VerifyJoinVersionData( sponsorName );
  693. if (status != ERROR_SUCCESS) {
  694. ClRtlLogPrint(LOG_UNUSUAL,
  695. "[JOIN] JoinVersion data for sponsor %1!ws! is invalid, status %2!u!.\n",
  696. sponsorName,
  697. status
  698. );
  699. goto error_exit;
  700. }
  701. //
  702. // Attempt to connect to the sponsor's extrocluster (join) RPC interface.
  703. //
  704. status = RpcStringBindingComposeW(
  705. L"ffe561b8-bf15-11cf-8c5e-08002bb49649",
  706. L"ncadg_ip_udp",
  707. sponsorName,
  708. NULL,
  709. NULL,
  710. &bindingString);
  711. if (status != RPC_S_OK) {
  712. ClRtlLogPrint(LOG_UNUSUAL,
  713. "[JOIN] Unable to compose ExtroCluster string binding for sponsor %1!ws!, status %2!u!.\n",
  714. sponsorName,
  715. status
  716. );
  717. goto error_exit;
  718. }
  719. status = RpcBindingFromStringBindingW(bindingString, &bindingHandle);
  720. RpcStringFreeW(&bindingString);
  721. if (status != RPC_S_OK) {
  722. ClRtlLogPrint(LOG_UNUSUAL,
  723. "[JOIN] Unable to build ExtroCluster binding for sponsor %1!ws!, status %2!u!.\n",
  724. sponsorName,
  725. status
  726. );
  727. goto error_exit;
  728. }
  729. //
  730. // under load, the sponsor might take a while to respond back to the
  731. // joiner. The default timeout is at 30 secs and this seems to work
  732. // ok. Note that this means the sponsor has 30 secs to reply to either
  733. // the RPC request or ping. As long it makes any reply, then the joiner's
  734. // RPC will continue to wait and not time out the sponsor.
  735. //
  736. status = RpcMgmtSetComTimeout( bindingHandle, CLUSTER_EXTROCLUSTER_RPC_COM_TIMEOUT );
  737. if (status != RPC_S_OK) {
  738. ClRtlLogPrint(LOG_UNUSUAL,
  739. "[JOIN] Unable to set ExtroCluster com timeout for sponsor %1!ws!, status %2!u!.\n",
  740. sponsorName,
  741. status
  742. );
  743. }
  744. status = RpcEpResolveBinding(bindingHandle, ExtroCluster_v2_0_c_ifspec);
  745. if (status != RPC_S_OK) {
  746. if ( (status == RPC_S_SERVER_UNAVAILABLE) ||
  747. (status == RPC_S_NOT_LISTENING) ||
  748. (status == EPT_S_NOT_REGISTERED)
  749. )
  750. {
  751. ClRtlLogPrint(LOG_NOISE,
  752. "[JOIN] Sponsor %1!ws! is not available (ExtroCluster), status=%2!u!.\n",
  753. sponsorName,
  754. status
  755. );
  756. }
  757. else {
  758. ClRtlLogPrint(LOG_UNUSUAL,
  759. "[JOIN] Unable to resolve ExtroCluster endpoint for sponsor %1!ws!, status %2!u!.\n",
  760. sponsorName,
  761. status
  762. );
  763. }
  764. goto error_exit;
  765. }
  766. if ( CsUseAuthenticatedRPC ) {
  767. //
  768. // establish a security context with this binding.
  769. //
  770. status = RpcBindingSetAuthInfoW(bindingHandle,
  771. CsServiceDomainAccount,
  772. RPC_C_AUTHN_LEVEL_CONNECT,
  773. CsRPCSecurityPackage[ CsRPCSecurityPackageIndex ],
  774. NULL,
  775. RPC_C_AUTHZ_NAME);
  776. if (status != RPC_S_OK) {
  777. ClRtlLogPrint(LOG_UNUSUAL,
  778. "[JOIN] Unable to set ExtroCluster AuthInfo using %1!ws! package, status %2!u!.\n",
  779. CsRPCSecurityPackageName[ CsRPCSecurityPackageIndex ],
  780. status);
  781. goto error_exit;
  782. }
  783. }
  784. error_exit:
  785. EnterCriticalSection(&CsJoinLock);
  786. if (status == RPC_S_OK) {
  787. if (CsJoinSponsorBinding == NULL) {
  788. //
  789. // This is the first successful connection.
  790. //
  791. ClRtlLogPrint(LOG_UNUSUAL,
  792. "[JOIN] Selecting %1!ws! as join sponsor.\n",
  793. sponsorName
  794. );
  795. CsJoinSponsorBinding = bindingHandle;
  796. bindingHandle = NULL;
  797. CsJoinSponsorName = sponsorName;
  798. sponsorName = NULL;
  799. SetEvent(CsJoinEvent);
  800. }
  801. else {
  802. ClRtlLogPrint(LOG_NOISE,
  803. "[JOIN] Closing connection to sponsor %1!ws!.\n",
  804. sponsorName
  805. );
  806. }
  807. }
  808. if (--CsJoinThreadCount == 0) {
  809. CloseHandle(CsJoinEvent);
  810. DeleteCriticalSection(&CsJoinLock);
  811. }
  812. else if (CsJoinThreadCount == 1) {
  813. SetEvent(CsJoinEvent);
  814. LeaveCriticalSection(&CsJoinLock);
  815. }
  816. else
  817. LeaveCriticalSection(&CsJoinLock);
  818. if (bindingHandle != NULL) {
  819. RpcBindingFree(&bindingHandle);
  820. }
  821. if (sponsorName != NULL) {
  822. LocalFree(sponsorName);
  823. }
  824. return(status);
  825. } // JoinpConnectThread
  826. DWORD
  827. JoinpAttemptJoin(
  828. LPWSTR SponsorName,
  829. RPC_BINDING_HANDLE JoinMasterBinding
  830. )
  831. /*++
  832. Routine Description:
  833. Called to attempt to join a cluster that already exists.
  834. Arguments:
  835. SponsorName - The name (or IP address) of the target sponsor.
  836. JoinMasterBinding - RPC binding to use to perform join.
  837. Return Value:
  838. ERROR_SUCCESS if successful
  839. Win32 error code otherwise.
  840. --*/
  841. {
  842. DWORD Status;
  843. NET_API_STATUS netStatus;
  844. LPTIME_OF_DAY_INFO tod = NULL;
  845. SYSTEMTIME systemTime;
  846. PNM_NETWORK network;
  847. DWORD startseq, endseq;
  848. #ifdef CLUSTER_TESTPOINT
  849. TESTPT(TpFailNmJoinCluster) {
  850. Status = 999999;
  851. goto error_exit;
  852. }
  853. #endif
  854. Status = NmJoinCluster(JoinMasterBinding);
  855. if (Status != ERROR_SUCCESS) {
  856. ClRtlLogPrint(LOG_UNUSUAL,
  857. "[JOIN] NmJoinCluster failed, status %1!u!.\n",
  858. Status
  859. );
  860. goto error_exit;
  861. }
  862. //
  863. // Synchronize the registry database
  864. //
  865. #ifdef CLUSTER_TESTPOINT
  866. TESTPT(TpFailDmJoin) {
  867. Status = 999999;
  868. goto error_exit;
  869. }
  870. #endif
  871. Status = DmJoin(JoinMasterBinding, &startseq);
  872. if (Status != ERROR_SUCCESS) {
  873. ClRtlLogPrint(LOG_CRITICAL,
  874. "[JOIN] DmJoin failed, error %1!d!\n",
  875. Status);
  876. goto error_exit;
  877. }
  878. //
  879. // Initialize the event handler, needs to register with gum for cluster wide
  880. //events.
  881. Status = EpInitPhase1();
  882. if ( Status != ERROR_SUCCESS) {
  883. ClRtlLogPrint(LOG_CRITICAL,
  884. "[JOIN] EpInitPhase1 failed, Status = %1!u!\n",
  885. Status);
  886. return(Status);
  887. }
  888. #ifdef CLUSTER_TESTPOINT
  889. TESTPT(TpFailApiInitPhase1) {
  890. Status = 999999;
  891. goto error_exit;
  892. }
  893. #endif
  894. //
  895. // Bring the API online in read-only mode. There is no join phase for
  896. // the API. The API is required by FmOnline, which starts the
  897. // resource monitor.
  898. //
  899. Status = ApiOnlineReadOnly();
  900. if ( Status != ERROR_SUCCESS) {
  901. ClRtlLogPrint(LOG_CRITICAL,
  902. "[JOIN] ApiOnlineReadOnly failed, error = %1!u!\n",
  903. Status);
  904. goto error_exit;
  905. }
  906. #ifdef CLUSTER_TESTPOINT
  907. TESTPT(TpFailFmJoinPhase1) {
  908. Status = 999999;
  909. goto error_exit;
  910. }
  911. #endif
  912. //update status for scm
  913. CsServiceStatus.dwCheckPoint++;
  914. CsAnnounceServiceStatus();
  915. //
  916. // Resynchronize the FM. We cannot enable the Groups until after the
  917. // the API is fully operational. See below.
  918. //
  919. Status = FmJoinPhase1();
  920. if (Status != ERROR_SUCCESS) {
  921. ClRtlLogPrint(LOG_CRITICAL,
  922. "[JOIN] FmJoinPhase1 failed, error %1!d!\n",
  923. Status);
  924. goto error_exit;
  925. }
  926. #ifdef CLUSTER_TESTPOINT
  927. TESTPT(TpFailDmUpdateJoinCluster) {
  928. Status = 999999;
  929. goto error_exit;
  930. }
  931. #endif
  932. // Call the DM to hook the notifications for quorum resource and
  933. //event handler
  934. Status = DmUpdateJoinCluster();
  935. if (Status != ERROR_SUCCESS)
  936. {
  937. ClRtlLogPrint(LOG_CRITICAL,
  938. "[JOIN] DmUpdateJoin failed, error = %1!u!\n",
  939. Status);
  940. goto error_exit;
  941. }
  942. #ifdef CLUSTER_TESTPOINT
  943. TESTPT(TpFailNmJoinComplete) {
  944. Status = 999999;
  945. goto error_exit;
  946. }
  947. #endif
  948. //
  949. // We are now fully online, call NM to globally change our state.
  950. //
  951. Status = NmJoinComplete(&endseq);
  952. if (Status != ERROR_SUCCESS) {
  953. ClRtlLogPrint(LOG_CRITICAL,
  954. "[JOIN] NmJoinComplete failed, error %1!d!\n",
  955. Status);
  956. goto error_exit;
  957. }
  958. #if 0
  959. //
  960. // This check is flawed. Network state updates can occur during
  961. // the join process, causing this check to fail unnecessarily.
  962. //
  963. if (startseq + GUM_UPDATE_JOINSEQUENCE != endseq) {
  964. ClRtlLogPrint(LOG_CRITICAL,
  965. "[JOIN] Sequence mismatch, start %1!d! end %2!d!\n",
  966. startseq, endseq);
  967. Status = ERROR_CLUSTER_DATABASE_SEQMISMATCH;
  968. goto error_exit;
  969. }
  970. #endif // 0
  971. //perform the fixup for the AdminExt value on both Nt4 and Nt5 nodes.
  972. Status=FmFixupAdminExt();
  973. if (Status != ERROR_SUCCESS) {
  974. ClRtlLogPrint(LOG_CRITICAL,
  975. "[JOIN] FmFixupAdminExt failed, error %1!d!\n",
  976. Status);
  977. goto error_exit;
  978. }
  979. //perform the fixups after the registry is downloaded
  980. //walk the list of fixups
  981. Status = NmPerformFixups(NM_JOIN_FIXUP);
  982. if (Status != ERROR_SUCCESS) {
  983. ClRtlLogPrint(LOG_CRITICAL,
  984. "[JOIN] NmPerformFixups failed, error %1!d!\n",
  985. Status);
  986. goto error_exit;
  987. }
  988. #ifdef CLUSTER_TESTPOINT
  989. TESTPT(TpFailApiInitPhase2) {
  990. Status = 999999;
  991. goto error_exit;
  992. }
  993. #endif
  994. //
  995. // Finally enable the full API.
  996. //
  997. Status = ApiOnline();
  998. if ( Status != ERROR_SUCCESS) {
  999. ClRtlLogPrint(LOG_CRITICAL,
  1000. "[JOIN] ApiOnline failed, error = %1!u!\n",
  1001. Status);
  1002. goto error_exit;
  1003. }
  1004. #ifdef CLUSTER_TESTPOINT
  1005. TESTPT(TpFailFmJoinPhase2) {
  1006. Status = 999999;
  1007. goto error_exit;
  1008. }
  1009. #endif
  1010. //update status for scm
  1011. CsServiceStatus.dwCheckPoint++;
  1012. CsAnnounceServiceStatus();
  1013. //
  1014. // Call back the Failover Manager to enable and move groups.
  1015. // The full registry is now available, so all groups/resources/resource
  1016. // types can be created (since they use the registry calls).
  1017. //
  1018. Status = FmJoinPhase2();
  1019. if (Status != ERROR_SUCCESS) {
  1020. ClRtlLogPrint(LOG_CRITICAL,
  1021. "[JOIN] FmJoinPhase2 failed, status %1!d!.\n",
  1022. Status);
  1023. goto error_exit;
  1024. }
  1025. #ifdef CLUSTER_TESTPOINT
  1026. TESTPT(TpFailEvInitialize) {
  1027. Status = 999999;
  1028. goto error_exit;
  1029. }
  1030. #endif
  1031. //
  1032. // Finish initializing the cluster wide event logging
  1033. //
  1034. // ASSUMPTION: this is called after the NM has established cluster
  1035. // membership.
  1036. //
  1037. if (!CsNoRepEvtLogging)
  1038. {
  1039. Status = EvOnline();
  1040. //if this fails, we still start the cluster service
  1041. if ( Status != ERROR_SUCCESS ) {
  1042. ClRtlLogPrint(LOG_CRITICAL,
  1043. "[JOIN] Error calling EvOnline, Status = %1!u!\n",
  1044. Status);
  1045. }
  1046. }
  1047. return(ERROR_SUCCESS);
  1048. error_exit:
  1049. ClRtlLogPrint(LOG_NOISE, "[INIT] Cleaning up failed join attempt.\n");
  1050. ClusterLeave();
  1051. return(Status);
  1052. }
  1053. BOOL
  1054. JoinpAddNodeCallback(
  1055. IN PVOID Context1,
  1056. IN PVOID Context2,
  1057. IN PVOID Object,
  1058. IN LPCWSTR Name
  1059. )
  1060. /*++
  1061. Routine Description:
  1062. Callback enumeration routine for adding a new node. This callback
  1063. figures out what node IDs are available.
  1064. Arguments:
  1065. Context1 - Supplies a pointer to an array of BOOLs. The node ID for
  1066. the enumerated node is set to FALSE.
  1067. Context2 - Not used.
  1068. Object - A pointer to the node object.
  1069. Name - The node name.
  1070. Return Value:
  1071. TRUE
  1072. --*/
  1073. {
  1074. PBOOL Avail;
  1075. DWORD Id;
  1076. Id = NmGetNodeId(Object);
  1077. CL_ASSERT(NmIsValidNodeId(Id));
  1078. Avail = (PBOOL)Context1;
  1079. Avail[Id] = FALSE;
  1080. return(TRUE);
  1081. }