Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1447 lines
39 KiB

  1. /*++
  2. Copyright (c) 1996 Microsoft Corporation
  3. Module Name:
  4. join.c
  5. Abstract:
  6. This module handles the initialization path where a newly booted
  7. node joins an existing cluster.
  8. Author:
  9. John Vert (jvert) 6/6/1996
  10. Revision History:
  11. --*/
  12. #include "initp.h"
  13. #include "lmcons.h"
  14. #include "lmremutl.h"
  15. #include "lmapibuf.h"
  16. #include <clusverp.h>
  17. #define JOIN_CLIENT_NO_DELAY 0 // delay for high-prio networks
  18. #define JOIN_CLIENT_NETWORK_DELAY 1000 // delay for low-prio networks
  19. #define JOIN_CLIENT_RESOURCE_DELAY 2000 // delay for cluster IP/netname
  20. #define JOIN_CLIENT_GET_NETWORK_DELAY(_NetworkPrio) \
  21. (((_NetworkPrio) == 1) ? JOIN_CLIENT_NO_DELAY : JOIN_CLIENT_NETWORK_DELAY)
  22. //
  23. // Local types
  24. //
  25. typedef struct {
  26. DWORD Delay;
  27. LPWSTR Name;
  28. } JOIN_SPONSOR_CONTEXT, *PJOIN_SPONSOR_CONTEXT;
  29. //
  30. // Local data
  31. //
  32. CRITICAL_SECTION CsJoinLock;
  33. HANDLE CsJoinEvent = NULL;
  34. DWORD CsJoinThreadCount = 0;
  35. DWORD CsJoinStatus=ERROR_SUCCESS;
  36. RPC_BINDING_HANDLE CsJoinSponsorBinding = NULL;
  37. LPWSTR CsJoinSponsorName = NULL;
  38. // While another node is joining, we will keep track of any DM or FM updates.
  39. BOOL CsDmOrFmHasChanged = FALSE;
  40. //
  41. // Local function prototypes
  42. //
  43. VOID
  44. JoinpEnumNodesAndJoinByAddress(
  45. IN HDMKEY Key,
  46. IN PWSTR NodeId,
  47. IN PVOID Context
  48. );
  49. VOID
  50. JoinpEnumNodesAndJoinByHostName(
  51. IN HDMKEY Key,
  52. IN PWSTR NodeId,
  53. IN PVOID Context
  54. );
  55. VOID
  56. JoinpConnectToSponsor(
  57. IN PWSTR SponsorName,
  58. IN DWORD Delay
  59. );
  60. DWORD WINAPI
  61. JoinpConnectThread(
  62. LPVOID Parameter
  63. );
  64. DWORD
  65. JoinpAttemptJoin(
  66. LPWSTR SponsorName,
  67. RPC_BINDING_HANDLE JoinMasterBinding
  68. );
  69. BOOL
  70. JoinpAddNodeCallback(
  71. IN PVOID Context1,
  72. IN PVOID Context2,
  73. IN PVOID Object,
  74. IN LPCWSTR Name
  75. );
  76. BOOL
  77. JoinpEnumNetworksToSetPriority(
  78. IN PVOID Context1,
  79. IN PVOID Context2,
  80. IN PVOID Object,
  81. IN LPCWSTR Name
  82. );
  83. DWORD
  84. ClusterJoin(
  85. VOID
  86. )
  87. /*++
  88. Routine Description:
  89. Called to attempt to join a cluster that already exists.
  90. Arguments:
  91. None
  92. Return Value:
  93. ERROR_SUCCESS if successful
  94. Win32 error code otherwise.
  95. --*/
  96. {
  97. DWORD Status;
  98. LPWSTR ClusterIpAddress = NULL;
  99. LPWSTR ClusIpAddrResource = NULL;
  100. LPWSTR ClusterNameId = NULL;
  101. DWORD idMaxSize = 0;
  102. DWORD idSize = 0;
  103. HDMKEY hClusNameResKey = NULL;
  104. HDMKEY hClusIPAddrResKey = NULL;
  105. //
  106. // Try connecting using the cluster IP address first. get the cluster
  107. // name resource, looking up its dependency for the cluster IP addr
  108. //
  109. Status = DmQuerySz(DmClusterParametersKey,
  110. CLUSREG_NAME_CLUS_CLUSTER_NAME_RES,
  111. &ClusterNameId,
  112. &idMaxSize,
  113. &idSize);
  114. if (Status != ERROR_SUCCESS) {
  115. ClRtlLogPrint(LOG_CRITICAL,
  116. "[JOIN] failed to get cluster name resource, error %1!u!.\n",
  117. Status);
  118. goto error_exit;
  119. }
  120. //
  121. // open name resource key and read its DependsOn key
  122. //
  123. hClusNameResKey = DmOpenKey( DmResourcesKey, ClusterNameId, KEY_READ );
  124. if ( hClusNameResKey == NULL ) {
  125. Status = GetLastError();
  126. ClRtlLogPrint(LOG_CRITICAL,
  127. "[JOIN] failed to open Cluster Name resource key, error %1!u!.\n",
  128. Status);
  129. goto error_exit;
  130. }
  131. //
  132. // allocate enough space for the GUID and the Parameters string
  133. //
  134. idMaxSize = ( CS_NETWORK_ID_LENGTH + sizeof( CLUSREG_KEYNAME_PARAMETERS ) + 2)
  135. * sizeof(WCHAR);
  136. ClusIpAddrResource = LocalAlloc( LMEM_FIXED, idMaxSize );
  137. if ( ClusIpAddrResource == NULL ) {
  138. Status = ERROR_NOT_ENOUGH_MEMORY;
  139. ClRtlLogPrint(LOG_CRITICAL,
  140. "[JOIN] no memory for Cluster Ip address resource ID!\n");
  141. goto error_exit;
  142. }
  143. Status = DmQueryMultiSz(hClusNameResKey,
  144. CLUSREG_NAME_RES_DEPENDS_ON,
  145. &ClusIpAddrResource,
  146. &idMaxSize,
  147. &idSize);
  148. if ( Status != ERROR_SUCCESS ) {
  149. ClRtlLogPrint(LOG_CRITICAL,
  150. "[JOIN] failed to get Cluster Ip address resource ID, error %1!u!.\n",
  151. Status);
  152. goto error_exit;
  153. }
  154. lstrcatW( ClusIpAddrResource, L"\\" );
  155. lstrcatW( ClusIpAddrResource, CLUSREG_KEYNAME_PARAMETERS );
  156. hClusIPAddrResKey = DmOpenKey( DmResourcesKey, ClusIpAddrResource, KEY_READ );
  157. if ( hClusIPAddrResKey == NULL ) {
  158. Status = GetLastError();
  159. ClRtlLogPrint(LOG_CRITICAL,
  160. "[JOIN] failed to open Cluster IP Address resource key, error %1!u!.\n",
  161. Status);
  162. goto error_exit;
  163. }
  164. //
  165. // get the IP Address; note that these value names are not defined
  166. // in a global way. if they are changed, this code will break
  167. //
  168. idMaxSize = idSize = 0;
  169. Status = DmQuerySz(hClusIPAddrResKey,
  170. L"Address",
  171. &ClusterIpAddress,
  172. &idMaxSize,
  173. &idSize);
  174. if ( Status != ERROR_SUCCESS ) {
  175. ClRtlLogPrint(LOG_CRITICAL,
  176. "[JOIN] failed to get Cluster Ip address, error %1!u!.\n",
  177. Status);
  178. goto error_exit;
  179. }
  180. //
  181. // Spawn threads to find a sponsor. We will try the make connections using
  182. // the cluster IP address, the IP address of each node on each network, and
  183. // the name of each node in the cluster. The connects will proceed in
  184. // parallel. We'll use the first one that succeeds.
  185. //
  186. CsJoinEvent = CreateEvent(NULL, TRUE, FALSE, NULL);
  187. if (CsJoinEvent == NULL) {
  188. Status = GetLastError();
  189. ClRtlLogPrint(LOG_CRITICAL,
  190. "[JOIN] failed to create join event, error %1!u!.\n",
  191. Status
  192. );
  193. goto error_exit;
  194. }
  195. CsJoinThreadCount = 1;
  196. InitializeCriticalSection(&CsJoinLock);
  197. EnterCriticalSection(&CsJoinLock);
  198. DmEnumKeys(DmNetInterfacesKey, JoinpEnumNodesAndJoinByAddress, NULL);
  199. DmEnumKeys(DmNodesKey, JoinpEnumNodesAndJoinByHostName, NULL);
  200. //
  201. // give the other threads a chance to start since using the cluster IP
  202. // address to join with is problematic when the resource moves in the
  203. // middle of a join
  204. //
  205. JoinpConnectToSponsor(ClusterIpAddress, JOIN_CLIENT_RESOURCE_DELAY);
  206. //update status for scm
  207. CsServiceStatus.dwCheckPoint++;
  208. CsAnnounceServiceStatus();
  209. if(CsJoinThreadCount == 1)
  210. SetEvent(CsJoinEvent);
  211. LeaveCriticalSection(&CsJoinLock);
  212. Status = WaitForSingleObject(CsJoinEvent, INFINITE);
  213. CL_ASSERT(Status == WAIT_OBJECT_0);
  214. EnterCriticalSection(&CsJoinLock);
  215. ClRtlLogPrint(LOG_NOISE,
  216. "[JOIN] Got out of the join wait, CsJoinThreadCount = %1!u!.\n",
  217. CsJoinThreadCount
  218. );
  219. if(--CsJoinThreadCount == 0) {
  220. CloseHandle(CsJoinEvent);
  221. DeleteCriticalSection(&CsJoinLock);
  222. }
  223. else
  224. LeaveCriticalSection(&CsJoinLock);
  225. //
  226. // All of the threads have failed or one of them made a connection,
  227. // use it to join.
  228. //
  229. if (CsJoinSponsorBinding != NULL) {
  230. CL_ASSERT(CsJoinSponsorName != NULL);
  231. ClRtlLogPrint(LOG_UNUSUAL,
  232. "[JOIN] Attempting join with sponsor %1!ws!.\n",
  233. CsJoinSponsorName
  234. );
  235. //
  236. // Chittur Subbaraman (chitturs) - 10/27/98
  237. //
  238. // If the database restore operation is requested, then
  239. // refuse to join the cluster and return an error code.
  240. //
  241. if ( CsDatabaseRestore == TRUE ) {
  242. Status = ERROR_CLUSTER_NODE_UP;
  243. LocalFree(CsJoinSponsorName);
  244. goto error_exit;
  245. }
  246. Status = JoinpAttemptJoin(CsJoinSponsorName, CsJoinSponsorBinding);
  247. RpcBindingFree(&CsJoinSponsorBinding);
  248. LocalFree(CsJoinSponsorName);
  249. }
  250. else {
  251. //we couldnt create a binding to the sponsorer
  252. if(CsJoinStatus == ERROR_SUCCESS) {
  253. //we did the version check in joinpconnectthread but for some reason
  254. //couldnt produce a binding
  255. Status = ERROR_BAD_NETPATH;
  256. ClRtlLogPrint(LOG_CRITICAL,
  257. "[JOIN] Unable to connect to any sponsor node.\n");
  258. }
  259. else {
  260. Status = CsJoinStatus;
  261. }
  262. // rajdas: If the join did not suceed due to version mismatch we shouldn't try to form a cluster.
  263. // Bug ID: 152229
  264. //
  265. if(CsJoinStatus == ERROR_CLUSTER_INCOMPATIBLE_VERSIONS)
  266. bFormCluster = FALSE;
  267. }
  268. error_exit:
  269. if ( ClusterNameId ) {
  270. LocalFree( ClusterNameId );
  271. }
  272. if ( ClusterIpAddress ) {
  273. LocalFree( ClusterIpAddress );
  274. }
  275. if ( ClusIpAddrResource ) {
  276. LocalFree( ClusIpAddrResource );
  277. }
  278. if ( hClusNameResKey ) {
  279. DmCloseKey( hClusNameResKey );
  280. }
  281. if ( hClusIPAddrResKey ) {
  282. DmCloseKey( hClusIPAddrResKey );
  283. }
  284. return(Status);
  285. }
  286. VOID
  287. JoinpEnumNodesAndJoinByAddress(
  288. IN HDMKEY Key,
  289. IN PWSTR NetInterfaceId,
  290. IN PVOID Context
  291. )
  292. /*++
  293. Routine Description:
  294. Attempts to establish an RPC connection to a specified
  295. node using its IP address
  296. Arguments:
  297. Key - pointer to the node key handle
  298. NetInterfaceId - pointer to string representing net IF ID (guid)
  299. Context - pointer to a location to return the final status
  300. Return Value:
  301. None
  302. --*/
  303. {
  304. DWORD status;
  305. LPWSTR NetIFNodeID = NULL;
  306. LPWSTR NetIFIpAddress = NULL;
  307. LPWSTR NetIFNetwork = NULL;
  308. HDMKEY NetIFNetworkKey = NULL;
  309. DWORD NetIFNetworkPriority;
  310. DWORD idMaxSize = 0;
  311. DWORD idSize = 0;
  312. //
  313. // get the NodeId Value from the NetIF key and if it's us,
  314. // skip this netIF
  315. //
  316. status = DmQuerySz(Key,
  317. CLUSREG_NAME_NETIFACE_NODE,
  318. &NetIFNodeID,
  319. &idMaxSize,
  320. &idSize);
  321. if ( status == ERROR_SUCCESS ) {
  322. if (lstrcmpiW(NetIFNodeID, NmLocalNodeIdString) != 0) {
  323. //
  324. // it's not us so get the address and try it...
  325. //
  326. idMaxSize = idSize = 0;
  327. status = DmQuerySz(Key,
  328. CLUSREG_NAME_NETIFACE_ADDRESS,
  329. &NetIFIpAddress,
  330. &idMaxSize,
  331. &idSize);
  332. if ( status != ERROR_SUCCESS ) {
  333. ClRtlLogPrint(LOG_CRITICAL,
  334. "[JOIN] failed to get NetInterface Address, error %1!u!.\n",
  335. status);
  336. goto error_exit;
  337. }
  338. //
  339. // Determine the delay based on the network priority. If we
  340. // cannot find it in the cluster database, we still try to
  341. // connect to the sponsor assuming the lowest priority.
  342. //
  343. NetIFNetworkPriority = 0xFFFFFFFF;
  344. idMaxSize = idSize = 0;
  345. status = DmQuerySz(Key,
  346. CLUSREG_NAME_NETIFACE_NETWORK,
  347. &NetIFNetwork,
  348. &idMaxSize,
  349. &idSize);
  350. if ( status != ERROR_SUCCESS ) {
  351. ClRtlLogPrint(LOG_CRITICAL,
  352. "[JOIN] Failed to get NetInterface Network, error %1!u!.\n",
  353. status);
  354. goto ConnectToSponsor;
  355. }
  356. NetIFNetworkKey = DmOpenKey(DmNetworksKey,
  357. NetIFNetwork,
  358. KEY_READ);
  359. if ( NetIFNetworkKey == NULL ) {
  360. ClRtlLogPrint(LOG_CRITICAL,
  361. "[JOIN] Failed to open key for network %1!ws!, error %2!u!.\n",
  362. NetIFNetwork, status
  363. );
  364. goto ConnectToSponsor;
  365. }
  366. status = DmQueryDword(NetIFNetworkKey,
  367. CLUSREG_NAME_NET_PRIORITY,
  368. &NetIFNetworkPriority,
  369. 0);
  370. if ( status != ERROR_SUCCESS ) {
  371. ClRtlLogPrint(LOG_CRITICAL,
  372. "[JOIN] Failed to get NetInterface network priority, error %1!u!.\n",
  373. status);
  374. }
  375. ConnectToSponsor:
  376. //
  377. // attempt the join with this address
  378. //
  379. JoinpConnectToSponsor(NetIFIpAddress,
  380. JOIN_CLIENT_GET_NETWORK_DELAY(NetIFNetworkPriority));
  381. }
  382. }
  383. else {
  384. ClRtlLogPrint(LOG_CRITICAL,
  385. "[JOIN] failed to get NetInterface Node ID, error %1!u!.\n",
  386. status);
  387. }
  388. error_exit:
  389. DmCloseKey(Key);
  390. if ( NetIFNodeID ) {
  391. LocalFree( NetIFNodeID );
  392. }
  393. if ( NetIFIpAddress ) {
  394. LocalFree( NetIFIpAddress );
  395. }
  396. return;
  397. }
  398. VOID
  399. JoinpEnumNodesAndJoinByHostName(
  400. IN HDMKEY Key,
  401. IN PWSTR NodeId,
  402. IN PVOID Context
  403. )
  404. /*++
  405. Routine Description:
  406. Attempts to establish an RPC connection to a specified node using
  407. its host name
  408. Arguments:
  409. Key - pointer to the node key handle
  410. NodeId - pointer to string representing node ID (number)
  411. Context - pointer to a location to return the final status
  412. Return Value:
  413. None
  414. --*/
  415. {
  416. DWORD status;
  417. LPWSTR nodeName=NULL;
  418. DWORD nodeNameLen=0;
  419. DWORD nodeNameSize=0;
  420. //
  421. // Try to connect if this is not us
  422. //
  423. if (lstrcmpiW(NodeId, NmLocalNodeIdString) != 0) {
  424. status = DmQuerySz(Key,
  425. CLUSREG_NAME_NODE_NAME,
  426. &nodeName,
  427. &nodeNameLen,
  428. &nodeNameSize);
  429. if (status == ERROR_SUCCESS) {
  430. JoinpConnectToSponsor(nodeName, JOIN_CLIENT_NETWORK_DELAY);
  431. LocalFree(nodeName);
  432. }
  433. }
  434. DmCloseKey(Key);
  435. return;
  436. }
  437. VOID
  438. JoinpConnectToSponsor(
  439. IN PWSTR SponsorName,
  440. IN DWORD Delay
  441. )
  442. /*++
  443. Routine Description:
  444. Attempts to establish an RPC connection to a specified node.
  445. Arguments:
  446. SponsorName - The name (or IP address) of the target sponsor.
  447. Delay - Milliseconds to wait before sending request
  448. Return Value:
  449. ERROR_SUCCESS if an RPC connection is successfully made to the sponsor.
  450. An RPC error code otherwise.
  451. --*/
  452. {
  453. HANDLE threadHandle;
  454. DWORD status = ERROR_SUCCESS;
  455. DWORD threadId;
  456. PJOIN_SPONSOR_CONTEXT context;
  457. BOOL setEvent = FALSE;
  458. ClRtlLogPrint(LOG_UNUSUAL,
  459. "[JOIN] Spawning thread to connect to sponsor %1!ws!\n",
  460. SponsorName
  461. );
  462. //
  463. // Allocate the context and sponsor name buffer separately. If this
  464. // thread "wins" sponsorship, the name buffer will be reused.
  465. //
  466. context = LocalAlloc( LMEM_FIXED | LMEM_ZEROINIT, sizeof(JOIN_SPONSOR_CONTEXT) );
  467. if (context != NULL) {
  468. context->Name = LocalAlloc( LMEM_FIXED | LMEM_ZEROINIT,
  469. (lstrlenW(SponsorName) + 1 ) * sizeof(WCHAR) );
  470. if (context->Name != NULL) {
  471. lstrcpyW(context->Name, SponsorName);
  472. context->Delay = Delay;
  473. CsJoinThreadCount++;
  474. threadHandle = CreateThread(
  475. NULL,
  476. 0,
  477. JoinpConnectThread,
  478. context,
  479. 0,
  480. &threadId
  481. );
  482. if (threadHandle != NULL) {
  483. CloseHandle(threadHandle);
  484. }
  485. else {
  486. status = GetLastError();
  487. ClRtlLogPrint(LOG_CRITICAL,
  488. "[JOIN] Failed to spawn connect thread, error %1!u!.\n",
  489. status
  490. );
  491. --CsJoinThreadCount;
  492. LocalFree(context->Name);
  493. LocalFree(context);
  494. }
  495. }
  496. else {
  497. LocalFree(context);
  498. ClRtlLogPrint(LOG_CRITICAL,
  499. "[JOIN] Failed to allocate memory for sponsor name.\n"
  500. );
  501. }
  502. }
  503. else {
  504. ClRtlLogPrint(LOG_CRITICAL,
  505. "[JOIN] Failed to allocate memory.\n"
  506. );
  507. }
  508. return;
  509. } // JoinpConnectToSponsor
  510. DWORD WINAPI
  511. VerifyJoinVersionData(
  512. LPWSTR sponsorName
  513. )
  514. /*++
  515. Routine Description:
  516. Verify that the sponsor and the joiner are compatible
  517. Arguments:
  518. sponsorName - pointer to text string of sponsor to use
  519. Return Value:
  520. ERROR_SUCCESS - if ok to continue join
  521. --*/
  522. {
  523. DWORD status;
  524. LPWSTR bindingString = NULL;
  525. RPC_BINDING_HANDLE bindingHandle = NULL;
  526. DWORD SponsorNodeId;
  527. DWORD ClusterHighestVersion;
  528. DWORD ClusterLowestVersion;
  529. DWORD JoinStatus = ERROR_SUCCESS;
  530. DWORD packageIndex;
  531. //
  532. // Attempt to connect to the sponsor's JoinVersion RPC interface.
  533. //
  534. status = RpcStringBindingComposeW(
  535. L"6e17aaa0-1a47-11d1-98bd-0000f875292e",
  536. L"ncadg_ip_udp",
  537. sponsorName,
  538. NULL,
  539. NULL,
  540. &bindingString);
  541. if (status != RPC_S_OK) {
  542. ClRtlLogPrint(LOG_UNUSUAL,
  543. "[JOIN] Unable to compose JoinVersion string binding for sponsor %1!ws!, status %2!u!.\n",
  544. sponsorName,
  545. status
  546. );
  547. goto error_exit;
  548. }
  549. status = RpcBindingFromStringBindingW(bindingString, &bindingHandle);
  550. RpcStringFreeW(&bindingString);
  551. if (status != RPC_S_OK) {
  552. ClRtlLogPrint(LOG_UNUSUAL,
  553. "[JOIN] Unable to build JoinVersion binding for sponsor %1!ws!, status %2!u!.\n",
  554. sponsorName,
  555. status
  556. );
  557. goto error_exit;
  558. }
  559. //
  560. // under load, the sponsor might take a while to respond back to the
  561. // joiner. The default timeout is at 30 secs and this seems to work
  562. // ok. Note that this means the sponsor has 30 secs to reply to either
  563. // the RPC request or ping. As long it makes any reply, then the joiner's
  564. // RPC will continue to wait and not time out the sponsor.
  565. //
  566. status = RpcMgmtSetComTimeout( bindingHandle, CLUSTER_JOINVERSION_RPC_COM_TIMEOUT );
  567. if (status != RPC_S_OK) {
  568. ClRtlLogPrint(LOG_UNUSUAL,
  569. "[JOIN] Unable to set JoinVersion com timeout for sponsor %1!ws!, status %2!u!.\n",
  570. sponsorName,
  571. status
  572. );
  573. }
  574. status = RpcEpResolveBinding(bindingHandle, JoinVersion_v2_0_c_ifspec);
  575. if (status != RPC_S_OK) {
  576. if ( (status == RPC_S_SERVER_UNAVAILABLE) ||
  577. (status == RPC_S_NOT_LISTENING) ||
  578. (status == EPT_S_NOT_REGISTERED)
  579. )
  580. {
  581. ClRtlLogPrint(LOG_NOISE,
  582. "[JOIN] Sponsor %1!ws! is not available (JoinVersion), status=%2!u!.\n",
  583. sponsorName,
  584. status
  585. );
  586. }
  587. else {
  588. ClRtlLogPrint(LOG_UNUSUAL,
  589. "[JOIN] Unable to resolve JoinVersion endpoint for sponsor %1!ws!, status %2!u!.\n",
  590. sponsorName,
  591. status
  592. );
  593. }
  594. goto error_exit;
  595. }
  596. //
  597. // run through the list of RPC security packages, trying to establish a
  598. // security context with this binding.
  599. //
  600. for (packageIndex = 0;
  601. packageIndex < CsNumberOfRPCSecurityPackages;
  602. ++packageIndex )
  603. {
  604. status = RpcBindingSetAuthInfoW(bindingHandle,
  605. CsServiceDomainAccount,
  606. RPC_C_AUTHN_LEVEL_CONNECT,
  607. CsRPCSecurityPackage[ packageIndex ],
  608. NULL,
  609. RPC_C_AUTHZ_NAME);
  610. if (status != RPC_S_OK) {
  611. ClRtlLogPrint(LOG_UNUSUAL,
  612. "[JOIN] Unable to set JoinVersion AuthInfo using %1!ws! package, status %2!u!.\n",
  613. CsRPCSecurityPackageName[packageIndex],
  614. status);
  615. continue;
  616. }
  617. status = CsRpcGetJoinVersionData(bindingHandle,
  618. NmLocalNodeId,
  619. CsMyHighestVersion,
  620. CsMyLowestVersion,
  621. &SponsorNodeId,
  622. &ClusterHighestVersion,
  623. &ClusterLowestVersion,
  624. &JoinStatus);
  625. if ( status == RPC_S_OK ) {
  626. break;
  627. } else {
  628. ClRtlLogPrint(LOG_UNUSUAL,
  629. "[JOIN] Unable to get join version data from sponsor %1!ws! using "
  630. "%2!ws! package, status %3!u!.\n",
  631. sponsorName,
  632. CsRPCSecurityPackageName[packageIndex],
  633. status);
  634. }
  635. }
  636. //
  637. // jump out now if nothing work (as in the case of a form)
  638. //
  639. if ( status != ERROR_SUCCESS ) {
  640. goto error_exit;
  641. }
  642. //
  643. // use the join lock to set the RPC package index
  644. //
  645. EnterCriticalSection( &CsJoinLock );
  646. if ( CsRPCSecurityPackageIndex < 0 ) {
  647. CsRPCSecurityPackageIndex = packageIndex;
  648. }
  649. LeaveCriticalSection( &CsJoinLock );
  650. //
  651. // check the sponsor was in agreement with the join
  652. //
  653. if ( JoinStatus != ERROR_SUCCESS ) {
  654. ClRtlLogPrint(LOG_UNUSUAL,
  655. "[JOIN] Sponsor %1!ws! has discontinued join, status %2!u!.\n",
  656. sponsorName,
  657. JoinStatus);
  658. if (JoinStatus == ERROR_CLUSTER_INCOMPATIBLE_VERSIONS)
  659. {
  660. ClRtlLogPrint(LOG_CRITICAL,
  661. "[JOIN] Join version data from sponsor %1!ws! doesn't match: JH: 0x%2!08X! JL: 0x%3!08X! SH: 0x%4!08X! SL: 0x%5!08X!.\n",
  662. sponsorName,
  663. CsMyHighestVersion,
  664. CsMyLowestVersion,
  665. ClusterHighestVersion,
  666. ClusterLowestVersion);
  667. //
  668. // rajdas: In this case I have managed to contact a sponsor, but there is a version mismatch. If all the join
  669. // threads meet the same fate, clussvc should not try to form a cluster.
  670. // BUG ID: 152229
  671. //
  672. CsJoinStatus = ERROR_CLUSTER_INCOMPATIBLE_VERSIONS;
  673. }
  674. goto error_exit;
  675. }
  676. // SS: we will leave this check because win2K clusters didnt do the
  677. // server side check, so the client must continue to do it
  678. //
  679. // now check that it is ok to join. We want this node to run
  680. // at the highest level of compatibility possible. One of the
  681. // following conditions must be true:
  682. //
  683. // 1) the High versions match exactly (major and build number)
  684. // 2) our Highest matches the sponsor's Lowest exactly, downgrading
  685. // the sponsor to our level of compatibility
  686. // 3) our Lowest matches the sponsor's Highest, downgrading ourselves
  687. // to the sponsor's level of compatibility
  688. //
  689. // note that the minor (build) version must match as well. The previous
  690. // version numbers are "well known" and shouldn't change when a newer
  691. // version is available/implemented.
  692. //
  693. if ( CsMyHighestVersion == ClusterHighestVersion ||
  694. CsMyHighestVersion == ClusterLowestVersion ||
  695. CsMyLowestVersion == ClusterHighestVersion
  696. #if 1 // CLUSTER_BETA
  697. || CsNoVersionCheck
  698. #endif
  699. )
  700. {
  701. status = ERROR_SUCCESS;
  702. } else {
  703. ClRtlLogPrint(LOG_CRITICAL,
  704. "[JOIN] Join version data from sponsor %1!ws! doesn't match: JH: 0x%2!08X! JL: 0x%3!08X! SH: 0x%4!08X! SL: 0x%5!08X!.\n",
  705. sponsorName,
  706. CsMyHighestVersion,
  707. CsMyLowestVersion,
  708. ClusterHighestVersion,
  709. ClusterLowestVersion);
  710. status = ERROR_CLUSTER_INCOMPATIBLE_VERSIONS;
  711. //
  712. // rajdas: In this case I have managed to contact a sponsor, but there is a version mismatch. If all the join
  713. // threads meet the same fate, clussvc should not try to form a cluster.
  714. // BUG ID: 152229
  715. //
  716. CsJoinStatus = ERROR_CLUSTER_INCOMPATIBLE_VERSIONS;
  717. }
  718. error_exit:
  719. if (bindingHandle != NULL) {
  720. RpcBindingFree(&bindingHandle);
  721. }
  722. return status;
  723. }
  724. DWORD WINAPI
  725. JoinpConnectThread(
  726. LPVOID Parameter
  727. )
  728. {
  729. PJOIN_SPONSOR_CONTEXT context = (PJOIN_SPONSOR_CONTEXT) Parameter;
  730. LPWSTR sponsorName = context->Name;
  731. DWORD status;
  732. LPWSTR bindingString = NULL;
  733. RPC_BINDING_HANDLE bindingHandle = NULL;
  734. BOOL setEvent = FALSE;
  735. //
  736. // Sleep for the specified delay.
  737. //
  738. if (context->Delay > 0) {
  739. Sleep(context->Delay);
  740. }
  741. //
  742. // No need to send a sponsorship request if a sponsor has
  743. // already been chosen.
  744. //
  745. if (CsJoinSponsorBinding != NULL) {
  746. ClRtlLogPrint(LOG_UNUSUAL,
  747. "[JOIN] No need to ask %1!ws! to sponsor us after delay of %2!u! milliseconds.\n",
  748. sponsorName, context->Delay
  749. );
  750. status = RPC_S_CALL_FAILED_DNE;
  751. goto error_exit;
  752. }
  753. //
  754. // Try to connect to the specified node.
  755. //
  756. ClRtlLogPrint(LOG_UNUSUAL,
  757. "[JOIN] Asking %1!ws! to sponsor us after delay of %2!u! milliseconds.\n",
  758. sponsorName, context->Delay
  759. );
  760. //
  761. // connect to the JoinVersion interface first to see if we should progress
  762. // any further. since this is the first RPC call to the other node, we can
  763. // determine which security package should be used for the other interfaces.
  764. //
  765. status = VerifyJoinVersionData( sponsorName );
  766. if (status != ERROR_SUCCESS) {
  767. ClRtlLogPrint(LOG_UNUSUAL,
  768. "[JOIN] JoinVersion data for sponsor %1!ws! is invalid, status %2!u!.\n",
  769. sponsorName,
  770. status
  771. );
  772. goto error_exit;
  773. }
  774. //
  775. // Attempt to connect to the sponsor's extrocluster (join) RPC interface.
  776. //
  777. status = RpcStringBindingComposeW(
  778. L"ffe561b8-bf15-11cf-8c5e-08002bb49649",
  779. L"ncadg_ip_udp",
  780. sponsorName,
  781. NULL,
  782. NULL,
  783. &bindingString);
  784. if (status != RPC_S_OK) {
  785. ClRtlLogPrint(LOG_UNUSUAL,
  786. "[JOIN] Unable to compose ExtroCluster string binding for sponsor %1!ws!, status %2!u!.\n",
  787. sponsorName,
  788. status
  789. );
  790. goto error_exit;
  791. }
  792. status = RpcBindingFromStringBindingW(bindingString, &bindingHandle);
  793. RpcStringFreeW(&bindingString);
  794. if (status != RPC_S_OK) {
  795. ClRtlLogPrint(LOG_UNUSUAL,
  796. "[JOIN] Unable to build ExtroCluster binding for sponsor %1!ws!, status %2!u!.\n",
  797. sponsorName,
  798. status
  799. );
  800. goto error_exit;
  801. }
  802. //
  803. // under load, the sponsor might take a while to respond back to the
  804. // joiner. The default timeout is at 30 secs and this seems to work
  805. // ok. Note that this means the sponsor has 30 secs to reply to either
  806. // the RPC request or ping. As long it makes any reply, then the joiner's
  807. // RPC will continue to wait and not time out the sponsor.
  808. //
  809. status = RpcMgmtSetComTimeout( bindingHandle, CLUSTER_EXTROCLUSTER_RPC_COM_TIMEOUT );
  810. if (status != RPC_S_OK) {
  811. ClRtlLogPrint(LOG_UNUSUAL,
  812. "[JOIN] Unable to set ExtroCluster com timeout for sponsor %1!ws!, status %2!u!.\n",
  813. sponsorName,
  814. status
  815. );
  816. }
  817. status = RpcEpResolveBinding(bindingHandle, ExtroCluster_v2_0_c_ifspec);
  818. if (status != RPC_S_OK) {
  819. if ( (status == RPC_S_SERVER_UNAVAILABLE) ||
  820. (status == RPC_S_NOT_LISTENING) ||
  821. (status == EPT_S_NOT_REGISTERED)
  822. )
  823. {
  824. ClRtlLogPrint(LOG_NOISE,
  825. "[JOIN] Sponsor %1!ws! is not available (ExtroCluster), status=%2!u!.\n",
  826. sponsorName,
  827. status
  828. );
  829. }
  830. else {
  831. ClRtlLogPrint(LOG_UNUSUAL,
  832. "[JOIN] Unable to resolve ExtroCluster endpoint for sponsor %1!ws!, status %2!u!.\n",
  833. sponsorName,
  834. status
  835. );
  836. }
  837. goto error_exit;
  838. }
  839. //
  840. // establish a security context with this binding.
  841. //
  842. status = RpcBindingSetAuthInfoW(bindingHandle,
  843. CsServiceDomainAccount,
  844. RPC_C_AUTHN_LEVEL_CONNECT,
  845. CsRPCSecurityPackage[ CsRPCSecurityPackageIndex ],
  846. NULL,
  847. RPC_C_AUTHZ_NAME);
  848. if (status != RPC_S_OK) {
  849. ClRtlLogPrint(LOG_UNUSUAL,
  850. "[JOIN] Unable to set ExtroCluster AuthInfo using %1!ws! package, status %2!u!.\n",
  851. CsRPCSecurityPackageName[ CsRPCSecurityPackageIndex ],
  852. status);
  853. goto error_exit;
  854. }
  855. error_exit:
  856. EnterCriticalSection(&CsJoinLock);
  857. if (status == RPC_S_OK) {
  858. if (CsJoinSponsorBinding == NULL) {
  859. //
  860. // This is the first successful connection.
  861. //
  862. ClRtlLogPrint(LOG_UNUSUAL,
  863. "[JOIN] Selecting %1!ws! as join sponsor.\n",
  864. sponsorName
  865. );
  866. CsJoinSponsorBinding = bindingHandle;
  867. bindingHandle = NULL;
  868. CsJoinSponsorName = sponsorName;
  869. sponsorName = NULL;
  870. SetEvent(CsJoinEvent);
  871. }
  872. else {
  873. ClRtlLogPrint(LOG_NOISE,
  874. "[JOIN] Closing connection to sponsor %1!ws!.\n",
  875. sponsorName
  876. );
  877. }
  878. }
  879. if (--CsJoinThreadCount == 0) {
  880. CloseHandle(CsJoinEvent);
  881. DeleteCriticalSection(&CsJoinLock);
  882. }
  883. else if (CsJoinThreadCount == 1) {
  884. SetEvent(CsJoinEvent);
  885. LeaveCriticalSection(&CsJoinLock);
  886. }
  887. else
  888. LeaveCriticalSection(&CsJoinLock);
  889. if (bindingHandle != NULL) {
  890. RpcBindingFree(&bindingHandle);
  891. }
  892. if (sponsorName != NULL) {
  893. LocalFree(sponsorName);
  894. }
  895. LocalFree(context);
  896. return(status);
  897. } // JoinpConnectThread
  898. DWORD
  899. JoinpAttemptJoin(
  900. LPWSTR SponsorName,
  901. RPC_BINDING_HANDLE JoinMasterBinding
  902. )
  903. /*++
  904. Routine Description:
  905. Called to attempt to join a cluster that already exists.
  906. Arguments:
  907. SponsorName - The name (or IP address) of the target sponsor.
  908. JoinMasterBinding - RPC binding to use to perform join.
  909. Return Value:
  910. ERROR_SUCCESS if successful
  911. Win32 error code otherwise.
  912. --*/
  913. {
  914. DWORD Status;
  915. NET_API_STATUS netStatus;
  916. LPTIME_OF_DAY_INFO tod = NULL;
  917. SYSTEMTIME systemTime;
  918. PNM_NETWORK network;
  919. DWORD startseq, endseq;
  920. #ifdef CLUSTER_TESTPOINT
  921. TESTPT(TpFailNmJoinCluster) {
  922. Status = 999999;
  923. goto error_exit;
  924. }
  925. #endif
  926. Status = NmJoinCluster(JoinMasterBinding);
  927. if (Status != ERROR_SUCCESS) {
  928. ClRtlLogPrint(LOG_UNUSUAL,
  929. "[JOIN] NmJoinCluster failed, status %1!u!.\n",
  930. Status
  931. );
  932. goto error_exit;
  933. }
  934. //
  935. // Synchronize the registry database
  936. //
  937. #ifdef CLUSTER_TESTPOINT
  938. TESTPT(TpFailDmJoin) {
  939. Status = 999999;
  940. goto error_exit;
  941. }
  942. #endif
  943. Status = DmJoin(JoinMasterBinding, &startseq);
  944. if (Status != ERROR_SUCCESS) {
  945. ClRtlLogPrint(LOG_CRITICAL,
  946. "[JOIN] DmJoin failed, error %1!d!\n",
  947. Status);
  948. goto error_exit;
  949. }
  950. //
  951. // Initialize the event handler, needs to register with gum for cluster wide
  952. //events.
  953. Status = EpInitPhase1();
  954. if ( Status != ERROR_SUCCESS) {
  955. ClRtlLogPrint(LOG_CRITICAL,
  956. "[JOIN] EpInitPhase1 failed, Status = %1!u!\n",
  957. Status);
  958. return(Status);
  959. }
  960. #ifdef CLUSTER_TESTPOINT
  961. TESTPT(TpFailApiInitPhase1) {
  962. Status = 999999;
  963. goto error_exit;
  964. }
  965. #endif
  966. //
  967. // Bring the API online in read-only mode. There is no join phase for
  968. // the API. The API is required by FmOnline, which starts the
  969. // resource monitor.
  970. //
  971. Status = ApiOnlineReadOnly();
  972. if ( Status != ERROR_SUCCESS) {
  973. ClRtlLogPrint(LOG_CRITICAL,
  974. "[JOIN] ApiOnlineReadOnly failed, error = %1!u!\n",
  975. Status);
  976. goto error_exit;
  977. }
  978. #ifdef CLUSTER_TESTPOINT
  979. TESTPT(TpFailFmJoinPhase1) {
  980. Status = 999999;
  981. goto error_exit;
  982. }
  983. #endif
  984. //update status for scm
  985. CsServiceStatus.dwCheckPoint++;
  986. CsAnnounceServiceStatus();
  987. //
  988. // Resynchronize the FM. We cannot enable the Groups until after the
  989. // the API is fully operational. See below.
  990. //
  991. Status = FmJoinPhase1(&endseq);
  992. if (Status != ERROR_SUCCESS) {
  993. ClRtlLogPrint(LOG_CRITICAL,
  994. "[JOIN] FmJoinPhase1 failed, error %1!d!\n",
  995. Status);
  996. goto error_exit;
  997. }
  998. #ifdef CLUSTER_TESTPOINT
  999. TESTPT(TpFailDmUpdateJoinCluster) {
  1000. Status = 999999;
  1001. goto error_exit;
  1002. }
  1003. #endif
  1004. // Call the DM to hook the notifications for quorum resource and
  1005. //event handler
  1006. Status = DmUpdateJoinCluster();
  1007. if (Status != ERROR_SUCCESS)
  1008. {
  1009. ClRtlLogPrint(LOG_CRITICAL,
  1010. "[JOIN] DmUpdateJoin failed, error = %1!u!\n",
  1011. Status);
  1012. goto error_exit;
  1013. }
  1014. #ifdef CLUSTER_TESTPOINT
  1015. TESTPT(TpFailNmJoinComplete) {
  1016. Status = 999999;
  1017. goto error_exit;
  1018. }
  1019. #endif
  1020. //
  1021. // We are now fully online, call NM to globally change our state.
  1022. //
  1023. Status = NmJoinComplete(&endseq);
  1024. if (Status != ERROR_SUCCESS) {
  1025. ClRtlLogPrint(LOG_CRITICAL,
  1026. "[JOIN] NmJoinComplete failed, error %1!d!\n",
  1027. Status);
  1028. goto error_exit;
  1029. }
  1030. //perform the fixup for the AdminExt value on both Nt4 and Nt5 nodes.
  1031. Status=FmFixupAdminExt();
  1032. if (Status != ERROR_SUCCESS) {
  1033. ClRtlLogPrint(LOG_CRITICAL,
  1034. "[JOIN] FmFixupAdminExt failed, error %1!d!\n",
  1035. Status);
  1036. goto error_exit;
  1037. }
  1038. //perform the fixups after the registry is downloaded
  1039. //walk the list of fixups
  1040. Status = NmPerformFixups(NM_JOIN_FIXUP);
  1041. if (Status != ERROR_SUCCESS) {
  1042. ClRtlLogPrint(LOG_CRITICAL,
  1043. "[JOIN] NmPerformFixups failed, error %1!d!\n",
  1044. Status);
  1045. goto error_exit;
  1046. }
  1047. #ifdef CLUSTER_TESTPOINT
  1048. TESTPT(TpFailApiInitPhase2) {
  1049. Status = 999999;
  1050. goto error_exit;
  1051. }
  1052. #endif
  1053. //
  1054. // Finally enable the full API.
  1055. //
  1056. Status = ApiOnline();
  1057. if ( Status != ERROR_SUCCESS) {
  1058. ClRtlLogPrint(LOG_CRITICAL,
  1059. "[JOIN] ApiOnline failed, error = %1!u!\n",
  1060. Status);
  1061. goto error_exit;
  1062. }
  1063. #ifdef CLUSTER_TESTPOINT
  1064. TESTPT(TpFailFmJoinPhase2) {
  1065. Status = 999999;
  1066. goto error_exit;
  1067. }
  1068. #endif
  1069. //update status for scm
  1070. CsServiceStatus.dwCheckPoint++;
  1071. CsAnnounceServiceStatus();
  1072. //
  1073. // Call back the Failover Manager to enable and move groups.
  1074. // The full registry is now available, so all groups/resources/resource
  1075. // types can be created (since they use the registry calls).
  1076. //
  1077. Status = FmJoinPhase2();
  1078. if (Status != ERROR_SUCCESS) {
  1079. ClRtlLogPrint(LOG_CRITICAL,
  1080. "[JOIN] FmJoinPhase2 failed, status %1!d!.\n",
  1081. Status);
  1082. goto error_exit;
  1083. }
  1084. #ifdef CLUSTER_TESTPOINT
  1085. TESTPT(TpFailEvInitialize) {
  1086. Status = 999999;
  1087. goto error_exit;
  1088. }
  1089. #endif
  1090. //
  1091. // Finish initializing the cluster wide event logging
  1092. //
  1093. // ASSUMPTION: this is called after the NM has established cluster
  1094. // membership.
  1095. //
  1096. if (!CsNoRepEvtLogging)
  1097. {
  1098. Status = EvOnline();
  1099. //if this fails, we still start the cluster service
  1100. if ( Status != ERROR_SUCCESS ) {
  1101. ClRtlLogPrint(LOG_CRITICAL,
  1102. "[JOIN] Error calling EvOnline, Status = %1!u!\n",
  1103. Status);
  1104. }
  1105. }
  1106. return(ERROR_SUCCESS);
  1107. error_exit:
  1108. ClRtlLogPrint(LOG_NOISE, "[INIT] Cleaning up failed join attempt.\n");
  1109. ClusterLeave();
  1110. return(Status);
  1111. }
  1112. BOOL
  1113. JoinpAddNodeCallback(
  1114. IN PVOID Context1,
  1115. IN PVOID Context2,
  1116. IN PVOID Object,
  1117. IN LPCWSTR Name
  1118. )
  1119. /*++
  1120. Routine Description:
  1121. Callback enumeration routine for adding a new node. This callback
  1122. figures out what node IDs are available.
  1123. Arguments:
  1124. Context1 - Supplies a pointer to an array of BOOLs. The node ID for
  1125. the enumerated node is set to FALSE.
  1126. Context2 - Not used.
  1127. Object - A pointer to the node object.
  1128. Name - The node name.
  1129. Return Value:
  1130. TRUE
  1131. --*/
  1132. {
  1133. PBOOL Avail;
  1134. DWORD Id;
  1135. Id = NmGetNodeId(Object);
  1136. CL_ASSERT(NmIsValidNodeId(Id));
  1137. Avail = (PBOOL)Context1;
  1138. Avail[Id] = FALSE;
  1139. return(TRUE);
  1140. }