/*++ Copyright (c) 1996 Microsoft Corporation Module Name: join.c Abstract: This module handles the initialization path where a newly booted node joins an existing cluster. Author: John Vert (jvert) 6/6/1996 Revision History: --*/ #include "initp.h" #include "lmcons.h" #include "lmremutl.h" #include "lmapibuf.h" #include #define JOIN_CLIENT_NO_DELAY 0 // delay for high-prio networks #define JOIN_CLIENT_NETWORK_DELAY 1000 // delay for low-prio networks #define JOIN_CLIENT_RESOURCE_DELAY 2000 // delay for cluster IP/netname #define JOIN_CLIENT_GET_NETWORK_DELAY(_NetworkPrio) \ (((_NetworkPrio) == 1) ? JOIN_CLIENT_NO_DELAY : JOIN_CLIENT_NETWORK_DELAY) // // Local types // typedef struct { DWORD Delay; LPWSTR Name; } JOIN_SPONSOR_CONTEXT, *PJOIN_SPONSOR_CONTEXT; // // Local data // CRITICAL_SECTION CsJoinLock; HANDLE CsJoinEvent = NULL; DWORD CsJoinThreadCount = 0; DWORD CsJoinStatus=ERROR_SUCCESS; RPC_BINDING_HANDLE CsJoinSponsorBinding = NULL; LPWSTR CsJoinSponsorName = NULL; // While another node is joining, we will keep track of any DM or FM updates. BOOL CsDmOrFmHasChanged = FALSE; // // Local function prototypes // VOID JoinpEnumNodesAndJoinByAddress( IN HDMKEY Key, IN PWSTR NodeId, IN PVOID Context ); VOID JoinpEnumNodesAndJoinByHostName( IN HDMKEY Key, IN PWSTR NodeId, IN PVOID Context ); VOID JoinpConnectToSponsor( IN PWSTR SponsorName, IN DWORD Delay ); DWORD WINAPI JoinpConnectThread( LPVOID Parameter ); DWORD JoinpAttemptJoin( LPWSTR SponsorName, RPC_BINDING_HANDLE JoinMasterBinding ); BOOL JoinpAddNodeCallback( IN PVOID Context1, IN PVOID Context2, IN PVOID Object, IN LPCWSTR Name ); BOOL JoinpEnumNetworksToSetPriority( IN PVOID Context1, IN PVOID Context2, IN PVOID Object, IN LPCWSTR Name ); DWORD ClusterJoin( VOID ) /*++ Routine Description: Called to attempt to join a cluster that already exists. Arguments: None Return Value: ERROR_SUCCESS if successful Win32 error code otherwise. --*/ { DWORD Status; LPWSTR ClusterIpAddress = NULL; LPWSTR ClusIpAddrResource = NULL; LPWSTR ClusterNameId = NULL; DWORD idMaxSize = 0; DWORD idSize = 0; HDMKEY hClusNameResKey = NULL; HDMKEY hClusIPAddrResKey = NULL; // // Try connecting using the cluster IP address first. get the cluster // name resource, looking up its dependency for the cluster IP addr // Status = DmQuerySz(DmClusterParametersKey, CLUSREG_NAME_CLUS_CLUSTER_NAME_RES, &ClusterNameId, &idMaxSize, &idSize); if (Status != ERROR_SUCCESS) { ClRtlLogPrint(LOG_CRITICAL, "[JOIN] failed to get cluster name resource, error %1!u!.\n", Status); goto error_exit; } // // open name resource key and read its DependsOn key // hClusNameResKey = DmOpenKey( DmResourcesKey, ClusterNameId, KEY_READ ); if ( hClusNameResKey == NULL ) { Status = GetLastError(); ClRtlLogPrint(LOG_CRITICAL, "[JOIN] failed to open Cluster Name resource key, error %1!u!.\n", Status); goto error_exit; } // // allocate enough space for the GUID and the Parameters string // idMaxSize = ( CS_NETWORK_ID_LENGTH + sizeof( CLUSREG_KEYNAME_PARAMETERS ) + 2) * sizeof(WCHAR); ClusIpAddrResource = LocalAlloc( LMEM_FIXED, idMaxSize ); if ( ClusIpAddrResource == NULL ) { Status = ERROR_NOT_ENOUGH_MEMORY; ClRtlLogPrint(LOG_CRITICAL, "[JOIN] no memory for Cluster Ip address resource ID!\n"); goto error_exit; } Status = DmQueryMultiSz(hClusNameResKey, CLUSREG_NAME_RES_DEPENDS_ON, &ClusIpAddrResource, &idMaxSize, &idSize); if ( Status != ERROR_SUCCESS ) { ClRtlLogPrint(LOG_CRITICAL, "[JOIN] failed to get Cluster Ip address resource ID, error %1!u!.\n", Status); goto error_exit; } lstrcatW( ClusIpAddrResource, L"\\" ); lstrcatW( ClusIpAddrResource, CLUSREG_KEYNAME_PARAMETERS ); hClusIPAddrResKey = DmOpenKey( DmResourcesKey, ClusIpAddrResource, KEY_READ ); if ( hClusIPAddrResKey == NULL ) { Status = GetLastError(); ClRtlLogPrint(LOG_CRITICAL, "[JOIN] failed to open Cluster IP Address resource key, error %1!u!.\n", Status); goto error_exit; } // // get the IP Address; note that these value names are not defined // in a global way. if they are changed, this code will break // idMaxSize = idSize = 0; Status = DmQuerySz(hClusIPAddrResKey, L"Address", &ClusterIpAddress, &idMaxSize, &idSize); if ( Status != ERROR_SUCCESS ) { ClRtlLogPrint(LOG_CRITICAL, "[JOIN] failed to get Cluster Ip address, error %1!u!.\n", Status); goto error_exit; } // // Spawn threads to find a sponsor. We will try the make connections using // the cluster IP address, the IP address of each node on each network, and // the name of each node in the cluster. The connects will proceed in // parallel. We'll use the first one that succeeds. // CsJoinEvent = CreateEvent(NULL, TRUE, FALSE, NULL); if (CsJoinEvent == NULL) { Status = GetLastError(); ClRtlLogPrint(LOG_CRITICAL, "[JOIN] failed to create join event, error %1!u!.\n", Status ); goto error_exit; } CsJoinThreadCount = 1; InitializeCriticalSection(&CsJoinLock); EnterCriticalSection(&CsJoinLock); DmEnumKeys(DmNetInterfacesKey, JoinpEnumNodesAndJoinByAddress, NULL); DmEnumKeys(DmNodesKey, JoinpEnumNodesAndJoinByHostName, NULL); // // give the other threads a chance to start since using the cluster IP // address to join with is problematic when the resource moves in the // middle of a join // JoinpConnectToSponsor(ClusterIpAddress, JOIN_CLIENT_RESOURCE_DELAY); //update status for scm CsServiceStatus.dwCheckPoint++; CsAnnounceServiceStatus(); if(CsJoinThreadCount == 1) SetEvent(CsJoinEvent); LeaveCriticalSection(&CsJoinLock); Status = WaitForSingleObject(CsJoinEvent, INFINITE); CL_ASSERT(Status == WAIT_OBJECT_0); EnterCriticalSection(&CsJoinLock); ClRtlLogPrint(LOG_NOISE, "[JOIN] Got out of the join wait, CsJoinThreadCount = %1!u!.\n", CsJoinThreadCount ); if(--CsJoinThreadCount == 0) { CloseHandle(CsJoinEvent); DeleteCriticalSection(&CsJoinLock); } else LeaveCriticalSection(&CsJoinLock); // // All of the threads have failed or one of them made a connection, // use it to join. // if (CsJoinSponsorBinding != NULL) { CL_ASSERT(CsJoinSponsorName != NULL); ClRtlLogPrint(LOG_UNUSUAL, "[JOIN] Attempting join with sponsor %1!ws!.\n", CsJoinSponsorName ); // // Chittur Subbaraman (chitturs) - 10/27/98 // // If the database restore operation is requested, then // refuse to join the cluster and return an error code. // if ( CsDatabaseRestore == TRUE ) { Status = ERROR_CLUSTER_NODE_UP; LocalFree(CsJoinSponsorName); goto error_exit; } Status = JoinpAttemptJoin(CsJoinSponsorName, CsJoinSponsorBinding); RpcBindingFree(&CsJoinSponsorBinding); LocalFree(CsJoinSponsorName); } else { //we couldnt create a binding to the sponsorer if(CsJoinStatus == ERROR_SUCCESS) { //we did the version check in joinpconnectthread but for some reason //couldnt produce a binding Status = ERROR_BAD_NETPATH; ClRtlLogPrint(LOG_CRITICAL, "[JOIN] Unable to connect to any sponsor node.\n"); } else { Status = CsJoinStatus; } // rajdas: If the join did not suceed due to version mismatch we shouldn't try to form a cluster. // Bug ID: 152229 // if(CsJoinStatus == ERROR_CLUSTER_INCOMPATIBLE_VERSIONS) bFormCluster = FALSE; } error_exit: if ( ClusterNameId ) { LocalFree( ClusterNameId ); } if ( ClusterIpAddress ) { LocalFree( ClusterIpAddress ); } if ( ClusIpAddrResource ) { LocalFree( ClusIpAddrResource ); } if ( hClusNameResKey ) { DmCloseKey( hClusNameResKey ); } if ( hClusIPAddrResKey ) { DmCloseKey( hClusIPAddrResKey ); } return(Status); } VOID JoinpEnumNodesAndJoinByAddress( IN HDMKEY Key, IN PWSTR NetInterfaceId, IN PVOID Context ) /*++ Routine Description: Attempts to establish an RPC connection to a specified node using its IP address Arguments: Key - pointer to the node key handle NetInterfaceId - pointer to string representing net IF ID (guid) Context - pointer to a location to return the final status Return Value: None --*/ { DWORD status; LPWSTR NetIFNodeID = NULL; LPWSTR NetIFIpAddress = NULL; LPWSTR NetIFNetwork = NULL; HDMKEY NetIFNetworkKey = NULL; DWORD NetIFNetworkPriority; DWORD idMaxSize = 0; DWORD idSize = 0; // // get the NodeId Value from the NetIF key and if it's us, // skip this netIF // status = DmQuerySz(Key, CLUSREG_NAME_NETIFACE_NODE, &NetIFNodeID, &idMaxSize, &idSize); if ( status == ERROR_SUCCESS ) { if (lstrcmpiW(NetIFNodeID, NmLocalNodeIdString) != 0) { // // it's not us so get the address and try it... // idMaxSize = idSize = 0; status = DmQuerySz(Key, CLUSREG_NAME_NETIFACE_ADDRESS, &NetIFIpAddress, &idMaxSize, &idSize); if ( status != ERROR_SUCCESS ) { ClRtlLogPrint(LOG_CRITICAL, "[JOIN] failed to get NetInterface Address, error %1!u!.\n", status); goto error_exit; } // // Determine the delay based on the network priority. If we // cannot find it in the cluster database, we still try to // connect to the sponsor assuming the lowest priority. // NetIFNetworkPriority = 0xFFFFFFFF; idMaxSize = idSize = 0; status = DmQuerySz(Key, CLUSREG_NAME_NETIFACE_NETWORK, &NetIFNetwork, &idMaxSize, &idSize); if ( status != ERROR_SUCCESS ) { ClRtlLogPrint(LOG_CRITICAL, "[JOIN] Failed to get NetInterface Network, error %1!u!.\n", status); goto ConnectToSponsor; } NetIFNetworkKey = DmOpenKey(DmNetworksKey, NetIFNetwork, KEY_READ); if ( NetIFNetworkKey == NULL ) { ClRtlLogPrint(LOG_CRITICAL, "[JOIN] Failed to open key for network %1!ws!, error %2!u!.\n", NetIFNetwork, status ); goto ConnectToSponsor; } status = DmQueryDword(NetIFNetworkKey, CLUSREG_NAME_NET_PRIORITY, &NetIFNetworkPriority, 0); if ( status != ERROR_SUCCESS ) { ClRtlLogPrint(LOG_CRITICAL, "[JOIN] Failed to get NetInterface network priority, error %1!u!.\n", status); } ConnectToSponsor: // // attempt the join with this address // JoinpConnectToSponsor(NetIFIpAddress, JOIN_CLIENT_GET_NETWORK_DELAY(NetIFNetworkPriority)); } } else { ClRtlLogPrint(LOG_CRITICAL, "[JOIN] failed to get NetInterface Node ID, error %1!u!.\n", status); } error_exit: DmCloseKey(Key); if ( NetIFNodeID ) { LocalFree( NetIFNodeID ); } if ( NetIFIpAddress ) { LocalFree( NetIFIpAddress ); } return; } VOID JoinpEnumNodesAndJoinByHostName( IN HDMKEY Key, IN PWSTR NodeId, IN PVOID Context ) /*++ Routine Description: Attempts to establish an RPC connection to a specified node using its host name Arguments: Key - pointer to the node key handle NodeId - pointer to string representing node ID (number) Context - pointer to a location to return the final status Return Value: None --*/ { DWORD status; LPWSTR nodeName=NULL; DWORD nodeNameLen=0; DWORD nodeNameSize=0; // // Try to connect if this is not us // if (lstrcmpiW(NodeId, NmLocalNodeIdString) != 0) { status = DmQuerySz(Key, CLUSREG_NAME_NODE_NAME, &nodeName, &nodeNameLen, &nodeNameSize); if (status == ERROR_SUCCESS) { JoinpConnectToSponsor(nodeName, JOIN_CLIENT_NETWORK_DELAY); LocalFree(nodeName); } } DmCloseKey(Key); return; } VOID JoinpConnectToSponsor( IN PWSTR SponsorName, IN DWORD Delay ) /*++ Routine Description: Attempts to establish an RPC connection to a specified node. Arguments: SponsorName - The name (or IP address) of the target sponsor. Delay - Milliseconds to wait before sending request Return Value: ERROR_SUCCESS if an RPC connection is successfully made to the sponsor. An RPC error code otherwise. --*/ { HANDLE threadHandle; DWORD status = ERROR_SUCCESS; DWORD threadId; PJOIN_SPONSOR_CONTEXT context; BOOL setEvent = FALSE; ClRtlLogPrint(LOG_UNUSUAL, "[JOIN] Spawning thread to connect to sponsor %1!ws!\n", SponsorName ); // // Allocate the context and sponsor name buffer separately. If this // thread "wins" sponsorship, the name buffer will be reused. // context = LocalAlloc( LMEM_FIXED | LMEM_ZEROINIT, sizeof(JOIN_SPONSOR_CONTEXT) ); if (context != NULL) { context->Name = LocalAlloc( LMEM_FIXED | LMEM_ZEROINIT, (lstrlenW(SponsorName) + 1 ) * sizeof(WCHAR) ); if (context->Name != NULL) { lstrcpyW(context->Name, SponsorName); context->Delay = Delay; CsJoinThreadCount++; threadHandle = CreateThread( NULL, 0, JoinpConnectThread, context, 0, &threadId ); if (threadHandle != NULL) { CloseHandle(threadHandle); } else { status = GetLastError(); ClRtlLogPrint(LOG_CRITICAL, "[JOIN] Failed to spawn connect thread, error %1!u!.\n", status ); --CsJoinThreadCount; LocalFree(context->Name); LocalFree(context); } } else { LocalFree(context); ClRtlLogPrint(LOG_CRITICAL, "[JOIN] Failed to allocate memory for sponsor name.\n" ); } } else { ClRtlLogPrint(LOG_CRITICAL, "[JOIN] Failed to allocate memory.\n" ); } return; } // JoinpConnectToSponsor DWORD WINAPI VerifyJoinVersionData( LPWSTR sponsorName ) /*++ Routine Description: Verify that the sponsor and the joiner are compatible Arguments: sponsorName - pointer to text string of sponsor to use Return Value: ERROR_SUCCESS - if ok to continue join --*/ { DWORD status; LPWSTR bindingString = NULL; RPC_BINDING_HANDLE bindingHandle = NULL; DWORD SponsorNodeId; DWORD ClusterHighestVersion; DWORD ClusterLowestVersion; DWORD JoinStatus = ERROR_SUCCESS; DWORD packageIndex; // // Attempt to connect to the sponsor's JoinVersion RPC interface. // status = RpcStringBindingComposeW( L"6e17aaa0-1a47-11d1-98bd-0000f875292e", L"ncadg_ip_udp", sponsorName, NULL, NULL, &bindingString); if (status != RPC_S_OK) { ClRtlLogPrint(LOG_UNUSUAL, "[JOIN] Unable to compose JoinVersion string binding for sponsor %1!ws!, status %2!u!.\n", sponsorName, status ); goto error_exit; } status = RpcBindingFromStringBindingW(bindingString, &bindingHandle); RpcStringFreeW(&bindingString); if (status != RPC_S_OK) { ClRtlLogPrint(LOG_UNUSUAL, "[JOIN] Unable to build JoinVersion binding for sponsor %1!ws!, status %2!u!.\n", sponsorName, status ); goto error_exit; } // // under load, the sponsor might take a while to respond back to the // joiner. The default timeout is at 30 secs and this seems to work // ok. Note that this means the sponsor has 30 secs to reply to either // the RPC request or ping. As long it makes any reply, then the joiner's // RPC will continue to wait and not time out the sponsor. // status = RpcMgmtSetComTimeout( bindingHandle, CLUSTER_JOINVERSION_RPC_COM_TIMEOUT ); if (status != RPC_S_OK) { ClRtlLogPrint(LOG_UNUSUAL, "[JOIN] Unable to set JoinVersion com timeout for sponsor %1!ws!, status %2!u!.\n", sponsorName, status ); } status = RpcEpResolveBinding(bindingHandle, JoinVersion_v2_0_c_ifspec); if (status != RPC_S_OK) { if ( (status == RPC_S_SERVER_UNAVAILABLE) || (status == RPC_S_NOT_LISTENING) || (status == EPT_S_NOT_REGISTERED) ) { ClRtlLogPrint(LOG_NOISE, "[JOIN] Sponsor %1!ws! is not available (JoinVersion), status=%2!u!.\n", sponsorName, status ); } else { ClRtlLogPrint(LOG_UNUSUAL, "[JOIN] Unable to resolve JoinVersion endpoint for sponsor %1!ws!, status %2!u!.\n", sponsorName, status ); } goto error_exit; } // // run through the list of RPC security packages, trying to establish a // security context with this binding. // for (packageIndex = 0; packageIndex < CsNumberOfRPCSecurityPackages; ++packageIndex ) { status = RpcBindingSetAuthInfoW(bindingHandle, CsServiceDomainAccount, RPC_C_AUTHN_LEVEL_CONNECT, CsRPCSecurityPackage[ packageIndex ], NULL, RPC_C_AUTHZ_NAME); if (status != RPC_S_OK) { ClRtlLogPrint(LOG_UNUSUAL, "[JOIN] Unable to set JoinVersion AuthInfo using %1!ws! package, status %2!u!.\n", CsRPCSecurityPackageName[packageIndex], status); continue; } status = CsRpcGetJoinVersionData(bindingHandle, NmLocalNodeId, CsMyHighestVersion, CsMyLowestVersion, &SponsorNodeId, &ClusterHighestVersion, &ClusterLowestVersion, &JoinStatus); if ( status == RPC_S_OK ) { break; } else { ClRtlLogPrint(LOG_UNUSUAL, "[JOIN] Unable to get join version data from sponsor %1!ws! using " "%2!ws! package, status %3!u!.\n", sponsorName, CsRPCSecurityPackageName[packageIndex], status); } } // // jump out now if nothing work (as in the case of a form) // if ( status != ERROR_SUCCESS ) { goto error_exit; } // // use the join lock to set the RPC package index // EnterCriticalSection( &CsJoinLock ); if ( CsRPCSecurityPackageIndex < 0 ) { CsRPCSecurityPackageIndex = packageIndex; } LeaveCriticalSection( &CsJoinLock ); // // check the sponsor was in agreement with the join // if ( JoinStatus != ERROR_SUCCESS ) { ClRtlLogPrint(LOG_UNUSUAL, "[JOIN] Sponsor %1!ws! has discontinued join, status %2!u!.\n", sponsorName, JoinStatus); if (JoinStatus == ERROR_CLUSTER_INCOMPATIBLE_VERSIONS) { ClRtlLogPrint(LOG_CRITICAL, "[JOIN] Join version data from sponsor %1!ws! doesn't match: JH: 0x%2!08X! JL: 0x%3!08X! SH: 0x%4!08X! SL: 0x%5!08X!.\n", sponsorName, CsMyHighestVersion, CsMyLowestVersion, ClusterHighestVersion, ClusterLowestVersion); // // rajdas: In this case I have managed to contact a sponsor, but there is a version mismatch. If all the join // threads meet the same fate, clussvc should not try to form a cluster. // BUG ID: 152229 // CsJoinStatus = ERROR_CLUSTER_INCOMPATIBLE_VERSIONS; } goto error_exit; } // SS: we will leave this check because win2K clusters didnt do the // server side check, so the client must continue to do it // // now check that it is ok to join. We want this node to run // at the highest level of compatibility possible. One of the // following conditions must be true: // // 1) the High versions match exactly (major and build number) // 2) our Highest matches the sponsor's Lowest exactly, downgrading // the sponsor to our level of compatibility // 3) our Lowest matches the sponsor's Highest, downgrading ourselves // to the sponsor's level of compatibility // // note that the minor (build) version must match as well. The previous // version numbers are "well known" and shouldn't change when a newer // version is available/implemented. // if ( CsMyHighestVersion == ClusterHighestVersion || CsMyHighestVersion == ClusterLowestVersion || CsMyLowestVersion == ClusterHighestVersion #if 1 // CLUSTER_BETA || CsNoVersionCheck #endif ) { status = ERROR_SUCCESS; } else { ClRtlLogPrint(LOG_CRITICAL, "[JOIN] Join version data from sponsor %1!ws! doesn't match: JH: 0x%2!08X! JL: 0x%3!08X! SH: 0x%4!08X! SL: 0x%5!08X!.\n", sponsorName, CsMyHighestVersion, CsMyLowestVersion, ClusterHighestVersion, ClusterLowestVersion); status = ERROR_CLUSTER_INCOMPATIBLE_VERSIONS; // // rajdas: In this case I have managed to contact a sponsor, but there is a version mismatch. If all the join // threads meet the same fate, clussvc should not try to form a cluster. // BUG ID: 152229 // CsJoinStatus = ERROR_CLUSTER_INCOMPATIBLE_VERSIONS; } error_exit: if (bindingHandle != NULL) { RpcBindingFree(&bindingHandle); } return status; } DWORD WINAPI JoinpConnectThread( LPVOID Parameter ) { PJOIN_SPONSOR_CONTEXT context = (PJOIN_SPONSOR_CONTEXT) Parameter; LPWSTR sponsorName = context->Name; DWORD status; LPWSTR bindingString = NULL; RPC_BINDING_HANDLE bindingHandle = NULL; BOOL setEvent = FALSE; // // Sleep for the specified delay. // if (context->Delay > 0) { Sleep(context->Delay); } // // No need to send a sponsorship request if a sponsor has // already been chosen. // if (CsJoinSponsorBinding != NULL) { ClRtlLogPrint(LOG_UNUSUAL, "[JOIN] No need to ask %1!ws! to sponsor us after delay of %2!u! milliseconds.\n", sponsorName, context->Delay ); status = RPC_S_CALL_FAILED_DNE; goto error_exit; } // // Try to connect to the specified node. // ClRtlLogPrint(LOG_UNUSUAL, "[JOIN] Asking %1!ws! to sponsor us after delay of %2!u! milliseconds.\n", sponsorName, context->Delay ); // // connect to the JoinVersion interface first to see if we should progress // any further. since this is the first RPC call to the other node, we can // determine which security package should be used for the other interfaces. // status = VerifyJoinVersionData( sponsorName ); if (status != ERROR_SUCCESS) { ClRtlLogPrint(LOG_UNUSUAL, "[JOIN] JoinVersion data for sponsor %1!ws! is invalid, status %2!u!.\n", sponsorName, status ); goto error_exit; } // // Attempt to connect to the sponsor's extrocluster (join) RPC interface. // status = RpcStringBindingComposeW( L"ffe561b8-bf15-11cf-8c5e-08002bb49649", L"ncadg_ip_udp", sponsorName, NULL, NULL, &bindingString); if (status != RPC_S_OK) { ClRtlLogPrint(LOG_UNUSUAL, "[JOIN] Unable to compose ExtroCluster string binding for sponsor %1!ws!, status %2!u!.\n", sponsorName, status ); goto error_exit; } status = RpcBindingFromStringBindingW(bindingString, &bindingHandle); RpcStringFreeW(&bindingString); if (status != RPC_S_OK) { ClRtlLogPrint(LOG_UNUSUAL, "[JOIN] Unable to build ExtroCluster binding for sponsor %1!ws!, status %2!u!.\n", sponsorName, status ); goto error_exit; } // // under load, the sponsor might take a while to respond back to the // joiner. The default timeout is at 30 secs and this seems to work // ok. Note that this means the sponsor has 30 secs to reply to either // the RPC request or ping. As long it makes any reply, then the joiner's // RPC will continue to wait and not time out the sponsor. // status = RpcMgmtSetComTimeout( bindingHandle, CLUSTER_EXTROCLUSTER_RPC_COM_TIMEOUT ); if (status != RPC_S_OK) { ClRtlLogPrint(LOG_UNUSUAL, "[JOIN] Unable to set ExtroCluster com timeout for sponsor %1!ws!, status %2!u!.\n", sponsorName, status ); } status = RpcEpResolveBinding(bindingHandle, ExtroCluster_v2_0_c_ifspec); if (status != RPC_S_OK) { if ( (status == RPC_S_SERVER_UNAVAILABLE) || (status == RPC_S_NOT_LISTENING) || (status == EPT_S_NOT_REGISTERED) ) { ClRtlLogPrint(LOG_NOISE, "[JOIN] Sponsor %1!ws! is not available (ExtroCluster), status=%2!u!.\n", sponsorName, status ); } else { ClRtlLogPrint(LOG_UNUSUAL, "[JOIN] Unable to resolve ExtroCluster endpoint for sponsor %1!ws!, status %2!u!.\n", sponsorName, status ); } goto error_exit; } // // establish a security context with this binding. // status = RpcBindingSetAuthInfoW(bindingHandle, CsServiceDomainAccount, RPC_C_AUTHN_LEVEL_CONNECT, CsRPCSecurityPackage[ CsRPCSecurityPackageIndex ], NULL, RPC_C_AUTHZ_NAME); if (status != RPC_S_OK) { ClRtlLogPrint(LOG_UNUSUAL, "[JOIN] Unable to set ExtroCluster AuthInfo using %1!ws! package, status %2!u!.\n", CsRPCSecurityPackageName[ CsRPCSecurityPackageIndex ], status); goto error_exit; } error_exit: EnterCriticalSection(&CsJoinLock); if (status == RPC_S_OK) { if (CsJoinSponsorBinding == NULL) { // // This is the first successful connection. // ClRtlLogPrint(LOG_UNUSUAL, "[JOIN] Selecting %1!ws! as join sponsor.\n", sponsorName ); CsJoinSponsorBinding = bindingHandle; bindingHandle = NULL; CsJoinSponsorName = sponsorName; sponsorName = NULL; SetEvent(CsJoinEvent); } else { ClRtlLogPrint(LOG_NOISE, "[JOIN] Closing connection to sponsor %1!ws!.\n", sponsorName ); } } if (--CsJoinThreadCount == 0) { CloseHandle(CsJoinEvent); DeleteCriticalSection(&CsJoinLock); } else if (CsJoinThreadCount == 1) { SetEvent(CsJoinEvent); LeaveCriticalSection(&CsJoinLock); } else LeaveCriticalSection(&CsJoinLock); if (bindingHandle != NULL) { RpcBindingFree(&bindingHandle); } if (sponsorName != NULL) { LocalFree(sponsorName); } LocalFree(context); return(status); } // JoinpConnectThread DWORD JoinpAttemptJoin( LPWSTR SponsorName, RPC_BINDING_HANDLE JoinMasterBinding ) /*++ Routine Description: Called to attempt to join a cluster that already exists. Arguments: SponsorName - The name (or IP address) of the target sponsor. JoinMasterBinding - RPC binding to use to perform join. Return Value: ERROR_SUCCESS if successful Win32 error code otherwise. --*/ { DWORD Status; NET_API_STATUS netStatus; LPTIME_OF_DAY_INFO tod = NULL; SYSTEMTIME systemTime; PNM_NETWORK network; DWORD startseq, endseq; #ifdef CLUSTER_TESTPOINT TESTPT(TpFailNmJoinCluster) { Status = 999999; goto error_exit; } #endif Status = NmJoinCluster(JoinMasterBinding); if (Status != ERROR_SUCCESS) { ClRtlLogPrint(LOG_UNUSUAL, "[JOIN] NmJoinCluster failed, status %1!u!.\n", Status ); goto error_exit; } // // Synchronize the registry database // #ifdef CLUSTER_TESTPOINT TESTPT(TpFailDmJoin) { Status = 999999; goto error_exit; } #endif Status = DmJoin(JoinMasterBinding, &startseq); if (Status != ERROR_SUCCESS) { ClRtlLogPrint(LOG_CRITICAL, "[JOIN] DmJoin failed, error %1!d!\n", Status); goto error_exit; } // // Initialize the event handler, needs to register with gum for cluster wide //events. Status = EpInitPhase1(); if ( Status != ERROR_SUCCESS) { ClRtlLogPrint(LOG_CRITICAL, "[JOIN] EpInitPhase1 failed, Status = %1!u!\n", Status); return(Status); } #ifdef CLUSTER_TESTPOINT TESTPT(TpFailApiInitPhase1) { Status = 999999; goto error_exit; } #endif // // Bring the API online in read-only mode. There is no join phase for // the API. The API is required by FmOnline, which starts the // resource monitor. // Status = ApiOnlineReadOnly(); if ( Status != ERROR_SUCCESS) { ClRtlLogPrint(LOG_CRITICAL, "[JOIN] ApiOnlineReadOnly failed, error = %1!u!\n", Status); goto error_exit; } #ifdef CLUSTER_TESTPOINT TESTPT(TpFailFmJoinPhase1) { Status = 999999; goto error_exit; } #endif //update status for scm CsServiceStatus.dwCheckPoint++; CsAnnounceServiceStatus(); // // Resynchronize the FM. We cannot enable the Groups until after the // the API is fully operational. See below. // Status = FmJoinPhase1(&endseq); if (Status != ERROR_SUCCESS) { ClRtlLogPrint(LOG_CRITICAL, "[JOIN] FmJoinPhase1 failed, error %1!d!\n", Status); goto error_exit; } #ifdef CLUSTER_TESTPOINT TESTPT(TpFailDmUpdateJoinCluster) { Status = 999999; goto error_exit; } #endif // Call the DM to hook the notifications for quorum resource and //event handler Status = DmUpdateJoinCluster(); if (Status != ERROR_SUCCESS) { ClRtlLogPrint(LOG_CRITICAL, "[JOIN] DmUpdateJoin failed, error = %1!u!\n", Status); goto error_exit; } #ifdef CLUSTER_TESTPOINT TESTPT(TpFailNmJoinComplete) { Status = 999999; goto error_exit; } #endif // // We are now fully online, call NM to globally change our state. // Status = NmJoinComplete(&endseq); if (Status != ERROR_SUCCESS) { ClRtlLogPrint(LOG_CRITICAL, "[JOIN] NmJoinComplete failed, error %1!d!\n", Status); goto error_exit; } //perform the fixup for the AdminExt value on both Nt4 and Nt5 nodes. Status=FmFixupAdminExt(); if (Status != ERROR_SUCCESS) { ClRtlLogPrint(LOG_CRITICAL, "[JOIN] FmFixupAdminExt failed, error %1!d!\n", Status); goto error_exit; } //perform the fixups after the registry is downloaded //walk the list of fixups Status = NmPerformFixups(NM_JOIN_FIXUP); if (Status != ERROR_SUCCESS) { ClRtlLogPrint(LOG_CRITICAL, "[JOIN] NmPerformFixups failed, error %1!d!\n", Status); goto error_exit; } #ifdef CLUSTER_TESTPOINT TESTPT(TpFailApiInitPhase2) { Status = 999999; goto error_exit; } #endif // // Finally enable the full API. // Status = ApiOnline(); if ( Status != ERROR_SUCCESS) { ClRtlLogPrint(LOG_CRITICAL, "[JOIN] ApiOnline failed, error = %1!u!\n", Status); goto error_exit; } #ifdef CLUSTER_TESTPOINT TESTPT(TpFailFmJoinPhase2) { Status = 999999; goto error_exit; } #endif //update status for scm CsServiceStatus.dwCheckPoint++; CsAnnounceServiceStatus(); // // Call back the Failover Manager to enable and move groups. // The full registry is now available, so all groups/resources/resource // types can be created (since they use the registry calls). // Status = FmJoinPhase2(); if (Status != ERROR_SUCCESS) { ClRtlLogPrint(LOG_CRITICAL, "[JOIN] FmJoinPhase2 failed, status %1!d!.\n", Status); goto error_exit; } #ifdef CLUSTER_TESTPOINT TESTPT(TpFailEvInitialize) { Status = 999999; goto error_exit; } #endif // // Finish initializing the cluster wide event logging // // ASSUMPTION: this is called after the NM has established cluster // membership. // if (!CsNoRepEvtLogging) { Status = EvOnline(); //if this fails, we still start the cluster service if ( Status != ERROR_SUCCESS ) { ClRtlLogPrint(LOG_CRITICAL, "[JOIN] Error calling EvOnline, Status = %1!u!\n", Status); } } return(ERROR_SUCCESS); error_exit: ClRtlLogPrint(LOG_NOISE, "[INIT] Cleaning up failed join attempt.\n"); ClusterLeave(); return(Status); } BOOL JoinpAddNodeCallback( IN PVOID Context1, IN PVOID Context2, IN PVOID Object, IN LPCWSTR Name ) /*++ Routine Description: Callback enumeration routine for adding a new node. This callback figures out what node IDs are available. Arguments: Context1 - Supplies a pointer to an array of BOOLs. The node ID for the enumerated node is set to FALSE. Context2 - Not used. Object - A pointer to the node object. Name - The node name. Return Value: TRUE --*/ { PBOOL Avail; DWORD Id; Id = NmGetNodeId(Object); CL_ASSERT(NmIsValidNodeId(Id)); Avail = (PBOOL)Context1; Avail[Id] = FALSE; return(TRUE); }