Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

601 lines
18 KiB

  1. /*++
  2. Copyright (c) 1996 Microsoft Corporation
  3. Module Name:
  4. join.c
  5. Abstract:
  6. GUM routines to implement the special join updates.
  7. Author:
  8. John Vert (jvert) 6/10/1996
  9. Revision History:
  10. --*/
  11. #include "gump.h"
  12. //
  13. // Define structure used to pass arguments to node enumeration callback
  14. //
  15. typedef struct _GUMP_JOIN_INFO {
  16. GUM_UPDATE_TYPE UpdateType;
  17. DWORD Status;
  18. DWORD Sequence;
  19. DWORD LockerNode;
  20. } GUMP_JOIN_INFO, *PGUMP_JOIN_INFO;
  21. //
  22. // Local function prototypes
  23. //
  24. BOOL
  25. GumpNodeCallback(
  26. IN PVOID Context1,
  27. IN PVOID Context2,
  28. IN PVOID Object,
  29. IN LPCWSTR Name
  30. );
  31. DWORD
  32. GumBeginJoinUpdate(
  33. IN GUM_UPDATE_TYPE UpdateType,
  34. OUT DWORD *Sequence
  35. )
  36. /*++
  37. Routine Description:
  38. Begins the special join update for a joining node. This
  39. function gets the current GUM sequence number for the
  40. specified update type from another node in the cluster.
  41. It also gets the list of nodes currently participating
  42. in the updates.
  43. Arguments:
  44. UpdateType - Supplies the GUM_UPDATE_TYPE.
  45. Sequence - Returns the sequence number that should be
  46. passed to GumEndJoinUpdate.
  47. Return Value:
  48. ERROR_SUCCESS if successful
  49. Win32 error code otherwise
  50. --*/
  51. {
  52. GUMP_JOIN_INFO JoinInfo;
  53. //
  54. // Enumerate the list of nodes. The callback routine will attempt
  55. // to obtain the required information from each node that is online.
  56. //
  57. JoinInfo.Status = ERROR_GEN_FAILURE;
  58. JoinInfo.UpdateType = UpdateType;
  59. OmEnumObjects(ObjectTypeNode,
  60. GumpNodeCallback,
  61. &JoinInfo,
  62. NULL);
  63. if (JoinInfo.Status == ERROR_SUCCESS) {
  64. ClRtlLogPrint(LOG_NOISE,
  65. "[GUM] GumBeginJoinUpdate succeeded with sequence %1!d! for type %2!u!\n",
  66. JoinInfo.Sequence,
  67. UpdateType);
  68. *Sequence = JoinInfo.Sequence;
  69. }
  70. return(JoinInfo.Status);
  71. }
  72. DWORD
  73. WINAPI
  74. GumEndJoinUpdate(
  75. IN DWORD Sequence,
  76. IN GUM_UPDATE_TYPE UpdateType,
  77. IN DWORD Context,
  78. IN DWORD BufferLength,
  79. IN PVOID Buffer
  80. )
  81. /*++
  82. Routine Description:
  83. Conditionally sends a join update to all active nodes in the
  84. cluster. If the clusterwise sequence number matches the supplied
  85. sequence number, all registered update handlers for the specified
  86. UpdateType are called on each node. Any registered update handlers
  87. for the current node will be called on the same thread. This is
  88. useful for correct synchronization of the data structures to be updated.
  89. As each node receives the join update, the sending node will be
  90. added to the list of nodes registered to receive any future updates
  91. of this type.
  92. The normal usage of this routine is as follows:
  93. joining node gets current sequence number from GumBeginJoinUpdate
  94. joining node gets current cluster state from another cluster node
  95. joining node issues GumEndJoinUpdate to add itself to every node's
  96. update list.
  97. If GumEndJoinUpdate fails, try again
  98. Arguments:
  99. Sequence - Supplies the sequence number obtained from GumGetCurrentSequence.
  100. UpdateType - Supplies the type of update. This determines which update handlers
  101. will be called
  102. Context - Supplies a DWORD of context to be passed to the
  103. GUM update handlers
  104. BufferLength - Supplies the length of the update buffer to be passed to the
  105. update handlers
  106. Buffer - Supplies a pointer to the update buffer to be passed to the update
  107. handlers.
  108. Return Value:
  109. ERROR_SUCCESS if the request is successful.
  110. Win32 error code on failure.
  111. --*/
  112. {
  113. DWORD Status=RPC_S_OK;
  114. DWORD i;
  115. PGUM_INFO GumInfo;
  116. DWORD MyNodeId;
  117. DWORD LockerNode=(DWORD)-1;
  118. DWORD dwGenerationNum; //the generation number at which the joiner gets the lock
  119. BOOL AssumeLockerWhistler;
  120. CL_ASSERT(UpdateType < GumUpdateMaximum);
  121. GumInfo = &GumTable[UpdateType];
  122. MyNodeId = NmGetNodeId(NmLocalNode);
  123. LockerNode = GumpLockerNode;
  124. //SS: bug can we be the locker node at this point in time?
  125. //CL_ASSERT(LockerNode != MyNodeId);
  126. //
  127. // Verify that the locker node allows us to finish the join update
  128. //
  129. if (LockerNode != MyNodeId)
  130. {
  131. ClRtlLogPrint(LOG_NOISE,
  132. "[GUM] GumEndJoinUpdate: attempting update\ttype %1!u! context %2!u! sequence %3!u!\n",
  133. UpdateType,
  134. Context,
  135. Sequence);
  136. //SS: what if the joiner node acquires the lock but dies after
  137. //will the remaining cluster continue to function ??
  138. //We need to make sure that node down events are generated
  139. //for this node as soon as the first gumbeginjoinupdate call
  140. //is made
  141. AssumeLockerWhistler = TRUE;
  142. RetryJoinUpdateForRollingUpgrade:
  143. NmStartRpc(LockerNode);
  144. if (AssumeLockerWhistler)
  145. {
  146. Status = GumAttemptJoinUpdate2(GumpRpcBindings[LockerNode],
  147. NmGetNodeId(NmLocalNode),
  148. UpdateType,
  149. Context,
  150. Sequence,
  151. BufferLength,
  152. Buffer,
  153. &dwGenerationNum);
  154. }
  155. else
  156. {
  157. Status = GumAttemptJoinUpdate(GumpRpcBindings[LockerNode],
  158. NmGetNodeId(NmLocalNode),
  159. UpdateType,
  160. Context,
  161. Sequence,
  162. BufferLength,
  163. Buffer);
  164. }
  165. NmEndRpc(LockerNode);
  166. if (Status == RPC_S_PROCNUM_OUT_OF_RANGE)
  167. {
  168. AssumeLockerWhistler = FALSE;
  169. goto RetryJoinUpdateForRollingUpgrade;
  170. }
  171. if (Status != ERROR_SUCCESS)
  172. {
  173. ClRtlLogPrint(LOG_UNUSUAL,
  174. "[GUM] Join attempt for type %1!d! failed %2!d!\n",
  175. UpdateType,
  176. Status);
  177. NmDumpRpcExtErrorInfo(Status);
  178. return(Status);
  179. }
  180. //if the locker node dies, should we retry with the locker node?
  181. //In this case, the locker node may be different
  182. //now from when GumBeginJoinUpdate() is called.
  183. //SS: we fail the join instead and just retry the whole process
  184. //instead of calling GumpCommFailure() to kill the locker here.
  185. // This way the existing cluster continues and the joining node
  186. // takes a hit which is probably a good thing
  187. }
  188. else
  189. {
  190. //SS: can we select ourselves as the locker while
  191. //we havent finished the join completely
  192. //SS: can others?
  193. //Is that valid
  194. Status = ERROR_REQUEST_ABORTED;
  195. return(Status);
  196. }
  197. // If the joining node dies after having acquired the lock,
  198. // then a node down event MUST be generated so that the GUM
  199. // lock can be released and the rest of the cluster can continue.
  200. //
  201. // Now Dispatch the update to all other nodes, except ourself.
  202. //
  203. for (i=LockerNode+1; i != LockerNode; i++)
  204. {
  205. if (i == (NmMaxNodeId + 1))
  206. {
  207. i=ClusterMinNodeId;
  208. if (i==LockerNode)
  209. {
  210. break;
  211. }
  212. }
  213. if (GumInfo->ActiveNode[i])
  214. {
  215. //skip yourself
  216. if (i != MyNodeId)
  217. {
  218. CL_ASSERT(GumpRpcBindings[i] != NULL);
  219. ClRtlLogPrint(LOG_NOISE,
  220. "[GUM] GumEndJoinUpdate: Dispatching seq %1!u!\ttype %2!u! context %3!u! to node %4!d!\n",
  221. Sequence,
  222. UpdateType,
  223. Context,
  224. i);
  225. NmStartRpc(i);
  226. Status = GumJoinUpdateNode(GumpRpcBindings[i],
  227. NmGetNodeId(NmLocalNode),
  228. UpdateType,
  229. Context,
  230. Sequence,
  231. BufferLength,
  232. Buffer);
  233. NmEndRpc(i);
  234. if (Status != ERROR_SUCCESS)
  235. {
  236. //we dont shoot that node, since we are the ones who is joining
  237. //However now its tables differ from the locker node's tables
  238. //Instead we will release the gum lock and abort
  239. //the join process. This joining node should then
  240. //be removed from the locker node's tables for update.
  241. //
  242. ClRtlLogPrint(LOG_NOISE,
  243. "[GUM] GumEndJoinUpdate: GumJoinUpdateNode failed \ttype %1!u! context %2!u! sequence %3!u!\n",
  244. UpdateType,
  245. Context,
  246. Sequence);
  247. NmDumpRpcExtErrorInfo(Status);
  248. break;
  249. }
  250. }
  251. }
  252. }
  253. CL_ASSERT(LockerNode != (DWORD)-1);
  254. if (Status != ERROR_SUCCESS)
  255. {
  256. goto EndJoinUnlock;
  257. }
  258. //
  259. // All nodes have been updated. Update our sequence and send the unlocking update.
  260. //
  261. GumTable[UpdateType].Joined = TRUE;
  262. GumpSequence = Sequence+1;
  263. EndJoinUnlock:
  264. //SS what if the locker node has died since then
  265. //we should make sure somebody unlocks and keeps the cluster going
  266. //Since we always try the unlock against the locker from whom we
  267. //got the lock, we will assume that the AssumeLockerWhistler is correctly
  268. //set now
  269. try {
  270. NmStartRpc(LockerNode);
  271. if (AssumeLockerWhistler)
  272. {
  273. GumUnlockUpdate2(GumpRpcBindings[LockerNode], UpdateType, Sequence,
  274. NmGetNodeId(NmLocalNode), dwGenerationNum);
  275. }
  276. else
  277. {
  278. GumUnlockUpdate(GumpRpcBindings[LockerNode], UpdateType, Sequence);
  279. }
  280. NmEndRpc(LockerNode);
  281. } except (I_RpcExceptionFilter(RpcExceptionCode())) {
  282. //
  283. // The locker node has crashed. Notify the NM, it will call our
  284. // notification routine to select a new locker node. Then retry
  285. // the unlock on the new locker node.
  286. // SS: changed to not retry unlocks..the new locker node will
  287. // unlock after propagating this change in any case.
  288. //
  289. NmEndRpc(LockerNode);
  290. Status = GetExceptionCode();
  291. ClRtlLogPrint(LOG_CRITICAL,
  292. "[GUM] GumEndJoinUpdate: Unlocking update to node %1!d! failed with %2!d!\n",
  293. LockerNode,
  294. Status);
  295. //instead of killing the locker node in the existing cluster which
  296. //we are trying to join, return a failure code which will abort the join
  297. //process. Since this is the locking node, when this node goes down the
  298. //new locker node should release the lock
  299. NmDumpRpcExtErrorInfo(Status);
  300. }
  301. ClRtlLogPrint(LOG_NOISE,
  302. "[GUM] GumEndJoinUpdate: completed update seq %1!u!\ttype %2!u! context %3!u!\n",
  303. Sequence,
  304. UpdateType,
  305. Context);
  306. return(Status);
  307. }
  308. BOOL
  309. GumpNodeCallback(
  310. IN PVOID Context1,
  311. IN PVOID Context2,
  312. IN PVOID Object,
  313. IN LPCWSTR Name
  314. )
  315. /*++
  316. Routine Description:
  317. Node enumeration callback routine for GumBeginJoinUpdate. For each
  318. node that is currently online, it attempts to connect and obtain
  319. the current GUM information (sequence and nodelist) for the specified
  320. update type.
  321. Arguments:
  322. Context1 - Supplies a pointer to the GUMP_JOIN_INFO structure.
  323. Context2 - not used
  324. Object - Supplies a pointer to the NM_NODE object
  325. Name - Supplies the node's name.
  326. Return Value:
  327. FALSE - if the information was successfully obtained and enumeration
  328. should stop.
  329. TRUE - If enumeration should continue.
  330. --*/
  331. {
  332. DWORD Status;
  333. DWORD Sequence;
  334. PGUMP_JOIN_INFO JoinInfo = (PGUMP_JOIN_INFO)Context1;
  335. PGUM_NODE_LIST NodeList = NULL;
  336. PNM_NODE Node = (PNM_NODE)Object;
  337. GUM_UPDATE_TYPE UpdateType;
  338. DWORD i;
  339. DWORD LockerNodeId;
  340. DWORD nodeId;
  341. if (NmGetNodeState(Node) != ClusterNodeUp &&
  342. NmGetNodeState(Node) != ClusterNodePaused){
  343. //
  344. // This node is not up, so don't try and get any
  345. // information from it.
  346. //
  347. return(TRUE);
  348. }
  349. //
  350. // Get the sequence and nodelist information from this node.
  351. //
  352. UpdateType = JoinInfo->UpdateType;
  353. if (UpdateType != GumUpdateTesting) {
  354. //
  355. // Our node should not be marked as ClusterNodeUp yet.
  356. //
  357. CL_ASSERT(Node != NmLocalNode);
  358. }
  359. nodeId = NmGetNodeId(Node);
  360. NmStartRpc(nodeId);
  361. Status = GumGetNodeSequence(GumpRpcBindings[NmGetNodeId(Node)],
  362. UpdateType,
  363. &Sequence,
  364. &LockerNodeId,
  365. &NodeList);
  366. NmEndRpc(nodeId);
  367. if (Status != ERROR_SUCCESS) {
  368. ClRtlLogPrint(LOG_UNUSUAL,
  369. "[GUM] GumGetNodeSequence from %1!ws! failed %2!d!\n",
  370. OmObjectId(Node),
  371. Status);
  372. NmDumpRpcExtErrorInfo(Status);
  373. return(TRUE);
  374. }
  375. JoinInfo->Status = ERROR_SUCCESS;
  376. JoinInfo->Sequence = Sequence;
  377. JoinInfo->LockerNode = LockerNodeId;
  378. //
  379. // Zero out all the nodes in the active node array.
  380. //
  381. ZeroMemory(&GumTable[UpdateType].ActiveNode,
  382. sizeof(GumTable[UpdateType].ActiveNode));
  383. //
  384. // Set all the nodes that are currently active in the
  385. // active node array.
  386. //
  387. for (i=0; i < NodeList->NodeCount; i++) {
  388. CL_ASSERT(NmIsValidNodeId(NodeList->NodeId[i]));
  389. ClRtlLogPrint(LOG_NOISE,
  390. "[GUM] GumpNodeCallback setting node %1!d! active.\n",
  391. NodeList->NodeId[i]);
  392. GumTable[UpdateType].ActiveNode[NodeList->NodeId[i]] = TRUE;;
  393. }
  394. MIDL_user_free(NodeList);
  395. //
  396. // Add in our own node.
  397. //
  398. GumTable[UpdateType].ActiveNode[NmGetNodeId(NmLocalNode)] = TRUE;
  399. //
  400. // Set the current locker node
  401. //
  402. GumpLockerNode = LockerNodeId;
  403. return(FALSE);
  404. }
  405. DWORD
  406. GumCreateRpcBindings(
  407. PNM_NODE Node
  408. )
  409. /*++
  410. Routine Description:
  411. Creates GUM's private RPC bindings for a joining node.
  412. Called by the Node Manager.
  413. Arguments:
  414. Node - A pointer to the node for which to create RPC bindings
  415. Return Value:
  416. A Win32 status code.
  417. --*/
  418. {
  419. DWORD Status;
  420. RPC_BINDING_HANDLE BindingHandle;
  421. CL_NODE_ID NodeId = NmGetNodeId(Node);
  422. ClRtlLogPrint(LOG_NOISE,
  423. "[GUM] Creating RPC bindings for node %1!u!.\n",
  424. NodeId
  425. );
  426. //
  427. // Main binding
  428. //
  429. if (GumpRpcBindings[NodeId] != NULL) {
  430. //
  431. // Reuse the old binding.
  432. //
  433. Status = ClMsgVerifyRpcBinding(GumpRpcBindings[NodeId]);
  434. if (Status != ERROR_SUCCESS) {
  435. ClRtlLogPrint(LOG_ERROR,
  436. "[GUM] Failed to verify 1st RPC binding for node %1!u!, status %2!u!.\n",
  437. NodeId,
  438. Status
  439. );
  440. return(Status);
  441. }
  442. }
  443. else {
  444. //
  445. // Create a new binding
  446. //
  447. Status = ClMsgCreateRpcBinding(
  448. Node,
  449. &(GumpRpcBindings[NodeId]),
  450. 0 );
  451. if (Status != ERROR_SUCCESS) {
  452. ClRtlLogPrint(LOG_ERROR,
  453. "[GUM] Failed to create 1st RPC binding for node %1!u!, status %2!u!.\n",
  454. NodeId,
  455. Status
  456. );
  457. return(Status);
  458. }
  459. }
  460. //
  461. // Replay binding
  462. //
  463. if (GumpReplayRpcBindings[NodeId] != NULL) {
  464. //
  465. // Reuse the old binding.
  466. //
  467. Status = ClMsgVerifyRpcBinding(GumpReplayRpcBindings[NodeId]);
  468. if (Status != ERROR_SUCCESS) {
  469. ClRtlLogPrint(LOG_ERROR,
  470. "[GUM] Failed to verify 2nd RPC binding for node %1!u!, status %2!u!.\n",
  471. NodeId,
  472. Status
  473. );
  474. return(Status);
  475. }
  476. }
  477. else {
  478. //
  479. // Create a new binding
  480. //
  481. Status = ClMsgCreateRpcBinding(
  482. Node,
  483. &(GumpReplayRpcBindings[NodeId]),
  484. 0 );
  485. if (Status != ERROR_SUCCESS) {
  486. ClRtlLogPrint(LOG_ERROR,
  487. "[GUM] Failed to create 2nd RPC binding for node %1!u!, status %2!u!.\n",
  488. NodeId,
  489. Status
  490. );
  491. return(Status);
  492. }
  493. }
  494. return(ERROR_SUCCESS);
  495. } // GumCreateRpcBindings