Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

6508 lines
264 KiB

  1. /*++
  2. Copyright(c) 1998,99 Microsoft Corporation
  3. Module Name:
  4. load.c
  5. Abstract:
  6. Windows Load Balancing Service (WLBS)
  7. Driver - load balancing algorithm
  8. Author:
  9. bbain
  10. ToDo:
  11. Kernel mode queue mgt
  12. Fail safe mode (single server for everything)
  13. --*/
  14. #ifdef KERNEL_MODE
  15. #include <ntddk.h>
  16. #include "log.h"
  17. #include "univ.h"
  18. #include "main.h" // added for multiple nic
  19. static ULONG log_module_id = LOG_MODULE_LOAD;
  20. #else
  21. #include <stdlib.h>
  22. #include <windows.h>
  23. #endif
  24. #include <stdio.h>
  25. #include "wlbsparm.h"
  26. #include "params.h"
  27. #include "wlbsiocl.h"
  28. #include "wlbsip.h"
  29. #include "load.h"
  30. #include "nlbwmi.h"
  31. //
  32. // For WPP Event Tracing
  33. //
  34. #include "trace.h" // for event tracing
  35. #include "load.tmh" // for event tracing
  36. #ifndef KERNEL_MODE
  37. #define UNIV_PRINT_INFO(msg) { \
  38. printf ("NLB (Information) [%s:%d] ", __FILE__, __LINE__); \
  39. printf msg; \
  40. printf ("\n"); \
  41. }
  42. #define UNIV_PRINT_CRIT(msg) { \
  43. printf ("NLB (Error) [%s:%d] ", __FILE__, __LINE__); \
  44. printf msg; \
  45. printf ("\n"); \
  46. }
  47. #if 0
  48. #define UNIV_PRINT_VERB(msg) { \
  49. printf ("NLB (Verbose) [%s:%d] ", __FILE__, __LINE__); \
  50. printf msg; \
  51. printf ("\n"); \
  52. }
  53. #else
  54. #define UNIV_PRINT_VERB(msg)
  55. #endif
  56. #define Univ_ulong_to_str(x, y, z) (y)
  57. #define LOG_MSG(c,s)
  58. #define LOG_MSG1(c,s,d1)
  59. #define LOG_MSG2(c,s,d1,d2)
  60. #else
  61. #endif
  62. #if defined (NLB_TCP_NOTIFICATION)
  63. GLOBAL_CONN_QUEUE g_conn_estabq[CVY_MAX_CHASH]; /* Global queue of all established connections across all NLB instances. */
  64. GLOBAL_CONN_QUEUE g_conn_pendingq[CVY_MAX_CHASH]; /* Global queue of pending connections that may or may not end up being
  65. established on a NIC to which NLB is bound. */
  66. HANDLE g_pending_conn_pool = NULL; /* Global fixed-size block pool of PENDING_ENTRYs. */
  67. #endif
  68. void Bin_state_print(PBIN_STATE bp, ULONG my_host_id);
  69. void Load_conn_kill(PLOAD_CTXT lp, PBIN_STATE bp);
  70. PBIN_STATE Load_pg_lookup(PLOAD_CTXT lp, ULONG svr_ipaddr, ULONG svr_port, BOOLEAN is_tcp);
  71. VOID Load_init_fsb(PLOAD_CTXT lp, PCONN_DESCR dp);
  72. VOID Load_init_dscr(PLOAD_CTXT lp, PCONN_ENTRY ep, BOOLEAN alloc);
  73. VOID Load_put_dscr(PLOAD_CTXT lp, PBIN_STATE bp, PCONN_ENTRY ep);
  74. #if 0 /* v2.06 */
  75. #define BIN_ALL_ONES ((MAP_T)-1) /* bin map state for 64 ones (v2.04) */
  76. #endif
  77. #define BIN_ALL_ONES ((MAP_T)(0xFFFFFFFFFFFFFFF)) /* bin map state for 60 ones (v2.04) */
  78. /* Byte offset of a field in a structure of the specified type: */
  79. #define CVY_FIELD_OFFSET(type, field) ((LONG_PTR)&(((type *)0)->field))
  80. /*
  81. * Address of the base of the structure given its type, field name, and the
  82. * address of a field or field offset within the structure:
  83. */
  84. #define STRUCT_PTR(address, type, field) ((type *)( \
  85. (PCHAR)(address) - \
  86. (PCHAR)CVY_FIELD_OFFSET(type, field)))
  87. #if defined (NLB_TCP_NOTIFICATION)
  88. /* Mark code that is used only during initialization. */
  89. #pragma alloc_text (INIT, LoadEntry)
  90. /*
  91. * Function: LoadEntry
  92. * Description: This function is called from DriverEntry to allow the load module to perform
  93. * any one-time intialization of global data.
  94. * Parameters: None.
  95. * Returns: Nothing.
  96. * Author: shouse, 4.21.02
  97. * Notes:
  98. */
  99. VOID LoadEntry ()
  100. {
  101. INT index;
  102. /* Initialize the global connection queues. */
  103. for (index = 0; index < CVY_MAX_CHASH; index++)
  104. {
  105. /* Allocate the spin lock to protect the queue. */
  106. NdisAllocateSpinLock(&g_conn_pendingq[index].lock);
  107. /* Initialize the queue head. */
  108. Queue_init(&g_conn_pendingq[index].queue);
  109. /* Allocate the spin lock to protect the queue. */
  110. NdisAllocateSpinLock(&g_conn_estabq[index].lock);
  111. /* Initialize the queue head. */
  112. Queue_init(&g_conn_estabq[index].queue);
  113. }
  114. /* Allocate a fixed-size block pool for pending connection entries. */
  115. g_pending_conn_pool = NdisCreateBlockPool(sizeof(PENDING_ENTRY), 0, 'pBLN', NULL);
  116. if (g_pending_conn_pool == NULL)
  117. {
  118. UNIV_PRINT_CRIT(("LoadEntry: Error creating fixed-size block pool"));
  119. TRACE_CRIT("%!FUNC! Error creating fixed-size block pool");
  120. }
  121. }
  122. /*
  123. * Function: LoadUnload
  124. * Description: This function is called from Init_unload to allow the load module to perform
  125. * any last minute tear-down of global data.
  126. * Parameters: None.
  127. * Returns: Nothing.
  128. * Author: shouse, 4.21.02
  129. * Notes: By the time this function is called, we are guaranteed to have de-registered
  130. * our TCP callback function, if it was indeed registered. Because ExUnregisterCallback
  131. * guarantees that it will not return until all pending ExNotifyCallback routines
  132. * have completed, we can be sure that by the time we get here, there will certainly
  133. * not be anybody accessing any of the global connection queues or FSB pools.
  134. */
  135. VOID LoadUnload ()
  136. {
  137. INT index;
  138. /* Destroy the fixed-size block pool and all descriptors therein.
  139. Note that NdisDestroyBlockPool expects all allocated blocks
  140. have been returned to the pool (freed) before it is called. */
  141. if (g_pending_conn_pool != NULL)
  142. {
  143. /* Loop through all of the connection descriptor queues and
  144. free all of the descriptors we've allocated. */
  145. for (index = 0; index < CVY_MAX_CHASH; index++)
  146. {
  147. PPENDING_ENTRY pp = NULL;
  148. NdisAcquireSpinLock(&g_conn_pendingq[index].lock);
  149. /* Dequeue the head of the queue. */
  150. pp = (PPENDING_ENTRY)Queue_deq(&g_conn_pendingq[index].queue);
  151. while (pp != NULL)
  152. {
  153. UNIV_ASSERT(pp->code == CVY_PENDINGCODE);
  154. /* Free the descriptor back to the fixed-size block pool. */
  155. NdisFreeToBlockPool((PUCHAR)pp);
  156. /* Get the next descriptor in the queue. */
  157. pp = (PPENDING_ENTRY)Queue_deq(&g_conn_pendingq[index].queue);
  158. }
  159. NdisReleaseSpinLock(&g_conn_pendingq[index].lock);
  160. }
  161. /* Destroy the fixed-size block pool. */
  162. NdisDestroyBlockPool(g_pending_conn_pool);
  163. }
  164. /* De-initialize the global connection queues. */
  165. for (index = 0; index < CVY_MAX_CHASH; index++)
  166. {
  167. /* Free the spin locks. */
  168. NdisFreeSpinLock(&g_conn_estabq[index].lock);
  169. NdisFreeSpinLock(&g_conn_pendingq[index].lock);
  170. }
  171. }
  172. #endif
  173. /*
  174. * Function: Load_teaming_consistency_notify
  175. * Description: This function is called to notify a team in which this adapter
  176. * might be participating whether the teaming configuration in the
  177. * heartbeats is consistent or not. Inconsistent configuration
  178. * results in the entire team being marked inactive - meaning that
  179. * no adapter in the team will handle any traffic, except to the DIP.
  180. * Parameters: member - a pointer to the team membership information for this adapter.
  181. * consistent - a boolean indicating the polarity of teaming consistency.
  182. * Returns: Nothing.
  183. * Author: shouse, 3.29.01
  184. * Notes: In order to check to see whether or not this adapter is part of a team,
  185. * we need to look into the team member information for this adapter. This
  186. * access should be locked, but for performance reasons, we will only lock
  187. * and check for sure if we "think" we're part of a team. Worst case is that
  188. * we are in the process of joining a team and we missed this check - no
  189. * matter, we'll notify them when/if we see this again.
  190. */
  191. VOID Load_teaming_consistency_notify (IN PBDA_MEMBER member, IN BOOL consistent) {
  192. /* Make sure that the membership information points to something. */
  193. UNIV_ASSERT(member);
  194. /* We can check without locking to keep the common case minimally expensive. If we do think
  195. we're part of a team, then we'll grab the lock and make sure. If our first indication is
  196. that we're not part of a team, then just bail out and if we actually are part of a team,
  197. we'll be through here again later to notify our team if necessary. */
  198. if (!member->active)
  199. return;
  200. NdisAcquireSpinLock(&univ_bda_teaming_lock);
  201. /* If we are an active member of a BDA team, then notify our team of our state. */
  202. if (member->active) {
  203. /* Assert that the team actually points to something. */
  204. UNIV_ASSERT(member->bda_team);
  205. /* Assert that the member ID is valid. */
  206. UNIV_ASSERT(member->member_id <= CVY_BDA_MAXIMUM_MEMBER_ID);
  207. if (consistent) {
  208. UNIV_PRINT_VERB(("Load_teaming_consistency_notify: Consistent configuration detected."));
  209. TRACE_VERB("%!FUNC! we are a consistent active member of a BDA team");
  210. /* Mark this member as consistent. */
  211. member->bda_team->consistency_map |= (1 << member->member_id);
  212. } else {
  213. UNIV_PRINT_VERB(("Load_teaming_consistency_notify: Inconsistent configuration detected."));
  214. TRACE_VERB("%!FUNC! we are an inconsistent active member of a BDA team");
  215. /* Mark this member as inconsistent. */
  216. member->bda_team->consistency_map &= ~(1 << member->member_id);
  217. /* Inactivate the team. */
  218. member->bda_team->active = FALSE;
  219. }
  220. }
  221. NdisReleaseSpinLock(&univ_bda_teaming_lock);
  222. }
  223. /*
  224. * Function: Load_teaming_consistency_check
  225. * Description: This function is used to check our teaming configuration against the
  226. * teaming configuration received in a remote heartbeat. It does little
  227. * more than check the equality of two DWORDS, however, if this is our
  228. * first notification of bad configuration, it prints a few debug state-
  229. * ments as well.
  230. * Parameters: bAlreadyKnown - a boolean indication of whether or not we have already detected bad configuration.
  231. * If the misconfiguration is already known, no additional logging is done.
  232. * member - a pointer to the team member structure for this adapter.
  233. * myConfig - a DWORD containing the teaming "code" for me.
  234. * theirCofnig - a DWORD containing the teaming "code" received in the heartbeat from them.
  235. * Returns: BOOLEAN (as ULONG) - TRUE means the configuration is consistent, FALSE indicates that it is not.
  236. * Author: shouse, 3.29.01
  237. * Notes: In order to check to see whether or not this adapter is part of a team,
  238. * we need to look into the team member information for this adapter. This
  239. * access should be locked, but for performance reasons, we will only lock
  240. * and check for sure if we "think" we're part of a team. Worst case is that
  241. * we are in the process of joining a team and we missed this check - no
  242. * matter, we'll check again on the next heartbeat.
  243. */
  244. ULONG Load_teaming_consistency_check (IN BOOLEAN bAlreadyKnown, IN PBDA_MEMBER member, IN ULONG myConfig, IN ULONG theirConfig, IN ULONG version) {
  245. /* We can check without locking to keep the common case minimally expensive. If we do think
  246. we're part of a team, then we'll grab the lock and make sure. If our first indication is
  247. that we're not part of a team, then just bail out and if we actually are part of a team,
  248. we'll be through here again later to check the consistency. */
  249. if (!member->active)
  250. return TRUE;
  251. NdisAcquireSpinLock(&univ_bda_teaming_lock);
  252. /* If we are part of a BDA team, check the BDA teaming configuration consistency. */
  253. if (member->active) {
  254. NdisReleaseSpinLock(&univ_bda_teaming_lock);
  255. /* If the heartbeat is an NT4.0 or Win2k heartbeat, then we can't trust the teaming
  256. ULONG in the heartbeat, which would contain some random garbage. In this case,
  257. we know that we're teaming but the peer does not support it, so we bail out and
  258. report an error. */
  259. if (version < CVY_VERSION_FULL) {
  260. if (!bAlreadyKnown) {
  261. UNIV_PRINT_CRIT(("Load_teaming_consistency_check: Bad teaming configuration detected: NT4.0/Win2k host in a teaming cluster"));
  262. TRACE_CRIT("%!FUNC! Bad teaming configuration detected: NT4.0/Win2k host in a teaming cluster");
  263. }
  264. return FALSE;
  265. }
  266. /* If the bi-directional affinity teaming configurations don't match, do something about it. */
  267. if (myConfig != theirConfig) {
  268. if (!bAlreadyKnown) {
  269. UNIV_PRINT_CRIT(("Load_teaming_consistency_check: Bad teaming configuration detected: Mine=0x%08x, Theirs=0x%08x", myConfig, theirConfig));
  270. TRACE_CRIT("%!FUNC! Bad teaming configuration detected: Mine=0x%08x, Theirs=0x%08x", myConfig, theirConfig);
  271. /* Report whether or not the teaming active flags are consistent. */
  272. if ((myConfig & CVY_BDA_TEAMING_CODE_ACTIVE_MASK) != (theirConfig & CVY_BDA_TEAMING_CODE_ACTIVE_MASK)) {
  273. UNIV_PRINT_VERB(("Load_teaming_consistency_check: Teaming active flags do not match: Mine=%d, Theirs=%d",
  274. (myConfig & CVY_BDA_TEAMING_CODE_ACTIVE_MASK) >> CVY_BDA_TEAMING_CODE_ACTIVE_OFFSET,
  275. (theirConfig & CVY_BDA_TEAMING_CODE_ACTIVE_MASK) >> CVY_BDA_TEAMING_CODE_ACTIVE_OFFSET));
  276. TRACE_VERB("%!FUNC! Teaming active flags do not match: Mine=%d, Theirs=%d",
  277. (myConfig & CVY_BDA_TEAMING_CODE_ACTIVE_MASK) >> CVY_BDA_TEAMING_CODE_ACTIVE_OFFSET,
  278. (theirConfig & CVY_BDA_TEAMING_CODE_ACTIVE_MASK) >> CVY_BDA_TEAMING_CODE_ACTIVE_OFFSET);
  279. }
  280. /* Report whether or not the master flags are consistent. */
  281. if ((myConfig & CVY_BDA_TEAMING_CODE_MASTER_MASK) != (theirConfig & CVY_BDA_TEAMING_CODE_MASTER_MASK)) {
  282. UNIV_PRINT_VERB(("Load_teaming_consistency_check: Master/slave settings do not match: Mine=%d, Theirs=%d",
  283. (myConfig & CVY_BDA_TEAMING_CODE_MASTER_MASK) >> CVY_BDA_TEAMING_CODE_MASTER_OFFSET,
  284. (theirConfig & CVY_BDA_TEAMING_CODE_MASTER_MASK) >> CVY_BDA_TEAMING_CODE_MASTER_OFFSET));
  285. TRACE_VERB("%!FUNC! Master/slave settings do not match: Mine=%d, Theirs=%d",
  286. (myConfig & CVY_BDA_TEAMING_CODE_MASTER_MASK) >> CVY_BDA_TEAMING_CODE_MASTER_OFFSET,
  287. (theirConfig & CVY_BDA_TEAMING_CODE_MASTER_MASK) >> CVY_BDA_TEAMING_CODE_MASTER_OFFSET);
  288. }
  289. /* Report whether or not the reverse hashing flags are consistent. */
  290. if ((myConfig & CVY_BDA_TEAMING_CODE_HASHING_MASK) != (theirConfig & CVY_BDA_TEAMING_CODE_HASHING_MASK)) {
  291. UNIV_PRINT_VERB(("Load_teaming_consistency_check: Reverse hashing flags do not match: Mine=%d, Theirs=%d",
  292. (myConfig & CVY_BDA_TEAMING_CODE_HASHING_MASK) >> CVY_BDA_TEAMING_CODE_HASHING_OFFSET,
  293. (theirConfig & CVY_BDA_TEAMING_CODE_HASHING_MASK) >> CVY_BDA_TEAMING_CODE_HASHING_OFFSET));
  294. TRACE_VERB("%!FUNC! Reverse hashing flags do not match: Mine=%d, Theirs=%d",
  295. (myConfig & CVY_BDA_TEAMING_CODE_HASHING_MASK) >> CVY_BDA_TEAMING_CODE_HASHING_OFFSET,
  296. (theirConfig & CVY_BDA_TEAMING_CODE_HASHING_MASK) >> CVY_BDA_TEAMING_CODE_HASHING_OFFSET);
  297. }
  298. /* Report whether or not the number of team members is consistent. */
  299. if ((myConfig & CVY_BDA_TEAMING_CODE_NUM_MEMBERS_MASK) != (theirConfig & CVY_BDA_TEAMING_CODE_NUM_MEMBERS_MASK)) {
  300. UNIV_PRINT_VERB(("Load_teaming_consistency_check: Numbers of team members do not match: Mine=%d, Theirs=%d",
  301. (myConfig & CVY_BDA_TEAMING_CODE_NUM_MEMBERS_MASK) >> CVY_BDA_TEAMING_CODE_NUM_MEMBERS_OFFSET,
  302. (theirConfig & CVY_BDA_TEAMING_CODE_NUM_MEMBERS_MASK) >> CVY_BDA_TEAMING_CODE_NUM_MEMBERS_OFFSET));
  303. TRACE_VERB("%!FUNC! Numbers of team members do not match: Mine=%d, Theirs=%d",
  304. (myConfig & CVY_BDA_TEAMING_CODE_NUM_MEMBERS_MASK) >> CVY_BDA_TEAMING_CODE_NUM_MEMBERS_OFFSET,
  305. (theirConfig & CVY_BDA_TEAMING_CODE_NUM_MEMBERS_MASK) >> CVY_BDA_TEAMING_CODE_NUM_MEMBERS_OFFSET);
  306. }
  307. /* Report whether or not the team membership lists are consistent. */
  308. if ((myConfig & CVY_BDA_TEAMING_CODE_MEMBERS_MASK) != (theirConfig & CVY_BDA_TEAMING_CODE_MEMBERS_MASK)) {
  309. UNIV_PRINT_VERB(("Load_teaming_consistency_check: Participating members lists do not match: Mine=0x%04x, Theirs=0x%04x",
  310. (myConfig & CVY_BDA_TEAMING_CODE_MEMBERS_MASK) >> CVY_BDA_TEAMING_CODE_MEMBERS_OFFSET,
  311. (theirConfig & CVY_BDA_TEAMING_CODE_MEMBERS_MASK) >> CVY_BDA_TEAMING_CODE_MEMBERS_OFFSET));
  312. TRACE_VERB("%!FUNC! Participating members lists do not match: Mine=0x%04x, Theirs=0x%04x",
  313. (myConfig & CVY_BDA_TEAMING_CODE_MEMBERS_MASK) >> CVY_BDA_TEAMING_CODE_MEMBERS_OFFSET,
  314. (theirConfig & CVY_BDA_TEAMING_CODE_MEMBERS_MASK) >> CVY_BDA_TEAMING_CODE_MEMBERS_OFFSET);
  315. }
  316. }
  317. return FALSE;
  318. }
  319. return TRUE;
  320. }
  321. NdisReleaseSpinLock(&univ_bda_teaming_lock);
  322. return TRUE;
  323. }
  324. /*
  325. * Function: Load_teaming_code_create
  326. * Description: This function pieces together the ULONG code that represents the configuration
  327. * of bi-directional affinity teaming on this adapter. If the adapter is not part
  328. * of a team, then the code is zero.
  329. * Parameters: code - a pointer to a ULONG that will receive the 32-bit code word.
  330. * member - a pointer to the team member structure for this adapter.
  331. * Returns: Nothing.
  332. * Author: shouse, 3.29.01
  333. * Notes: In order to check to see whether or not this adapter is part of a team,
  334. * we need to look into the team member information for this adapter. This
  335. * access should be locked, but for performance reasons, we will only lock
  336. * and check for sure if we "think" we're part of a team. Worst case is that
  337. * we are in the process of joining a team and we missed this check - no
  338. * matter, we'll be through here the next time er send a heartbeat anyway.
  339. */
  340. VOID Load_teaming_code_create (OUT PULONG code, IN PBDA_MEMBER member) {
  341. /* Assert that the code actually points to something. */
  342. UNIV_ASSERT(code);
  343. /* Assert that the membership information actually points to something. */
  344. UNIV_ASSERT(member);
  345. /* Reset the code. */
  346. *code = 0;
  347. /* We can check without locking to keep the common case minimally expensive. If we do think
  348. we're part of a team, then we'll grab the lock and make sure. If our first indication is
  349. that we're not part of a team, then just bail out and if we actually are part of a team,
  350. we'll be through here again later to generate the code next time we send a heartbeat. */
  351. if (!member->active)
  352. return;
  353. NdisAcquireSpinLock(&univ_bda_teaming_lock);
  354. /* If we are in a team, fill in the team configuration information. */
  355. if (member->active) {
  356. /* Assert that the team actually points to something. */
  357. UNIV_ASSERT(member->bda_team);
  358. /* Add configuration information for teaming at each timeout. */
  359. CVY_BDA_TEAMING_CODE_CREATE(*code,
  360. member->active,
  361. member->master,
  362. member->reverse_hash,
  363. member->bda_team->membership_count,
  364. member->bda_team->membership_fingerprint);
  365. }
  366. NdisReleaseSpinLock(&univ_bda_teaming_lock);
  367. }
  368. /*
  369. * Function: Load_add_reference
  370. * Description: This function adds a reference to the load module of a given adapter.
  371. * Parameters: pLoad - a pointer to the load module to reference.
  372. * Returns: ULONG - The incremented value.
  373. * Author: shouse, 3.29.01
  374. * Notes:
  375. */
  376. ULONG Load_add_reference (IN PLOAD_CTXT pLoad) {
  377. /* Assert that the load pointer actually points to something. */
  378. UNIV_ASSERT(pLoad);
  379. /* Increment the reference count. */
  380. return NdisInterlockedIncrement(&pLoad->ref_count);
  381. }
  382. /*
  383. * Function: Load_release_reference
  384. * Description: This function releases a reference on the load module of a given adapter.
  385. * Parameters: pLoad - a pointer to the load module to dereference.
  386. * Returns: ULONG - The decremented value.
  387. * Author: shouse, 3.29.01
  388. * Notes:
  389. */
  390. ULONG Load_release_reference (IN PLOAD_CTXT pLoad) {
  391. /* Assert that the load pointer actually points to something. */
  392. UNIV_ASSERT(pLoad);
  393. /* Decrement the reference count. */
  394. return NdisInterlockedDecrement(&pLoad->ref_count);
  395. }
  396. /*
  397. * Function: Load_get_reference_count
  398. * Description: This function returns the current load module reference count on a given adapter.
  399. * Parameters: pLoad - a pointer to the load module to check.
  400. * Returns: ULONG - The current reference count.
  401. * Author: shouse, 3.29.01
  402. * Notes:
  403. */
  404. ULONG Load_get_reference_count (IN PLOAD_CTXT pLoad) {
  405. /* Assert that the load pointer actually points to something. */
  406. UNIV_ASSERT(pLoad);
  407. /* Return the reference count. */
  408. return pLoad->ref_count;
  409. }
  410. /* Hash routine is based on a public-domain Tiny Encryption Algorithm (TEA) by
  411. David Wheeler and Roger Needham at the Computer Laboratory of Cambridge
  412. University. For reference, please consult
  413. http://vader.brad.ac.uk/tea/tea.shtml */
  414. ULONG Map (
  415. ULONG v1,
  416. ULONG v2) /* v2.06: removed range parameter */
  417. {
  418. ULONG y = v1,
  419. z = v2,
  420. sum = 0;
  421. const ULONG a = 0x67; //key [0];
  422. const ULONG b = 0xdf; //key [1];
  423. const ULONG c = 0x40; //key [2];
  424. const ULONG d = 0xd3; //key [3];
  425. const ULONG delta = 0x9E3779B9;
  426. //
  427. // Unroll the loop to improve performance
  428. //
  429. sum += delta;
  430. y += (z << 4) + a ^ z + sum ^ (z >> 5) + b;
  431. z += (y << 4) + c ^ y + sum ^ (y >> 5) + d;
  432. sum += delta;
  433. y += (z << 4) + a ^ z + sum ^ (z >> 5) + b;
  434. z += (y << 4) + c ^ y + sum ^ (y >> 5) + d;
  435. sum += delta;
  436. y += (z << 4) + a ^ z + sum ^ (z >> 5) + b;
  437. z += (y << 4) + c ^ y + sum ^ (y >> 5) + d;
  438. sum += delta;
  439. y += (z << 4) + a ^ z + sum ^ (z >> 5) + b;
  440. z += (y << 4) + c ^ y + sum ^ (y >> 5) + d;
  441. sum += delta;
  442. y += (z << 4) + a ^ z + sum ^ (z >> 5) + b;
  443. z += (y << 4) + c ^ y + sum ^ (y >> 5) + d;
  444. sum += delta;
  445. y += (z << 4) + a ^ z + sum ^ (z >> 5) + b;
  446. z += (y << 4) + c ^ y + sum ^ (y >> 5) + d;
  447. sum += delta;
  448. y += (z << 4) + a ^ z + sum ^ (z >> 5) + b;
  449. z += (y << 4) + c ^ y + sum ^ (y >> 5) + d;
  450. sum += delta;
  451. y += (z << 4) + a ^ z + sum ^ (z >> 5) + b;
  452. z += (y << 4) + c ^ y + sum ^ (y >> 5) + d;
  453. return y ^ z;
  454. } /* end Map */
  455. /*
  456. * Function: Load_simple_hash
  457. * Description: This function is a simple hash based on the IP 4-tuple used to locate
  458. * state for the connection. That is, this hash is used to determine the
  459. * queue index in which this connection should store, and can later find,
  460. * its state.
  461. * Parameters: svr_ipaddr - the server IP address in network byte order
  462. * svr_port - the server port in host byte order
  463. * client_ipaddr - the client IP address in network byte order
  464. * client_port - the client port in host byte order
  465. * Returns: ULONG - the result of the hash.
  466. * Author: shouse, 4.15.02
  467. * Notes:
  468. */
  469. ULONG Load_simple_hash (
  470. ULONG svr_ipaddr,
  471. ULONG svr_port,
  472. ULONG client_ipaddr,
  473. ULONG client_port)
  474. {
  475. return (ULONG)(svr_ipaddr + client_ipaddr + (svr_port << 16) + (client_port << 0));
  476. }
  477. /*
  478. * Function: Load_complex_hash
  479. * Description: This is the conventional NLB hashing algorithm, which ends up invoking a
  480. * light-weight encryption algorithm to calculate a hash that is ultimately
  481. * used to map this connection to a bin, or "bucket". If reverse hashing
  482. * is set, then server side parameters are used instead of client side. If
  483. * limiting is set, then client and server side paramters should NOT be mixed
  484. * when hashing; i.e. use ONLY server OR client, depending on reverse hashing.
  485. * Parameters: svr_ipaddr - the server IP address in network byte order
  486. * svr_port - the server port in host byte order
  487. * client_ipaddr - the client IP address in network byte order
  488. * client_port - the client port in host byte order
  489. * affinity - the client affinity (None, Single or Class C)
  490. * reverse_hash - whether or not to reverse client and server during hashing
  491. * limit_map_fn - whether or not to include server-side parameters in hashing
  492. * Returns: ULONG - the result of the hash.
  493. * Author: shouse, 4.15.02
  494. * Notes:
  495. */
  496. ULONG Load_complex_hash (
  497. ULONG svr_ipaddr,
  498. ULONG svr_port,
  499. ULONG client_ipaddr,
  500. ULONG client_port,
  501. ULONG affinity,
  502. ULONG reverse_hash,
  503. ULONG limit_map_fn)
  504. {
  505. /* If we're not reverse-hashing, this is our conventional hash using primarily
  506. the client information. If the map limit flag is set, then we are sure NOT
  507. to use ANY server-side information in the hash. This is most common in BDA. */
  508. if (!reverse_hash)
  509. {
  510. if (!limit_map_fn)
  511. {
  512. if (affinity == CVY_AFFINITY_NONE)
  513. return Map(client_ipaddr, ((svr_port << 16) + client_port));
  514. else if (affinity == CVY_AFFINITY_SINGLE)
  515. return Map(client_ipaddr, svr_ipaddr);
  516. else
  517. return Map(client_ipaddr & TCPIP_CLASSC_MASK, svr_ipaddr);
  518. }
  519. else
  520. {
  521. if (affinity == CVY_AFFINITY_NONE)
  522. return Map(client_ipaddr, client_port);
  523. else if (affinity == CVY_AFFINITY_SINGLE)
  524. return Map(client_ipaddr, MAP_FN_PARAMETER);
  525. else
  526. return Map(client_ipaddr & TCPIP_CLASSC_MASK, MAP_FN_PARAMETER);
  527. }
  528. }
  529. /* Otherwise, reverse the client and server information as we hash. Again, if
  530. the map limit flag is set, use NO client-side information in the hash. */
  531. else
  532. {
  533. if (!limit_map_fn)
  534. {
  535. if (affinity == CVY_AFFINITY_NONE)
  536. return Map(svr_ipaddr, ((client_port << 16) + svr_port));
  537. else if (affinity == CVY_AFFINITY_SINGLE)
  538. return Map(svr_ipaddr, client_ipaddr);
  539. else
  540. return Map(svr_ipaddr & TCPIP_CLASSC_MASK, client_ipaddr);
  541. }
  542. else
  543. {
  544. if (affinity == CVY_AFFINITY_NONE)
  545. return Map(svr_ipaddr, svr_port);
  546. else if (affinity == CVY_AFFINITY_SINGLE)
  547. return Map(svr_ipaddr, MAP_FN_PARAMETER);
  548. else
  549. return Map(svr_ipaddr & TCPIP_CLASSC_MASK, MAP_FN_PARAMETER);
  550. }
  551. }
  552. }
  553. BOOLEAN Bin_targ_map_get(
  554. PLOAD_CTXT lp,
  555. PBIN_STATE binp, /* ptr. to bin state */
  556. ULONG my_host_id,
  557. PMAP_T pmap) /* ptr. to target map */
  558. /*
  559. Get target map for this host
  560. returns BOOLEAN:
  561. TRUE => valid target map is returned via pmap
  562. FALSE => error occurred; no target map returned
  563. */
  564. {
  565. ULONG remsz, /* remainder size */
  566. loadsz, /* size of a load partition */
  567. first_bit; /* first bit position of load partition */
  568. MAP_T targ_map; /* bit map of load bins for this host */
  569. ULONG tot_load = 0; /* total of load perecentages */
  570. ULONG * pload_list; /* ptr. to list of load balance perecntages */
  571. WCHAR num [20];
  572. PMAIN_CTXT ctxtp = CONTAINING_RECORD (lp, MAIN_CTXT, load);
  573. pload_list = binp->load_amt;
  574. if (binp->mode == CVY_SINGLE)
  575. {
  576. ULONG max_pri; /* highest priority */
  577. ULONG i;
  578. first_bit = 0;
  579. /* compute max priority */
  580. max_pri = CVY_MAX_HOSTS + 1;
  581. for (i=0; i<CVY_MAX_HOSTS; i++)
  582. {
  583. tot_load += pload_list[i]; /* v2.1 */
  584. if (pload_list[i] != 0)
  585. {
  586. //
  587. // If another host has the same priority as this host, do not converge
  588. //
  589. if (i!= my_host_id && pload_list[i] == pload_list[my_host_id])
  590. {
  591. if (!(lp->dup_sspri))
  592. {
  593. UNIV_PRINT_CRIT(("Bin_targ_map_get: Host %d: Duplicate single svr priorities detected", my_host_id));
  594. TRACE_CRIT("%!FUNC! Host %d: Duplicate single svr priorities detected", my_host_id);
  595. Univ_ulong_to_str (pload_list[my_host_id], num, 10);
  596. LOG_MSG(MSG_ERROR_SINGLE_DUP, num);
  597. lp->dup_sspri = TRUE;
  598. }
  599. /* 1.03: return error, which inhibits convergence; note that
  600. rule will be automatically reinstated when duplicate server
  601. priorities are eliminated */
  602. return FALSE;
  603. }
  604. if ( pload_list[i] <= max_pri )
  605. {
  606. max_pri = pload_list[i];
  607. }
  608. }
  609. }
  610. binp->tot_load = tot_load; /* v2.1 */
  611. /* now determine if we are the highest priority host */
  612. if (pload_list[my_host_id] == max_pri)
  613. {
  614. loadsz = CVY_MAXBINS;
  615. targ_map = BIN_ALL_ONES; /* v2.05 */
  616. }
  617. else
  618. {
  619. loadsz = 0;
  620. targ_map = 0; /* v2.05 */
  621. }
  622. }
  623. else /* load balanced */
  624. {
  625. ULONG i, j;
  626. ULONG partsz[CVY_MAX_HOSTS+1];
  627. /* new partition size per host */
  628. ULONG cur_partsz[CVY_MAX_HOSTS+1];
  629. /* current partition size per host (v2.05) */
  630. ULONG cur_host[CVY_MAXBINS];
  631. /* current host for each bin (v2.05) */
  632. ULONG tot_partsz; /* sum of partition sizes */
  633. ULONG donor; /* current donor host (v2.05) */
  634. ULONG cur_nbins; /* current # bins (v2.05) */
  635. /* setup current partition sizes and bin to host mapping from current map (v2.05) */
  636. cur_nbins = 0;
  637. for (j=0; j<CVY_MAXBINS; j++)
  638. cur_host[j] = CVY_MAX_HOSTS; /* all bins are initially orphans */
  639. for (i=0; i<CVY_MAX_HOSTS; i++)
  640. {
  641. ULONG count = 0L;
  642. MAP_T cmap = binp->cur_map[i];
  643. tot_load += pload_list[i]; /* folded into this loop v2.1 */
  644. for (j=0; j<CVY_MAXBINS && cmap != ((MAP_T)0); j++)
  645. {
  646. /* if host i has bin j and it's not a duplicate, set up the mapping */
  647. if ((cmap & ((MAP_T)0x1)) != ((MAP_T)0) && cur_host[j] == CVY_MAX_HOSTS)
  648. {
  649. count++;
  650. cur_host[j] = i;
  651. }
  652. cmap >>= 1;
  653. }
  654. cur_partsz[i] = count;
  655. cur_nbins += count;
  656. }
  657. if (cur_nbins > CVY_MAXBINS)
  658. {
  659. UNIV_PRINT_CRIT(("Bin_targ_map_get: Error - too many bins found"));
  660. TRACE_CRIT("%!FUNC! Error - too many bins found");
  661. LOG_MSG(MSG_ERROR_INTERNAL, MSG_NONE);
  662. cur_nbins = CVY_MAXBINS;
  663. }
  664. /* if there are orphan bins, give them to pseudo-host CVY_MAX_HOSTS for now (v2.05) */
  665. if (cur_nbins < CVY_MAXBINS)
  666. cur_partsz[CVY_MAX_HOSTS] = CVY_MAXBINS - cur_nbins;
  667. else
  668. cur_partsz[CVY_MAX_HOSTS] = 0;
  669. /* compute total load */
  670. binp->tot_load = tot_load; /* v2.06 */
  671. /* now compute tentative partition sizes and remainder after initially
  672. dividing up partitions among hosts */
  673. tot_partsz = 0;
  674. first_bit = 0;
  675. for (i=0; i<CVY_MAX_HOSTS; i++)
  676. {
  677. if (tot_load > 0)
  678. partsz[i] = CVY_MAXBINS * pload_list[i] / tot_load;
  679. else
  680. partsz[i] = 0;
  681. tot_partsz += partsz[i];
  682. }
  683. remsz = CVY_MAXBINS - tot_partsz;
  684. /* check for zero total load */
  685. if (tot_partsz == 0)
  686. {
  687. * pmap = 0;
  688. return TRUE;
  689. }
  690. /* first dole out remainder bits to hosts that currently have bins (this
  691. minimizes the number of bins that have to move) v2.05 */
  692. if (remsz > 0)
  693. {
  694. for (i=0; i<CVY_MAX_HOSTS && remsz > 0; i++)
  695. if (cur_partsz[i] > 0 && pload_list[i] > 0)
  696. {
  697. partsz[i]++;
  698. remsz--;
  699. }
  700. }
  701. /* now dole out remainder bits to hosts that currently have no bins (to maintain
  702. the target load balance) v2.05 */
  703. if (remsz > 0)
  704. {
  705. for (i=0; i<CVY_MAX_HOSTS && remsz > 0; i++)
  706. if (cur_partsz[i] == 0 && pload_list[i] > 0)
  707. {
  708. partsz[i]++;
  709. remsz--;
  710. }
  711. }
  712. /* We MUST be out of bins by now. */
  713. UNIV_ASSERT(remsz == 0);
  714. if (remsz != 0)
  715. {
  716. UNIV_PRINT_CRIT(("Bin_targ_map_get: Bins left over (%u) after handing out to all hosts with and without bins!", remsz));
  717. TRACE_CRIT("%!FUNC! Bins left over (%u) after handing out to all hosts with and without bins!", remsz);
  718. }
  719. /* reallocate bins to target hosts to match new partition sizes (v2.05) */
  720. donor = 0;
  721. partsz[CVY_MAX_HOSTS] = 0; /* pseudo-host needs no bins */
  722. for (i=0; i<CVY_MAX_HOSTS; i++)
  723. {
  724. ULONG rcvrsz; /* current receiver's target partition */
  725. ULONG donorsz; /* current donor's target partition size */
  726. /* find and give this host some bins */
  727. rcvrsz = partsz[i];
  728. while (rcvrsz > cur_partsz[i])
  729. {
  730. /* find a host with too many bins */
  731. for (; donor < CVY_MAX_HOSTS; donor++)
  732. if (partsz[donor] < cur_partsz[donor])
  733. break;
  734. /* if donor is pseudo-host and it's out of bins, give it more bins
  735. to keep algorithm from looping; this should never happen */
  736. if (donor >= CVY_MAX_HOSTS && cur_partsz[donor] == 0)
  737. {
  738. UNIV_PRINT_CRIT(("Bin_targ_map_get: Error - no donor bins"));
  739. TRACE_CRIT("%!FUNC! Error - no donor bins");
  740. LOG_MSG(MSG_ERROR_INTERNAL, MSG_NONE);
  741. cur_partsz[donor] = CVY_MAXBINS;
  742. }
  743. /* now find the donor's bins and give them to the target host */
  744. donorsz = partsz[donor]; /* donor's target bin count */
  745. for (j=0; j<CVY_MAXBINS; j++)
  746. {
  747. if (cur_host[j] == donor)
  748. {
  749. cur_host[j] = i;
  750. cur_partsz[donor]--;
  751. cur_partsz[i]++;
  752. /* if this donor has no more to give, go find the next donor;
  753. if this receiver needs no more, go on to next receiver */
  754. if (donorsz == cur_partsz[donor] || rcvrsz == cur_partsz[i])
  755. break;
  756. }
  757. }
  758. /* if no bin was found, log a fatal error and exit */
  759. if (j == CVY_MAXBINS)
  760. {
  761. UNIV_PRINT_CRIT(("Bin_targ_map_get: Error - no bin found"));
  762. TRACE_CRIT("%!FUNC! Error - no bin found");
  763. LOG_MSG(MSG_ERROR_INTERNAL, MSG_NONE);
  764. break;
  765. }
  766. }
  767. }
  768. /* finally, compute bit mask for this host (v2.05) */
  769. targ_map = 0;
  770. for (j=0; j<CVY_MAXBINS; j++)
  771. {
  772. if (cur_host[j] == CVY_MAX_HOSTS)
  773. {
  774. UNIV_PRINT_CRIT(("Bin_targ_map_get: Error - incomplete mapping"));
  775. TRACE_CRIT("%!FUNC! Error - incomplete mapping");
  776. LOG_MSG(MSG_ERROR_INTERNAL, MSG_NONE);
  777. cur_host[j] = 0;
  778. }
  779. if (cur_host[j] == my_host_id)
  780. targ_map |= ((MAP_T)1) << j;
  781. }
  782. }
  783. * pmap = targ_map;
  784. return TRUE;
  785. } /* end Bin_targ_map_get */
  786. BOOLEAN Bin_map_check(
  787. ULONG tot_load, /* total load percentage (v2.06) */
  788. PMAP_T pbin_map) /* bin map for all hosts */
  789. {
  790. MAP_T tot_map, /* total map for all hosts */
  791. ovr_map, /* overlap map between hosts */
  792. exp_tot_map; /* expected total map */
  793. ULONG i;
  794. /* compute expected total map (2.04) */
  795. if (tot_load == 0) /* v2.06 */
  796. {
  797. return TRUE;
  798. }
  799. else
  800. {
  801. exp_tot_map = BIN_ALL_ONES;
  802. }
  803. /* compute total map and overlap map */
  804. tot_map = ovr_map = 0;
  805. for (i=0; i<CVY_MAX_HOSTS; i++)
  806. {
  807. ovr_map |= (pbin_map[i] & tot_map);
  808. tot_map |= pbin_map[i];
  809. }
  810. if (tot_map == exp_tot_map && ovr_map == 0)
  811. {
  812. return TRUE;
  813. }
  814. else
  815. {
  816. return FALSE;
  817. }
  818. } /* end Bin_map_check */
  819. BOOLEAN Bin_map_covering(
  820. ULONG tot_load, /* total load percentage (v2.06) */
  821. PMAP_T pbin_map) /* bin map for all hosts */
  822. {
  823. MAP_T tot_map, /* total map for all hosts */
  824. exp_tot_map; /* expected total map */
  825. ULONG i;
  826. /* compute expected total map (v2.04) */
  827. if (tot_load == 0) /* v2.06 */
  828. {
  829. return TRUE;
  830. }
  831. else
  832. {
  833. exp_tot_map = BIN_ALL_ONES;
  834. }
  835. /* compute total map and overlap map */
  836. tot_map = 0;
  837. for (i=0; i<CVY_MAX_HOSTS; i++)
  838. {
  839. tot_map |= pbin_map[i];
  840. }
  841. if (tot_map == exp_tot_map)
  842. {
  843. return TRUE;
  844. }
  845. else
  846. {
  847. return FALSE;
  848. }
  849. } /* end Bin_map_covering */
  850. void Bin_state_init(
  851. PLOAD_CTXT lp,
  852. PBIN_STATE binp, /* ptr. to bin state */
  853. ULONG index, /* index of bin state */
  854. ULONG my_host_id,
  855. ULONG mode,
  856. ULONG prot,
  857. BOOLEAN equal_bal, /* TRUE => balance equally across hosts */
  858. USHORT affinity,
  859. ULONG load_amt) /* this host's load percentage if unequal */
  860. /*
  861. Initialize bin state for a port group
  862. */
  863. {
  864. ULONG i; /* loop variable */
  865. PMAIN_CTXT ctxtp = CONTAINING_RECORD (lp, MAIN_CTXT, load);
  866. if ((equal_bal && mode == CVY_SINGLE) ||
  867. (mode == CVY_SINGLE && load_amt > CVY_MAX_HOSTS) ||
  868. index >= CVY_MAXBINS)
  869. {
  870. UNIV_ASSERT(FALSE); // This should never happen
  871. }
  872. binp->code = CVY_BINCODE; /* (bbain 8/19/99) */
  873. binp->equal_bal = equal_bal;
  874. binp->affinity = affinity;
  875. binp->index = index;
  876. binp->compatible = TRUE;
  877. binp->mode = mode;
  878. binp->prot = prot;
  879. /* initialize target and new load maps */
  880. binp->targ_map = 0;
  881. binp->all_idle_map = BIN_ALL_ONES;
  882. binp->cmap = 0; /* v2.1 */
  883. for (i=0; i<CVY_MAX_HOSTS; i++)
  884. {
  885. binp->new_map[i] = 0;
  886. binp->cur_map[i] = 0;
  887. binp->chk_map[i] = 0;
  888. binp->idle_map[i] = BIN_ALL_ONES;
  889. }
  890. /* initialize load percentages for all hosts */
  891. if (equal_bal)
  892. {
  893. load_amt = CVY_EQUAL_LOAD;
  894. }
  895. binp->tot_load = load_amt;
  896. for (i=0; i<CVY_MAX_HOSTS; i++)
  897. {
  898. if (i == my_host_id)
  899. {
  900. binp->orig_load_amt =
  901. binp->load_amt[i] = load_amt;
  902. }
  903. else
  904. binp->load_amt[i] = 0;
  905. }
  906. /* initialize requesting state to no requests active and all bins local or none */
  907. binp->snd_bins = 0;
  908. binp->rcv_bins = 0;
  909. binp->rdy_bins = 0;
  910. binp->idle_bins = BIN_ALL_ONES; /* we are initially idle */
  911. /* perform first initialization only once (v2.06) */
  912. if (!(binp->initialized))
  913. {
  914. binp->tconn = 0;
  915. for (i=0; i<CVY_MAXBINS; i++)
  916. {
  917. binp->nconn[i] = 0;
  918. }
  919. Queue_init(&(binp->connq));
  920. binp->initialized = TRUE;
  921. }
  922. /* Initialize the performance counters. */
  923. binp->packets_accepted = 0;
  924. binp->packets_dropped = 0;
  925. binp->bytes_accepted = 0;
  926. binp->bytes_dropped = 0;
  927. } /* end Bin_state_init */
  928. BOOLEAN Bin_converge(
  929. PLOAD_CTXT lp,
  930. PBIN_STATE binp, /* ptr. to bin state */
  931. ULONG my_host_id)
  932. /*
  933. Explicitly attempt to converge new port group state
  934. returns BOOL:
  935. TRUE => all hosts have consistent new state for converging
  936. FALSE => parameter error or inconsistent convergence state
  937. */
  938. {
  939. MAP_T orphan_map; /* map of orphans that this host will now own */
  940. ULONG i;
  941. BOOLEAN fCheckMap = FALSE;
  942. /* determine new target load map; 1.03: return in error if no map generated */
  943. if (!Bin_targ_map_get(lp, binp, my_host_id, &(binp->targ_map)))
  944. {
  945. return FALSE;
  946. }
  947. /* compute map of all currently orphan bins; note that all duplicates are
  948. considered to be orphans */
  949. orphan_map = 0;
  950. for (i=0; i<CVY_MAX_HOSTS; i++)
  951. orphan_map |= binp->cur_map[i];
  952. orphan_map = ~orphan_map;
  953. /* update our new map to include all current bins and orphans that are in the
  954. target set */
  955. binp->new_map[my_host_id] = binp->cmap | /* v2.1 */
  956. (binp->targ_map & orphan_map); /* 1.03 */
  957. /* check that new load maps are consistent and covering */
  958. fCheckMap = Bin_map_check(binp->tot_load, binp->new_map); /* v2.06 */
  959. return fCheckMap;
  960. } /* end Bin_converge */
  961. void Bin_converge_commit(
  962. PLOAD_CTXT lp,
  963. PBIN_STATE binp, /* ptr. to bin state */
  964. ULONG my_host_id)
  965. /*
  966. Commit to new port group state
  967. */
  968. {
  969. ULONG i;
  970. PMAIN_CTXT ctxtp = CONTAINING_RECORD (lp, MAIN_CTXT, load);
  971. MAP_T old_cmap = binp->cmap;
  972. /* check that new load maps are consistent and covering */
  973. if (!(Bin_map_check(binp->tot_load, binp->new_map))) /* v2.06 */
  974. {
  975. if (!(lp->bad_map))
  976. {
  977. UNIV_PRINT_CRIT(("Bin_converge_commit: Bad new map"));
  978. TRACE_CRIT("%!FUNC! Bad new map");
  979. LOG_MSG1(MSG_ERROR_INTERNAL, MSG_NONE, (ULONG_PTR)binp->new_map);
  980. lp->bad_map = TRUE;
  981. }
  982. }
  983. /* commit to new current maps */
  984. for (i=0; i<CVY_MAX_HOSTS; i++)
  985. {
  986. binp->chk_map[i] =
  987. binp->cur_map[i] = binp->new_map[i];
  988. }
  989. /* setup new send/rcv bins, and new ready to ship bins; note that ready to
  990. ship bins are cleared from the current map */
  991. binp->rdy_bins = binp->cur_map[my_host_id] & ~(binp->targ_map); /* 1.03 */
  992. binp->cur_map[my_host_id] &= ~(binp->rdy_bins);
  993. binp->rcv_bins = binp->targ_map & ~(binp->cur_map[my_host_id]);
  994. binp->cmap = binp->cur_map[my_host_id]; /* v2.1 */
  995. /* If the port rule map has changed, reset the performance counters. */
  996. if (binp->cmap != old_cmap) {
  997. binp->packets_accepted = 0;
  998. binp->packets_dropped = 0;
  999. binp->bytes_accepted = 0;
  1000. binp->bytes_dropped = 0;
  1001. }
  1002. #if 0
  1003. /* simulation output generator (2.05) */
  1004. {
  1005. ULONG lcount = 0L;
  1006. ULONG ncount = 0L;
  1007. MAP_T bins = binp->rdy_bins;
  1008. for (i=0; i<CVY_MAXBINS && bins != 0; i++, bins >>= 1)
  1009. if ((bins & ((MAP_T)0x1)) != ((MAP_T)0))
  1010. lcount++;
  1011. bins = binp->targ_map;
  1012. for (i=0; i<CVY_MAXBINS && bins != 0; i++, bins >>= 1)
  1013. if ((bins & ((MAP_T)0x1)) != ((MAP_T)0))
  1014. ncount++;
  1015. UNIV_PRINT_VERB(("Converge at host %d pg %d: losing %d, will have %d bins\n", my_host_id, binp->index, lcount, ncount));
  1016. }
  1017. #endif
  1018. } /* end Bin_converge_commit */
  1019. BOOLEAN Bin_host_update(
  1020. PLOAD_CTXT lp,
  1021. PBIN_STATE binp, /* ptr. to bin state */
  1022. ULONG my_host_id, /* my host's id MINUS one */
  1023. BOOLEAN converging, /* TRUE => we are converging now */
  1024. BOOLEAN rem_converging, /* TRUE => remote host is converging */
  1025. ULONG rem_host, /* remote host's id MINUS one */
  1026. MAP_T cur_map, /* remote host's current map or 0 if host died */
  1027. MAP_T new_map, /* remote host's new map if converging */
  1028. MAP_T idle_map, /* remote host's idle map */
  1029. MAP_T rdy_bins, /* bins that host is ready to send; ignored
  1030. if converging to prevent bin transfers */
  1031. ULONG pkt_count, /* remote host's packet count */
  1032. ULONG load_amt) /* remote host's load percentage */
  1033. /*
  1034. Update hosts's state for a port group
  1035. returns BOOL:
  1036. TRUE => if not converging, normal return
  1037. otherwise, all hosts have consistent state for converging
  1038. FALSE => parameter error or inconsistent convergence state
  1039. function:
  1040. Updates hosts's state for a port group and attempts to converge new states if
  1041. in convergence mode. Called when a ping message is received or when a host
  1042. is considered to have died. Handles case of newly discovered hosts. Can be
  1043. called multiple times with the same information.
  1044. */
  1045. {
  1046. ULONG i;
  1047. PMAIN_CTXT ctxtp = CONTAINING_RECORD (lp, MAIN_CTXT, load);
  1048. if (rem_host >= CVY_MAX_HOSTS || rem_host == my_host_id)
  1049. {
  1050. UNIV_PRINT_CRIT(("Bin_host_update: Parameter error"));
  1051. TRACE_CRIT("%!FUNC! Parameter error");
  1052. LOG_MSG2(MSG_ERROR_INTERNAL, MSG_NONE, rem_host+1, my_host_id+1);
  1053. return FALSE;
  1054. }
  1055. UNIV_ASSERT(binp->code == CVY_BINCODE); /* (bbain 8/19/99) */
  1056. /* change load percentage if load changed */
  1057. if (load_amt != binp->load_amt[rem_host])
  1058. {
  1059. binp->load_amt[rem_host] = load_amt;
  1060. }
  1061. /* check for non-overlapping maps */
  1062. if ((binp->cmap & cur_map) != 0) /* v2.1 */
  1063. {
  1064. /* if we have received fewer packets than the other host or have a higher host id,
  1065. remove duplicates from current map; this uses a heuristic that a newly joining
  1066. host that was subnetted probably did not receive packets; we are trying to avoid
  1067. having two hosts answer to the same client while minimizing disruption of service
  1068. (v1.32B) */
  1069. if (lp->send_msg.pkt_count < pkt_count ||
  1070. (lp->send_msg.pkt_count == pkt_count && rem_host < my_host_id))
  1071. {
  1072. MAP_T dup_map;
  1073. dup_map = binp->cmap & cur_map; /* v2.1 */
  1074. binp->cur_map[my_host_id] &= ~dup_map;
  1075. binp->cmap = binp->cur_map[my_host_id]; /* v2.1 */
  1076. /* If there has been a collision, reset the performance counters. */
  1077. binp->packets_accepted = 0;
  1078. binp->packets_dropped = 0;
  1079. binp->bytes_accepted = 0;
  1080. binp->bytes_dropped = 0;
  1081. Load_conn_kill(lp, binp);
  1082. }
  1083. if (!converging && !rem_converging)
  1084. {
  1085. if (!(lp->overlap_maps))
  1086. {
  1087. UNIV_PRINT_CRIT(("Bin_host_update: Host %d: Two hosts with overlapping maps detected %d.", my_host_id, binp->index));
  1088. TRACE_CRIT("%!FUNC! Host %d: Two hosts with overlapping maps detected %d.", my_host_id, binp->index);
  1089. LOG_MSG2(MSG_WARN_OVERLAP, MSG_NONE, my_host_id+1, binp->index);
  1090. lp->overlap_maps = TRUE;
  1091. }
  1092. /* force convergence if in normal operations */
  1093. return FALSE;
  1094. }
  1095. }
  1096. /* now update remote host's current map */
  1097. binp->cur_map[rem_host] = cur_map;
  1098. /* update idle map and calculate new global idle map if it's changed */
  1099. if (binp->idle_map[rem_host] != idle_map)
  1100. {
  1101. MAP_T saved_map = binp->all_idle_map;
  1102. MAP_T new_idle_map = BIN_ALL_ONES;
  1103. MAP_T tmp_map;
  1104. binp->idle_map[rem_host] = idle_map;
  1105. /* compute new idle map for all other hosts */
  1106. for (i=0; i<CVY_MAX_HOSTS; i++)
  1107. if (i != my_host_id)
  1108. new_idle_map &= binp->idle_map[i];
  1109. binp->all_idle_map = new_idle_map;
  1110. /* see which locally owned bins have gone idle in all other hosts */
  1111. tmp_map = new_idle_map & (~saved_map) & binp->cmap; /* v2.1 */
  1112. if (tmp_map != 0)
  1113. {
  1114. UNIV_PRINT_VERB(("Bin_host_update: Host %d pg %d: detected new all idle %08x for local bins",
  1115. my_host_id, binp->index, tmp_map));
  1116. TRACE_VERB("%!FUNC! Host %d pg %d: detected new all idle 0x%08x for local bins",
  1117. my_host_id, binp->index, (ULONG)tmp_map);
  1118. }
  1119. tmp_map = saved_map & (~new_idle_map) & binp->cmap; /* v2.1 */
  1120. if (tmp_map != 0)
  1121. {
  1122. UNIV_PRINT_VERB(("Bin_host_update: Host %d pg %d: detected new non-idle %08x for local bins",
  1123. my_host_id, binp->index, tmp_map));
  1124. TRACE_VERB("%!FUNC! Host %d pg %d: detected new non-idle 0x%08x for local bins",
  1125. my_host_id, binp->index, (ULONG)tmp_map);
  1126. }
  1127. }
  1128. /* 1.03: eliminated else clause */
  1129. /* if we are not converging AND other host not converging, exchange bins;
  1130. convergence must now be complete for both hosts */
  1131. if (!converging)
  1132. {
  1133. if (!rem_converging) { /* 1.03: reorganized code to exchange bins only when both
  1134. hosts are not converging to avoid using stale bins */
  1135. MAP_T new_bins; /* incoming bins from the remote host */
  1136. MAP_T old_cmap = binp->cmap;
  1137. /* check to see if remote host has received some bins from us */
  1138. binp->rdy_bins &= (~cur_map);
  1139. /* check to see if we can receive some bins */
  1140. new_bins = binp->rcv_bins & rdy_bins;
  1141. if (new_bins != 0)
  1142. {
  1143. if ((binp->cmap & new_bins) != 0) /* v2.1 */
  1144. {
  1145. if (!(lp->err_rcving_bins))
  1146. {
  1147. UNIV_PRINT_CRIT(("Bin_host_update: Receiving bins already own"));
  1148. TRACE_CRIT("%!FUNC! Receiving bins already own");
  1149. LOG_MSG2(MSG_ERROR_INTERNAL, MSG_NONE, binp->cur_map[my_host_id], new_bins);
  1150. lp->err_rcving_bins = TRUE;
  1151. }
  1152. }
  1153. binp->cur_map[my_host_id] |= new_bins;
  1154. binp->rcv_bins &= ~new_bins;
  1155. binp->cmap = binp->cur_map[my_host_id]; /* v2.1 */
  1156. /* If the port rule map has changed, reset the performance counters. */
  1157. if (binp->cmap != old_cmap) {
  1158. binp->packets_accepted = 0;
  1159. binp->packets_dropped = 0;
  1160. binp->bytes_accepted = 0;
  1161. binp->bytes_dropped = 0;
  1162. }
  1163. UNIV_PRINT_VERB(("Bin_host_update: Host %d pg %d: received %08x ; cur now %08x",
  1164. my_host_id, binp->index, new_bins, binp->cur_map[my_host_id]));
  1165. TRACE_VERB("%!FUNC! host %d pg %d: received 0x%08x ; cur now 0x%08x",
  1166. my_host_id, binp->index, (ULONG)new_bins, (ULONG)binp->cur_map[my_host_id]);
  1167. }
  1168. /* do consistency check that all bins are covered */
  1169. binp->chk_map[rem_host] = cur_map | rdy_bins;
  1170. binp->chk_map[my_host_id] = binp->cmap | binp->rdy_bins; /* v2.1 */
  1171. if (!Bin_map_covering(binp->tot_load, binp->chk_map)) /* v2.06 */
  1172. {
  1173. if (!(lp->err_orphans))
  1174. {
  1175. #if 0
  1176. UNIV_PRINT_CRIT(("Bin_host_update: Host %d: Orphan bins detected", my_host_id));
  1177. TRACE_CRIT("%!FUNC! Host %d: Orphan bins detected", my_host_id);
  1178. LOG_MSG1(MSG_ERROR_INTERNAL, MSG_NONE, my_host_id+1);
  1179. #endif
  1180. lp->err_orphans = TRUE;
  1181. }
  1182. }
  1183. }
  1184. return TRUE;
  1185. }
  1186. /* otherwise, store proposed new load map and try to converge current host data */
  1187. else
  1188. {
  1189. BOOLEAN fRet;
  1190. binp->chk_map[rem_host] =
  1191. binp->new_map[rem_host] = new_map;
  1192. fRet = Bin_converge(lp, binp, my_host_id);
  1193. return fRet;
  1194. }
  1195. } /* end Bin_host_update */
  1196. void Bin_state_print(
  1197. PBIN_STATE binp, /* ptr. to bin state */
  1198. ULONG my_host_id)
  1199. {
  1200. #if 0
  1201. ULONG i;
  1202. #endif
  1203. UNIV_PRINT_VERB(("Bin_state_print: Hst %d binp %x: maps: targ %x cur %x new %x; eq %d mode %d amt %d tot %d; bins: snd %x rcv %x rdy %x",
  1204. my_host_id, binp, binp->targ_map, binp->cur_map[my_host_id], binp->new_map[my_host_id],
  1205. binp->equal_bal, binp->mode, binp->load_amt[my_host_id],
  1206. binp->tot_load, binp->snd_bins, binp->rcv_bins, binp->rdy_bins));
  1207. TRACE_VERB("%!FUNC! Hst 0x%x binp 0x%p: maps: targ 0x%x cur 0x%x new 0x%x; eq %d mode %d amt %d tot %d; bins: snd 0x%x rcv 0x%x rdy 0x%x",
  1208. my_host_id, binp, (ULONG)binp->targ_map, (ULONG)binp->cur_map[my_host_id], (ULONG)binp->new_map[my_host_id],
  1209. binp->equal_bal, binp->mode, binp->load_amt[my_host_id],
  1210. binp->tot_load, (ULONG)binp->snd_bins, (ULONG)binp->rcv_bins, (ULONG)binp->rdy_bins);
  1211. #if 0
  1212. for (i=0; i<CVY_MAX_HOSTS; i++)
  1213. {
  1214. UNIV_PRINT_VERB(("Bin_state_print: Host %d: cur map %x new %x load_amt %d", i+1, binp->cur_map[i],
  1215. binp->new_map[i], binp->load_amt[i]));
  1216. TRACE_VERB("%!FUNC! Host %d: cur map 0x%x new 0x%x load_amt %d", i+1, binp->cur_map[i],
  1217. binp->new_map[i], binp->load_amt[i]);
  1218. }
  1219. for (i=0; i<CVY_MAXBINS; i++)
  1220. {
  1221. UNIV_PRINT_VERB(("Bin_state_print: Bin %d: req_host %d bin_state %d nconn %d", i, binp->req_host[i],
  1222. binp->bin_state[i], binp->nconn[i]));
  1223. TRACE_VERB("%!FUNC! Bin %d: req_host %d bin_state %d nconn %d", i, binp->req_host[i],
  1224. binp->bin_state[i], binp->nconn[i]);
  1225. }
  1226. #endif
  1227. } /* end Bin_state_print */
  1228. /*
  1229. * Function: Load_soil_dscr
  1230. * Description: This function marks a given connection dirty and either destroys
  1231. * it or moves it to the dirty descriptor queue for subsequent cleanup.
  1232. * Parameters: lp - a pointer to the load module.
  1233. * bp - a pointer to the appropriate port rule.
  1234. * ep - a pointer to the descriptor to soil.
  1235. * Returns: Nothing.
  1236. * Author: shouse, 7.23.02
  1237. * Notes:
  1238. */
  1239. void Load_soil_dscr (PLOAD_CTXT lp, PBIN_STATE bp, PCONN_ENTRY ep)
  1240. {
  1241. /* Mark the connection dirty. We mark the connection dirty here to
  1242. ensure that Load_put_dscr does not update the connection counters
  1243. when this descriptor is eventually destroyed. */
  1244. ep->flags |= NLB_CONN_ENTRY_FLAGS_DIRTY;
  1245. /* Increment the dirty connection counters. We do this unconditionally
  1246. because we've already marked the descriptor dirty. Load_put_dscr
  1247. will decrement these counters when it sees that the descriptor has
  1248. been marked dirty. */
  1249. lp->dirty_bin[ep->bin]++;
  1250. lp->num_dirty++;
  1251. /* Make connection and bin dirty if we don't have a zero timeout period so that they
  1252. will not be handled by TCP/IP anymore; this avoids allowing TCP/IP's now stale
  1253. connection state from handling packets for newer connections should traffic be
  1254. directed to this host in the future.
  1255. Only mark descriptors and bins dirty, how-
  1256. ever, if the descriptor is NOT on the timeout queue. */
  1257. if (!ep->timeout)
  1258. {
  1259. switch (ep->protocol)
  1260. {
  1261. case TCPIP_PROTOCOL_TCP:
  1262. case TCPIP_PROTOCOL_PPTP:
  1263. case TCPIP_PROTOCOL_GRE:
  1264. #if defined (NLB_TCP_NOTIFICATION)
  1265. /* If TCP notifications are turned on, we will mark these descriptors dirty
  1266. and remove them when TCP notifies us that it has removed the state for
  1267. the TCP connection. GRE descriptors always correspond to a PPTP/TCP
  1268. tunnel and are cleaned up when their "parent" descriptor is cleaned up. */
  1269. if (NLB_NOTIFICATIONS_ON() || (lp->cln_timeout > 0))
  1270. #else
  1271. /* If there is a non-zero cleanup timeout, place these descriptors on the
  1272. dirty queue and clean them up when the timeout expires. */
  1273. if (lp->cln_timeout > 0)
  1274. #endif
  1275. {
  1276. /* Unlink the descriptor from the bin queue and link it to the dirty queue. */
  1277. Link_unlink(&(ep->blink));
  1278. Queue_enq(&(lp->conn_dirtyq), &(ep->blink));
  1279. /* Note that a cleanup is now pending. */
  1280. lp->cln_waiting = TRUE;
  1281. }
  1282. /* Otherwise, clean the descriptors up now. */
  1283. else
  1284. {
  1285. /* Clear the descriptor. */
  1286. CVY_CONN_CLEAR(ep);
  1287. /* Release the descriptor. */
  1288. Load_put_dscr(lp, bp, ep);
  1289. }
  1290. break;
  1291. case TCPIP_PROTOCOL_IPSEC1:
  1292. case TCPIP_PROTOCOL_IPSEC_UDP:
  1293. /* Unlink the descriptor from the bin queue and link it to the dirty queue. */
  1294. Link_unlink(&(ep->blink));
  1295. Queue_enq(&(lp->conn_dirtyq), &(ep->blink));
  1296. /* Note that a cleanup is now pending. */
  1297. lp->cln_waiting = TRUE;
  1298. break;
  1299. default:
  1300. /* Clear the descriptor. */
  1301. CVY_CONN_CLEAR(ep);
  1302. /* Release the descriptor. */
  1303. Load_put_dscr(lp, bp, ep);
  1304. break;
  1305. }
  1306. }
  1307. /* Otherwise, if the descriptor is already timing-out (timeout != 0), TCP/IP should
  1308. not have any stale state for this connection, as it has already terminated, so
  1309. just destroy the descriptor now. */
  1310. else
  1311. {
  1312. /* Clear the descriptor. */
  1313. CVY_CONN_CLEAR(ep);
  1314. /* Release the descriptor. */
  1315. Load_put_dscr(lp, bp, ep);
  1316. }
  1317. }
  1318. void Load_conn_kill(
  1319. PLOAD_CTXT lp,
  1320. PBIN_STATE bp)
  1321. /*
  1322. Kill all connections in a port group (v1.32B)
  1323. */
  1324. {
  1325. PCONN_ENTRY ep; /* ptr. to connection entry */
  1326. QUEUE * qp; /* ptr. to bin's connection queue */
  1327. QUEUE * dqp; /* ptr. to dirty queue */
  1328. LONG count[CVY_MAXBINS];
  1329. /* count of cleaned up connections per bin for checking */
  1330. ULONG i;
  1331. BOOLEAN err_bin; /* bin id error detected */
  1332. BOOLEAN err_count; /* connection count error detected */
  1333. PMAIN_CTXT ctxtp = CONTAINING_RECORD (lp, MAIN_CTXT, load);
  1334. err_bin = err_count = FALSE;
  1335. qp = &(bp->connq);
  1336. dqp = &(lp->conn_dirtyq);
  1337. for (i=0; i<CVY_MAXBINS; i++)
  1338. count[i] = 0;
  1339. /* remove connections from bin queue and either make dirty or cleanup */
  1340. ep = (PCONN_ENTRY)Queue_front(qp);
  1341. while (ep != NULL)
  1342. {
  1343. UNIV_ASSERT(ep->code == CVY_ENTRCODE);
  1344. if (ep->bin >= CVY_MAXBINS)
  1345. {
  1346. if (!err_bin)
  1347. {
  1348. UNIV_PRINT_CRIT(("Load_conn_kill: Bad bin id"));
  1349. TRACE_CRIT("%!FUNC! Bad bin id");
  1350. LOG_MSG2(MSG_ERROR_INTERNAL, MSG_NONE, ep->bin, CVY_MAXBINS);
  1351. err_bin = TRUE;
  1352. }
  1353. }
  1354. else
  1355. {
  1356. count[ep->bin]++;
  1357. }
  1358. /* Mark the descriptor dirty and either free it or move it to
  1359. the dirty descriptor queue for subsequent cleanup. */
  1360. Load_soil_dscr(lp, bp, ep);
  1361. ep = (PCONN_ENTRY)Queue_front(qp);
  1362. }
  1363. /* now make bins idle */
  1364. for (i=0; i<CVY_MAXBINS; i++)
  1365. {
  1366. if (bp->nconn[i] != count[i])
  1367. {
  1368. if (!err_count)
  1369. {
  1370. UNIV_PRINT_CRIT(("Load_conn_kill: Bad connection count %d %d bin %d", bp->nconn[i], (LONG)count[i], i));
  1371. TRACE_CRIT("%!FUNC! Bad connection count %d %d bin %d", bp->nconn[i], (LONG)count[i], i);
  1372. /* KXF 2.1.1 - removed after tripped up at MSFT a few times */
  1373. #if 0
  1374. LOG_MSG2(MSG_ERROR_INTERNAL, MSG_NONE, bp->nconn[i], (LONG)count[i]);
  1375. #endif
  1376. err_count = TRUE;
  1377. }
  1378. }
  1379. bp->nconn[i] = 0;
  1380. }
  1381. lp->nconn -= bp->tconn;
  1382. if (lp->nconn < 0)
  1383. lp->nconn = 0;
  1384. bp->tconn = 0;
  1385. bp->idle_bins = BIN_ALL_ONES;
  1386. if (lp->cln_waiting)
  1387. {
  1388. lp->cur_time = 0;
  1389. }
  1390. }
  1391. void Load_conn_cleanup(
  1392. PLOAD_CTXT lp)
  1393. /*
  1394. Clean up all dirty connections (v1.32B)
  1395. */
  1396. {
  1397. PCONN_ENTRY ep; /* ptr. to connection entry */
  1398. PCONN_ENTRY next; /* ptr. to next connection entry */
  1399. QUEUE * dqp; /* ptr. to dirty queue */
  1400. BOOLEAN err_bin; /* bin id error detected */
  1401. ULONG i;
  1402. PMAIN_CTXT ctxtp = CONTAINING_RECORD (lp, MAIN_CTXT, load);
  1403. err_bin = FALSE;
  1404. dqp = &(lp->conn_dirtyq);
  1405. /* dequeue and clean up all connections on dirty connection queue */
  1406. ep = (PCONN_ENTRY)Queue_front(dqp);
  1407. while (ep != NULL)
  1408. {
  1409. PBIN_STATE bp;
  1410. UNIV_ASSERT(ep->code == CVY_ENTRCODE);
  1411. if (ep->bin >= CVY_MAXBINS)
  1412. {
  1413. if (!err_bin)
  1414. {
  1415. UNIV_PRINT_CRIT(("Load_conn_cleanup: Bad bin id"));
  1416. TRACE_CRIT("%!FUNC! Bad bin id");
  1417. LOG_MSG2(MSG_ERROR_INTERNAL, MSG_NONE, ep->bin, CVY_MAXBINS);
  1418. err_bin = TRUE;
  1419. }
  1420. }
  1421. /* If we're about to clean up this descriprtor, it had better be dirty. */
  1422. UNIV_ASSERT(ep->flags & NLB_CONN_ENTRY_FLAGS_DIRTY);
  1423. /* Find the NEXT descriptor in the queue before we possibly destroy this one. */
  1424. next = (PCONN_ENTRY)Queue_next(dqp, &(ep->blink));
  1425. switch (ep->protocol)
  1426. {
  1427. case TCPIP_PROTOCOL_IPSEC1:
  1428. case TCPIP_PROTOCOL_IPSEC_UDP:
  1429. break;
  1430. case TCPIP_PROTOCOL_TCP:
  1431. case TCPIP_PROTOCOL_PPTP:
  1432. case TCPIP_PROTOCOL_GRE:
  1433. #if defined (NLB_TCP_NOTIFICATION)
  1434. if (!NLB_NOTIFICATIONS_ON())
  1435. {
  1436. #endif
  1437. /* Lookup the port rule, so we can update the port rule info. */
  1438. bp = Load_pg_lookup(lp, ep->svr_ipaddr, ep->svr_port, IS_TCP_PKT(ep->protocol));
  1439. /* Clear the descriptor. */
  1440. CVY_CONN_CLEAR(ep);
  1441. /* Release the descriptor. */
  1442. Load_put_dscr(lp, bp, ep);
  1443. #if defined (NLB_TCP_NOTIFICATION)
  1444. }
  1445. #endif
  1446. break;
  1447. default:
  1448. /* Lookup the port rule, so we can update the port rule info. */
  1449. bp = Load_pg_lookup(lp, ep->svr_ipaddr, ep->svr_port, IS_TCP_PKT(ep->protocol));
  1450. /* Clear the descriptor. */
  1451. CVY_CONN_CLEAR(ep);
  1452. /* Release the descriptor. */
  1453. Load_put_dscr(lp, bp, ep);
  1454. break;
  1455. }
  1456. /* Set the current descriptor to the next descriptor. */
  1457. ep = next;
  1458. }
  1459. }
  1460. void Load_stop(
  1461. PLOAD_CTXT lp)
  1462. {
  1463. ULONG i;
  1464. IRQLEVEL irql;
  1465. PMAIN_CTXT ctxtp = CONTAINING_RECORD (lp, MAIN_CTXT, load);
  1466. UNIV_ASSERT(lp->code == CVY_LOADCODE); /* (bbain 8/19/99) */
  1467. if (!(lp->active))
  1468. {
  1469. return;
  1470. }
  1471. LOCK_ENTER(&(lp->lock), &irql);
  1472. /* make connections for all rules dirty so they will not be handled */
  1473. for (i=0; i<lp->send_msg.nrules; i++)
  1474. {
  1475. PBIN_STATE bp; /* ptr. to bin state */
  1476. bp = &(lp->pg_state[i]);
  1477. UNIV_ASSERT(bp->code == CVY_BINCODE); /* (bbain 8/21/99) */
  1478. Load_conn_kill(lp, bp); /* (v1.32B) */
  1479. /* advertise that we are not handling any load in case a ping is sent out */
  1480. lp->send_msg.cur_map[i] = 0;
  1481. lp->send_msg.new_map[i] = 0;
  1482. lp->send_msg.idle_map[i] = BIN_ALL_ONES;
  1483. lp->send_msg.rdy_bins[i] = 0;
  1484. lp->send_msg.load_amt[i] = 0;
  1485. }
  1486. lp->send_msg.state = HST_CVG; /* force convergence (v2.1) */
  1487. /* go inactive until restarted */
  1488. lp->active = FALSE;
  1489. lp->nconn = 0; /* v2.1 */
  1490. LOCK_EXIT(&(lp->lock), irql);
  1491. } /* end Load_stop */
  1492. BOOLEAN Load_start( /* (v1.32B) */
  1493. PLOAD_CTXT lp)
  1494. {
  1495. ULONG i;
  1496. BOOLEAN ret;
  1497. PMAIN_CTXT ctxtp = CONTAINING_RECORD (lp, MAIN_CTXT, load);
  1498. WCHAR me[20];
  1499. if (!(lp->initialized))
  1500. Load_init(lp, & ctxtp -> params);
  1501. UNIV_ASSERT(lp->code == CVY_LOADCODE); /* (bbain 8/19/99) */
  1502. if (lp->active)
  1503. {
  1504. return FALSE;
  1505. }
  1506. lp->my_host_id =(* (lp->params)).host_priority - 1;
  1507. lp->ping_map =
  1508. lp->host_map = 1 << lp->my_host_id;
  1509. lp->last_hmap = 0; /* bbain RTM RC1 6/23/99 */
  1510. for (i=0; i<CVY_MAX_HOSTS; i++)
  1511. {
  1512. lp->nmissed_pings[i] = 0;
  1513. }
  1514. lp->min_missed_pings = (* (lp->params)).alive_tolerance;
  1515. lp->cln_timeout = (* (lp->params)).cleanup_delay;
  1516. lp->def_timeout = (* (lp->params)).alive_period;
  1517. lp->stable_map = 0;
  1518. lp->consistent = TRUE;
  1519. /* Intiialize the bad teaming configuration detected flag. */
  1520. lp->bad_team_config = FALSE;
  1521. /* Host map of legacy (win2k/NT4.0) hosts detected. */
  1522. lp->legacy_hosts = 0;
  1523. lp->dup_hosts = FALSE;
  1524. lp->dup_sspri = FALSE;
  1525. lp->bad_map = FALSE;
  1526. lp->overlap_maps = FALSE;
  1527. lp->err_rcving_bins = FALSE;
  1528. lp->err_orphans = FALSE;
  1529. lp->bad_num_rules = FALSE;
  1530. lp->alloc_inhibited = FALSE;
  1531. lp->alloc_failed = FALSE;
  1532. lp->bad_defrule = FALSE;
  1533. lp->scale_client = (BOOLEAN)(* (lp->params)).scale_client;
  1534. lp->my_stable_ct = 0;
  1535. lp->all_stable_ct = 0;
  1536. lp->min_stable_ct = lp->min_missed_pings;
  1537. lp->dscr_per_alloc = (* (lp->params)).dscr_per_alloc;
  1538. lp->max_dscr_allocs = (* (lp->params)).max_dscr_allocs;
  1539. /* Calculate the maximum number of outstanding descriptors (in use) allowed. */
  1540. lp->max_dscr_out = lp->max_dscr_allocs * lp->dscr_per_alloc;
  1541. lp->tcp_timeout = (* (lp->params)).tcp_dscr_timeout;
  1542. lp->ipsec_timeout = (* (lp->params)).ipsec_dscr_timeout;
  1543. lp->pkt_count = 0;
  1544. /* initialize port group bin states; add a default rule at the end */
  1545. if ((* (lp->params)).num_rules >= (CVY_MAX_RULES - 1))
  1546. {
  1547. UNIV_PRINT_CRIT(("Load_start: Too many rules; using max possible."));
  1548. TRACE_CRIT("%!FUNC! Too many rules; using max possible.");
  1549. lp->send_msg.nrules = (USHORT)CVY_MAX_RULES;
  1550. }
  1551. else
  1552. lp->send_msg.nrules = (USHORT)((* (lp->params)).num_rules) + 1;
  1553. for (i=0; i<lp->send_msg.nrules; i++)
  1554. {
  1555. PBIN_STATE bp; /* ptr. to bin state */
  1556. PCVY_RULE rp; /* ptr. to rules array */
  1557. bp = &(lp->pg_state[i]);
  1558. rp = &((* (lp->params)).port_rules[i]);
  1559. if (i == (((ULONG)lp->send_msg.nrules) - 1))
  1560. /* initialize bin state for default rule to single server with
  1561. host priority */
  1562. Bin_state_init(lp, bp, i, lp->my_host_id, CVY_SINGLE, CVY_TCP_UDP,
  1563. FALSE, (USHORT)0, (* (lp->params)).host_priority);
  1564. else if (rp->mode == CVY_SINGLE)
  1565. Bin_state_init(lp, bp, i, lp->my_host_id, rp->mode, rp->protocol,
  1566. FALSE, (USHORT)0, rp->mode_data.single.priority);
  1567. else if (rp->mode == CVY_MULTI)
  1568. Bin_state_init(lp, bp, i, lp->my_host_id, rp->mode, rp->protocol,
  1569. (BOOLEAN)(rp->mode_data.multi.equal_load),
  1570. rp->mode_data.multi.affinity,
  1571. (rp->mode_data.multi.equal_load ?
  1572. CVY_EQUAL_LOAD : rp->mode_data.multi.load));
  1573. /* handle CVY_NEVER mode as multi-server. the check for
  1574. those modes is done before attempting to hash to the bin in
  1575. Load_packet_check and Load_conn_advise so bin distribution plays
  1576. no role in the behavior, but simply allows the rule to be valid
  1577. across all of the operational servers */
  1578. else
  1579. Bin_state_init(lp, bp, i, lp->my_host_id, rp->mode, rp->protocol,
  1580. TRUE, (USHORT)0, CVY_EQUAL_LOAD);
  1581. ret = Bin_converge(lp, bp, lp->my_host_id);
  1582. if (!ret)
  1583. {
  1584. UNIV_PRINT_CRIT(("Load_start: Initial convergence inconsistent"));
  1585. TRACE_CRIT("%!FUNC! Initial convergence inconsistent");
  1586. LOG_MSG(MSG_ERROR_INTERNAL, MSG_NONE);
  1587. }
  1588. /* export current port group state to send msg */
  1589. if (i == (((ULONG)(lp->send_msg.nrules)) - 1))
  1590. lp->send_msg.rcode[i]= 0;
  1591. else
  1592. lp->send_msg.rcode[i]= rp->code;
  1593. lp->send_msg.cur_map[i] = bp->cmap; /* v2.1 */
  1594. lp->send_msg.new_map[i] = bp->new_map[lp->my_host_id];
  1595. lp->send_msg.idle_map[i] = bp->idle_bins;
  1596. lp->send_msg.rdy_bins[i] = bp->rdy_bins;
  1597. lp->send_msg.load_amt[i] = bp->load_amt[lp->my_host_id];
  1598. // NOTE: The following line of code was removed when it was discovered that it
  1599. // routinely produces a Wake On LAN pattern in the heartbeat that causes BroadCom
  1600. // NICs to panic. Although this is NOT an NLB issue, but rather a firmware issue
  1601. // in BroadCom NICs, it was decided to remove the information from the heartbeat
  1602. // to alleviate the problem for customers with BroadCom NICs upgrading to .NET.
  1603. // This array is UNUSED by NLB, so there is no harm in not filling it in; it was
  1604. // added a long time ago for debugging purposes as part of the now-defunct FIN-
  1605. // counting fix that was part of Win2k SP1.
  1606. //
  1607. // For future reference, should we need to use this space in the heartbeat at some
  1608. // future point in time, it appears that we will need to be careful to avoid potential
  1609. // WOL patterns in our heartbeats where we can avoid it. A WOL pattern is:
  1610. //
  1611. // 6 bytes of 0xFF, followed by 16 idential instances of a "MAC address" that can
  1612. // appear ANYWHERE in ANY frame type, including our very own NLB heartbeats. E.g.:
  1613. //
  1614. // FF FF FF FF FF FF 01 02 03 04 05 06 01 02 03 04 05 06 01 02 03 04 05 06
  1615. // 01 02 03 04 05 06 01 02 03 04 05 06 01 02 03 04 05 06 01 02 03 04 05 06
  1616. // 01 02 03 04 05 06 01 02 03 04 05 06 01 02 03 04 05 06 01 02 03 04 05 06
  1617. // 01 02 03 04 05 06 01 02 03 04 05 06 01 02 03 04 05 06 01 02 03 04 05 06
  1618. // 01 02 03 04 05 06
  1619. //
  1620. // The MAC address need not be valid, however. In NLB heartbeats, the "MAC address"
  1621. // in the mistaken WOL pattern is "00 00 00 00 00 00". NLB routinely fills heartbeats
  1622. // with FF and 00 bytes, but it seems that by "luck" no other place in the heartbeat
  1623. // seems this vulnerable. For instance, in the load_amt array, each entry has a
  1624. // maximum value of 100 (decimal), so there is no possibility of generating the initial
  1625. // 6 bytes of FF to start the WOL pattern. All of the "map" arrays seem to be saved
  1626. // by two strokes of fortune; (i) little endian and (ii) the bin distribution algorithm.
  1627. //
  1628. // (i) Since we don't use the 4 most significant bits of the ULONGLONGs used to store
  1629. // each map, the most significant bit is NEVER FF. Because Intel is little endian, the
  1630. // most significant byte appears last. For example:
  1631. //
  1632. // 0F FF FF FF FF FF FF FF appears in the packet as FF FF FF FF FF FF 0F
  1633. //
  1634. // This breaks the FF sequence in many scenarios.
  1635. //
  1636. // (ii) The way the bin distribution algorithm distributes buckets to hosts seems to
  1637. // discourage other possibilities. For instance, a current map of:
  1638. //
  1639. // 00 FF FF FF FF FF FF 00
  1640. //
  1641. // just isn't likely. However, it IS STILL POSSIBLE! So, it is important to note that:
  1642. //
  1643. // REMOVING THIS LINE OF CODE DOES NOT, IN ANY WAY, GUARANTEE THAT AN NLB HEARTBEAT
  1644. // CANNOT STILL CONTAIN A VALID WAKE ON LAN PATTERN SOMEWHERE ELSE IN THE FRAME!!!
  1645. // lp->send_msg.pg_rsvd1[i] = (ULONG)bp->all_idle_map;
  1646. }
  1647. /* initialize send msg */
  1648. lp->send_msg.host_id = (USHORT)(lp->my_host_id);
  1649. lp->send_msg.master_id = (USHORT)(lp->my_host_id);
  1650. lp->send_msg.hcode = lp->params->install_date;
  1651. lp->send_msg.pkt_count = lp->pkt_count; /* 1.32B */
  1652. Univ_ulong_to_str (lp->my_host_id+1, me, 10);
  1653. /* Tracking convergence - Starting convergence because this host is joining the cluster. */
  1654. LOG_MSGS(MSG_INFO_CONVERGING_NEW_MEMBER, me, me);
  1655. TRACE_CONVERGENCE("%!FUNC! Initiating convergence on host %d. Reason: Host %d is joining the cluster.", lp->my_host_id+1, lp->my_host_id+1);
  1656. /* Tracking convergence - Starting convergence. */
  1657. lp->send_msg.state = HST_CVG;
  1658. /* Reset the convergence statistics. */
  1659. lp->num_convergences = 1;
  1660. lp->last_convergence = 0;
  1661. /* activate module */
  1662. lp->active = TRUE;
  1663. return TRUE;
  1664. } /* end Load_start */
  1665. void Load_init(
  1666. PLOAD_CTXT lp,
  1667. PCVY_PARAMS params)
  1668. {
  1669. ULONG i;
  1670. PMAIN_CTXT ctxtp = CONTAINING_RECORD (lp, MAIN_CTXT, load);
  1671. LOCK_INIT(&(lp->lock));
  1672. if (!(lp->initialized))
  1673. {
  1674. lp->code = CVY_LOADCODE; /* (bbain 8/19/99) */
  1675. /* initialize hashed connection descriptors and queues */
  1676. for (i=0; i<CVY_MAX_CHASH; i++)
  1677. {
  1678. PCONN_ENTRY ep;
  1679. ep = &(lp->hashed_conn[i]);
  1680. /* Initialize the descriptor at this hash location. */
  1681. Load_init_dscr(lp, ep, FALSE);
  1682. /* Initialize the connection queue at this hash location. */
  1683. Queue_init(&(lp->connq[i]));
  1684. }
  1685. /* Initialize connection free and dirty queues. */
  1686. Queue_init(&(lp->conn_dirtyq));
  1687. Queue_init(&(lp->conn_rcvryq));
  1688. /* Initialize the queues for timing out connection descriptors. */
  1689. Queue_init(&(lp->tcp_expiredq));
  1690. Queue_init(&(lp->ipsec_expiredq));
  1691. /* Reset the number of dirty connections. */
  1692. lp->num_dirty = 0;
  1693. for (i=0; i<CVY_MAXBINS; i++)
  1694. {
  1695. /* Reset the dirty connection bin counters. */
  1696. lp->dirty_bin[i] = 0;
  1697. }
  1698. lp->cln_waiting = FALSE;
  1699. lp->def_timeout =
  1700. lp->cur_timeout = params -> alive_period;
  1701. lp->nconn = 0;
  1702. lp->active = FALSE;
  1703. lp->initialized = TRUE;
  1704. /* Initially, there are no outstanding connection descriptors. */
  1705. lp->num_dscr_out = 0;
  1706. lp->max_dscr_out = 0;
  1707. /* Allocate a fixed-size block pool for connection descriptors. */
  1708. lp->free_dscr_pool = NdisCreateBlockPool(sizeof(CONN_DESCR), 0, 'dBLN', NULL);
  1709. if (lp->free_dscr_pool == NULL)
  1710. {
  1711. UNIV_PRINT_CRIT(("Load_init: Error creating fixed-size block pool"));
  1712. TRACE_CRIT("%!FUNC! Error creating fixed-size block pool");
  1713. }
  1714. /* Store a pointer to the NLB parameters. */
  1715. lp->params = params;
  1716. /* Initialize the reference count on this load module. */
  1717. lp->ref_count = 0;
  1718. /* Reset the internally maintained clock used for connection descriptor timeout. */
  1719. lp->clock_sec = 0;
  1720. lp->clock_msec = 0;
  1721. }
  1722. else
  1723. {
  1724. UNIV_ASSERT(lp->code == CVY_LOADCODE);
  1725. }
  1726. /* Don't start module. */
  1727. } /* end Load_init */
  1728. /* DO NOT CALL THIS FUNCTION WITH THE LOAD LOCK HELD! */
  1729. void Load_cleanup(
  1730. PLOAD_CTXT lp)
  1731. {
  1732. ULONG i;
  1733. PCONN_ENTRY ep = NULL;
  1734. PMAIN_CTXT ctxtp = CONTAINING_RECORD (lp, MAIN_CTXT, load);
  1735. UNIV_ASSERT(lp->code == CVY_LOADCODE);
  1736. UNIV_ASSERT(!lp->active);
  1737. #if defined (NLB_TCP_NOTIFICATION)
  1738. /* If notification is on, we need to unlink any connections that we have
  1739. from the global established connection queues. */
  1740. if (NLB_NOTIFICATIONS_ON())
  1741. {
  1742. /* Loop through all of the dirty descriptors and unlink them all
  1743. from the global connection queue. There is no need to actually
  1744. clean them up or update any counters as this load module is
  1745. about to disappear. */
  1746. ep = (PCONN_ENTRY)Queue_deq(&lp->conn_dirtyq);
  1747. while (ep != NULL)
  1748. {
  1749. UNIV_ASSERT(ep->code == CVY_ENTRCODE);
  1750. /* If we're about to clean up this descriptor, it had better be dirty. */
  1751. UNIV_ASSERT(ep->flags & NLB_CONN_ENTRY_FLAGS_DIRTY);
  1752. /* Note: virtual descriptors are NOT placed in the global connection
  1753. queues, so dirty virtual descriptors do not need to be unlinked. */
  1754. if (!(ep->flags & NLB_CONN_ENTRY_FLAGS_VIRTUAL))
  1755. {
  1756. /* Note: The rule for locking the global queues is that you MUST
  1757. lock the queues BEFORE locking the load module itself. For
  1758. most load APIs, the main module locks the load module BEFORE
  1759. calling the relevant load module API. Load_cleanup, however,
  1760. is a case where the load lock is not acquired AT ALL. Therefore,
  1761. it is permissible for us to grab the global queue locks here,
  1762. knowing that the load module lock has NOT BEEN LOCKED. DO NOT
  1763. ACQUIRE THE LOAD MODULE LOCK BEFORE CALLING THIS FUNCTION. */
  1764. NdisAcquireSpinLock(&g_conn_estabq[ep->index].lock);
  1765. /* Unlink from the global connection queue. */
  1766. g_conn_estabq[ep->index].length--;
  1767. Link_unlink(&ep->glink);
  1768. NdisReleaseSpinLock(&g_conn_estabq[ep->index].lock);
  1769. }
  1770. /* Get the next descriptor in the queue. */
  1771. ep = (PCONN_ENTRY)Queue_deq(&lp->conn_dirtyq);
  1772. }
  1773. }
  1774. #endif
  1775. /* Destroy the fixed-size block pool and all descriptors therein.
  1776. Note that NdisDestroyBlockPool expects all allocated blocks
  1777. have been returned to the pool (freed) before it is called. */
  1778. if (lp->free_dscr_pool != NULL)
  1779. {
  1780. /* Loop through all of the connection descriptor queues and
  1781. free all of the descriptors we've allocated. */
  1782. for (i = 0; i < CVY_MAX_CHASH; i++)
  1783. {
  1784. /* Dequeue the head of the queue. */
  1785. PCONN_DESCR dp = (PCONN_DESCR)Queue_deq(&lp->connq[i]);
  1786. while (dp != NULL)
  1787. {
  1788. UNIV_ASSERT(dp->code == CVY_DESCCODE);
  1789. /* If we're about to free this descriptor, it had better be allocated. */
  1790. UNIV_ASSERT(dp->entry.flags & NLB_CONN_ENTRY_FLAGS_ALLOCATED);
  1791. /* Free the descriptor back to the fixed-size block pool. */
  1792. NdisFreeToBlockPool((PUCHAR)dp);
  1793. /* Get the next descriptor in the queue. */
  1794. dp = (PCONN_DESCR)Queue_deq(&lp->connq[i]);
  1795. }
  1796. }
  1797. /* Destroy the fixed-size block pool. */
  1798. NdisDestroyBlockPool(lp->free_dscr_pool);
  1799. }
  1800. } /* end Load_cleanup */
  1801. void Load_convergence_start (PLOAD_CTXT lp)
  1802. {
  1803. PMAIN_CTXT ctxtp = CONTAINING_RECORD(lp, MAIN_CTXT, load);
  1804. lp->consistent = TRUE;
  1805. /* Increment the number of convergences. */
  1806. if (lp->send_msg.state == HST_NORMAL)
  1807. lp->num_convergences++;
  1808. /* Setup initial convergence state. */
  1809. lp->send_msg.state = HST_CVG;
  1810. lp->stable_map = 0;
  1811. lp->my_stable_ct = 0;
  1812. lp->all_stable_ct = 0;
  1813. lp->send_msg.master_id = (USHORT)(lp->my_host_id);
  1814. }
  1815. BOOLEAN Load_msg_rcv(
  1816. PLOAD_CTXT lp,
  1817. PVOID phdr,
  1818. PPING_MSG pmsg) /* ptr. to ping message */
  1819. {
  1820. ULONG i;
  1821. BOOLEAN consistent;
  1822. ULONG my_host;
  1823. ULONG rem_host;
  1824. ULONG saved_map; /* saved host map */
  1825. PPING_MSG sendp; /* ptr. to my send message */
  1826. IRQLEVEL irql;
  1827. WCHAR me[20];
  1828. WCHAR them[20];
  1829. ULONG map;
  1830. PMAIN_CTXT ctxtp = CONTAINING_RECORD (lp, MAIN_CTXT, load);
  1831. PMAIN_FRAME_HDR ping_hdrp = (PMAIN_FRAME_HDR)phdr;
  1832. /* Used for tracking convergence and event logging. */
  1833. BOOLEAN bInconsistentMaster = FALSE;
  1834. BOOLEAN bInconsistentTeaming = FALSE;
  1835. BOOLEAN bInconsistentPortRules = FALSE;
  1836. UNIV_ASSERT(lp->code == CVY_LOADCODE);
  1837. TRACE_HB("%!FUNC! Recv HB from host %d", (ULONG) pmsg->host_id + 1);
  1838. if (!(lp->active))
  1839. {
  1840. return FALSE;
  1841. }
  1842. my_host = lp->my_host_id;
  1843. rem_host = (ULONG) pmsg->host_id;
  1844. Univ_ulong_to_str (my_host+1, me, 10);
  1845. Univ_ulong_to_str (rem_host+1, them, 10);
  1846. sendp = &(lp->send_msg);
  1847. if (rem_host >= CVY_MAX_HOSTS)
  1848. {
  1849. return FALSE;
  1850. }
  1851. LOCK_ENTER(&(lp->lock), &irql);
  1852. /* If this heartbeat is from a win2k host, add it to the legacy host map. */
  1853. if (ping_hdrp->version < CVY_VERSION_FULL)
  1854. lp->legacy_hosts |= (1 << rem_host);
  1855. /* filter out packets broadcast by this host */
  1856. if(rem_host == my_host)
  1857. {
  1858. /* if this packet was really from another host, we have duplicate host ids */
  1859. if (sendp->hcode != pmsg->hcode)
  1860. {
  1861. if (!(lp->dup_hosts))
  1862. {
  1863. UNIV_PRINT_CRIT(("Load_msg_rcv: Duplicate host ids detected."));
  1864. TRACE_CRIT("%!FUNC! Duplicate host ids detected.");
  1865. LOG_MSG(MSG_ERROR_HOST_ID, me);
  1866. lp->dup_hosts = TRUE;
  1867. }
  1868. /* Tracking convergence - Starting convergence because duplicate host IDs were detected in the cluster. */
  1869. if (sendp->state == HST_NORMAL) {
  1870. LOG_MSGS(MSG_INFO_CONVERGING_DUPLICATE_HOST_ID, me, them);
  1871. TRACE_CONVERGENCE("%!FUNC! Initiating convergence on host %d. Reason: Host %d is configured with the same host ID.", my_host+1, rem_host+1);
  1872. // If enabled, fire wmi event indicating start of convergence
  1873. if (NlbWmiEvents[ConvergingEvent].Enable)
  1874. {
  1875. WCHAR wsDip[CVY_MAX_DED_IP_ADDR + 1];
  1876. Univ_ip_addr_ulong_to_str (ping_hdrp->ded_ip_addr, wsDip);
  1877. NlbWmi_Fire_ConvergingEvent(ctxtp,
  1878. NLB_EVENT_CONVERGING_DUPLICATE_HOST_ID,
  1879. wsDip,
  1880. rem_host+1);
  1881. }
  1882. else
  1883. {
  1884. TRACE_VERB("%!FUNC! NOT Generating NLB_EVENT_CONVERGING_DUPLICATE_HOST_ID 'cos ConvergingEvent generation disabled");
  1885. }
  1886. }
  1887. /* Tracking convergence - Starting convergence. */
  1888. Load_convergence_start(lp);
  1889. }
  1890. /* just update ping and host maps for us */
  1891. lp->ping_map |= (1 << my_host);
  1892. lp->host_map |= (1 << my_host);
  1893. LOCK_EXIT(&(lp->lock), irql);
  1894. return (sendp->state != HST_NORMAL);
  1895. }
  1896. if (sendp->nrules != pmsg->nrules)
  1897. {
  1898. if (!(lp->bad_num_rules))
  1899. {
  1900. UNIV_PRINT_CRIT(("Load_msg_rcv: Host %d: Hosts have diff # rules.", my_host));
  1901. TRACE_CRIT("%!FUNC! Host %d: Hosts have diff # rules.", my_host);
  1902. LOG_MSG2(MSG_ERROR_RULES_MISMATCH, them, sendp->nrules, pmsg->nrules);
  1903. lp->bad_num_rules = TRUE;
  1904. }
  1905. /* Tracking convergence - Starting convergence because the number of port rules on this host and the remote host do not match. */
  1906. if (sendp->state == HST_NORMAL) {
  1907. LOG_MSGS(MSG_INFO_CONVERGING_NUM_RULES, me, them);
  1908. TRACE_CONVERGENCE("%!FUNC! Initiating convergence on host %d. Reason: Host %d is configured with a conflicting number of port rules.", my_host+1, rem_host+1);
  1909. // If enabled, fire wmi event indicating start of convergence
  1910. if (NlbWmiEvents[ConvergingEvent].Enable)
  1911. {
  1912. WCHAR wsDip[CVY_MAX_DED_IP_ADDR + 1];
  1913. Univ_ip_addr_ulong_to_str (ping_hdrp->ded_ip_addr, wsDip);
  1914. NlbWmi_Fire_ConvergingEvent(ctxtp,
  1915. NLB_EVENT_CONVERGING_NUM_RULES,
  1916. wsDip,
  1917. rem_host+1);
  1918. }
  1919. else
  1920. {
  1921. TRACE_VERB("%!FUNC! NOT Generating NLB_EVENT_CONVERGING_NUM_RULES 'cos ConvergingEvent generation disabled");
  1922. }
  1923. }
  1924. /* Tracking convergence - Starting convergence. */
  1925. Load_convergence_start(lp);
  1926. /* just update ping and host maps for remote host (bbain 2/17/99) */
  1927. lp->ping_map |= (1 << rem_host);
  1928. lp->host_map |= (1 << rem_host);
  1929. LOCK_EXIT(&(lp->lock), irql);
  1930. return (sendp->state != HST_NORMAL);
  1931. }
  1932. /* update mastership and see if consistent */
  1933. if (rem_host < sendp->master_id)
  1934. sendp->master_id = (USHORT)rem_host;
  1935. consistent = sendp->master_id == pmsg->master_id; /* 1.03 */
  1936. /* For the purposes of logging the reason for convergence, note this inconsistency. */
  1937. if (!consistent) bInconsistentMaster = TRUE;
  1938. /* update ping and host maps to include remote host */
  1939. lp->ping_map |= (1 << rem_host);
  1940. saved_map = lp->host_map;
  1941. lp->host_map |= (1 << rem_host);
  1942. /* handle host convergence */
  1943. if (sendp->state != HST_NORMAL)
  1944. {
  1945. /* if master, update stable map for remote host */
  1946. if (sendp->master_id == my_host)
  1947. {
  1948. if (pmsg->state == HST_STABLE)
  1949. {
  1950. lp->stable_map |= (1 << rem_host);
  1951. }
  1952. else
  1953. {
  1954. lp->stable_map &= ~(1 << rem_host);
  1955. lp->all_stable_ct = 0;
  1956. }
  1957. }
  1958. /* otherwise, update state if have global stable convergence and the current
  1959. master has signalled completion by returning to the normal state; note
  1960. that we must do this prior to updating port group states */
  1961. else if (rem_host == sendp->master_id && pmsg->state == HST_NORMAL)
  1962. {
  1963. if (sendp->state == HST_STABLE)
  1964. {
  1965. sendp->state = HST_NORMAL;
  1966. /* Note the time of the last completed convergence. */
  1967. lp->last_convergence = lp->clock_sec;
  1968. /* Notify our BDA team that this cluster is consistently configured.
  1969. If we are not part of a BDA team, this call is essentially a no-op. */
  1970. Load_teaming_consistency_notify(&ctxtp->bda_teaming, TRUE);
  1971. /* Reset the bad teaming configuration detected flag if we are converged. */
  1972. lp->bad_team_config = FALSE;
  1973. lp->dup_hosts = FALSE;
  1974. lp->dup_sspri = FALSE;
  1975. lp->bad_map = FALSE;
  1976. lp->overlap_maps = FALSE;
  1977. lp->err_rcving_bins = FALSE;
  1978. lp->err_orphans = FALSE;
  1979. lp->bad_num_rules = FALSE;
  1980. lp->pkt_count = 0; /* v1.32B */
  1981. for (i=0; i<sendp->nrules; i++)
  1982. {
  1983. PBIN_STATE bp;
  1984. bp = &(lp->pg_state[i]);
  1985. bp->compatible = TRUE; /* 1.03 */
  1986. Bin_converge_commit(lp, bp, my_host);
  1987. UNIV_PRINT_VERB(("Load_msg_rcv: Host %d pg %d: new cur map %x idle %x all %x",
  1988. my_host, i, bp->cur_map[my_host], bp->idle_bins,
  1989. bp->all_idle_map));
  1990. TRACE_CONVERGENCE("%!FUNC! Host %d pg %d: new cur map 0x%x idle 0x%x all 0x%x",
  1991. my_host, i, (ULONG)bp->cur_map[my_host], (ULONG)bp->idle_bins,
  1992. (ULONG)bp->all_idle_map);
  1993. }
  1994. UNIV_PRINT_VERB(("Load_msg_rcv: Host %d: converged as slave", my_host));
  1995. TRACE_VERB("%!FUNC! Host %d: converged as slave", my_host);
  1996. /* log convergence completion if host map changed (bbain RTM RC1 6/23/99) */
  1997. /* Ignoring return value is OK since the return values are all non-errors */
  1998. Load_hosts_query (lp, TRUE, & map);
  1999. lp->last_hmap = lp->host_map;
  2000. if (lp->legacy_hosts) {
  2001. /* If a Win2k or NT4.0 host is attempting to join the cluster, warn the user that there are potential
  2002. limitations of mixed clusters, such as no virtual cluster support, no IGMP, no BDA, no VPN session
  2003. support and others. For some of these, the cluster will not be allowed to converge, while for some
  2004. it will, so we'll just warn the user that they should check the documentation for limitations. */
  2005. UNIV_PRINT_INFO(("Load_msg_rcv: NT4.0/Win2k host(s) detected: Be aware of the limitations of operating a mixed cluster."));
  2006. TRACE_INFO("%!FUNC! NT4.0/Win2k host(s) detected: Be aware of the limitations of operating a mixed cluster.");
  2007. LOG_MSG(MSG_WARN_MIXED_CLUSTER, MSG_NONE);
  2008. }
  2009. }
  2010. else
  2011. {
  2012. /* Tracking convergence - Starting convergence because the DEFAULT host prematurely ended convergence. In this case, we
  2013. are guaranteed to already be in the HST_CVG state, and because this message can be misleading in some circumstances,
  2014. we do not log an event. For instance, due to timing issues, when a host joins a cluster he can receive a HST_NORMAL
  2015. heartbeat from the DEFAULT host while it is still in the HST_CVG state simply because that heartbeat left the DEFAULT
  2016. host before it received our first heartbeat, which initiated convergence. */
  2017. TRACE_CONVERGENCE("%!FUNC! Initiating convergence on host %d. Reason: Host %d, the DEFAULT host, prematurely terminated convergence.", my_host+1, rem_host+1);
  2018. /* Tracking convergence - Starting convergence. */
  2019. Load_convergence_start(lp);
  2020. }
  2021. }
  2022. }
  2023. /* Compare the teaming configuration of this host with the remote host. If the
  2024. two are inconsitent and we are part of a team, we will initiate convergence. */
  2025. if (!Load_teaming_consistency_check(lp->bad_team_config, &ctxtp->bda_teaming, sendp->teaming, pmsg->teaming, ping_hdrp->version)) {
  2026. /* Only log an event if the teaming configuration was, but is now not, consistent. */
  2027. if (!lp->bad_team_config) {
  2028. /* Note that we saw this. */
  2029. lp->bad_team_config = TRUE;
  2030. /* Log the event. */
  2031. LOG_MSG(MSG_ERROR_BDA_BAD_TEAM_CONFIG, them);
  2032. }
  2033. /* Notify the team that this cluster is NOT consistently configured. */
  2034. Load_teaming_consistency_notify(&ctxtp->bda_teaming, FALSE);
  2035. /* Mark the heartbeats inconsistent to force and retain convergence. */
  2036. consistent = FALSE;
  2037. /* For the purposes of logging the reason for convergence, note this inconsistency. */
  2038. bInconsistentTeaming = TRUE;
  2039. }
  2040. /* update port group state */
  2041. for (i=0; i<sendp->nrules; i++)
  2042. {
  2043. BOOLEAN ret;
  2044. PBIN_STATE bp;
  2045. bp = &lp->pg_state[i];
  2046. /* if rule codes don't match, print message and handle incompatibility (1.03: note
  2047. that we previously marked rule invalid, which would stop processing) */
  2048. if (sendp->rcode[i] != pmsg->rcode[i])
  2049. {
  2050. /* 1.03: if rule was peviously compatible, print message */
  2051. if (bp->compatible)
  2052. {
  2053. PCVY_RULE rp;
  2054. UNIV_PRINT_CRIT(("Load_msg_rcv: Host %d pg %d: Rule codes do not match.", lp->my_host_id, i));
  2055. TRACE_CRIT("%!FUNC! Host %d pg %d: Rule codes do not match.", lp->my_host_id, i);
  2056. /* bbain 8/27/99 */
  2057. LOG_MSG2(MSG_ERROR_RULES_MISMATCH, them, sendp->rcode[i], pmsg->rcode[i]);
  2058. /* Get the port rule information for this rule. */
  2059. rp = &lp->params->port_rules[i];
  2060. /* Check to see if this is an issue with a win2k host in a cluster utilizing virtual clusters. */
  2061. if ((rp->virtual_ip_addr != CVY_ALL_VIP_NUMERIC_VALUE) && ((sendp->rcode[i] ^ ~rp->virtual_ip_addr) == pmsg->rcode[i])) {
  2062. UNIV_PRINT_CRIT(("Load_msg_rcv: ** A Windows 2000 or NT4 host MAY be participating in a cluster utilizing virtual cluster support."));
  2063. TRACE_CRIT("%!FUNC! ** A Windows 2000 or NT4 host MAY be participating in a cluster utilizing virtual cluster support.");
  2064. LOG_MSG(MSG_WARN_VIRTUAL_CLUSTERS, them);
  2065. }
  2066. bp->compatible = FALSE;
  2067. }
  2068. /* 1.03: mark rule inconsistent to force and continue convergence */
  2069. consistent = FALSE;
  2070. /* For the purposes of logging the reason for convergence, note this inconsistency. */
  2071. bInconsistentPortRules = TRUE;
  2072. /* don't update bin state */
  2073. continue;
  2074. }
  2075. ret = Bin_host_update(lp, bp, my_host, (BOOLEAN)(sendp->state != HST_NORMAL),
  2076. (BOOLEAN)(pmsg->state != HST_NORMAL),
  2077. rem_host, pmsg->cur_map[i], pmsg->new_map[i],
  2078. pmsg->idle_map[i], pmsg->rdy_bins[i],
  2079. pmsg->pkt_count, pmsg->load_amt[i]);
  2080. if (!ret)
  2081. consistent = FALSE;
  2082. }
  2083. /* update our consistency state */
  2084. lp->consistent = consistent;
  2085. /* if we are in normal operation and we discover a new host or a host goes into
  2086. convergence or we discover an inconsistency, go into convergence */
  2087. if (sendp->state == HST_NORMAL)
  2088. {
  2089. if (lp->host_map != saved_map || pmsg->state == HST_CVG || !consistent)
  2090. {
  2091. ConvergingEventId Cause = NLB_EVENT_CONVERGING_UNKNOWN;
  2092. /* If a host has joined the cluster, or if inconsistent teaming configuration or port
  2093. rules were detected, then we need to log an event. However, we segregate the
  2094. inconsistent master host flag because it is set by the initiating host in MANY
  2095. occasions, so we want to log the most specific reason(s) for convergence if
  2096. possible and only report the inconsistent master detection only if nothing more
  2097. specific can be deduced. */
  2098. if (lp->host_map != saved_map || bInconsistentTeaming || bInconsistentPortRules) {
  2099. /* If the host maps are different, then we know that the host from which we received
  2100. this packet is joining the cluster because the ONLY operation on the host map in
  2101. this function is to ADD a remote host to our map. Otherwise, if the map has not
  2102. changed, then an inconsistent configuration got us into the branch. */
  2103. if (lp->host_map != saved_map) {
  2104. /* Tracking convergence - Starting convergence because another host is joining the cluster. */
  2105. LOG_MSGS(MSG_INFO_CONVERGING_NEW_MEMBER, me, them);
  2106. TRACE_CONVERGENCE("%!FUNC! Initiating convergence on host %d. Reason: Host %d is joining the cluster.", my_host+1, rem_host+1);
  2107. Cause = NLB_EVENT_CONVERGING_NEW_MEMBER;
  2108. } else if (bInconsistentTeaming || bInconsistentPortRules) {
  2109. /* Tracking convergence - Starting convergence because inconsistent configuration was detected. */
  2110. LOG_MSGS(MSG_INFO_CONVERGING_BAD_CONFIG, me, them);
  2111. TRACE_CONVERGENCE("%!FUNC! Initiating convergence on host %d. Reason: Host %d has conflicting configuration.", my_host+1, rem_host+1);
  2112. Cause = NLB_EVENT_CONVERGING_BAD_CONFIG;
  2113. }
  2114. /* If we have nothing better to report, report convergence for an unspecific reason. */
  2115. } else if (bInconsistentMaster || pmsg->state == HST_CVG) {
  2116. /* Tracking convergence - Starting convergence for unknown reasons. */
  2117. LOG_MSGS(MSG_INFO_CONVERGING_UNKNOWN, me, them);
  2118. TRACE_CONVERGENCE("%!FUNC! Initiating convergence on host %d. Reason: Host %d is converging for an unknown reason.", my_host+1, rem_host+1);
  2119. }
  2120. // If enabled, fire wmi event indicating start of convergence
  2121. if (NlbWmiEvents[ConvergingEvent].Enable)
  2122. {
  2123. WCHAR wsDip[CVY_MAX_DED_IP_ADDR + 1];
  2124. Univ_ip_addr_ulong_to_str (ping_hdrp->ded_ip_addr, wsDip);
  2125. NlbWmi_Fire_ConvergingEvent(ctxtp,
  2126. Cause,
  2127. wsDip,
  2128. rem_host+1);
  2129. }
  2130. else
  2131. {
  2132. TRACE_VERB("%!FUNC! NOT Generating ConvergingEvent(New Member/Bad Config/Unknown) 'cos ConvergingEvent generation disabled");
  2133. }
  2134. /* Tracking convergence - Starting convergence. */
  2135. Load_convergence_start(lp);
  2136. }
  2137. }
  2138. /* otherwise, if we are in convergence and we see an inconsistency, just restart
  2139. our local convergence */
  2140. else
  2141. {
  2142. /* update our consistency state; if we didn't see consistent information,
  2143. restart this host's convergence */
  2144. if (!consistent)
  2145. {
  2146. /* Tracking convergence - Starting convergence because inconsistent configuration was detected.
  2147. This keeps hosts in a state of convergence when hosts are inconsistently configured. However,
  2148. since the cluster is already in a state of convergece (HST_CVG or HST_STABLE), don't log an
  2149. event, which may confuse a user. */
  2150. TRACE_CONVERGENCE("%!FUNC! Initiating convergence on host %d. Reason: Host %d has conflicting configuration.", my_host+1, rem_host+1);
  2151. /* Tracking convergence - Starting convergence. */
  2152. sendp->state = HST_CVG;
  2153. lp->my_stable_ct = 0;
  2154. lp->stable_map &= ~(1 << my_host);
  2155. lp->all_stable_ct = 0;
  2156. }
  2157. }
  2158. LOCK_EXIT(&(lp->lock), irql);
  2159. return (sendp->state != HST_NORMAL);
  2160. } /* end Load_msg_rcv */
  2161. PPING_MSG Load_snd_msg_get(
  2162. PLOAD_CTXT lp)
  2163. {
  2164. return &(lp->send_msg);
  2165. } /* end Load_snd_msg_get */
  2166. /*
  2167. * Function: Load_age_descriptors
  2168. * Description: This function searches a list of connection descriptors and
  2169. * removes those whose timeouts have expired. The queues are
  2170. * sorted timeout queues, so it is only ever necessary to look
  2171. * at the head of the queue to find expired descriptors. This
  2172. * function loops until all expired descriptors are removed.
  2173. * Parameters: lp - a pointer to the load module.
  2174. * eqp - pointer to the expired descriptor queue to service.
  2175. * Returns: Nothing.
  2176. * Author: shouse, 9.9.01
  2177. * Notes:
  2178. */
  2179. void Load_age_descriptors (PLOAD_CTXT lp, QUEUE * eqp)
  2180. {
  2181. PCONN_ENTRY ep; /* Pointer to connection entry. */
  2182. PBIN_STATE bp; /* Pointer to port rule state. */
  2183. LINK * linkp; /* Pointer to the queue link. */
  2184. BOOLEAN err_bin = FALSE; /* Bin ID error detected. */
  2185. PMAIN_CTXT ctxtp = CONTAINING_RECORD(lp, MAIN_CTXT, load);
  2186. /* Get a pointer to (but do not dequeue) the head of the queue. */
  2187. linkp = (LINK *)Queue_front(eqp);
  2188. /* As long as there are descriptors to check, keep looking - when
  2189. we find the first descriptor that is NOT ready to be dequeued,
  2190. we stop looking and break out of the loop. */
  2191. while (linkp != NULL) {
  2192. /* Get a pointer to the descriptor (linkp is a pointer to
  2193. the LIST_ENTRY in the descriptor, not the descriptor). */
  2194. ep = STRUCT_PTR(linkp, CONN_ENTRY, rlink);
  2195. UNIV_ASSERT(ep->code == CVY_ENTRCODE);
  2196. /* Do some sanity checking on the bin number. */
  2197. if (ep->bin >= CVY_MAXBINS) {
  2198. if (!err_bin) {
  2199. TRACE_CRIT("%!FUNC! Bad bin number");
  2200. LOG_MSG2(MSG_ERROR_INTERNAL, MSG_NONE, ep->bin, CVY_MAXBINS);
  2201. err_bin = TRUE;
  2202. }
  2203. }
  2204. #if defined (TRACE_DSCR)
  2205. DbgPrint("Load_age_descriptors: Descriptor %p: clock=%u, timeout=%u", ep, lp->clock_sec, ep->timeout);
  2206. #endif
  2207. /* If the current clock time is greater than or equal to the
  2208. scheduled timeout for this descriptor, then pull it off
  2209. and recycle it. */
  2210. if (lp->clock_sec >= ep->timeout) {
  2211. #if defined (TRACE_DSCR)
  2212. DbgPrint("Load_age_descriptors: Removing descriptor %p", ep);
  2213. #endif
  2214. /* Lookup the port rule, so we can update the port rule info. */
  2215. bp = Load_pg_lookup(lp, ep->svr_ipaddr, ep->svr_port, IS_TCP_PKT(ep->protocol));
  2216. /* Clear the descriptor. */
  2217. CVY_CONN_CLEAR(ep);
  2218. /* Release the descriptor. */
  2219. Load_put_dscr(lp, bp, ep);
  2220. /* Break if this descriptor was not ready to expire yet. */
  2221. } else break;
  2222. /* Grab the next descriptor in the queue. */
  2223. linkp = (LINK *)Queue_front(eqp);
  2224. }
  2225. }
  2226. BOOLEAN Load_timeout(
  2227. PLOAD_CTXT lp,
  2228. PULONG new_timeout,
  2229. PULONG pnconn)
  2230. /*
  2231. Note: we only update ping message in this function since we know that upper level code
  2232. sends out ping messages after calling this routine. We cannot be sure that Load_msg_rcv
  2233. is sequentialized with sending a message, (1.03)
  2234. Upper level code locks this routine wrt Load_msg_rcv, Load_packet_check, and
  2235. Load_conn_advise. (1.03)
  2236. */
  2237. {
  2238. ULONG missed_pings;
  2239. ULONG my_host;
  2240. ULONG i;
  2241. PPING_MSG sendp; /* ptr. to my send message */
  2242. IRQLEVEL irql;
  2243. ULONG map; /* returned host map from query */
  2244. PMAIN_CTXT ctxtp = CONTAINING_RECORD (lp, MAIN_CTXT, load);
  2245. BOOLEAN fRet = FALSE;
  2246. UNIV_ASSERT(lp->code == CVY_LOADCODE);
  2247. LOCK_ENTER(&(lp->lock), &irql);
  2248. if ((lp->cln_waiting) && (lp->cur_time < lp->cln_timeout))
  2249. {
  2250. lp->cur_time += lp->cur_timeout;
  2251. if (lp->cur_time >= lp->cln_timeout)
  2252. {
  2253. TRACE_INFO("%!FUNC! Cleaning out dirty connection descriptors");
  2254. Load_conn_cleanup(lp);
  2255. }
  2256. }
  2257. /* Update the internal clock. We add the time since the last timeout
  2258. (in ms) to our msec count. We then add any whole number of seconds
  2259. that have accumulated in msec to the sec count. The remainder is
  2260. left in msec to accumulate. */
  2261. lp->clock_msec += lp->cur_timeout;
  2262. lp->clock_sec += (lp->clock_msec / 1000);
  2263. lp->clock_msec = (lp->clock_msec % 1000);
  2264. /* Age all conenction descriptors. */
  2265. Load_age_descriptors(lp, &(lp->tcp_expiredq));
  2266. Load_age_descriptors(lp, &(lp->ipsec_expiredq));
  2267. /* Return if not active. */
  2268. if (!(lp->active))
  2269. {
  2270. if (new_timeout != NULL)
  2271. * new_timeout = lp->cur_timeout = lp->def_timeout;
  2272. if (pnconn != NULL) /* v2.1 */
  2273. * pnconn = lp->nconn;
  2274. LOCK_EXIT(&(lp->lock), irql);
  2275. return FALSE;
  2276. }
  2277. my_host = lp->my_host_id;
  2278. sendp = &(lp->send_msg);
  2279. /* compute which hosts missed pings and reset ping map */
  2280. missed_pings = lp->host_map & (~lp->ping_map);
  2281. #ifdef NO_CLEANUP
  2282. lp->ping_map = 1 << my_host;
  2283. #else
  2284. lp->ping_map = 0;
  2285. #endif
  2286. /* check whether any host is dead, including ourselves */
  2287. for (i=0; i<CVY_MAX_HOSTS; i++)
  2288. {
  2289. /* if we have a missed ping for this host, increment count */
  2290. if ((missed_pings & 0x1) == 1)
  2291. {
  2292. lp->nmissed_pings[i]++;
  2293. /* if we missed too many pings, declare host dead and force convergence */
  2294. if (lp->nmissed_pings[i] == lp->min_missed_pings)
  2295. {
  2296. ULONG j;
  2297. BOOLEAN ret;
  2298. WCHAR me[20];
  2299. WCHAR them[20];
  2300. if (i == my_host)
  2301. {
  2302. UNIV_PRINT_VERB(("Load_timeout: Host %d: Missed too many pings; this host declared offline", i));
  2303. TRACE_VERB("%!FUNC! Host %d: Missed too many pings; this host declared offline", i);
  2304. /* reset our packet count since we are likely not to be receiving
  2305. packets from others now; this will make us less favored to
  2306. handle duplicate bins later (v1.32B) */
  2307. lp->pkt_count = 0;
  2308. }
  2309. lp->host_map &= ~(1<<i);
  2310. /* Reset the legacy host bit if the host has gone off-line. */
  2311. lp->legacy_hosts &= ~(1<<i);
  2312. for (j=0; j<sendp->nrules; j++)
  2313. {
  2314. PBIN_STATE bp;
  2315. bp = &(lp->pg_state[j]);
  2316. UNIV_ASSERT(bp->code == CVY_BINCODE); /* (bbain 8/19/99) */
  2317. if (i == my_host)
  2318. {
  2319. ULONG k;
  2320. /* cleanup connections and restore maps to clean state */
  2321. Load_conn_kill(lp, bp);
  2322. bp->targ_map = 0;
  2323. bp->all_idle_map = BIN_ALL_ONES;
  2324. bp->cmap = 0; /* v2.1 */
  2325. bp->compatible = TRUE; /* v1.03 */
  2326. for (k=0; k<CVY_MAX_HOSTS; k++)
  2327. {
  2328. bp->new_map[k] = 0;
  2329. bp->cur_map[k] = 0;
  2330. bp->chk_map[k] = 0;
  2331. bp->idle_map[k] = BIN_ALL_ONES;
  2332. if (k != i)
  2333. bp->load_amt[k] = 0;
  2334. }
  2335. bp->snd_bins =
  2336. bp->rcv_bins =
  2337. bp->rdy_bins = 0;
  2338. bp->idle_bins = BIN_ALL_ONES;
  2339. /* Re-initialize the performance counters. */
  2340. bp->packets_accepted = 0;
  2341. bp->packets_dropped = 0;
  2342. bp->bytes_accepted = 0;
  2343. bp->bytes_dropped = 0;
  2344. /* compute initial new map for convergence as only host in cluster
  2345. (v 1.3.2B) */
  2346. ret = Bin_converge(lp, bp, lp->my_host_id);
  2347. if (!ret)
  2348. {
  2349. UNIV_PRINT_CRIT(("Load_timeout: Initial convergence inconsistent"));
  2350. TRACE_CRIT("%!FUNC! Initial convergence inconsistent");
  2351. LOG_MSG(MSG_ERROR_INTERNAL, MSG_NONE);
  2352. }
  2353. }
  2354. else
  2355. {
  2356. ret = Bin_host_update(lp, bp, my_host, TRUE, TRUE,
  2357. i, 0, 0, BIN_ALL_ONES, 0, 0, 0);
  2358. }
  2359. }
  2360. lp->nmissed_pings[i] = 0;
  2361. /* If a host has dropped out of the cluster, then log an event. However, we don't
  2362. log an event when we drop out because the only way for us to drop out of our own
  2363. cluster is if we are stopping anyway, or if we have lost network connectivity.
  2364. Logging such events may be misleading, so we won't bother. */
  2365. if (i != my_host) {
  2366. Univ_ulong_to_str (my_host+1, me, 10);
  2367. Univ_ulong_to_str (i+1, them, 10);
  2368. /* Tracking convergence - Starting convergence because a member has fallen out of the cluster. */
  2369. LOG_MSGS(MSG_INFO_CONVERGING_MEMBER_LOST, me, them);
  2370. TRACE_CONVERGENCE("%!FUNC! Initiating convergence on host %d. Reason: Host %d is leaving the cluster.", my_host+1, i+1);
  2371. // If enabled, fire wmi event indicating start of convergence
  2372. if (NlbWmiEvents[ConvergingEvent].Enable)
  2373. {
  2374. NlbWmi_Fire_ConvergingEvent(ctxtp,
  2375. NLB_EVENT_CONVERGING_MEMBER_LOST,
  2376. NLB_EVENT_NO_DIP_STRING,
  2377. i+1);
  2378. }
  2379. else
  2380. {
  2381. TRACE_VERB("%!FUNC! NOT Generating NLB_EVENT_CONVERGING_MEMBER_LOST 'cos ConvergingEvent generation disabled");
  2382. }
  2383. }
  2384. /* Tracking convergence - Starting convergence. */
  2385. Load_convergence_start(lp);
  2386. }
  2387. }
  2388. /* otherwise reset missed ping count */
  2389. else
  2390. lp->nmissed_pings[i] = 0;
  2391. missed_pings >>= 1;
  2392. }
  2393. /* handle convergence */
  2394. if (sendp->state != HST_NORMAL)
  2395. {
  2396. /* check whether we have been consistent and have received our own pings
  2397. for a sufficient period to move to a stable state and announce it to
  2398. other hosts */
  2399. if (sendp->state == HST_CVG)
  2400. {
  2401. if (lp->consistent && ((lp->host_map & (1 << my_host)) != 0))
  2402. {
  2403. lp->my_stable_ct++;
  2404. if (lp->my_stable_ct >= lp->min_stable_ct)
  2405. {
  2406. sendp->state = HST_STABLE;
  2407. lp->stable_map |= (1 << my_host);
  2408. }
  2409. }
  2410. else
  2411. lp->my_stable_ct = lp->all_stable_ct = 0; /* wlb B3RC1 */
  2412. }
  2413. /* otherwise, see if we are the master and everybody's been stable for
  2414. a sufficient period for us to terminate convergence */
  2415. else if (sendp->state == HST_STABLE &&
  2416. my_host == sendp->master_id &&
  2417. lp->stable_map == lp->host_map)
  2418. {
  2419. lp->all_stable_ct++;
  2420. if (lp->all_stable_ct >= lp->min_stable_ct)
  2421. {
  2422. sendp->state = HST_NORMAL;
  2423. /* Note the time of the last completed convergence. */
  2424. lp->last_convergence = lp->clock_sec;
  2425. /* Notify our BDA team that this cluster is consistently configured.
  2426. If we are not part of BDA team, this call is essentially a no-op. */
  2427. Load_teaming_consistency_notify(&ctxtp->bda_teaming, TRUE);
  2428. /* Reset the bad teaming configuration detected flag if we are converged. */
  2429. lp->bad_team_config = FALSE;
  2430. lp->dup_hosts = FALSE;
  2431. lp->dup_sspri = FALSE;
  2432. lp->bad_map = FALSE;
  2433. lp->overlap_maps = FALSE;
  2434. lp->err_rcving_bins = FALSE;
  2435. lp->err_orphans = FALSE;
  2436. lp->bad_num_rules = FALSE;
  2437. lp->pkt_count = 0; /* v1.32B */
  2438. for (i=0; i<sendp->nrules; i++)
  2439. {
  2440. PBIN_STATE bp;
  2441. BOOLEAN ret;
  2442. bp = &(lp->pg_state[i]);
  2443. bp->compatible = TRUE; /* 1.03 */
  2444. /* explicitly converge to new map in case we're the only host (v2.06) */
  2445. ret = Bin_converge(lp, bp, lp->my_host_id);
  2446. if (!ret)
  2447. {
  2448. UNIV_PRINT_CRIT(("Load_timeout: Final convergence inconsistent"));
  2449. TRACE_CRIT("%!FUNC! Final convergence inconsistent");
  2450. LOG_MSG(MSG_ERROR_INTERNAL, MSG_NONE);
  2451. }
  2452. Bin_converge_commit(lp, bp, my_host);
  2453. UNIV_PRINT_VERB(("Load_timeout: Host %d pg %d: new cur map %x idle %x all %x",
  2454. my_host, i, bp->cur_map[my_host], bp->idle_bins,
  2455. bp->all_idle_map));
  2456. }
  2457. UNIV_PRINT_VERB(("Load_timeout: Host %d: converged as master", my_host));
  2458. TRACE_CONVERGENCE("%!FUNC! Host %d: converged as master", my_host);
  2459. /* log convergence completion if host map changed (bbain RTM RC1 6/23/99) */
  2460. Load_hosts_query (lp, TRUE, & map);
  2461. lp->last_hmap = lp->host_map;
  2462. if (lp->legacy_hosts) {
  2463. /* If a Win2k or NT4.0 host is attempting to join the cluster, warn the user that there are potential
  2464. limitations of mixed clusters, such as no virtual cluster support, no IGMP, no BDA, no VPN session
  2465. support and others. For some of these, the cluster will not be allowed to converge, while for some
  2466. it will, so we'll just warn the user that they should check the documentation for limitations. */
  2467. UNIV_PRINT_INFO(("Load_timeout: NT4.0/Win2k host(s) detected: Be aware of the limitations of operating a mixed cluster."));
  2468. TRACE_INFO("%!FUNC! NT4.0/Win2k host(s) detected: Be aware of the limitations of operating a mixed cluster.");
  2469. LOG_MSG(MSG_WARN_MIXED_CLUSTER, MSG_NONE);
  2470. }
  2471. }
  2472. }
  2473. }
  2474. /* 1.03: update ping message */
  2475. for (i=0; i<sendp->nrules; i++)
  2476. {
  2477. PBIN_STATE bp;
  2478. bp = &(lp->pg_state[i]);
  2479. /* export current port group state to ping message */
  2480. sendp->cur_map[i] = bp->cmap; /* v2.1 */
  2481. sendp->new_map[i] = bp->new_map[my_host];
  2482. sendp->idle_map[i] = bp->idle_bins;
  2483. sendp->rdy_bins[i] = bp->rdy_bins;
  2484. sendp->load_amt[i] = bp->load_amt[my_host];
  2485. // NOTE: The following line of code was removed when it was discovered that it
  2486. // routinely produces a Wake On LAN pattern in the heartbeat that causes BroadCom
  2487. // NICs to panic. Although this is NOT an NLB issue, but rather a firmware issue
  2488. // in BroadCom NICs, it was decided to remove the information from the heartbeat
  2489. // to alleviate the problem for customers with BroadCom NICs upgrading to .NET.
  2490. // This array is UNUSED by NLB, so there is no harm in not filling it in; it was
  2491. // added a long time ago for debugging purposes as part of the now-defunct FIN-
  2492. // counting fix that was part of Win2k SP1.
  2493. //
  2494. // For future reference, should we need to use this space in the heartbeat at some
  2495. // future point in time, it appears that we will need to be careful to avoid potential
  2496. // WOL patterns in our heartbeats where we can avoid it. A WOL pattern is:
  2497. //
  2498. // 6 bytes of 0xFF, followed by 16 idential instances of a "MAC address" that can
  2499. // appear ANYWHERE in ANY frame type, including our very own NLB heartbeats. E.g.:
  2500. //
  2501. // FF FF FF FF FF FF 01 02 03 04 05 06 01 02 03 04 05 06 01 02 03 04 05 06
  2502. // 01 02 03 04 05 06 01 02 03 04 05 06 01 02 03 04 05 06 01 02 03 04 05 06
  2503. // 01 02 03 04 05 06 01 02 03 04 05 06 01 02 03 04 05 06 01 02 03 04 05 06
  2504. // 01 02 03 04 05 06 01 02 03 04 05 06 01 02 03 04 05 06 01 02 03 04 05 06
  2505. // 01 02 03 04 05 06
  2506. //
  2507. // The MAC address need not be valid, however. In NLB heartbeats, the "MAC address"
  2508. // in the mistaken WOL pattern is "00 00 00 00 00 00". NLB routinely fills heartbeats
  2509. // with FF and 00 bytes, but it seems that by "luck" no other place in the heartbeat
  2510. // seems this vulnerable. For instance, in the load_amt array, each entry has a
  2511. // maximum value of 100 (decimal), so there is no possibility of generating the initial
  2512. // 6 bytes of FF to start the WOL pattern. All of the "map" arrays seem to be saved
  2513. // by two strokes of fortune; (i) little endian and (ii) the bin distribution algorithm.
  2514. //
  2515. // (i) Since we don't use the 4 most significant bits of the ULONGLONGs used to store
  2516. // each map, the most significant bit is NEVER FF. Because Intel is little endian, the
  2517. // most significant byte appears last. For example:
  2518. //
  2519. // 0F FF FF FF FF FF FF FF appears in the packet as FF FF FF FF FF FF 0F
  2520. //
  2521. // This breaks the FF sequence in many scenarios.
  2522. //
  2523. // (ii) The way the bin distribution algorithm distributes buckets to hosts seems to
  2524. // discourage other possibilities. For instance, a current map of:
  2525. //
  2526. // 00 FF FF FF FF FF FF 00
  2527. //
  2528. // just isn't likely. However, it IS STILL POSSIBLE! So, it is important to note that:
  2529. //
  2530. // REMOVING THIS LINE OF CODE DOES NOT, IN ANY WAY, GUARANTEE THAT AN NLB HEARTBEAT
  2531. // CANNOT STILL CONTAIN A VALID WAKE ON LAN PATTERN SOMEWHERE ELSE IN THE FRAME!!!
  2532. // sendp->pg_rsvd1[i] = (ULONG)bp->all_idle_map;
  2533. }
  2534. sendp->pkt_count = lp->pkt_count; /* 1.32B */
  2535. /* Add configuration information for teaming at each timeout. */
  2536. Load_teaming_code_create(&lp->send_msg.teaming, &ctxtp->bda_teaming);
  2537. /* request fast timeout if converging */
  2538. if (new_timeout != NULL) /* 1.03 */
  2539. {
  2540. if (sendp->state != HST_NORMAL)
  2541. * new_timeout = lp->cur_timeout = lp->def_timeout / 2;
  2542. else
  2543. * new_timeout = lp->cur_timeout = lp->def_timeout;
  2544. }
  2545. if (pnconn != NULL) /* v2.1 */
  2546. * pnconn = lp->nconn;
  2547. fRet = (sendp->state != HST_NORMAL);
  2548. LOCK_EXIT(&(lp->lock), irql);
  2549. return fRet;
  2550. } /* end Load_timeout */
  2551. PBIN_STATE Load_pg_lookup(
  2552. PLOAD_CTXT lp,
  2553. ULONG svr_ipaddr,
  2554. ULONG svr_port,
  2555. BOOLEAN is_tcp)
  2556. {
  2557. PCVY_RULE rp; /* ptr. to rules array */
  2558. PBIN_STATE bp; /* ptr. to bin state */
  2559. ULONG i;
  2560. ULONG nurules; /* # user defined rules */
  2561. PMAIN_CTXT ctxtp = CONTAINING_RECORD (lp, MAIN_CTXT, load);
  2562. UNIV_ASSERT(lp->code == CVY_LOADCODE); /* (bbain 8/19/99) */
  2563. rp = (* (lp->params)).port_rules;
  2564. nurules = (* (lp->params)).num_rules;
  2565. /* check for invalid port value (bbain RC1 6/14/99) */
  2566. UNIV_ASSERT(svr_port <= CVY_MAX_PORT);
  2567. /* find server port rule */
  2568. for (i=0; i<nurules; i++)
  2569. {
  2570. /* For virtual clusters: If the server IP address matches the VIP for the port rule,
  2571. or if the VIP for the port rule is "ALL VIPs", and if the port lies in the range
  2572. for this rule, and if the protocol matches, this is the rule. Notice that this
  2573. give priority to rules for specific VIPs over those for "ALL VIPs", which means
  2574. that this code RELIES on the port rules being sorted by VIP/port where the "ALL
  2575. VIP" ports rules are at the end of the port rule list. */
  2576. if ((svr_ipaddr == rp->virtual_ip_addr || CVY_ALL_VIP_NUMERIC_VALUE == rp->virtual_ip_addr) &&
  2577. (svr_port >= rp->start_port && svr_port <= rp->end_port) &&
  2578. ((is_tcp && rp->protocol != CVY_UDP) || (!is_tcp && rp->protocol != CVY_TCP)))
  2579. break;
  2580. else
  2581. rp++;
  2582. }
  2583. /* use default rule if port not found or rule is invalid */
  2584. bp = &(lp->pg_state[i]);
  2585. UNIV_ASSERT(bp->code == CVY_BINCODE); /* (bbain 8/19/99) */
  2586. return bp;
  2587. } /* end Load_pg_lookup */
  2588. /*
  2589. * Function: Load_find_dscr
  2590. * Description: This function takes a load pointer, hash value and connection
  2591. * parameters and searches all possible locations looking for a
  2592. * matching connection descriptor. If it finds ones, it returns
  2593. * a pointer to the descriptor (CONN_ENTRY); otherwise, it returns
  2594. * NULL to indicate that no matching descriptor was found.
  2595. * Parameters: lp - a pointer to the load module context (LOAD_CTXT)
  2596. * index - the connection queue index for this packet
  2597. * svr_ipaddr - the server IP address in network byte order
  2598. * svr_port - the server port number in host byte order
  2599. * client_ipaddr - the client IP address in network byte order
  2600. * client_port - the client port number in host byte order
  2601. * protocol - the connection protocol
  2602. * Returns: PCONN_ENTRY - a pointer to the descriptor, or NULL if not found
  2603. * Author: shouse, 10.4.01
  2604. * Notes:
  2605. */
  2606. PCONN_ENTRY Load_find_dscr (
  2607. PLOAD_CTXT lp,
  2608. ULONG index,
  2609. ULONG svr_ipaddr,
  2610. ULONG svr_port,
  2611. ULONG client_ipaddr,
  2612. ULONG client_port,
  2613. USHORT protocol)
  2614. {
  2615. BOOLEAN match = FALSE; /* TRUE => we have a record of this connection. */
  2616. PBIN_STATE bp; /* Pointer to bin state. */
  2617. PCONN_ENTRY ep; /* Pointer to connection entry. */
  2618. PCONN_DESCR dp; /* Pointer to connection descriptor. */
  2619. QUEUE * qp; /* Pointer to connection queue. */
  2620. UNIV_ASSERT(lp->code == CVY_LOADCODE);
  2621. /* Get a pointer to the connection entry for this hash ID. */
  2622. ep = &(lp->hashed_conn[index]);
  2623. UNIV_ASSERT(ep->code == CVY_ENTRCODE);
  2624. /* Get a pointer to the conneciton queue. */
  2625. qp = &(lp->connq[index]);
  2626. /* Look in the hashed connection table first. */
  2627. if (CVY_CONN_MATCH(ep, svr_ipaddr, svr_port, client_ipaddr, client_port, protocol))
  2628. {
  2629. /* Note that we found a match for this tuple. */
  2630. match = TRUE;
  2631. }
  2632. else
  2633. {
  2634. /* Look through the descriptor queue. */
  2635. for (dp = (PCONN_DESCR)Queue_front(qp); dp != NULL; dp = (PCONN_DESCR)Queue_next(qp, &(dp->link)))
  2636. {
  2637. if (CVY_CONN_MATCH(&(dp->entry), svr_ipaddr, svr_port, client_ipaddr, client_port, protocol))
  2638. {
  2639. /* Note that we found a match for this tuple. */
  2640. match = TRUE;
  2641. UNIV_ASSERT (dp->code == CVY_DESCCODE);
  2642. /* Get a pointer to the connection entry. */
  2643. ep = &(dp->entry);
  2644. UNIV_ASSERT (ep->code == CVY_ENTRCODE);
  2645. break;
  2646. }
  2647. }
  2648. }
  2649. /* If we found a match, return it, otherwise return NULL. */
  2650. if (match)
  2651. return ep;
  2652. else
  2653. return NULL;
  2654. }
  2655. /*
  2656. * Function: Load_note_conn_up
  2657. * Description: This function adjusts the appropriate connection counters
  2658. * for an up-coming connection.
  2659. * Parameters: lp - a pointer to the load module context (LOAD_CTXT)
  2660. * bp - a pointer to the port rule on which the connection was established
  2661. * bin - the bin to which the connection maps (Map % 60)
  2662. * Returns: Nothing.
  2663. * Author: shouse, 10.4.01
  2664. * Notes:
  2665. */
  2666. VOID Load_note_conn_up (PLOAD_CTXT lp, PBIN_STATE bp, ULONG bin)
  2667. {
  2668. /* Increment the number of connections. */
  2669. lp->nconn++;
  2670. bp->tconn++;
  2671. bp->nconn[bin]++;
  2672. /* Mark bin not idle if necessary. */
  2673. if (bp->nconn[bin] == 1) bp->idle_bins &= ~(((MAP_T) 1) << bin);
  2674. }
  2675. /*
  2676. * Function: Load_note_conn_down
  2677. * Description: This function adjusts the appropriate connection counters
  2678. * for an down-going connection.
  2679. * Parameters: lp - a pointer to the load module context (LOAD_CTXT)
  2680. * bp - a pointer to the port rule on which the connection resides
  2681. * bin - the bin to which the connection maps (Map % 60)
  2682. * Returns: Nothing.
  2683. * Author: shouse, 10.4.01
  2684. * Notes:
  2685. */
  2686. VOID Load_note_conn_down (PLOAD_CTXT lp, PBIN_STATE bp, ULONG bin)
  2687. {
  2688. UNIV_ASSERT(bp->nconn[bin] > 0 && bp->tconn > 0 && lp->nconn > 0);
  2689. /* Update the number of connections on the entire load module. */
  2690. if (lp->nconn <= 0)
  2691. lp->nconn = 0;
  2692. else
  2693. lp->nconn--;
  2694. /* Update the number of connections on this bin and port rule. */
  2695. if (bp->nconn[bin] <= 0)
  2696. bp->nconn[bin] = 0;
  2697. else
  2698. bp->nconn[bin]--;
  2699. /* Update the total number of connections on this port rule. */
  2700. if (bp->tconn <= 0)
  2701. bp->tconn = 0;
  2702. else
  2703. bp->tconn--;
  2704. /* If this was the last connection on this bin, update the idle map. */
  2705. if (bp->nconn[bin] == 0) bp->idle_bins |= (((MAP_T) 1) << bin);
  2706. }
  2707. /*
  2708. * Function: Load_init_dscr
  2709. * Description: This function initializes a NEWLY ALLOCATED descriptor.
  2710. * It is only necessary to perform this initialization ONCE.
  2711. * As descriptors are freed for re-use, use Load_reset_dscr
  2712. * to "re-initialize" them.
  2713. * Parameters: lp - a pointer to the load context on which this descriptor lives
  2714. * ep - a pointer to a connection descriptor
  2715. * alloc - whether or not this descriptor was dynamically allocated
  2716. * Returns: Nothing.
  2717. * Author: shouse, 10.4.01
  2718. * Notes:
  2719. */
  2720. VOID Load_init_dscr (PLOAD_CTXT lp, PCONN_ENTRY ep, BOOLEAN alloc)
  2721. {
  2722. /* Set the "magic number". */
  2723. ep->code = CVY_ENTRCODE;
  2724. #if defined (NLB_TCP_NOTIFICATION)
  2725. /* Save a pointer to this load module. */
  2726. ep->load = lp;
  2727. #endif
  2728. /* Initialize the hashing results. */
  2729. ep->index = 0;
  2730. ep->bin = 0;
  2731. /* Re-set the flags register. */
  2732. ep->flags = 0;
  2733. /* Is this descriptor in the static hash array, or allocated? */
  2734. if (alloc)
  2735. ep->flags |= NLB_CONN_ENTRY_FLAGS_ALLOCATED;
  2736. /* Initialize some other descriptor state. */
  2737. ep->timeout = 0;
  2738. ep->ref_count = 0;
  2739. /* Clear the descriptor. */
  2740. CVY_CONN_CLEAR(ep);
  2741. /* Initilize the links. */
  2742. Link_init(&(ep->blink));
  2743. Link_init(&(ep->rlink));
  2744. #if defined (NLB_TCP_NOTIFICATION)
  2745. Link_init(&(ep->glink));
  2746. #endif
  2747. }
  2748. /*
  2749. * Function: Load_init_fsb
  2750. * Description: This function initializes a fixed-size block allocated from the
  2751. * fixed-size block pool.
  2752. * Parameters: lp - a pointer to the load context on which the descriptor lives
  2753. * dp - a pointer to a block (connection descriptor)
  2754. * Returns: Nothing.
  2755. * Author: shouse, 4.1.02
  2756. * Notes:
  2757. */
  2758. VOID Load_init_fsb (PLOAD_CTXT lp, PCONN_DESCR dp)
  2759. {
  2760. /* Set the "magic number". */
  2761. dp->code = CVY_DESCCODE;
  2762. /* Initialize the connection queue link. */
  2763. Link_init(&(dp->link));
  2764. /* Initialize the connection entry. */
  2765. Load_init_dscr(lp, &dp->entry, TRUE);
  2766. }
  2767. /*
  2768. * Function: Load_reset_dscr
  2769. * Description: This function resets a descriptor for re-use. This includes
  2770. * re-initializing the state, setting the bin and queueing the
  2771. * descriptor onto the recovery and port rule queues.
  2772. * Parameters: lp - a pointer to the load module context (LOAD_CTXT)
  2773. * bp - a pointer to the port rule on which the connection is established
  2774. * ep - a pointer to the descriptor to be reset
  2775. * index - the connection queue index
  2776. * bin - the bin to which the connection maps
  2777. * references - the number of references to place on the descriptor initially
  2778. * Returns: Nothing.
  2779. * Author: shouse, 10.4.01
  2780. * Notes:
  2781. */
  2782. VOID Load_reset_dscr (PLOAD_CTXT lp, PBIN_STATE bp, PCONN_ENTRY ep, ULONG index, ULONG bin, SHORT references)
  2783. {
  2784. /* Reset some of the descriptor state to its defaults. */
  2785. ep->ref_count = references;
  2786. ep->timeout = 0;
  2787. /* Clear all descriptor flags except ALLOCATED. */
  2788. ep->flags &= NLB_CONN_ENTRY_FLAGS_ALLOCATED;
  2789. /* Store the hashing results in the descriptor. */
  2790. ep->index = (USHORT)index;
  2791. ep->bin = (UCHAR)bin;
  2792. /* Queue entry into the recovery queue. */
  2793. Queue_enq(&(lp->conn_rcvryq), &(ep->rlink));
  2794. /* Queue entry into port group queue. */
  2795. Queue_enq(&(bp->connq), &(ep->blink));
  2796. /* Update the connection counters, etc. */
  2797. Load_note_conn_up(lp, bp, bin);
  2798. }
  2799. /*
  2800. * Function: Load_put_dscr
  2801. * Description: This function completely releases a descriptor for later
  2802. * use. This includes unlinking from all appropriate queues,
  2803. * decrementing appropriate counters and re-setting some
  2804. * descriptor state. Callers of this function should call
  2805. * CVY_CONN_CLEAR to mark the descriptor as unused.
  2806. * Parameters: lp - a pointer to the load module context (LOAD_CTXT)
  2807. * bp - a pointer to the port rule on which the connection was active
  2808. * ep - a pointer to the connection descriptor to release
  2809. * Returns: Nothing.
  2810. * Author: shouse, 10.4.01
  2811. * Notes: Callers MUST call CVY_CONN_CLEAR to mark the descriptor unused!
  2812. * Do NOT access ep after calling this function (it may have been freed)!
  2813. */
  2814. VOID Load_put_dscr (PLOAD_CTXT lp, PBIN_STATE bp, PCONN_ENTRY ep)
  2815. {
  2816. PCONN_DESCR dp;
  2817. /* Unlink from the bin/dirty and recovery/timeout queues. */
  2818. Link_unlink(&(ep->rlink));
  2819. Link_unlink(&(ep->blink));
  2820. /* If the connection is NOT dirty, then we have to update
  2821. the connection counts, etc. If it is dirty then the
  2822. relevant counters have already been reset. */
  2823. if (!(ep->flags & NLB_CONN_ENTRY_FLAGS_DIRTY))
  2824. {
  2825. Load_note_conn_down(lp, bp, (ULONG)ep->bin);
  2826. }
  2827. else
  2828. {
  2829. /* If we're destroying a dirty connection, update the dirty counters. */
  2830. lp->dirty_bin[ep->bin]--;
  2831. lp->num_dirty--;
  2832. /* If this was the last dirty connection, turn off the cleanup waiting flag. */
  2833. if (lp->num_dirty == 0)
  2834. lp->cln_waiting = FALSE;
  2835. }
  2836. /* If this is an allocated (and therefore queued) descriptor,
  2837. there is some additional cleanup to do. */
  2838. if (ep->flags & NLB_CONN_ENTRY_FLAGS_ALLOCATED)
  2839. {
  2840. /* Get a pointer to the parent structure. */
  2841. dp = STRUCT_PTR(ep, CONN_DESCR, entry);
  2842. UNIV_ASSERT(dp->code == CVY_DESCCODE);
  2843. /* Unlink from the connection queue and put the descriptor back on the free
  2844. queue. We MUST do this before calling NdisFreeToBlockPool, as the pool
  2845. implementation will stomp on link because we allow it to re-use that piece
  2846. of our memory to link free blocks. Since this operation may also result
  2847. the memory being freed (actually, pages will NEVER be freed immediately,
  2848. but don't tempt fate), do NOT touch the descriptor once we've freed it
  2849. back to the pool. CALLERS OF THIS FUNCTION SHOULD TAKE THE SAME PRECAUTION
  2850. AND NOT TOUCH THE DESCRIPTOR AFTER CALLING THIS FUNCTION. */
  2851. Link_unlink(&(dp->link));
  2852. /* Free the descriptor back to the fixed-size block pool. */
  2853. NdisFreeToBlockPool((PUCHAR)dp);
  2854. /* Decrement the number of outstanding descriptors from the pool. */
  2855. lp->num_dscr_out--;
  2856. }
  2857. }
  2858. /*
  2859. * Function: Load_get_dscr
  2860. * Description: This function finds a descriptor to be used for a new connection
  2861. * by any available means; this includes an available free descriptor,
  2862. * allocating new descriptors if necessary, or as a last resort,
  2863. * cannibalizing an existing, in-use descriptor. If it succeeds, it
  2864. * returns a pointer to the descriptor; otherwise, it returns NULL to
  2865. * indicate the failure to locate an available descriptor. Callers of
  2866. * this function should call CVY_CONN_SET upon success to mark the
  2867. * descriptor as used and fill in the connection parameters.
  2868. * Parameters: lp - a pointer to the load module context (LOAD_CTXT)
  2869. * bp - a pointer to the port rule on which the connection is being established
  2870. * index - the connection queue index
  2871. * bin - the bin to which the connection belongs
  2872. * Returns: PCONN_ENTRY - a pointer to the new descriptor, or NULL if failed
  2873. * Author: shouse, 10.4.01
  2874. * Notes: Callers of this function MUST call CVY_CONN_SET to mark the descriptor
  2875. * active and to set the connection parameters (IPs, ports, protocol).
  2876. */
  2877. PCONN_ENTRY Load_get_dscr (PLOAD_CTXT lp, PBIN_STATE bp, ULONG index, ULONG bin)
  2878. {
  2879. PCONN_DESCR dp = NULL;
  2880. PCONN_ENTRY ep = NULL;
  2881. QUEUE * qp;
  2882. PMAIN_CTXT ctxtp = CONTAINING_RECORD(lp, MAIN_CTXT, load);
  2883. /* Get a pointer to the connection entry for this hash ID. */
  2884. ep = &(lp->hashed_conn[index]);
  2885. /* Get a pointer to the conneciton queue. */
  2886. qp = &(lp->connq[index]);
  2887. /* If hash entry table is not available, setup and enqueue a new entry. */
  2888. if (CVY_CONN_IN_USE(ep)) {
  2889. /* Get a pointer to a free descriptor. */
  2890. if ((lp->free_dscr_pool != NULL) && (lp->num_dscr_out < lp->max_dscr_out))
  2891. {
  2892. /* Allocate a descriptor from the fixed-size block pool. */
  2893. dp = (PCONN_DESCR)NdisAllocateFromBlockPool(lp->free_dscr_pool);
  2894. if (dp == NULL) {
  2895. /* Allocation failed, log a message and bail out. */
  2896. if (!(lp->alloc_failed)) {
  2897. TRACE_CRIT("%!FUNC! Error allocating connection descriptors");
  2898. LOG_MSG(MSG_ERROR_MEMORY, MSG_NONE);
  2899. lp->alloc_failed = TRUE;
  2900. }
  2901. return NULL;
  2902. }
  2903. /* Initialize the fixed-size block (connection descriptor). */
  2904. Load_init_fsb(lp, dp);
  2905. UNIV_ASSERT(dp->code == CVY_DESCCODE);
  2906. /* Increment the count of outstading descriptors from the fixed-size block pool. */
  2907. lp->num_dscr_out++;
  2908. /* There was a free descriptor, so setup the connection entry pointer. */
  2909. ep = &(dp->entry);
  2910. UNIV_ASSERT(ep->code == CVY_ENTRCODE);
  2911. }
  2912. #if defined (NLB_TCP_NOTIFICATION)
  2913. /* If notification is turned ON, we do NOT cannibalize descriptors. */
  2914. else if (!NLB_NOTIFICATIONS_ON())
  2915. #else
  2916. else
  2917. #endif
  2918. {
  2919. /* If we have reached the allocation limit, start taking connection descriptors from
  2920. the timeout or recovery queues since they are likely to be stale and very old. */
  2921. PBIN_STATE rbp;
  2922. LINK * rlp;
  2923. /* We were unable to allocation more connection descriptors and we will
  2924. be forced to cannibalize a connection descriptor already in use. Warn
  2925. the administrator that they should consider allowing NLB to allocate
  2926. more connection descriptors. */
  2927. if (!(lp->alloc_inhibited)) {
  2928. TRACE_CRIT("%!FUNC! All descriptors have been allocated and are in use");
  2929. LOG_MSG(MSG_WARN_DESCRIPTORS, CVY_NAME_MAX_DSCR_ALLOCS);
  2930. lp->alloc_inhibited = TRUE;
  2931. }
  2932. TRACE_INFO("%!FUNC! Attempting to take a connection descriptor from the TCP timeout queue");
  2933. /* Dequeue a descriptor from the TCP timeout queue. Cannibalize this queue
  2934. first because (i) its the most likely to have an available descriptor,
  2935. (ii) it should be the least disruptive because the connection has been
  2936. terminated AND the timeout for TCP is very short. */
  2937. rlp = (LINK *)Queue_deq(&(lp->tcp_expiredq));
  2938. if (rlp == NULL) {
  2939. TRACE_INFO("%!FUNC! Attempting to take a connection descriptor from the IPSec timeout queue");
  2940. /* Dequeue a descriptor from the IPSec timeout queue. While it is
  2941. true that descriptors on this queue are theoretically closed,
  2942. since IPSec cannot be sure that not upper-level protocols still
  2943. have state at the time a Main Mode SA expires and NLB is notified,
  2944. these connections are non-trivially likely to regenerate, so it
  2945. is necessary to keep the state around for a long time (24 hours
  2946. by default). Therefore, we cannibalize this timeout queue last
  2947. as it is the most likely to be disruptive, aside from the revovery
  2948. queue. */
  2949. rlp = (LINK *)Queue_deq(&(lp->ipsec_expiredq));
  2950. if (rlp == NULL) {
  2951. TRACE_INFO("%!FUNC! Attempting to take a connection descriptor from the recovery queue");
  2952. /* Dequeue a descriptor from the recovery queue. Since these are
  2953. "live" connections, we take descriptors from this queues as a
  2954. last resort. */
  2955. rlp = (LINK *)Queue_deq(&(lp->conn_rcvryq));
  2956. /* No descriptors are available anywhere - this should NEVER happen, but. */
  2957. if (rlp == NULL) return NULL;
  2958. }
  2959. }
  2960. TRACE_INFO("%!FUNC! Successfull cannibalized a connection descriptor");
  2961. /* Grab a pointer to the connection entry. */
  2962. ep = STRUCT_PTR(rlp, CONN_ENTRY, rlink);
  2963. UNIV_ASSERT(ep->code == CVY_ENTRCODE);
  2964. if (ep->flags & NLB_CONN_ENTRY_FLAGS_ALLOCATED) {
  2965. /* Unlink allocated descriptors from the hash table queue if necessary
  2966. and set dp so that code below will put it back in the right hash queue. */
  2967. dp = STRUCT_PTR(ep, CONN_DESCR, entry);
  2968. UNIV_ASSERT(dp->code == CVY_DESCCODE);
  2969. Link_unlink(&(dp->link));
  2970. } else {
  2971. dp = NULL;
  2972. }
  2973. /* Dirty connections are not counted, so we don't need to update these counters. */
  2974. if (!(ep->flags & NLB_CONN_ENTRY_FLAGS_DIRTY))
  2975. {
  2976. /* Find out which port group we are on so we can clean up its counters. */
  2977. rbp = Load_pg_lookup(lp, ep->svr_ipaddr, ep->svr_port, IS_TCP_PKT(ep->protocol));
  2978. /* Update the connection counters, etc. to remove all knowledge of this
  2979. "old" connection that we're cannibalizing. */
  2980. Load_note_conn_down(lp, rbp, (ULONG)ep->bin);
  2981. }
  2982. else
  2983. {
  2984. /* If we're cannibalizing a dirty connection, update the dirty counters. */
  2985. lp->dirty_bin[ep->bin]--;
  2986. lp->num_dirty--;
  2987. /* If this was the last dirty connection, turn off the cleanup waiting flag. */
  2988. if (lp->num_dirty == 0)
  2989. lp->cln_waiting = FALSE;
  2990. }
  2991. Link_unlink(&(ep->blink));
  2992. /* Mark the descriptor as unused. */
  2993. CVY_CONN_CLEAR(ep);
  2994. }
  2995. #if defined (NLB_TCP_NOTIFICATION)
  2996. /* There are no free descriptors, and we refuse to cannibalize. */
  2997. else
  2998. {
  2999. /* We were unable to allocation more connection descriptors and we will
  3000. be forced to cannibalize a connection descriptor already in use. Warn
  3001. the administrator that they should consider allowing NLB to allocate
  3002. more connection descriptors. */
  3003. if (!(lp->alloc_inhibited)) {
  3004. TRACE_CRIT("%!FUNC! All descriptors have been allocated and are in use");
  3005. LOG_MSG(MSG_WARN_DESCRIPTORS, CVY_NAME_MAX_DSCR_ALLOCS);
  3006. lp->alloc_inhibited = TRUE;
  3007. }
  3008. return NULL;
  3009. }
  3010. /* If notification is ON, then we're sure that descriptors here are dynamic,
  3011. and therefore will ALWAYS have to be re-queued. If notification is OFF,
  3012. that depends on whether a potentially cannibalized descriptor was dynamically
  3013. allocated or not. */
  3014. if (NLB_NOTIFICATIONS_ON())
  3015. {
  3016. UNIV_ASSERT(dp != NULL);
  3017. /* Enqueue descriptor in hash table unless it's already a hash table entry (a recovered
  3018. connection might be in hash table, so make sure we do not end up queueing it). */
  3019. UNIV_ASSERT(dp->code == CVY_DESCCODE);
  3020. Queue_enq(qp, &(dp->link));
  3021. }
  3022. else
  3023. {
  3024. #endif
  3025. /* Enqueue descriptor in hash table unless it's already a hash table entry (a recovered
  3026. connection might be in hash table, so make sure we do not end up queueing it). */
  3027. if (dp != NULL) {
  3028. UNIV_ASSERT(dp->code == CVY_DESCCODE);
  3029. Queue_enq(qp, &(dp->link));
  3030. }
  3031. #if defined (NLB_TCP_NOTIFICATION)
  3032. }
  3033. #endif
  3034. }
  3035. UNIV_ASSERT(ep->code == CVY_ENTRCODE);
  3036. /* Reset the descriptor information. */
  3037. Load_reset_dscr(lp, bp, ep, index, bin, 1);
  3038. return ep;
  3039. }
  3040. /*
  3041. * Function: Load_timeout_dscr
  3042. * Description: This function moves an active connection descriptor to
  3043. * the timeout state by dequeueing it from the recovery
  3044. * queue, setting the appropriate timeout and moving it to
  3045. * the appropriate timeout queue, where it will remain active
  3046. * for some amount of time (configurable via the registry).
  3047. * Parameters: lp - a pointer to the load module context (LOAD_CTXT)
  3048. * bp - a pointer to the port rule on which this connection is active
  3049. * ep - a pointer to the connection descriptor to timeout
  3050. * Returns: Nothing.
  3051. * Author: shouse, 10.4.01
  3052. * Notes:
  3053. */
  3054. VOID Load_timeout_dscr (PLOAD_CTXT lp, PBIN_STATE bp, PCONN_ENTRY ep)
  3055. {
  3056. /* Virtual descriptors should NEVER get in this function. */
  3057. UNIV_ASSERT(!(ep->flags & NLB_CONN_ENTRY_FLAGS_VIRTUAL));
  3058. /* Take the descriptor off of the recovery queue and move it to the appropriate
  3059. timeout queue, based on protocol. Each protocol has its own queue to avoid
  3060. the need for a sorted insert function, which is expensive. */
  3061. Link_unlink(&(ep->rlink));
  3062. /* Set the timeout based on the protocol and add it to the appropriate timeout queue. */
  3063. switch (ep->protocol) {
  3064. case TCPIP_PROTOCOL_TCP:
  3065. case TCPIP_PROTOCOL_PPTP:
  3066. /* If the user has specified a zero timeout, then simply destroy the descriptor. */
  3067. if (!lp->tcp_timeout)
  3068. {
  3069. /* Clear the connection descriptor. */
  3070. CVY_CONN_CLEAR(ep);
  3071. /* Release the descriptor. */
  3072. Load_put_dscr(lp, bp, ep);
  3073. break;
  3074. }
  3075. /* The timeout is the current time, plus the timeout for this particular protocol. */
  3076. ep->timeout = lp->clock_sec + lp->tcp_timeout;
  3077. Queue_enq(&(lp->tcp_expiredq), &(ep->rlink));
  3078. #if defined (TRACE_DSCR)
  3079. DbgPrint("Load_timeout_dscr: Moving TCP descriptor %p to the TCP timeout queue: clock=%u, timeout=%d", ep, lp->clock_sec, ep->timeout);
  3080. #endif
  3081. break;
  3082. case TCPIP_PROTOCOL_IPSEC1:
  3083. /* If the user has specified a zero timeout, then simply destroy the descriptor. */
  3084. if (!lp->ipsec_timeout)
  3085. {
  3086. /* Clear the connection descriptor. */
  3087. CVY_CONN_CLEAR(ep);
  3088. /* Release the descriptor. */
  3089. Load_put_dscr(lp, bp, ep);
  3090. break;
  3091. }
  3092. /* The timeout is the current time, plus the timeout for this particular protocol. */
  3093. ep->timeout = lp->clock_sec + lp->ipsec_timeout;
  3094. Queue_enq(&(lp->ipsec_expiredq), &(ep->rlink));
  3095. #if defined (TRACE_DSCR)
  3096. DbgPring("Load_timeout_dscr: Moving IPSec descriptor %p to the IPSec timeout queue: clock=%u, timeout=%u", ep, lp->clock_sec, ep->timeout);
  3097. #endif
  3098. break;
  3099. default:
  3100. #if defined (TRACE_DSCR)
  3101. DbgPrint("Load_timeout_dscr: Invalid descriptor protocol (%u). Removing descriptor %p immediately.", ep->protocol, ep);
  3102. #endif
  3103. /* Although this should never happen, clean up immediately
  3104. if the protocol in the descriptor is invalid. Note that
  3105. virtual descriptors, such as GRE, should NEVER be timed
  3106. out, and therefore should not enter this function. */
  3107. UNIV_ASSERT(0);
  3108. /* Clear the connection descriptor. */
  3109. CVY_CONN_CLEAR(ep);
  3110. /* Release the descriptor. */
  3111. Load_put_dscr(lp, bp, ep);
  3112. break;
  3113. }
  3114. }
  3115. /*
  3116. * Function: Load_flush_dscr
  3117. * Description: This function will flush out any descriptor that may be lying around
  3118. * for the given IP tuple. This may happen as a result of a RST being
  3119. * sent on another adapter, which NLB did not see and therefore did not
  3120. * properly destroy the state for. This function is called on all incoming
  3121. * SYN packets to remove this stale state. For PPTP/IPSec connections, it is
  3122. * also necessary to update any matching virtual descriptor found.
  3123. * Parameters: lp - a pointer to the load module context (LOAD_CTXT)
  3124. * bp - a pointer to the port rule on which this connection is active
  3125. * index - the connection queue index
  3126. * svr_ipaddr - the server IP address in network byte order
  3127. * svr_port - the server port in host byte order
  3128. * client_ipaddr - the client IP address in network byte order
  3129. * client_port - the client port in host byte order
  3130. * protocol - the protocol of this connection
  3131. * Returns: Nothing.
  3132. * Author: shouse, 1.7.02
  3133. * Notes:
  3134. */
  3135. VOID Load_flush_dscr (
  3136. PLOAD_CTXT lp,
  3137. PBIN_STATE bp,
  3138. ULONG index,
  3139. ULONG svr_ipaddr,
  3140. ULONG svr_port,
  3141. ULONG client_ipaddr,
  3142. ULONG client_port,
  3143. USHORT protocol)
  3144. {
  3145. PCONN_ENTRY ep; /* Pointer to connection entry. */
  3146. ULONG vindex;
  3147. ULONG hash;
  3148. SHORT references = 0;
  3149. UNIV_ASSERT(lp->code == CVY_LOADCODE);
  3150. /* Look for an existing matching connection descriptor. */
  3151. ep = Load_find_dscr(lp, index, svr_ipaddr, svr_port, client_ipaddr, client_port, protocol);
  3152. /* If not match was found, or the descriptor is already dirty, there's nothing to do. */
  3153. if ((ep != NULL) && !(ep->flags & NLB_CONN_ENTRY_FLAGS_DIRTY)) {
  3154. UNIV_ASSERT(ep->ref_count >= 0);
  3155. /* Note the number of references on this descriptor. */
  3156. references = ep->ref_count;
  3157. /* Mark the descriptor dirty and either free it or move it to
  3158. the dirty descriptor queue for subsequent cleanup. */
  3159. Load_soil_dscr(lp, bp, ep);
  3160. /* Update the connection counters on the port rule and load module.
  3161. Dirty descriptors update the connection counts when marked dirty,
  3162. not when they are ultimately destroyed. */
  3163. Load_note_conn_down(lp, bp, (ULONG)ep->bin);
  3164. if (protocol == TCPIP_PROTOCOL_PPTP) {
  3165. /* Compute a simple and inexpensive hash on all parts of the IP tuple except the protocol. */
  3166. hash = Load_simple_hash(svr_ipaddr, PPTP_CTRL_PORT, client_ipaddr, PPTP_CTRL_PORT);
  3167. /* Our index in all connection arrays is this hash, modulo the array size. */
  3168. vindex = hash % CVY_MAX_CHASH;
  3169. /* Look for an existing matching connection descriptor. */
  3170. ep = Load_find_dscr(lp, vindex, svr_ipaddr, PPTP_CTRL_PORT, client_ipaddr, PPTP_CTRL_PORT, TCPIP_PROTOCOL_GRE);
  3171. /* If not match was found, or the descriptor is already dirty, there's nothing to do. */
  3172. if ((ep != NULL) && !(ep->flags & NLB_CONN_ENTRY_FLAGS_DIRTY)) {
  3173. UNIV_ASSERT(ep->flags & NLB_CONN_ENTRY_FLAGS_VIRTUAL);
  3174. UNIV_ASSERT(ep->ref_count > 0);
  3175. /* If the descriptor has more references than the "parent"
  3176. descriptor, then we don't want to mark it dirty, or we'll
  3177. affect the traffic of other connections sharing this
  3178. descriptor. Otherwise, if we account for all references
  3179. on the virtual descriptor, mark it dirty. */
  3180. if (ep->ref_count <= references) {
  3181. /* Mark the descriptor dirty and either free it or move it to
  3182. the dirty descriptor queue for subsequent cleanup. */
  3183. Load_soil_dscr(lp, bp, ep);
  3184. /* Update the connection counters on the port rule and load module.
  3185. Dirty descriptors update the connection counts when marked dirty,
  3186. not when they are ultimately destroyed. */
  3187. Load_note_conn_down(lp, bp, (ULONG)ep->bin);
  3188. }
  3189. }
  3190. }
  3191. else if (protocol == TCPIP_PROTOCOL_IPSEC1) {
  3192. /* Compute a simple and inexpensive hash on all parts of the IP tuple except the protocol. */
  3193. hash = Load_simple_hash(svr_ipaddr, IPSEC_CTRL_PORT, client_ipaddr, IPSEC_CTRL_PORT);
  3194. /* Our index in all connection arrays is this hash, modulo the array size. */
  3195. vindex = hash % CVY_MAX_CHASH;
  3196. /* Look for an existing matching connection descriptor. */
  3197. ep = Load_find_dscr(lp, vindex, svr_ipaddr, IPSEC_CTRL_PORT, client_ipaddr, IPSEC_CTRL_PORT, TCPIP_PROTOCOL_IPSEC_UDP);
  3198. /* If not match was found, or the descriptor is already dirty, there's nothing to do. */
  3199. if ((ep != NULL) && !(ep->flags & NLB_CONN_ENTRY_FLAGS_DIRTY)) {
  3200. UNIV_ASSERT(ep->flags & NLB_CONN_ENTRY_FLAGS_VIRTUAL);
  3201. UNIV_ASSERT(ep->ref_count > 0);
  3202. /* If the descriptor has more references than the "parent"
  3203. descriptor, then we don't want to mark it dirty, or we'll
  3204. affect the traffic of other connections sharing this
  3205. descriptor. Otherwise, if we account for all references
  3206. on the virtual descriptor, mark it dirty. */
  3207. if (ep->ref_count <= references) {
  3208. /* Mark the descriptor dirty and either free it or move it to
  3209. the dirty descriptor queue for subsequent cleanup. */
  3210. Load_soil_dscr(lp, bp, ep);
  3211. /* Update the connection counters on the port rule and load module.
  3212. Dirty descriptors update the connection counts when marked dirty,
  3213. not when they are ultimately destroyed. */
  3214. Load_note_conn_down(lp, bp, (ULONG)ep->bin);
  3215. }
  3216. }
  3217. }
  3218. /* If at least one descriptor has been marked dirty, restart the cleanup timer. */
  3219. if (lp->cln_waiting)
  3220. lp->cur_time = 0;
  3221. }
  3222. }
  3223. /*
  3224. * Function: Load_create_dscr
  3225. * Description: This function creates and sets up a new descriptor for a given connection.
  3226. * The input connection entry pointer is the "existing" descriptor found by
  3227. * the caller, which can be (probably will be) NULL; in that case, a new
  3228. * descriptor needs to be acquired and initialized. If a descriptor already
  3229. * exisits, it is updated or cleansed, depending on its state.
  3230. * Parameters: lp - a pointer to the load module context (LOAD_CTXT)
  3231. * bp - a pointer to the port rule on which this connection is active
  3232. * ep - a pointer to the connection descriptor, if one was already found
  3233. * index - the connection queue index
  3234. * bin - the bin to which the connection maps (Map % 60)
  3235. * Returns: PCONN_ENTRY - a pointer to the connection entry
  3236. * Author:
  3237. * Notes:
  3238. */
  3239. PCONN_ENTRY Load_create_dscr (
  3240. PLOAD_CTXT lp,
  3241. PBIN_STATE bp,
  3242. PCONN_ENTRY ep,
  3243. ULONG index,
  3244. ULONG bin)
  3245. {
  3246. UNIV_ASSERT(lp->code == CVY_LOADCODE);
  3247. /* If we don't have a connection match, setup a new connection entry. */
  3248. if (ep == NULL) {
  3249. /* Get a new descriptor. */
  3250. ep = Load_get_dscr(lp, bp, index, bin);
  3251. /* If we can't find a descriptor, something is severely wrong - bail out. */
  3252. if (ep == NULL) return NULL;
  3253. UNIV_ASSERT(ep->code == CVY_ENTRCODE);
  3254. /* Otherwise, we have a match; clean up conn entry if dirty since we have a
  3255. new connection, although TCP/IP will likely reject it if it has stale state
  3256. from another connection. */
  3257. } else {
  3258. UNIV_ASSERT(ep->code == CVY_ENTRCODE);
  3259. if (ep->flags & NLB_CONN_ENTRY_FLAGS_DIRTY) {
  3260. /* If we're re-using a connection descriptor already
  3261. in use, then we need to pull it off the recovery/
  3262. timeout queue because it might have been previously
  3263. added to the timeout queue and we don't want it
  3264. spontaneously expiring on us. */
  3265. Link_unlink(&(ep->rlink));
  3266. /* Unlink the descriptor from the dirty queue. */
  3267. Link_unlink(&(ep->blink));
  3268. /* If we cleansing a dirty connection, update the dirty counters. */
  3269. lp->dirty_bin[ep->bin]--;
  3270. lp->num_dirty--;
  3271. /* If this was the last dirty connection, turn off the cleanup waiting flag. */
  3272. if (lp->num_dirty == 0)
  3273. lp->cln_waiting = FALSE;
  3274. /* Reset the dirty descriptor and re-use it for this connection. */
  3275. Load_reset_dscr(lp, bp, ep, index, bin, ep->ref_count++);
  3276. } else {
  3277. ep->timeout = 0;
  3278. /* If we're re-using a connection descriptor already
  3279. in use, then we need to pull it off the recovery/
  3280. timeout queue and re-enqueue it on the recoevry
  3281. queue because it might have been previously added
  3282. to the timeout queue and we don't want it spon-
  3283. taneously expiring on us. */
  3284. Link_unlink(&(ep->rlink));
  3285. Queue_enq(&(lp->conn_rcvryq), &(ep->rlink));
  3286. ep->ref_count++;
  3287. }
  3288. }
  3289. return ep;
  3290. }
  3291. /*
  3292. * Function: Load_destroy_dscr
  3293. * Description: This function "destroys" an existing descriptor. If the operation is
  3294. * a RST, it is immediately destroyed; if it is a FIN, the reference count
  3295. * is decremented and depending on the new count, the descriptor is either
  3296. * moved to a timeout queue or left alone.
  3297. * Parameters: lp - a pointer to the load module context (LOAD_CTXT)
  3298. * bp - a pointer to the port rule on which this connection is active
  3299. * ep - a pointer to the connection descriptor if one was already found
  3300. * conn_status - whether this is a RST or a FIN
  3301. * Returns: ULONG - the number of remaining references on the descriptor.
  3302. * Author: shouse, 1.7.02
  3303. * Notes:
  3304. */
  3305. ULONG Load_destroy_dscr (
  3306. PLOAD_CTXT lp,
  3307. PBIN_STATE bp,
  3308. PCONN_ENTRY ep,
  3309. ULONG conn_status)
  3310. {
  3311. UNIV_ASSERT(lp->code == CVY_LOADCODE);
  3312. /* If no descriptor was provided, bail out. This should NOT be called
  3313. with a NULL descriptor, but we have to handle it anyway. */
  3314. if (ep == NULL) return 0;
  3315. UNIV_ASSERT(ep->ref_count >= 0);
  3316. /* This descriptor was already moved to the expired queue - must be
  3317. that we received a retransmitted FIN on this connection, or the
  3318. reference count of a virtual descriptor was skewed. */
  3319. if (!ep->ref_count) {
  3320. UNIV_ASSERT(ep->timeout != 0);
  3321. /* If this is a RST notification, then destroy the state now.
  3322. If its a FIN, just ignore it. Either way, return zero. */
  3323. if (conn_status == CVY_CONN_RESET) {
  3324. /* Clear the connection descriptor. */
  3325. CVY_CONN_CLEAR(ep);
  3326. /* Release the descriptor. */
  3327. Load_put_dscr(lp, bp, ep);
  3328. }
  3329. /* Return - the descriptor already has zero references (no update needed). */
  3330. return 0;
  3331. }
  3332. UNIV_ASSERT(ep->ref_count > 0);
  3333. /* Decrement the reference count by one. */
  3334. ep->ref_count--;
  3335. UNIV_ASSERT(ep->ref_count >= 0);
  3336. /* If there are still references on this descriptor,
  3337. then its not ready to be destroyed yet, so we'll
  3338. keep it around and exit here. */
  3339. if (ep->ref_count > 0) return (ep->ref_count);
  3340. /* If this is a RST, or if the descriptor is virtual or dirty, destroy the descriptor
  3341. now. There is no need to timeout virtual GRE or IPSec/UDP descriptors; they can be
  3342. immediate destroyed. Of course, if the descriptor has already been marked dirty,
  3343. then we can destroy it now that the reference count has reached zero. */
  3344. if ((conn_status == CVY_CONN_RESET) || (ep->flags & NLB_CONN_ENTRY_FLAGS_VIRTUAL) || (ep->flags & NLB_CONN_ENTRY_FLAGS_DIRTY)) {
  3345. /* Clear the connection descriptor. */
  3346. CVY_CONN_CLEAR(ep);
  3347. /* Release the descriptor. */
  3348. Load_put_dscr(lp, bp, ep);
  3349. /* However, conventional descriptors, such as TCP or IPSec, should be timed-out gracefully. */
  3350. } else {
  3351. /* Otherwise, we're destroying it. Take the descriptor
  3352. off of the recovery queue and move it to the appropriate
  3353. timeout queue, based on protocol. Each protocol has
  3354. its own queue to avoid the need for a sorted insert
  3355. function, which is expensive. */
  3356. Load_timeout_dscr(lp, bp, ep);
  3357. }
  3358. /* No references left on the descriptor; it was destroyed or timed-out. */
  3359. return 0;
  3360. }
  3361. /*
  3362. * Function: Load_packet_check
  3363. * Description: This function determines whether or not to take a data packet
  3364. * in the IP stream identified by the IP tuple in question.
  3365. * Protocols that are session-less depend only on the hashing
  3366. * result and the ownership map. Session-ful protocols may need
  3367. * to perform a descriptor look-up if ambiguity exists.
  3368. * Parameters: lp - a pointer to the load module context (LOAD_CTXT)
  3369. * svr_ipaddr - the server IP address in network byte order
  3370. * svr_port - the server port in host byte order
  3371. * client_ipaddr - the client IP address in network byte order
  3372. * client_port - the client port in host byte order
  3373. * protocol - the protocol of this connection
  3374. * limit_map_fn - whether or not to include server-side parameters in hashing
  3375. * reverse_hash - whether or not to reverse client and server during hashing
  3376. * Returns: BOOLEAN - do we accept the packet? (TRUE = yes)
  3377. * Author: bbain, shouse, 10.4.01
  3378. * Notes:
  3379. */
  3380. BOOLEAN Load_packet_check(
  3381. PLOAD_CTXT lp,
  3382. ULONG svr_ipaddr,
  3383. ULONG svr_port,
  3384. ULONG client_ipaddr,
  3385. ULONG client_port,
  3386. USHORT protocol,
  3387. BOOLEAN limit_map_fn,
  3388. BOOLEAN reverse_hash)
  3389. {
  3390. PBIN_STATE bp;
  3391. ULONG hash;
  3392. ULONG index;
  3393. ULONG bin;
  3394. IRQLEVEL irql;
  3395. PMAIN_CTXT ctxtp = CONTAINING_RECORD (lp, MAIN_CTXT, load);
  3396. BOOLEAN is_tcp_pkt = IS_TCP_PKT(protocol);
  3397. BOOLEAN is_session_pkt = IS_SESSION_PKT(protocol);
  3398. BOOLEAN acpt = FALSE;
  3399. UNIV_ASSERT(lp->code == CVY_LOADCODE);
  3400. TRACE_FILTER("%!FUNC! Enter: lp = %p, server IP = %u.%u.%u.%u, server port = %u, client IP = %u.%u.%u.%u, client port = %u, protocol = %u, limit map = %u, reverse hash = %u",
  3401. lp, IP_GET_OCTET(svr_ipaddr, 0), IP_GET_OCTET(svr_ipaddr, 1), IP_GET_OCTET(svr_ipaddr, 2), IP_GET_OCTET(svr_ipaddr, 3), svr_port,
  3402. IP_GET_OCTET(client_ipaddr, 0), IP_GET_OCTET(client_ipaddr, 1), IP_GET_OCTET(client_ipaddr, 2), IP_GET_OCTET(client_ipaddr, 3), client_port,
  3403. protocol, limit_map_fn, reverse_hash);
  3404. /* If the load module is inactive, drop the packet and return here. */
  3405. if (!lp->active) {
  3406. TRACE_FILTER("%!FUNC! Drop packet - load module is inactive");
  3407. acpt = FALSE;
  3408. goto exit;
  3409. }
  3410. /* Increment count of pkts handled. */
  3411. lp->pkt_count++;
  3412. /* Find the port rule for this connection. */
  3413. bp = Load_pg_lookup(lp, svr_ipaddr, svr_port, is_tcp_pkt);
  3414. /* Make sure that Load_pg_lookup properly handled protocol specific rules. */
  3415. UNIV_ASSERT((is_tcp_pkt && bp->prot != CVY_UDP) || (!is_tcp_pkt && bp->prot != CVY_TCP));
  3416. /* Handle CVY_NEVER mode immediately. */
  3417. if (bp->mode == CVY_NEVER) {
  3418. /* Increment the dropped packet count. */
  3419. bp->packets_dropped++;
  3420. TRACE_FILTER("%!FUNC! Drop packet - port rule %u is disabled\n", bp->index);
  3421. acpt = FALSE;
  3422. goto exit;
  3423. }
  3424. /* Compute a simple and inexpensive hash on all parts of the IP tuple except the protocol. */
  3425. hash = Load_simple_hash(svr_ipaddr, svr_port, client_ipaddr, client_port);
  3426. index = hash % CVY_MAX_CHASH;
  3427. /* Compute the hash. */
  3428. hash = Load_complex_hash(svr_ipaddr, svr_port, client_ipaddr, client_port, bp->affinity, reverse_hash, limit_map_fn);
  3429. bin = hash % CVY_MAXBINS;
  3430. LOCK_ENTER(&(lp->lock), &irql);
  3431. /* Check bin for residency and all other hosts now idle on their bins; in this case
  3432. and if we do not have dirty connections, we must be able to handle the packet. */
  3433. if (((bp->cmap & (((MAP_T) 1) << bin)) != 0) && (!is_session_pkt || (((bp->all_idle_map & (((MAP_T) 1) << bin)) != 0) && (!(lp->cln_waiting))))) {
  3434. /* Note that we may have missed a connection, but it could also be a stale
  3435. packet so we can't start tracking the connection now. */
  3436. TRACE_FILTER("%!FUNC! Accept packet - packet owned unconditionally: Port rule = %u, Bin = %u, Current map = 0x%015I64x, "
  3437. "All idle map = 0x%015I64x, Connections = %u, Cleanup waiting = %u, Dirty %u",
  3438. bp->index, bin, bp->cmap, bp->all_idle_map, bp->nconn[bin], lp->cln_waiting, lp->dirty_bin[bin]);
  3439. /* Increment the accepted packet count. */
  3440. bp->packets_accepted++;
  3441. acpt = TRUE;
  3442. goto unlock;
  3443. /* Important note: Virtual descriptors that are not session-based and return
  3444. FALSE for IS_SESSION_PKT() use this case to check for a connection descriptor
  3445. match. (Example: UDP subsequent fragments within IPSec tunnels of protocol
  3446. type TCPIP_PROTOCOL_IPSEC_UDP) Do not disable this code for non-session
  3447. protocols. */
  3448. /* Otherwise, if we have an active connection for this bin or if we have dirty
  3449. connections for this bin and the bin is resident, check for a match. */
  3450. } else if (bp->nconn[bin] > 0 || (lp->cln_waiting && lp->dirty_bin[bin] && ((bp->cmap & (((MAP_T) 1) << bin)) != 0))) {
  3451. PCONN_ENTRY ep;
  3452. /* Look for an existing matching connection descriptor. */
  3453. ep = Load_find_dscr(lp, index, svr_ipaddr, svr_port, client_ipaddr, client_port, protocol);
  3454. /* If we can't find one, we don't own the connection. */
  3455. if (ep == NULL) {
  3456. TRACE_FILTER("%!FUNC! Drop packet - packet not owned by this host: Port rule = %u, Bin = %u, Current map = 0x%015I64x, "
  3457. "All idle map = 0x%015I64x, Connections = %u, Cleanup waiting = %u, Dirty %u",
  3458. bp->index, bin, bp->cmap, bp->all_idle_map, bp->nconn[bin], lp->cln_waiting, lp->dirty_bin[bin]);
  3459. /* Increment the dropped packet count. */
  3460. bp->packets_dropped++;
  3461. acpt = FALSE;
  3462. goto unlock;
  3463. }
  3464. UNIV_ASSERT(ep->code == CVY_ENTRCODE);
  3465. /* If connection was dirty, just block the packet since TCP/IP may have stale
  3466. connection state for a previous connection from another host. */
  3467. if (ep->flags & NLB_CONN_ENTRY_FLAGS_DIRTY) {
  3468. TRACE_FILTER("%!FUNC! Drop packet - block dirty connections (%p): Port rule = %u, Bin = %u, Current map = 0x%015I64x, "
  3469. "All idle map = 0x%015I64x, Connections = %u, Cleanup waiting = %u, Dirty %u",
  3470. ep, bp->index, bin, bp->cmap, bp->all_idle_map, bp->nconn[bin], lp->cln_waiting, lp->dirty_bin[bin]);
  3471. /* Increment the dropped packet count. */
  3472. bp->packets_dropped++;
  3473. acpt = FALSE;
  3474. goto unlock;
  3475. }
  3476. TRACE_FILTER("%!FUNC! Accept packet - matching descriptor found (%p): Port rule = %u, Bin = %u, Current map = 0x%015I64x, "
  3477. "All idle map = 0x%015I64x, Connections = %u, Cleanup waiting = %u, Dirty %u",
  3478. ep, bp->index, bin, bp->cmap, bp->all_idle_map, bp->nconn[bin], lp->cln_waiting, lp->dirty_bin[bin]);
  3479. /* Increment the accepted packet count. */
  3480. bp->packets_accepted++;
  3481. acpt = TRUE;
  3482. goto unlock;
  3483. }
  3484. TRACE_FILTER("%!FUNC! Drop packet - packet not owned by this host: Port rule = %u, Bin = %u, Current map = 0x%015I64x, "
  3485. "All idle map = 0x%015I64x, Connections = %u, Cleanup waiting = %u, Dirty %u",
  3486. bp->index, bin, bp->cmap, bp->all_idle_map, bp->nconn[bin], lp->cln_waiting, lp->dirty_bin[bin]);
  3487. /* Increment the dropped packet count. */
  3488. bp->packets_dropped++;
  3489. acpt = FALSE;
  3490. unlock:
  3491. LOCK_EXIT(&(lp->lock), irql);
  3492. exit:
  3493. TRACE_FILTER("%!FUNC! Exit: acpt = %u", acpt);
  3494. return acpt;
  3495. }
  3496. /*
  3497. * Function: Load_conn_advise
  3498. * Description: This function determines whether or not to accept this packet,
  3499. * which represents the beginning or end of a session-ful connection.
  3500. #if !defined (NLB_TCP_NOTIFICATION)
  3501. * If the connection is going up, and is successful, this function
  3502. * creates state to track the connection. If the connection is
  3503. * going down, this function removes the state for tracking the
  3504. * connection.
  3505. #endif
  3506. * Parameters: lp - a pointer to the load module context (LOAD_CTXT)
  3507. * svr_ipaddr - the server IP address in network byte order
  3508. * svr_port - the server port in host byte order
  3509. * client_ipaddr - the client IP address in network byte order
  3510. * client_port - the client port in host byte order
  3511. * protocol - the protocol of this connection
  3512. * conn_status - whether the connection is going UP, DOWN, or being RESET
  3513. * limit_map_fn - whether or not to include server-side parameters in hashing
  3514. * reverse_hash - whether or not to reverse client and server during hashing
  3515. * Returns: BOOLEAN - do we accept the packet (TRUE = yes)
  3516. * Author: bbain, shouse, 10.4.01
  3517. * Notes:
  3518. */
  3519. BOOLEAN Load_conn_advise(
  3520. PLOAD_CTXT lp,
  3521. ULONG svr_ipaddr,
  3522. ULONG svr_port,
  3523. ULONG client_ipaddr,
  3524. ULONG client_port,
  3525. USHORT protocol,
  3526. ULONG conn_status,
  3527. BOOLEAN limit_map_fn,
  3528. BOOLEAN reverse_hash)
  3529. {
  3530. ULONG hash;
  3531. ULONG vindex;
  3532. ULONG index;
  3533. ULONG bin;
  3534. PBIN_STATE bp;
  3535. PCONN_ENTRY ep;
  3536. IRQLEVEL irql;
  3537. PMAIN_CTXT ctxtp = CONTAINING_RECORD (lp, MAIN_CTXT, load);
  3538. BOOLEAN is_tcp_pkt = IS_TCP_PKT(protocol);
  3539. BOOLEAN acpt = TRUE;
  3540. UNIV_ASSERT(lp->code == CVY_LOADCODE);
  3541. TRACE_FILTER("%!FUNC! Enter: lp = %p, server IP = %u.%u.%u.%u, server port = %u, client IP = %u.%u.%u.%u, client port = %u, protocol = %u, status = %u, limit map = %u, reverse hash = %u",
  3542. lp, IP_GET_OCTET(svr_ipaddr, 0), IP_GET_OCTET(svr_ipaddr, 1), IP_GET_OCTET(svr_ipaddr, 2), IP_GET_OCTET(svr_ipaddr, 3), svr_port,
  3543. IP_GET_OCTET(client_ipaddr, 0), IP_GET_OCTET(client_ipaddr, 1), IP_GET_OCTET(client_ipaddr, 2), IP_GET_OCTET(client_ipaddr, 3), client_port,
  3544. protocol, conn_status, limit_map_fn, reverse_hash);
  3545. /* If the load module is inactive, drop the packet and return here. */
  3546. if (!lp->active) {
  3547. TRACE_FILTER("%!FUNC! Drop packet - load module is inactive");
  3548. acpt = FALSE;
  3549. goto exit;
  3550. }
  3551. /* Increment count of pkts handled. */
  3552. lp->pkt_count++;
  3553. /* Find the port rule for this connection. */
  3554. bp = Load_pg_lookup(lp, svr_ipaddr, svr_port, is_tcp_pkt);
  3555. /* Handle CVY_NEVER immediately. */
  3556. if (bp->mode == CVY_NEVER) {
  3557. /* Increment the dropped packet count. */
  3558. bp->packets_dropped++;
  3559. TRACE_FILTER("%!FUNC! Drop packet - port rule %u is disabled\n", bp->index);
  3560. acpt = FALSE;
  3561. goto exit;
  3562. }
  3563. /* Compute a simple and inexpensive hash on all parts of the IP tuple except the protocol. */
  3564. hash = Load_simple_hash(svr_ipaddr, svr_port, client_ipaddr, client_port);
  3565. index = hash % CVY_MAX_CHASH;
  3566. /* Compute the hash. */
  3567. hash = Load_complex_hash(svr_ipaddr, svr_port, client_ipaddr, client_port, bp->affinity, reverse_hash, limit_map_fn);
  3568. bin = hash % CVY_MAXBINS;
  3569. /* If this is a connection up notification, first clean out any old state that may exist for this
  3570. connection BEFORE we load-balance IFF we do NOT own the bucket to which the connection maps.
  3571. If we are not the bucket owner, the somebody else probably is; since we know that they will
  3572. accept the new connection, we need to flush out any state that we may have lying around.
  3573. This cleans out stale state that may have been left around by falling out of sync with TCP/IP.
  3574. Note that re-transmitted SYNs can wreak havoc here if the bucket map has shifted since the
  3575. first SYN, however, since the other host has no way of knowing that the second SYN is a
  3576. re-transmit, there's nothing we can do about it anyway. */
  3577. if ((conn_status == CVY_CONN_UP) && ((bp->cmap & (((MAP_T) 1) << bin)) == 0)) {
  3578. LOCK_ENTER(&(lp->lock), &irql);
  3579. /* If this is a SYN, flush out any old descriptor that may be lying around for this connection. */
  3580. Load_flush_dscr(lp, bp, index, svr_ipaddr, svr_port, client_ipaddr, client_port, protocol);
  3581. LOCK_EXIT(&(lp->lock), &irql);
  3582. }
  3583. /* If this connection is not in our current map and it is not a connection
  3584. down notification for a non-idle bin, just filter it out. */
  3585. if ((bp->cmap & (((MAP_T) 1) << bin)) == 0 && (!((conn_status == CVY_CONN_DOWN || conn_status == CVY_CONN_RESET) && bp->nconn[bin] > 0))) {
  3586. TRACE_FILTER("%!FUNC! Drop packet - packet not owned by this host: Port rule = %u, Bin = %u, Current map = 0x%015I64x, "
  3587. "All idle map = 0x%015I64x, Connections = %u, Cleanup waiting = %u, Dirty %u",
  3588. bp->index, bin, bp->cmap, bp->all_idle_map, bp->nconn[bin], lp->cln_waiting, lp->dirty_bin[bin]);
  3589. /* Increment the dropped packet count. */
  3590. bp->packets_dropped++;
  3591. acpt = FALSE;
  3592. goto exit;
  3593. }
  3594. #if defined (NLB_TCP_NOTIFICATION)
  3595. /* DO NOT create a descriptor until TCP or IPSec tells us to via a connection notification. If TCP
  3596. notification is OFF, then only exit early if its an IPSec SYN. */
  3597. if ((conn_status == CVY_CONN_UP) && (NLB_NOTIFICATIONS_ON() || (protocol == TCPIP_PROTOCOL_IPSEC1))) {
  3598. TRACE_FILTER("%!FUNC! Accept packet - SYN owned by this host: Port rule = %u, Bin = %u, Current map = 0x%015I64x, "
  3599. "All idle map = 0x%015I64x, Connections = %u, Cleanup waiting = %u, Dirty %u",
  3600. bp->index, bin, bp->cmap, bp->all_idle_map, bp->nconn[bin], lp->cln_waiting, lp->dirty_bin[bin]);
  3601. #else
  3602. /* DO NOT create a descriptor until IPSec tells us to via a connection notification IOCTL. */
  3603. if ((conn_status == CVY_CONN_UP) && (protocol == TCPIP_PROTOCOL_IPSEC1)) {
  3604. TRACE_FILTER("%!FUNC! Accept packet - IPSec SYN owned by this host: Port rule = %u, Bin = %u, Current map = 0x%015I64x, "
  3605. "All idle map = 0x%015I64x, Connections = %u, Cleanup waiting = %u, Dirty %u",
  3606. bp->index, bin, bp->cmap, bp->all_idle_map, bp->nconn[bin], lp->cln_waiting, lp->dirty_bin[bin]);
  3607. #endif
  3608. /* Increment the accepted packet count. */
  3609. bp->packets_accepted++;
  3610. acpt = TRUE;
  3611. goto exit;
  3612. }
  3613. LOCK_ENTER(&(lp->lock), &irql);
  3614. /* Look for an existing matching connection descriptor. */
  3615. ep = Load_find_dscr(lp, index, svr_ipaddr, svr_port, client_ipaddr, client_port, protocol);
  3616. /* If we see a new connection, handle it. */
  3617. if (conn_status == CVY_CONN_UP) {
  3618. /* Create a new connection descriptor to track this connection. */
  3619. ep = Load_create_dscr(lp, bp, ep, index, bin);
  3620. /* If, for some reason, we were unable to create state for this connection, bail out here. */
  3621. if (ep == NULL) {
  3622. TRACE_FILTER("%!FUNC! Drop packet - no available descriptors: Port rule = %u, Bin = %u, Current map = 0x%015I64x, "
  3623. "All idle map = 0x%015I64x, Connections = %u, Cleanup waiting = %u, Dirty %u",
  3624. bp->index, bin, bp->cmap, bp->all_idle_map, bp->nconn[bin], lp->cln_waiting, lp->dirty_bin[bin]);
  3625. /* Increment the dropped packet count. */
  3626. bp->packets_dropped++;
  3627. acpt = FALSE;
  3628. goto unlock;
  3629. }
  3630. /* Set the connection information in the descriptor. */
  3631. CVY_CONN_SET(ep, svr_ipaddr, svr_port, client_ipaddr, client_port, protocol);
  3632. /* If this is a new PPTP tunnel, create or update a virtual descriptor to track the GRE data packets. */
  3633. if (protocol == TCPIP_PROTOCOL_PPTP) {
  3634. /* Compute a simple and inexpensive hash on all parts of the IP tuple except the protocol. */
  3635. hash = Load_simple_hash(svr_ipaddr, PPTP_CTRL_PORT, client_ipaddr, PPTP_CTRL_PORT);
  3636. /* Our index in all connection arrays is this hash, modulo the array size. */
  3637. vindex = hash % CVY_MAX_CHASH;
  3638. /* Look for an existing matching virtual connection descriptor. */
  3639. ep = Load_find_dscr(lp, vindex, svr_ipaddr, PPTP_CTRL_PORT, client_ipaddr, PPTP_CTRL_PORT, TCPIP_PROTOCOL_GRE);
  3640. /* Create or update a virtual descriptor for the GRE traffic. */
  3641. ep = Load_create_dscr(lp, bp, ep, vindex, bin);
  3642. /* If we can't allocate the virtual descriptor, bail out, but don't fail. */
  3643. if (ep == NULL) goto unlock;
  3644. /* Set the connection information in the descriptor. */
  3645. CVY_CONN_SET(ep, svr_ipaddr, PPTP_CTRL_PORT, client_ipaddr, PPTP_CTRL_PORT, TCPIP_PROTOCOL_GRE);
  3646. /* Set the virtual descriptor flag. */
  3647. ep->flags |= NLB_CONN_ENTRY_FLAGS_VIRTUAL;
  3648. }
  3649. /* Otherwise, if a known connection is going down, remove our connection entry. */
  3650. } else if ((conn_status == CVY_CONN_DOWN || conn_status == CVY_CONN_RESET) && (ep != NULL)) {
  3651. /* If we found state for this connection, the bin is the bin from the descriptor,
  3652. not the calculated bin, which may not even been accurate if the port rules have
  3653. been modified since this connection was established. */
  3654. bin = ep->bin;
  3655. /* If connection was dirty, just block the packet since TCP/IP may have stale
  3656. connection state for a previous connection from another host. */
  3657. if (ep->flags & NLB_CONN_ENTRY_FLAGS_DIRTY) {
  3658. TRACE_FILTER("%!FUNC! Drop packet - block dirty connections: Port rule = %u, Bin = %u, Current map = 0x%015I64x, "
  3659. "All idle map = 0x%015I64x, Connections = %u, Cleanup waiting = %u, Dirty %u",
  3660. bp->index, bin, bp->cmap, bp->all_idle_map, bp->nconn[bin], lp->cln_waiting, lp->dirty_bin[bin]);
  3661. /* Increment the dropped packet count. */
  3662. bp->packets_dropped++;
  3663. goto unlock;
  3664. }
  3665. /* Update the descriptor by destroying it or moving it to the appropriate timeout queue if no references remain. */
  3666. (VOID)Load_destroy_dscr(lp, bp, ep, conn_status);
  3667. /* If this is a PPTP tunnel going down, update the virtual GRE descriptor. Virtual descriptors
  3668. are ALWAYS de-referenced, not destroyed, even if the notification is a RST because these
  3669. descriptors are potentially shared by multiple PPTP tunnels. */
  3670. if (protocol == TCPIP_PROTOCOL_PPTP) {
  3671. /* Compute a simple and inexpensive hash on all parts of the IP tuple except the protocol. */
  3672. hash = Load_simple_hash(svr_ipaddr, PPTP_CTRL_PORT, client_ipaddr, PPTP_CTRL_PORT);
  3673. /* Our index in all connection arrays is this hash, modulo the array size. */
  3674. vindex = hash % CVY_MAX_CHASH;
  3675. /* Look for an existing matching virtual connection descriptor. */
  3676. ep = Load_find_dscr(lp, vindex, svr_ipaddr, PPTP_CTRL_PORT, client_ipaddr, PPTP_CTRL_PORT, TCPIP_PROTOCOL_GRE);
  3677. /* Dereference the virtual GRE descriptor. */
  3678. (VOID)Load_destroy_dscr(lp, bp, ep, conn_status);
  3679. }
  3680. /* Otherwise, we found no match for a FIN/RST packet - drop it. */
  3681. } else {
  3682. TRACE_FILTER("%!FUNC! Drop packet - no matching descriptor found: Port rule = %u, Bin = %u, Current map = 0x%015I64x, "
  3683. "All idle map = 0x%015I64x, Connections = %u, Cleanup waiting = %u, Dirty %u",
  3684. bp->index, bin, bp->cmap, bp->all_idle_map, bp->nconn[bin], lp->cln_waiting, lp->dirty_bin[bin]);
  3685. /* Increment the dropped packet count. */
  3686. bp->packets_dropped++;
  3687. acpt = FALSE;
  3688. goto unlock;
  3689. }
  3690. TRACE_FILTER("%!FUNC! Accept packet - packet owned by this host: Port rule = %u, Bin = %u, Current map = 0x%015I64x, "
  3691. "All idle map = 0x%015I64x, Connections = %u, Cleanup waiting = %u, Dirty %u",
  3692. bp->index, bin, bp->cmap, bp->all_idle_map, bp->nconn[bin], lp->cln_waiting, lp->dirty_bin[bin]);
  3693. /* Increment the accepted packet count. */
  3694. bp->packets_accepted++;
  3695. // Exit here under one of these conditions:
  3696. // (i) got a syn and added a descriptor
  3697. // (ii) got a fin or a reset and destroyed the descriptor
  3698. acpt = TRUE;
  3699. unlock:
  3700. LOCK_EXIT(&(lp->lock), irql);
  3701. exit:
  3702. TRACE_FILTER("%!FUNC! Exit: acpt = %u", acpt);
  3703. return acpt;
  3704. }
  3705. /*
  3706. * Function: Load_conn_notify
  3707. * Description: This function is nearly identical to Load_conn_advise, except
  3708. * for two important distinctions; (1) this function is a notification,
  3709. * not a request, so load-balancing decisions are not made here, and
  3710. * (2) packet handling statistics are not incremented here, as calls
  3711. * to this function rarely stem from processing a real packet. For
  3712. * example, when a TCP SYN packet is received, main.c calls Load_conn_advise
  3713. * essentially asking, "hey, should accept this new connection i just
  3714. * saw?" While, when IPSec notifies NLB that a new Main Mode SA has just
  3715. * been established, main.c calls Load_conn_notify essentially dictating,
  3716. * "hey a new connection just went up, so whether you like it or not,
  3717. * create state to track this connection."
  3718. * Parameters: lp - a pointer to the load module context (LOAD_CTXT)
  3719. * svr_ipaddr - the server IP address in network byte order
  3720. * svr_port - the server port in host byte order
  3721. * client_ipaddr - the client IP address in network byte order
  3722. * client_port - the client port in host byte order
  3723. * protocol - the protocol of this connection
  3724. * conn_status - whether the connection is going UP, DOWN, or being RESET
  3725. * limit_map_fn - whether or not to include server-side parameters in hashing
  3726. * reverse_hash - whether or not to reverse client and server during hashing
  3727. * Returns: BOOLEAN - was i able to successfully update my state (TRUE = yes)
  3728. * Author: shouse, 10.4.01
  3729. * Notes:
  3730. */
  3731. BOOLEAN Load_conn_notify (
  3732. PLOAD_CTXT lp,
  3733. ULONG svr_ipaddr,
  3734. ULONG svr_port,
  3735. ULONG client_ipaddr,
  3736. ULONG client_port,
  3737. USHORT protocol,
  3738. ULONG conn_status,
  3739. BOOLEAN limit_map_fn,
  3740. BOOLEAN reverse_hash)
  3741. {
  3742. ULONG hash;
  3743. ULONG vindex;
  3744. ULONG index;
  3745. ULONG bin;
  3746. PBIN_STATE bp;
  3747. PCONN_ENTRY ep;
  3748. IRQLEVEL irql;
  3749. PMAIN_CTXT ctxtp = CONTAINING_RECORD (lp, MAIN_CTXT, load);
  3750. BOOLEAN is_tcp_pkt = IS_TCP_PKT(protocol);
  3751. BOOLEAN acpt = TRUE;
  3752. UNIV_ASSERT(lp->code == CVY_LOADCODE);
  3753. TRACE_FILTER("%!FUNC! Enter: lp = %p, server IP = %u.%u.%u.%u, server port = %u, client IP = %u.%u.%u.%u, client port = %u, protocol = %u, status = %u, limit map = %u, reverse hash = %u",
  3754. lp, IP_GET_OCTET(svr_ipaddr, 0), IP_GET_OCTET(svr_ipaddr, 1), IP_GET_OCTET(svr_ipaddr, 2), IP_GET_OCTET(svr_ipaddr, 3), svr_port,
  3755. IP_GET_OCTET(client_ipaddr, 0), IP_GET_OCTET(client_ipaddr, 1), IP_GET_OCTET(client_ipaddr, 2), IP_GET_OCTET(client_ipaddr, 3), client_port,
  3756. protocol, conn_status, limit_map_fn, reverse_hash);
  3757. /* If the load module is inactive and this is a CONN_UP, drop the packet and return here.
  3758. If this is a notification for a CONN_DOWN or CONN_RESET, process it. */
  3759. if ((!lp->active) && (conn_status == CVY_CONN_UP)) {
  3760. TRACE_FILTER("%!FUNC! Drop packet - load module is inactive");
  3761. acpt = FALSE;
  3762. goto exit;
  3763. }
  3764. /* Find the port rule for this connection. */
  3765. bp = Load_pg_lookup(lp, svr_ipaddr, svr_port, is_tcp_pkt);
  3766. /* Handle CVY_NEVER immediately. */
  3767. if (bp->mode == CVY_NEVER) {
  3768. TRACE_FILTER("%!FUNC! Drop packet - port rule %u is disabled\n", bp->index);
  3769. acpt = FALSE;
  3770. goto exit;
  3771. }
  3772. /* Compute a simple and inexpensive hash on all parts of the IP tuple except the protocol. */
  3773. hash = Load_simple_hash(svr_ipaddr, svr_port, client_ipaddr, client_port);
  3774. index = hash % CVY_MAX_CHASH;
  3775. /* Compute the hash. */
  3776. hash = Load_complex_hash(svr_ipaddr, svr_port, client_ipaddr, client_port, bp->affinity, reverse_hash, limit_map_fn);
  3777. bin = hash % CVY_MAXBINS;
  3778. LOCK_ENTER(&(lp->lock), &irql);
  3779. /* Look for an existing matching connection descriptor. */
  3780. ep = Load_find_dscr(lp, index, svr_ipaddr, svr_port, client_ipaddr, client_port, protocol);
  3781. /* If we see a new connection, handle it. */
  3782. if (conn_status == CVY_CONN_UP) {
  3783. /* Create a new connection descriptor to track this connection. */
  3784. ep = Load_create_dscr(lp, bp, ep, index, bin);
  3785. /* If, for some reason, we were unable to create state for this connection, bail out here. */
  3786. if (ep == NULL) {
  3787. TRACE_FILTER("%!FUNC! Drop packet - no available descriptors: Port rule = %u, Bin = %u, Current map = 0x%015I64x, "
  3788. "All idle map = 0x%015I64x, Connections = %u, Cleanup waiting = %u, Dirty %u",
  3789. bp->index, bin, bp->cmap, bp->all_idle_map, bp->nconn[bin], lp->cln_waiting, lp->dirty_bin[bin]);
  3790. acpt = FALSE;
  3791. goto unlock;
  3792. }
  3793. /* Set the connection information in the descriptor. */
  3794. CVY_CONN_SET(ep, svr_ipaddr, svr_port, client_ipaddr, client_port, protocol);
  3795. /* If this is a new PPTP tunnel, create or update a virtual descriptor to track the GRE data packets. */
  3796. if (protocol == TCPIP_PROTOCOL_PPTP) {
  3797. /* Compute a simple and inexpensive hash on all parts of the IP tuple except the protocol. */
  3798. hash = Load_simple_hash(svr_ipaddr, PPTP_CTRL_PORT, client_ipaddr, PPTP_CTRL_PORT);
  3799. /* Our index in all connection arrays is this hash, modulo the array size. */
  3800. vindex = hash % CVY_MAX_CHASH;
  3801. /* Look for an existing matching virtual connection descriptor. */
  3802. ep = Load_find_dscr(lp, vindex, svr_ipaddr, PPTP_CTRL_PORT, client_ipaddr, PPTP_CTRL_PORT, TCPIP_PROTOCOL_GRE);
  3803. /* Create or update a virtual descriptor for the GRE traffic. */
  3804. ep = Load_create_dscr(lp, bp, ep, vindex, bin);
  3805. /* If we can't allocate the virtual descriptor, bail out, but don't fail. */
  3806. if (ep == NULL) goto unlock;
  3807. /* Set the connection information in the descriptor. */
  3808. CVY_CONN_SET(ep, svr_ipaddr, PPTP_CTRL_PORT, client_ipaddr, PPTP_CTRL_PORT, TCPIP_PROTOCOL_GRE);
  3809. /* Set the virtual descriptor flag. */
  3810. ep->flags |= NLB_CONN_ENTRY_FLAGS_VIRTUAL;
  3811. }
  3812. /* If this is a new IPSEC tunnel, create or update a virtual descriptor to track the UDP subsequent data fragments. */
  3813. else if (protocol == TCPIP_PROTOCOL_IPSEC1) {
  3814. /* Compute a simple and inexpensive hash on all parts of the IP tuple except the protocol. */
  3815. hash = Load_simple_hash(svr_ipaddr, IPSEC_CTRL_PORT, client_ipaddr, IPSEC_CTRL_PORT);
  3816. /* Our index in all connection arrays is this hash, modulo the array size. */
  3817. vindex = hash % CVY_MAX_CHASH;
  3818. /* Look for an existing matching virtual connection descriptor. */
  3819. ep = Load_find_dscr(lp, vindex, svr_ipaddr, IPSEC_CTRL_PORT, client_ipaddr, IPSEC_CTRL_PORT, TCPIP_PROTOCOL_IPSEC_UDP);
  3820. /* Create or update a virtual descriptor for the UDP subsequent fragment traffic. */
  3821. ep = Load_create_dscr(lp, bp, ep, vindex, bin);
  3822. /* If we can't allocate the virtual descriptor, bail out, but don't fail. */
  3823. if (ep == NULL) goto unlock;
  3824. /* Set the connection information in the descriptor. */
  3825. CVY_CONN_SET(ep, svr_ipaddr, IPSEC_CTRL_PORT, client_ipaddr, IPSEC_CTRL_PORT, TCPIP_PROTOCOL_IPSEC_UDP);
  3826. /* Set the virtual descriptor flag. */
  3827. ep->flags |= NLB_CONN_ENTRY_FLAGS_VIRTUAL;
  3828. }
  3829. /* Otherwise, if a known connection is going down, remove our connection entry. */
  3830. } else if ((conn_status == CVY_CONN_DOWN || conn_status == CVY_CONN_RESET) && (ep != NULL)) {
  3831. /* If we found state for this connection, the bin is the bin from the descriptor,
  3832. not the calculated bin, which may not even been accurate if the port rules have
  3833. been modified since this connection was established. */
  3834. bin = ep->bin;
  3835. /* Update the descriptor by destroying it or moving it to the appropriate timeout queue if no references remain. */
  3836. (VOID)Load_destroy_dscr(lp, bp, ep, conn_status);
  3837. /* If this is a PPTP tunnel going down, update the virtual GRE descriptor. Virtual descriptors
  3838. are ALWAYS de-referenced, not destroyed, even if the notification is a RST because these
  3839. descriptors are potentially shared by multiple PPTP tunnels. */
  3840. if (protocol == TCPIP_PROTOCOL_PPTP) {
  3841. /* Compute a simple and inexpensive hash on all parts of the IP tuple except the protocol. */
  3842. hash = Load_simple_hash(svr_ipaddr, PPTP_CTRL_PORT, client_ipaddr, PPTP_CTRL_PORT);
  3843. /* Our index in all connection arrays is this hash, modulo the array size. */
  3844. vindex = hash % CVY_MAX_CHASH;
  3845. /* Look for an existing matching connection descriptor. */
  3846. ep = Load_find_dscr(lp, vindex, svr_ipaddr, PPTP_CTRL_PORT, client_ipaddr, PPTP_CTRL_PORT, TCPIP_PROTOCOL_GRE);
  3847. /* Dereference the virtual GRE descriptor. */
  3848. (VOID)Load_destroy_dscr(lp, bp, ep, conn_status);
  3849. }
  3850. /* If this is an IPSEC tunnel going down, update the virtual ISPEC_UDP descriptor. Virtual descriptors
  3851. are ALWAYS de-referenced, not destroyed, even if the notification is a RST because these
  3852. descriptors are potentially shared by multiple IPSEC tunnels. */
  3853. else if (protocol == TCPIP_PROTOCOL_IPSEC1) {
  3854. /* Compute a simple and inexpensive hash on all parts of the IP tuple except the protocol. */
  3855. hash = Load_simple_hash(svr_ipaddr, IPSEC_CTRL_PORT, client_ipaddr, IPSEC_CTRL_PORT);
  3856. /* Our index in all connection arrays is this hash, modulo the array size. */
  3857. vindex = hash % CVY_MAX_CHASH;
  3858. /* Look for an existing matching virtual connection descriptor. */
  3859. ep = Load_find_dscr(lp, vindex, svr_ipaddr, IPSEC_CTRL_PORT, client_ipaddr, IPSEC_CTRL_PORT, TCPIP_PROTOCOL_IPSEC_UDP);
  3860. /* Dereference the virtual IPSec/UDP descriptor. */
  3861. (VOID)Load_destroy_dscr(lp, bp, ep, conn_status);
  3862. }
  3863. /* Otherwise, we found no match for a FIN/RST packet - drop it. */
  3864. } else {
  3865. TRACE_FILTER("%!FUNC! Drop packet - no matching descriptor for RST/FIN: Port rule = %u, Bin = %u, Current map = 0x%015I64x, "
  3866. "All idle map = 0x%015I64x, Connections = %u, Cleanup waiting = %u, Dirty %u",
  3867. bp->index, bin, bp->cmap, bp->all_idle_map, bp->nconn[bin], lp->cln_waiting, lp->dirty_bin[bin]);
  3868. acpt = FALSE;
  3869. goto unlock;
  3870. }
  3871. TRACE_FILTER("%!FUNC! Accept packet - packet owned by this host: Port rule = %u, Bin = %u, Current map = 0x%015I64x, "
  3872. "All idle map = 0x%015I64x, Connections = %u, Cleanup waiting = %u, Dirty %u",
  3873. bp->index, bin, bp->cmap, bp->all_idle_map, bp->nconn[bin], lp->cln_waiting, lp->dirty_bin[bin]);
  3874. // Exit here under one of these conditions:
  3875. // (i) got a syn and added a descriptor
  3876. // (ii) got a fin or a reset and destroyed the descriptor
  3877. acpt = TRUE;
  3878. unlock:
  3879. LOCK_EXIT(&(lp->lock), irql);
  3880. exit:
  3881. TRACE_FILTER("%!FUNC! Exit: acpt = %u", acpt);
  3882. return acpt;
  3883. }
  3884. /*
  3885. * Function: Load_conn_get
  3886. * Description: This function returns the connection parameters for the descriptor
  3887. * at the head of the recovery queue, if one exists. The recovery
  3888. * queue holds all "active" connections, some of which may be stale.
  3889. * If an active descriptor exists, it fills in the connection info
  3890. * and returns TRUE to indicate success; otherwise it returns FALSE
  3891. * to indicate that no connection was found.
  3892. * Parameters: lp - a pointer to the load module context (LOAD_CTXT)
  3893. * OUT svr_ipaddr - the server IP address in network byte order
  3894. * OUT svr_port - the server port in host byte order
  3895. * OUT client_ipaddr - the client IP address in network byte order
  3896. * OUT client_port - the client port in host byte order
  3897. * OUT protocol - the protocol of this connection
  3898. * Returns: BOOLEAN -
  3899. * Author: shouse, 10.4.01
  3900. * Notes:
  3901. */
  3902. BOOLEAN Load_conn_get (PLOAD_CTXT lp, PULONG svr_ipaddr, PULONG svr_port, PULONG client_ipaddr, PULONG client_port, PUSHORT protocol)
  3903. {
  3904. LINK * rlp;
  3905. PCONN_ENTRY ep;
  3906. UNIV_ASSERT(lp->code == CVY_LOADCODE);
  3907. /* Get the descriptor off of the front of the recovery queue - DO NOT dequeue
  3908. it, just get a pointer to the descriptor and LEAVE IT ON THE QUEUE. */
  3909. rlp = (LINK *)Queue_front(&(lp->conn_rcvryq));
  3910. /* If there are no descriptors, return failure. */
  3911. if (rlp == NULL)
  3912. return FALSE;
  3913. /* Get a pointer to the connection entry. */
  3914. ep = STRUCT_PTR(rlp, CONN_ENTRY, rlink);
  3915. UNIV_ASSERT(ep->code == CVY_ENTRCODE);
  3916. /* Grab the IP tuple information out the descriptor and return it to the caller. */
  3917. *svr_ipaddr = ep->svr_ipaddr;
  3918. *svr_port = ep->svr_port;
  3919. *client_ipaddr = ep->client_ipaddr;
  3920. *client_port = ep->client_port;
  3921. *protocol = ep->protocol;
  3922. return TRUE;
  3923. }
  3924. /*
  3925. * Function: Load_conn_sanction
  3926. * Description: This function is called to "sanction" an active connection descriptor.
  3927. * Sanction means that NLB has verified that this connection is indeed
  3928. * still active by querying other system entities (such as TCP/IP). To
  3929. * sanction a descriptor simply involves moving it from its place in the
  3930. * recovery queue (should be the head in most cases) to the tail of the
  3931. * recovery queue, where it has the least chance of being cannibalized.
  3932. * Parameters: lp - a pointer to the load module context (LOAD_CTXT)
  3933. * svr_ipaddr - the server IP address in network byte order
  3934. * svr_port - the server port in host byte order
  3935. * client_ipaddr - the client IP address in network byte order
  3936. * client_port - the client port in host byte order
  3937. * protocol - the protocol of this connection
  3938. * Returns: BOOLEAN - was i successful in approbating the descriptor? (TRUE = yes)
  3939. * Author: shouse, 10.4.01
  3940. * Notes:
  3941. */
  3942. BOOLEAN Load_conn_sanction (
  3943. PLOAD_CTXT lp,
  3944. ULONG svr_ipaddr,
  3945. ULONG svr_port,
  3946. ULONG client_ipaddr,
  3947. ULONG client_port,
  3948. USHORT protocol)
  3949. {
  3950. ULONG hash;
  3951. ULONG index;
  3952. PCONN_ENTRY ep;
  3953. IRQLEVEL irql;
  3954. PMAIN_CTXT ctxtp = CONTAINING_RECORD(lp, MAIN_CTXT, load);
  3955. BOOLEAN acpt = FALSE;
  3956. UNIV_ASSERT(lp->code == CVY_LOADCODE);
  3957. TRACE_FILTER("%!FUNC! Enter: lp = %p, server IP = %u.%u.%u.%u, server port = %u, client IP = %u.%u.%u.%u, client port = %u, protocol = %u",
  3958. lp, IP_GET_OCTET(svr_ipaddr, 0), IP_GET_OCTET(svr_ipaddr, 1), IP_GET_OCTET(svr_ipaddr, 2), IP_GET_OCTET(svr_ipaddr, 3), svr_port,
  3959. IP_GET_OCTET(client_ipaddr, 0), IP_GET_OCTET(client_ipaddr, 1), IP_GET_OCTET(client_ipaddr, 2), IP_GET_OCTET(client_ipaddr, 3), client_port, protocol);
  3960. /* Compute a simple and inexpensive hash on all parts of the IP tuple except the protocol. */
  3961. hash = Load_simple_hash(svr_ipaddr, svr_port, client_ipaddr, client_port);
  3962. index = hash % CVY_MAX_CHASH;
  3963. LOCK_ENTER(&(lp->lock), &irql);
  3964. /* Try to find a matching descriptor for this connection. */
  3965. ep = Load_find_dscr(lp, index, svr_ipaddr, svr_port, client_ipaddr, client_port, protocol);
  3966. /* If there is no matching descriptor, then it must have been destroyed - return failure. */
  3967. if (ep == NULL) {
  3968. TRACE_FILTER("%!FUNC! Drop packet - no matching descriptor found");
  3969. acpt = FALSE;
  3970. goto unlock;
  3971. }
  3972. /* If this descriptor is being timed-out, do nothing - the connection has terminated
  3973. gracefully and the descriptor will be destroyed when it expires. */
  3974. if (ep->timeout) {
  3975. TRACE_FILTER("%!FUNC! Drop packet - matching descriptor found, already expired");
  3976. acpt = FALSE;
  3977. goto unlock;
  3978. }
  3979. /* To approbate the descriptor, we remove it from its place in the recovery queue
  3980. and move it to the tail; active descriptors are moved to the end of the queue
  3981. to attempt to prevent them from being recycled when we run out of free descriptors. */
  3982. Link_unlink(&(ep->rlink));
  3983. Queue_enq(&(lp->conn_rcvryq), &(ep->rlink));
  3984. TRACE_FILTER("%!FUNC! Accept packet - descriptor approbated");
  3985. acpt = TRUE;
  3986. unlock:
  3987. LOCK_EXIT(&(lp->lock), &irql);
  3988. return acpt;
  3989. }
  3990. ULONG Load_port_change(
  3991. PLOAD_CTXT lp,
  3992. ULONG ipaddr,
  3993. ULONG port,
  3994. ULONG cmd,
  3995. ULONG value)
  3996. {
  3997. PCVY_RULE rp; /* Pointer to configured port rules. */
  3998. PBIN_STATE bp; /* Pointer to load module port rule state. */
  3999. ULONG nrules; /* Number of rules. */
  4000. ULONG i;
  4001. ULONG ret = IOCTL_CVY_NOT_FOUND;
  4002. PMAIN_CTXT ctxtp = CONTAINING_RECORD(lp, MAIN_CTXT, load);
  4003. BOOLEAN bPortControlCmd;
  4004. UNIV_ASSERT(lp->code == CVY_LOADCODE);
  4005. if (! lp->active)
  4006. {
  4007. return IOCTL_CVY_NOT_FOUND;
  4008. }
  4009. bPortControlCmd = TRUE;
  4010. rp = (* (lp->params)).port_rules;
  4011. /* If we are draining whole cluster, include DEFAULT rule; Otherwise, just
  4012. include the user-defined rules (the DEFAULT rule is the last rule). */
  4013. if (cmd == IOCTL_CVY_CLUSTER_DRAIN || cmd == IOCTL_CVY_CLUSTER_PLUG)
  4014. nrules = (* (lp->params)).num_rules + 1;
  4015. else
  4016. nrules = (* (lp->params)).num_rules;
  4017. for (i=0; i<nrules; i++, rp++)
  4018. {
  4019. /* If the virtual IP address is IOCTL_ALL_VIPS (0x00000000), then we are applying this
  4020. change to all port rules for port X, regardless of VIP. If the virtual IP address is
  4021. to be applied to a particular VIP, then we apply only to port rules whose VIP matches.
  4022. Similarly, if the change is to apply to an "ALL VIP" rule, then we also apply when the
  4023. VIP matches because the caller uses CVY_ALL_VIP_NUMERIC_VALUE (0xffffffff) as the
  4024. virtual IP address, which is the same value stored in the port rule state. */
  4025. if ((ipaddr == IOCTL_ALL_VIPS || ipaddr == rp->virtual_ip_addr) &&
  4026. (port == IOCTL_ALL_PORTS || (port >= rp->start_port && port <= rp->end_port)))
  4027. {
  4028. bp = &(lp->pg_state[i]);
  4029. UNIV_ASSERT(bp->code == CVY_BINCODE); /* (bbain 8/19/99) */
  4030. /* If enabling a port rule, set the load amount to original value;
  4031. If disabling a port rule, set the load amount to zero;
  4032. Otherwise, set the load amount it to the specified amount. */
  4033. if (cmd == IOCTL_CVY_PORT_ON || cmd == IOCTL_CVY_CLUSTER_PLUG)
  4034. {
  4035. if (cmd == IOCTL_CVY_CLUSTER_PLUG)
  4036. {
  4037. bPortControlCmd = FALSE;
  4038. }
  4039. if (bp->load_amt[lp->my_host_id] == bp->orig_load_amt)
  4040. {
  4041. /* If we are the first port rule to match, then set the
  4042. return value to "Already"; Otherwise, we don't want to
  4043. overwrite some other port rule's return value of "OK"
  4044. in the case of ALL_VIPS or ALL_PORTS. */
  4045. if (ret == IOCTL_CVY_NOT_FOUND) ret = IOCTL_CVY_ALREADY;
  4046. continue;
  4047. }
  4048. /* Restore the original load amount. */
  4049. bp->load_amt[lp->my_host_id] = bp->orig_load_amt;
  4050. ret = IOCTL_CVY_OK;
  4051. }
  4052. else if (cmd == IOCTL_CVY_PORT_OFF)
  4053. {
  4054. if (bp->load_amt[lp->my_host_id] == 0)
  4055. {
  4056. /* If we are the first port rule to match, then set the
  4057. return value to "Already"; Otherwise, we don't want to
  4058. overwrite some other port rule's return value of "OK"
  4059. in the case of ALL_VIPS or ALL_PORTS. */
  4060. if (ret == IOCTL_CVY_NOT_FOUND) ret = IOCTL_CVY_ALREADY;
  4061. continue;
  4062. }
  4063. bp->load_amt[lp->my_host_id] = 0;
  4064. /* Immediately stop handling all traffic on the port group. */
  4065. bp->cmap = 0;
  4066. bp->cur_map[lp->my_host_id] = 0;
  4067. /* Re-initialize the performance counters. */
  4068. bp->packets_accepted = 0;
  4069. bp->packets_dropped = 0;
  4070. bp->bytes_accepted = 0;
  4071. bp->bytes_dropped = 0;
  4072. Load_conn_kill(lp, bp);
  4073. ret = IOCTL_CVY_OK;
  4074. }
  4075. else if (cmd == IOCTL_CVY_PORT_DRAIN || cmd == IOCTL_CVY_CLUSTER_DRAIN)
  4076. {
  4077. if (cmd == IOCTL_CVY_CLUSTER_DRAIN)
  4078. {
  4079. bPortControlCmd = FALSE;
  4080. }
  4081. if (bp->load_amt[lp->my_host_id] == 0)
  4082. {
  4083. /* If we are the first port rule to match, then set the
  4084. return value to "Already"; Otherwise, we don't want to
  4085. overwrite some other port rule's return value of "OK"
  4086. in the case of ALL_VIPS or ALL_PORTS. */
  4087. if (ret == IOCTL_CVY_NOT_FOUND) ret = IOCTL_CVY_ALREADY;
  4088. continue;
  4089. }
  4090. /* Set load weight to zero, but continue to handle existing connections. */
  4091. bp->load_amt[lp->my_host_id] = 0;
  4092. ret = IOCTL_CVY_OK;
  4093. }
  4094. else
  4095. {
  4096. UNIV_ASSERT(cmd == IOCTL_CVY_PORT_SET);
  4097. if (bp->load_amt[lp->my_host_id] == value)
  4098. {
  4099. /* If we are the first port rule to match, then set the
  4100. return value to "Already"; Otherwise, we don't want to
  4101. overwrite some other port rule's return value of "OK"
  4102. in the case of ALL_VIPS or ALL_PORTS. */
  4103. if (ret == IOCTL_CVY_NOT_FOUND) ret = IOCTL_CVY_ALREADY;
  4104. continue;
  4105. }
  4106. /* Set the load weight for this port rule. */
  4107. bp->orig_load_amt = value;
  4108. bp->load_amt[lp->my_host_id] = value;
  4109. ret = IOCTL_CVY_OK;
  4110. }
  4111. if (port != IOCTL_ALL_PORTS && ipaddr != IOCTL_ALL_VIPS) break;
  4112. }
  4113. }
  4114. /* If the cluster isn't already converging, then initiate convergence if the load weight of a port rule has been modified. */
  4115. if (ret == IOCTL_CVY_OK) {
  4116. if (bPortControlCmd)
  4117. {
  4118. // If enabled, fire wmi event indicating enable/disable/drain of ports on this node
  4119. if (NlbWmiEvents[PortRuleControlEvent].Enable)
  4120. {
  4121. WCHAR wsVip[CVY_MAX_VIRTUAL_IP_ADDR + 1];
  4122. Univ_ip_addr_ulong_to_str (ipaddr, wsVip);
  4123. // Form the VIP & Port number in case of All VIPs & All Ports
  4124. switch(cmd)
  4125. {
  4126. case IOCTL_CVY_PORT_ON:
  4127. NlbWmi_Fire_PortControlEvent(ctxtp, NLB_EVENT_PORT_ENABLED, wsVip, port);
  4128. break;
  4129. case IOCTL_CVY_PORT_OFF:
  4130. NlbWmi_Fire_PortControlEvent(ctxtp, NLB_EVENT_PORT_DISABLED, wsVip, port);
  4131. break;
  4132. case IOCTL_CVY_PORT_DRAIN:
  4133. NlbWmi_Fire_PortControlEvent(ctxtp, NLB_EVENT_PORT_DRAINING, wsVip, port);
  4134. break;
  4135. // For Port Set, do NOT fire event from here. This is 'cos it is only called in the
  4136. // reload case and the event is fired from the caller i.e. Main_apply_without_restart().
  4137. // The event is fired from the caller 'cos this function could be called more than
  4138. // one time (if there are multiple port rules) and we want to fire the event only once
  4139. case IOCTL_CVY_PORT_SET:
  4140. break;
  4141. default:
  4142. TRACE_CRIT("%!FUNC! Unexpected command code : 0x%x, NOT firing PortControl event", cmd);
  4143. break;
  4144. }
  4145. }
  4146. else
  4147. {
  4148. TRACE_VERB("%!FUNC! NOT generating event 'cos PortControlEvent event generation is disabled");
  4149. }
  4150. }
  4151. else // Node Control event
  4152. {
  4153. // If enabled, fire wmi event indicating starting/draining of nlb on this node
  4154. if (NlbWmiEvents[NodeControlEvent].Enable)
  4155. {
  4156. switch(cmd)
  4157. {
  4158. case IOCTL_CVY_CLUSTER_PLUG:
  4159. NlbWmi_Fire_NodeControlEvent(ctxtp, NLB_EVENT_NODE_STARTED);
  4160. break;
  4161. case IOCTL_CVY_CLUSTER_DRAIN:
  4162. NlbWmi_Fire_NodeControlEvent(ctxtp, NLB_EVENT_NODE_DRAINING);
  4163. break;
  4164. default:
  4165. TRACE_CRIT("%!FUNC! Unexpected command code : 0x%x, NOT firing NodeControl event", cmd);
  4166. break;
  4167. }
  4168. }
  4169. else
  4170. {
  4171. TRACE_VERB("%!FUNC! NOT generating event 'cos NodeControlEvent event generation is disabled");
  4172. }
  4173. }
  4174. if (lp->send_msg.state != HST_CVG) {
  4175. WCHAR me[20];
  4176. Univ_ulong_to_str (lp->my_host_id+1, me, 10);
  4177. /* Tracking convergence - Starting convergence because our port rule configuration has changed. */
  4178. LOG_MSGS(MSG_INFO_CONVERGING_NEW_RULES, me, me);
  4179. TRACE_CONVERGENCE("%!FUNC! Initiating convergence on host %d. Reason: Host %d has changed its port rule configuration.", lp->my_host_id+1, lp->my_host_id+1);
  4180. /* Tracking convergence. */
  4181. Load_convergence_start(lp);
  4182. // If enabled, fire wmi event indicating start of convergence
  4183. if (NlbWmiEvents[ConvergingEvent].Enable)
  4184. {
  4185. NlbWmi_Fire_ConvergingEvent(ctxtp,
  4186. NLB_EVENT_CONVERGING_MODIFIED_RULES,
  4187. ctxtp->params.ded_ip_addr,
  4188. ctxtp->params.host_priority);
  4189. }
  4190. else
  4191. {
  4192. TRACE_VERB("%!FUNC! NOT Generating NLB_EVENT_CONVERGING_MODIFIED_RULES 'cos ConvergingEvent generation disabled");
  4193. }
  4194. }
  4195. }
  4196. return ret;
  4197. } /* end Load_port_change */
  4198. ULONG Load_hosts_query(
  4199. PLOAD_CTXT lp,
  4200. BOOLEAN internal,
  4201. PULONG host_map)
  4202. {
  4203. WCHAR members[256] = L"";
  4204. WCHAR num[20] = L"";
  4205. WCHAR me[20] = L"";
  4206. PWCHAR ptr = members;
  4207. ULONG index = 0;
  4208. ULONG count = 0;
  4209. PMAIN_CTXT ctxtp = CONTAINING_RECORD (lp, MAIN_CTXT, load);
  4210. UNIV_ASSERT(lp->code == CVY_LOADCODE);
  4211. for (index = 0; index < CVY_MAX_HOSTS; index++) {
  4212. if (lp->host_map & (1 << index)) {
  4213. ptr = Univ_ulong_to_str(index + 1, ptr, 10);
  4214. *ptr = L',';
  4215. ptr++;
  4216. count++;
  4217. }
  4218. }
  4219. if (count) ptr--;
  4220. *ptr = 0;
  4221. *host_map = lp->host_map;
  4222. Univ_ulong_to_str((*(lp->params)).host_priority, me, 10);
  4223. Univ_ulong_to_str(count, num, 10);
  4224. if (lp->send_msg.state != HST_NORMAL)
  4225. {
  4226. UNIV_PRINT_VERB(("Load_hosts_query: Current host map is %08x and converging", lp->host_map));
  4227. TRACE_VERB("%!FUNC! Current host map is 0x%08x and converging", lp->host_map);
  4228. if (internal)
  4229. {
  4230. /* If there are 9 or less members in the cluster, we can be sure that there
  4231. is enough room in an event log to list the members out. If not, it might
  4232. get truncated, so we might as well log a different event instead and tell
  4233. the user to perform a "wlbs query" to see the list. */
  4234. if (count < 10) {
  4235. LOG_MSGS(MSG_INFO_CONVERGING_LIST, me, members);
  4236. } else {
  4237. LOG_MSGS1(MSG_INFO_CONVERGING_MAP, me, num, *host_map);
  4238. }
  4239. }
  4240. return IOCTL_CVY_CONVERGING;
  4241. }
  4242. else if (lp->pg_state[(*(lp->params)).num_rules].cmap != 0)
  4243. {
  4244. UNIV_PRINT_VERB(("Load_hosts_query: Current host map is %08x and converged as DEFAULT", lp->host_map));
  4245. TRACE_VERB("%!FUNC! Current host map is 0x%08x and converged as DEFAULT", lp->host_map);
  4246. if (internal)
  4247. {
  4248. /* If there are 9 or less members in the cluster, we can be sure that there
  4249. is enough room in an event log to list the members out. If not, it might
  4250. get truncated, so we might as well log a different event instead and tell
  4251. the user to perform a "wlbs query" to see the list. */
  4252. if (count < 10) {
  4253. LOG_MSGS(MSG_INFO_MASTER_LIST, me, members);
  4254. } else {
  4255. LOG_MSGS1(MSG_INFO_MASTER_MAP, me, num, *host_map);
  4256. }
  4257. // If enabled, fire wmi event indicating cluster is converged
  4258. if (NlbWmiEvents[ConvergedEvent].Enable)
  4259. {
  4260. NlbWmi_Fire_ConvergedEvent(ctxtp, *host_map);
  4261. }
  4262. else
  4263. {
  4264. TRACE_VERB("%!FUNC! ConvergedEvent generation disabled");
  4265. }
  4266. }
  4267. return IOCTL_CVY_MASTER;
  4268. }
  4269. else
  4270. {
  4271. UNIV_PRINT_VERB(("Load_hosts_query: Current host map is %08x and converged (NON-DEFAULT)", lp->host_map));
  4272. TRACE_VERB("%!FUNC! Current host map is 0x%08x and converged (NON-DEFAULT)", lp->host_map);
  4273. if (internal)
  4274. {
  4275. /* If there are 9 or less members in the cluster, we can be sure that there
  4276. is enough room in an event log to list the members out. If not, it might
  4277. get truncated, so we might as well log a different event instead and tell
  4278. the user to perform a "wlbs query" to see the list. */
  4279. if (count < 10) {
  4280. LOG_MSGS(MSG_INFO_SLAVE_LIST, me, members);
  4281. } else {
  4282. LOG_MSGS1(MSG_INFO_SLAVE_MAP, me, num, *host_map);
  4283. }
  4284. // If enabled, fire wmi event indicating cluster is converged
  4285. if (NlbWmiEvents[ConvergedEvent].Enable)
  4286. {
  4287. NlbWmi_Fire_ConvergedEvent(ctxtp, *host_map);
  4288. }
  4289. else
  4290. {
  4291. TRACE_VERB("%!FUNC! ConvergedEvent generation disabled");
  4292. }
  4293. }
  4294. return IOCTL_CVY_SLAVE;
  4295. }
  4296. } /* end Load_hosts_query */
  4297. /*
  4298. * Function: Load_query_packet_filter
  4299. * Description: This function takes a IP tuple and protocol and consults the load-
  4300. * balancing state to determine whether or not this packet would be
  4301. * accepted by the load module. In either case, the reason for the
  4302. * decision is also provided, plus, in most cases, some of the load
  4303. * module state is also returned to provide some context to justify
  4304. * the decision. This function is COMPLETELY unobtrusive and makes
  4305. * NO changes to the actual state of the load module.
  4306. * Parameters: lp - a pointer to the load module.
  4307. * pQuery - a pointer to a buffer into which the results are placed.
  4308. * svr_ipaddr - the server side IP address of this virtual packet.
  4309. * svr_port - the server side port of this virtual packet.
  4310. * client_ipaddr - the client side IP address of this virtual packet.
  4311. * client_ipaddr - the client side port of this virtual packet.
  4312. * protocol - the protocol of this virtual packet (UDP, TCP or IPSec1).
  4313. * limit_map_fin - a boolean indication of whether or not to use server
  4314. * side parameters in the Map function. This is controlled
  4315. * by BDA teaming.
  4316. * reverse_hash - whether or not to reverse client and server during hashing
  4317. * Returns: Nothing.
  4318. * Author: shouse, 5.18.01
  4319. * Notes: This function is only observatory and makes NO changes to the state of
  4320. * the load module.
  4321. */
  4322. VOID Load_query_packet_filter (
  4323. PLOAD_CTXT lp,
  4324. PNLB_OPTIONS_PACKET_FILTER pQuery,
  4325. ULONG svr_ipaddr,
  4326. ULONG svr_port,
  4327. ULONG client_ipaddr,
  4328. ULONG client_port,
  4329. USHORT protocol,
  4330. UCHAR flags,
  4331. BOOLEAN limit_map_fn,
  4332. BOOLEAN reverse_hash)
  4333. {
  4334. PBIN_STATE bp;
  4335. ULONG hash;
  4336. ULONG index;
  4337. ULONG bin;
  4338. QUEUE * qp;
  4339. /* This variable is used for port rule lookup and since the port rules only cover
  4340. UDP and TCP, we categorize as TCP and non-TCP, meaning that any protocol that's
  4341. not TCP will be treated like UDP for the sake of port rule lookup. */
  4342. BOOLEAN is_tcp_pkt = IS_TCP_PKT(protocol);
  4343. /* Further, some protocols are treated with "session" semantics, while others are
  4344. not. For TCP, this "session" is currently a single TCP connection, which is
  4345. tracked from SYN to FIN using a connection descriptor. IPSec "sessions" are
  4346. also tracked using descriptors, so even though its treated like UDP for port
  4347. rule lookup, its treated with the session semantics resembling TCP. Therefore,
  4348. by default the determination of a session packet is initially the same as the
  4349. determination of a TCP packet. */
  4350. BOOLEAN is_session_pkt = IS_SESSION_PKT(protocol);
  4351. UNIV_ASSERT(lp);
  4352. UNIV_ASSERT(pQuery);
  4353. UNIV_ASSERT(lp->code == CVY_LOADCODE);
  4354. /* If the load module has been "turned off", then we drop the packet. */
  4355. if (!lp->active) {
  4356. pQuery->Accept = NLB_REJECT_LOAD_MODULE_INACTIVE;
  4357. return;
  4358. }
  4359. /* Find the port rule for this server IP address / port pair. */
  4360. bp = Load_pg_lookup(lp, svr_ipaddr, svr_port, is_tcp_pkt);
  4361. UNIV_ASSERT ((is_tcp_pkt && bp->prot != CVY_UDP) || (!is_tcp_pkt && bp->prot != CVY_TCP));
  4362. /* If the matching port rule is configured as "disabled", which means to drop any
  4363. packets that match the rule, then we drop the packet. */
  4364. if (bp->mode == CVY_NEVER) {
  4365. pQuery->Accept = NLB_REJECT_PORT_RULE_DISABLED;
  4366. return;
  4367. }
  4368. /* If the applicable port rule is configured in "No" affinity mode, make sure enough
  4369. information has been specified in the query to faithfully determine packet ownership. */
  4370. if (bp->affinity == CVY_AFFINITY_NONE) {
  4371. /* VPN protocols REQUIRE either "Single" or "Class C" affinity; reject the request. */
  4372. if ((protocol == TCPIP_PROTOCOL_GRE) || (protocol == TCPIP_PROTOCOL_PPTP) || (protocol == TCPIP_PROTOCOL_IPSEC1)) {
  4373. pQuery->Accept = NLB_UNKNOWN_NO_AFFINITY;
  4374. return;
  4375. /* Hasing in "No" affinity requires the client port; if it wasn't specified, reject
  4376. the request. We check for a non-zero server port to special case ICMP filtering,
  4377. which sets BOTH ports to zero legally. */
  4378. } else if ((client_port == 0) && (svr_port != 0)) {
  4379. pQuery->Accept = NLB_UNKNOWN_NO_AFFINITY;
  4380. return;
  4381. }
  4382. }
  4383. /* Compute a simple and inexpensive hash on all parts of the IP tuple except the protocol. */
  4384. hash = Load_simple_hash(svr_ipaddr, svr_port, client_ipaddr, client_port);
  4385. index = hash % CVY_MAX_CHASH;
  4386. /* Compute the hash. */
  4387. hash = Load_complex_hash(svr_ipaddr, svr_port, client_ipaddr, client_port, bp->affinity, reverse_hash, limit_map_fn);
  4388. bin = hash % CVY_MAXBINS;
  4389. /* At this point, we can begin providing the requestee some actual information about
  4390. the state of the load module to better inform them as to why the decision we return
  4391. them was actually made. Here will provide some appropriate information about the
  4392. port rule we are operating on, including the "bucket" ID, the current "bucket"
  4393. ownership map and the number of connections active on this "bucket". */
  4394. pQuery->HashInfo.Valid = TRUE;
  4395. pQuery->HashInfo.Bin = bin;
  4396. pQuery->HashInfo.CurrentMap = bp->cmap;
  4397. pQuery->HashInfo.AllIdleMap = bp->all_idle_map;
  4398. pQuery->HashInfo.ActiveConnections = bp->nconn[bin];
  4399. /* If the packet is a connection control packet (TCP SYN/FIN/RST or IPSec MMSA, etc),
  4400. then we treat it differently than normal connection data. Mimics Load_conn_advise(). */
  4401. #if defined (NLB_TCP_NOTIFICATION)
  4402. /* If notifications are turned ON, then we only want to traverse this path if its a session-ful SYN.
  4403. FINs and RSTs should fall into the Load_packet_check path. If notification is NOT ON, then fall
  4404. through here for all SYNs, FINs and RSTs for session-ful protocols. */
  4405. if (is_session_pkt && ((flags & NLB_FILTER_FLAGS_CONN_UP) || (((flags & NLB_FILTER_FLAGS_CONN_DOWN) || (flags & NLB_FILTER_FLAGS_CONN_RESET)) && !NLB_NOTIFICATIONS_ON())))
  4406. #else
  4407. if (is_session_pkt && ((flags & NLB_FILTER_FLAGS_CONN_UP) || (flags & NLB_FILTER_FLAGS_CONN_DOWN) || (flags & NLB_FILTER_FLAGS_CONN_RESET)))
  4408. #endif
  4409. {
  4410. PCONN_ENTRY ep;
  4411. /* If this host does not own the bucket and the packet is not a connection
  4412. down or connection reset for a non-idle bin, then we don't own the packet. */
  4413. if (((bp->cmap & (((MAP_T) 1) << bin)) == 0) && (!(((flags & NLB_FILTER_FLAGS_CONN_DOWN) || (flags & NLB_FILTER_FLAGS_CONN_RESET)) && (bp->nconn[bin] > 0)))) {
  4414. pQuery->Accept = NLB_REJECT_OWNED_ELSEWHERE;
  4415. return;
  4416. }
  4417. /* At this point, we _might_ own the packet - if its a connection up, then
  4418. we definately do, because we own the bucket it maps to. */
  4419. if (flags & NLB_FILTER_FLAGS_CONN_UP) {
  4420. pQuery->Accept = NLB_ACCEPT_UNCONDITIONAL_OWNERSHIP;
  4421. return;
  4422. }
  4423. /* Look for an existing matching connection descriptor. */
  4424. ep = Load_find_dscr(lp, index, pQuery->ServerIPAddress, pQuery->ServerPort, pQuery->ClientIPAddress, pQuery->ClientPort, pQuery->Protocol);
  4425. /* If we haven't found a matching connection descriptor, then this host
  4426. certainly does not own this packet. */
  4427. if (ep == NULL) {
  4428. pQuery->Accept = NLB_REJECT_OWNED_ELSEWHERE;
  4429. return;
  4430. }
  4431. UNIV_ASSERT(ep->code == CVY_ENTRCODE);
  4432. /* If we find a match in the static hash table, fill in some descriptor
  4433. information for the user, including whether or not the descriptor was
  4434. allocated or static (static is this case) and the observed FIN count. */
  4435. pQuery->DescriptorInfo.Valid = TRUE;
  4436. pQuery->DescriptorInfo.Alloc = (ep->flags & NLB_CONN_ENTRY_FLAGS_ALLOCATED) ? TRUE : FALSE;
  4437. pQuery->DescriptorInfo.Dirty = (ep->flags & NLB_CONN_ENTRY_FLAGS_DIRTY) ? TRUE : FALSE;
  4438. pQuery->DescriptorInfo.RefCount = ep->ref_count;
  4439. /* If the connection is dirty, we do not take the packet because TCP may
  4440. have stale information for this descriptor. */
  4441. if (ep->flags & NLB_CONN_ENTRY_FLAGS_DIRTY) {
  4442. pQuery->Accept = NLB_REJECT_CONNECTION_DIRTY;
  4443. return;
  4444. }
  4445. /* If the connection is not dirty, we'll take the packet, as it belongs
  4446. to an existing connection that we are servicing on this host. */
  4447. pQuery->Accept = NLB_ACCEPT_FOUND_MATCHING_DESCRIPTOR;
  4448. return;
  4449. /* Otherwise, if its not a control packet, then its just a data packet, which
  4450. requires that either we unconditionally own this connection (if all other
  4451. hosts are idle on the bucket this packet maps to), or that we have an active
  4452. connection descriptor for this connection. Mimics load_packet_check(). */
  4453. } else {
  4454. /* If we currently own the "bucket" to which this connection maps and either NLB provides
  4455. no session support for this protocol, or all other hosts have no exisitng connections
  4456. on this "bucket" and we have no dirty connections, then we can safely take the packet
  4457. with no regard to the connection (session) descriptors. */
  4458. if (((bp->cmap & (((MAP_T) 1) << bin)) != 0) && (!is_session_pkt || (((bp->all_idle_map & (((MAP_T) 1) << bin)) != 0) && (!(lp->cln_waiting))))) {
  4459. pQuery->Accept = NLB_ACCEPT_UNCONDITIONAL_OWNERSHIP;
  4460. return;
  4461. /* Otherwise, if there are active connections on this "bucket" or if we own the
  4462. "bucket" and there are dirty connections on it, then we'll walk our descriptor
  4463. lists to determine whether or not we should take the packet or not. */
  4464. } else if (bp->nconn[bin] > 0 || (lp->cln_waiting && lp->dirty_bin[bin] && ((bp->cmap & (((MAP_T) 1) << bin)) != 0))) {
  4465. PCONN_ENTRY ep;
  4466. /* Look for an existing matching connection descriptor. */
  4467. ep = Load_find_dscr(lp, index, pQuery->ServerIPAddress, pQuery->ServerPort, pQuery->ClientIPAddress, pQuery->ClientPort, pQuery->Protocol);
  4468. /* If we haven't found a matching connection descriptor, then this host
  4469. certainly does not own this packet. */
  4470. if (ep == NULL) {
  4471. pQuery->Accept = NLB_REJECT_OWNED_ELSEWHERE;
  4472. return;
  4473. }
  4474. UNIV_ASSERT(ep->code == CVY_ENTRCODE);
  4475. /* If we find a match in the static hash table, fill in some descriptor
  4476. information for the user, including whether or not the descriptor was
  4477. allocated or static (static is this case) and the observed FIN count. */
  4478. pQuery->DescriptorInfo.Valid = TRUE;
  4479. pQuery->DescriptorInfo.Alloc = (ep->flags & NLB_CONN_ENTRY_FLAGS_ALLOCATED) ? TRUE : FALSE;
  4480. pQuery->DescriptorInfo.Dirty = (ep->flags & NLB_CONN_ENTRY_FLAGS_DIRTY) ? TRUE : FALSE;
  4481. pQuery->DescriptorInfo.RefCount = ep->ref_count;
  4482. /* If the connection is dirty, we do not take the packet because TCP may
  4483. have stale information for this descriptor. */
  4484. if (ep->flags & NLB_CONN_ENTRY_FLAGS_DIRTY) {
  4485. pQuery->Accept = NLB_REJECT_CONNECTION_DIRTY;
  4486. return;
  4487. }
  4488. /* If the connection is not dirty, we'll take the packet, as it belongs
  4489. to an existing connection that we are servicing on this host. */
  4490. pQuery->Accept = NLB_ACCEPT_FOUND_MATCHING_DESCRIPTOR;
  4491. return;
  4492. }
  4493. }
  4494. /* If we get all the way down here, then we aren't going to accept the packet
  4495. because we do not own the "bucket" to which the packet maps and we have no
  4496. existing connection (session) state to allow us to service the packet. */
  4497. pQuery->Accept = NLB_REJECT_OWNED_ELSEWHERE;
  4498. return;
  4499. }
  4500. /*
  4501. * Function: Load_query_port_state
  4502. * Description: This function returns the state (enabled, disabled, draining) of a particular
  4503. * port rule and, if found, returns some packet handling statistics for the port
  4504. * rule, such as the number of packets and bytes accepted and dropped. These
  4505. * counters are reset whenever a load weight change is made on the port rule, or
  4506. * whenever the load module is stopped/started. This function is just a query
  4507. * and therefore makes NO changes to the actual state of any port rule.
  4508. * Parameters: lp - a pointer to the load module.
  4509. * pQuery - a pointer to a buffer into which the results are placed.
  4510. * ipaddr - the VIP for the port rule that we are looking for. When per-VIP rules
  4511. * are not used, this IP address is 255.255.255.255 (0xffffffff).
  4512. * port - the port we are looking for. This function (and all other port rule
  4513. * operation functions, for that matter) identify a port rule by a port
  4514. * number within the range of a rule. Therefore, 80 identifies the port
  4515. * rule whose start port is 0 and whose end port is 1024, for instance.
  4516. * Returns: Nothing.
  4517. * Author: shouse, 5.18.01
  4518. * Notes: It is very important that this function operates completely unobtrusively.
  4519. */
  4520. VOID Load_query_port_state (
  4521. PLOAD_CTXT lp,
  4522. PNLB_OPTIONS_PORT_RULE_STATE pQuery,
  4523. ULONG ipaddr,
  4524. USHORT port)
  4525. {
  4526. PCVY_RULE rp; /* Pointer to configured port rules. */
  4527. PBIN_STATE bp; /* Pointer to load module port rule state. */
  4528. ULONG nrules; /* Number of configured port rules. */
  4529. ULONG i;
  4530. UNIV_ASSERT(lp);
  4531. UNIV_ASSERT(pQuery);
  4532. UNIV_ASSERT(lp->code == CVY_LOADCODE);
  4533. /* If the load module is inactive, all rules are in a default state, so
  4534. since there is nothing interesting to report, bail out and report that
  4535. the port rule could not be found. */
  4536. if (!lp->active) {
  4537. pQuery->Status = NLB_PORT_RULE_NOT_FOUND;
  4538. return;
  4539. }
  4540. /* Begin by assuming that we won't find a corresponding rule. */
  4541. pQuery->Status = NLB_PORT_RULE_NOT_FOUND;
  4542. /* Grab a pointer to the beginning of the port rules array. These are the port
  4543. rules are read from the registry, so no state is associated with them. */
  4544. rp = (*(lp->params)).port_rules;
  4545. /* Find out how many port rules to loop through. */
  4546. nrules = (*(lp->params)).num_rules;
  4547. /* Loop through all port rules looking for a match. */
  4548. for (i = 0; i < nrules; i++, rp++) {
  4549. /* If the VIP matches (this check includes the check for ALL VIP, which is coded as
  4550. 0xffffffff by both the user-level software and the load module) and the port number
  4551. is within the range of this port rule, we have a winner. */
  4552. if ((ipaddr == rp->virtual_ip_addr) && ((port >= rp->start_port) && (port <= rp->end_port))) {
  4553. /* Get a pointer to the load module port rule state for this rule. The load
  4554. module stores the port rules in the same order as they are read from the
  4555. registry and stored in the NLB params, so we can use the index of the loop
  4556. to directly index into the corresponding load module state for this rule. */
  4557. bp = &(lp->pg_state[i]);
  4558. UNIV_ASSERT(bp->code == CVY_BINCODE);
  4559. /* If the load weight is zero, this could be because either the rule is
  4560. disabled or because it is in the process of draining. */
  4561. if (bp->load_amt[lp->my_host_id] == 0) {
  4562. /* If the current number of connections being served on this port
  4563. rule is non-zero, then this port rule is being drained - the
  4564. count is decremented by every completed connection and goes to
  4565. zero when the rule is finished draining. */
  4566. if (bp->tconn) {
  4567. pQuery->Status = NLB_PORT_RULE_DRAINING;
  4568. } else {
  4569. pQuery->Status = NLB_PORT_RULE_DISABLED;
  4570. }
  4571. /* If the port rule has a non-zero load weight, then it is enabled. */
  4572. } else {
  4573. pQuery->Status = NLB_PORT_RULE_ENABLED;
  4574. }
  4575. /* Fill in some statistics for this port rule, including the number
  4576. of packets and bytes accepted and dropped, which can be used to
  4577. create an estimate of actual load balancing performance. */
  4578. pQuery->Statistics.Packets.Accepted = bp->packets_accepted;
  4579. pQuery->Statistics.Packets.Dropped = bp->packets_dropped;
  4580. pQuery->Statistics.Bytes.Accepted = bp->bytes_accepted;
  4581. pQuery->Statistics.Bytes.Dropped = bp->bytes_dropped;
  4582. break;
  4583. }
  4584. }
  4585. }
  4586. /*
  4587. * Function: Load_query_convergence_info
  4588. * Description: Queries the load module for the convergence statistics
  4589. * Parameters: lp - a pointer to the load module context.
  4590. * OUT num_cvgs - a pointer to a ULONG to hold the total number of convergences on this host.
  4591. * OUT last_cvg - a pointer to a ULONG to hold the time since the last convergence completed.
  4592. * Returns: BOOLEAN - whether or not the load module is active. If TRUE, then the OUT params were filled in.
  4593. * Author: shouse, 10.30.01
  4594. * Notes:
  4595. */
  4596. BOOLEAN Load_query_convergence_info (PLOAD_CTXT lp, PULONG num_cvgs, PULONG last_cvg)
  4597. {
  4598. PPING_MSG sendp;
  4599. UNIV_ASSERT(lp->code == CVY_LOADCODE);
  4600. /* If the load module is inactive, return failure. */
  4601. if (!lp->active)
  4602. return FALSE;
  4603. /* Get a pointer to our heartbeat. */
  4604. sendp = &(lp->send_msg);
  4605. /* Otherwise, fill in the total number of convergences since this host has joined
  4606. the cluster and the time, in seconds, since the last convergence completed. */
  4607. *num_cvgs = lp->num_convergences;
  4608. /* If the host is converged, then the time since the last convergence is the
  4609. current time minus the timestamp of the last convergence. Otherwise, the
  4610. last convergence has not yet completed, so return zero (in progress). */
  4611. if (sendp->state == HST_NORMAL)
  4612. *last_cvg = lp->clock_sec - lp->last_convergence;
  4613. else
  4614. *last_cvg = NLB_QUERY_TIME_INVALID;
  4615. return TRUE;
  4616. }
  4617. /*
  4618. * Function: Load_query_statistics
  4619. * Description: Queries the load module for some relevant statisitics
  4620. * Parameters: lp - a pointer to the load module context.
  4621. * OUT num_cvgs - a pointer to a ULONG to hold the current number of active connections
  4622. * OUT last_cvg - a pointer to a ULONG to hold the total number of descriptors allocated thusfar
  4623. * Returns: BOOLEAN - whether or not the load module is active. If TRUE, then the OUT params were filled in.
  4624. * Author: shouse, 4.19.02
  4625. * Notes:
  4626. */
  4627. BOOLEAN Load_query_statistics (PLOAD_CTXT lp, PULONG num_conn, PULONG num_dscr)
  4628. {
  4629. UNIV_ASSERT(lp->code == CVY_LOADCODE);
  4630. /* If the load module is inactive, return failure. */
  4631. if (!lp->active)
  4632. return FALSE;
  4633. /* The total number of ACTIVE connections across all port rules. */
  4634. *num_conn = lp->nconn;
  4635. /* The number of descriptors allocated thusfar. */
  4636. *num_dscr = lp->num_dscr_out;
  4637. return TRUE;
  4638. }
  4639. #if defined (NLB_TCP_NOTIFICATION)
  4640. /*
  4641. * Function: Load_conn_up
  4642. * Description: This function is called to create state to track a connection (usually TCP
  4643. * or IPSec/L2TP). This is not a function to ask the load module whether or
  4644. * not to accept a packet, rather it is a request to create state to track a
  4645. * connection that is being established.
  4646. * Parameters: lp - a pointer to the load module context (LOAD_CTXT)
  4647. * svr_ipaddr - the server IP address in network byte order
  4648. * svr_port - the server port in host byte order
  4649. * client_ipaddr - the client IP address in network byte order
  4650. * client_port - the client port in host byte order
  4651. * protocol - the protocol of this connection
  4652. * limit_map_fn - whether or not to include server-side parameters in hashing
  4653. * reverse_hash - whether or not to reverse client and server during hashing
  4654. * Returns: BOOLEAN - whether or not state was successfully created to track this connection.
  4655. * Author: shouse, 4.15.02
  4656. * Notes: DO NOT CALL THIS FUNCTION WITH THE LOAD LOCK HELD.
  4657. */
  4658. BOOLEAN Load_conn_up (
  4659. PLOAD_CTXT lp,
  4660. ULONG svr_ipaddr,
  4661. ULONG svr_port,
  4662. ULONG client_ipaddr,
  4663. ULONG client_port,
  4664. USHORT protocol,
  4665. BOOLEAN limit_map_fn,
  4666. BOOLEAN reverse_hash)
  4667. {
  4668. ULONG hash;
  4669. ULONG vindex;
  4670. ULONG index;
  4671. ULONG bin;
  4672. PBIN_STATE bp;
  4673. PCONN_ENTRY ep;
  4674. IRQLEVEL irql;
  4675. PNDIS_SPIN_LOCK lockp = GET_LOAD_LOCK(lp);
  4676. BOOLEAN is_tcp_pkt = IS_TCP_PKT(protocol);
  4677. BOOLEAN acpt = TRUE;
  4678. UNIV_ASSERT(lp->code == CVY_LOADCODE);
  4679. TRACE_FILTER("%!FUNC! Enter: lp = %p, server IP = %u.%u.%u.%u, server port = %u, client IP = %u.%u.%u.%u, client port = %u, protocol = %u, limit map = %u, reverse hash = %u",
  4680. lp, IP_GET_OCTET(svr_ipaddr, 0), IP_GET_OCTET(svr_ipaddr, 1), IP_GET_OCTET(svr_ipaddr, 2), IP_GET_OCTET(svr_ipaddr, 3), svr_port,
  4681. IP_GET_OCTET(client_ipaddr, 0), IP_GET_OCTET(client_ipaddr, 1), IP_GET_OCTET(client_ipaddr, 2), IP_GET_OCTET(client_ipaddr, 3), client_port,
  4682. protocol, limit_map_fn, reverse_hash);
  4683. /* Compute a simple and inexpensive hash on all parts of the IP tuple except the protocol. */
  4684. hash = Load_simple_hash(svr_ipaddr, svr_port, client_ipaddr, client_port);
  4685. /* Our index in all connection arrays is this hash, modulo the array size. */
  4686. index = hash % CVY_MAX_CHASH;
  4687. /* ALWAYS lock the global queues BEFORE locking the load module itself. */
  4688. NdisAcquireSpinLock(&g_conn_estabq[index].lock);
  4689. /* Lock the particular load module instance. */
  4690. NdisAcquireSpinLock(lockp);
  4691. /* If the load module is inactive, drop the packet and return here. */
  4692. if (!lp->active) {
  4693. TRACE_FILTER("%!FUNC! Drop packet - load module is inactive");
  4694. acpt = FALSE;
  4695. goto exit;
  4696. }
  4697. /* Find the port rule for this connection. */
  4698. bp = Load_pg_lookup(lp, svr_ipaddr, svr_port, is_tcp_pkt);
  4699. /* Handle CVY_NEVER immediately. */
  4700. if (bp->mode == CVY_NEVER) {
  4701. TRACE_FILTER("%!FUNC! Drop packet - port rule %u is disabled\n", bp->index);
  4702. acpt = FALSE;
  4703. goto exit;
  4704. }
  4705. /* Compute the hash. */
  4706. hash = Load_complex_hash(svr_ipaddr, svr_port, client_ipaddr, client_port, bp->affinity, reverse_hash, limit_map_fn);
  4707. /* Now hash client address to bin id. */
  4708. bin = hash % CVY_MAXBINS;
  4709. LOCK_ENTER(&(lp->lock), &irql);
  4710. /* Look for an existing matching connection descriptor. */
  4711. ep = Load_find_dscr(lp, index, svr_ipaddr, svr_port, client_ipaddr, client_port, protocol);
  4712. /* If there is no existing descriptor using this tuple, or if there is one, but its reference
  4713. count is zero, then the descriptor is NOT on the global connection queue; otherwise it is. */
  4714. if ((ep != NULL) && (ep->ref_count != 0)) {
  4715. /* Temporarily pull this descriptor off of the global connection queue. We'll end up putting
  4716. it back on later, but this way we can UNCONDITIONALLY link to the queue when the time comes. */
  4717. g_conn_estabq[index].length--;
  4718. Link_unlink(&ep->glink);
  4719. }
  4720. /* Create a new connection descriptor to track this connection. */
  4721. ep = Load_create_dscr(lp, bp, ep, index, bin);
  4722. /* If, for some reason, we were unable to create state for this connection, bail out here. */
  4723. if (ep == NULL) {
  4724. TRACE_FILTER("%!FUNC! Drop packet - no available descriptors: Port rule = %u, Index = %u, Bin = %u, Current map = 0x%015I64x, "
  4725. "All idle map = 0x%015I64x, Connections = %u, Cleanup waiting = %u, Dirty %u",
  4726. bp->index, index, bin, bp->cmap, bp->all_idle_map, bp->nconn[bin], lp->cln_waiting, lp->dirty_bin[bin]);
  4727. acpt = FALSE;
  4728. goto unlock;
  4729. }
  4730. /* Set the connection information in the descriptor. */
  4731. CVY_CONN_SET(ep, svr_ipaddr, svr_port, client_ipaddr, client_port, protocol);
  4732. /* Insert the descriptor into the global connection queue. */
  4733. g_conn_estabq[index].length++;
  4734. Queue_enq(&g_conn_estabq[index].queue, &ep->glink);
  4735. /* If this is a new PPTP tunnel, create or update a virtual descriptor to track the GRE data packets. */
  4736. if (protocol == TCPIP_PROTOCOL_PPTP) {
  4737. /* Compute a simple and inexpensive hash on all parts of the IP tuple except the protocol. */
  4738. hash = Load_simple_hash(svr_ipaddr, PPTP_CTRL_PORT, client_ipaddr, PPTP_CTRL_PORT);
  4739. /* Our index in all connection arrays is this hash, modulo the array size. */
  4740. vindex = hash % CVY_MAX_CHASH;
  4741. /* Look for an existing matching virtual connection descriptor. */
  4742. ep = Load_find_dscr(lp, vindex, svr_ipaddr, PPTP_CTRL_PORT, client_ipaddr, PPTP_CTRL_PORT, TCPIP_PROTOCOL_GRE);
  4743. /* Create or update a virtual descriptor for the GRE traffic. */
  4744. ep = Load_create_dscr(lp, bp, ep, vindex, bin);
  4745. /* If we can't allocate the virtual descriptor, bail out, but don't fail. */
  4746. if (ep == NULL) goto unlock;
  4747. /* Set the connection information in the descriptor. */
  4748. CVY_CONN_SET(ep, svr_ipaddr, PPTP_CTRL_PORT, client_ipaddr, PPTP_CTRL_PORT, TCPIP_PROTOCOL_GRE);
  4749. /* Set the virtual descriptor flag. */
  4750. ep->flags |= NLB_CONN_ENTRY_FLAGS_VIRTUAL;
  4751. }
  4752. /* If this is a new IPSEC tunnel, create or update a virtual descriptor to track the UDP subsequent data fragments. */
  4753. else if (protocol == TCPIP_PROTOCOL_IPSEC1) {
  4754. /* Compute a simple and inexpensive hash on all parts of the IP tuple except the protocol. */
  4755. hash = Load_simple_hash(svr_ipaddr, IPSEC_CTRL_PORT, client_ipaddr, IPSEC_CTRL_PORT);
  4756. /* Our index in all connection arrays is this hash, modulo the array size. */
  4757. vindex = hash % CVY_MAX_CHASH;
  4758. /* Look for an existing matching virtual connection descriptor. */
  4759. ep = Load_find_dscr(lp, vindex, svr_ipaddr, IPSEC_CTRL_PORT, client_ipaddr, IPSEC_CTRL_PORT, TCPIP_PROTOCOL_IPSEC_UDP);
  4760. /* Create or update a virtual descriptor for the UDP subsequent fragment traffic. */
  4761. ep = Load_create_dscr(lp, bp, ep, vindex, bin);
  4762. /* If we can't allocate the virtual descriptor, bail out, but don't fail. */
  4763. if (ep == NULL) goto unlock;
  4764. /* Set the connection information in the descriptor. */
  4765. CVY_CONN_SET(ep, svr_ipaddr, IPSEC_CTRL_PORT, client_ipaddr, IPSEC_CTRL_PORT, TCPIP_PROTOCOL_IPSEC_UDP);
  4766. /* Set the virtual descriptor flag. */
  4767. ep->flags |= NLB_CONN_ENTRY_FLAGS_VIRTUAL;
  4768. }
  4769. TRACE_FILTER("%!FUNC! Accept packet - connection state created: Port rule = %u, Index = %u, Bin = %u, Current map = 0x%015I64x, "
  4770. "All idle map = 0x%015I64x, Connections = %u, Cleanup waiting = %u, Dirty %u",
  4771. bp->index, index, bin, bp->cmap, bp->all_idle_map, bp->nconn[bin], lp->cln_waiting, lp->dirty_bin[bin]);
  4772. acpt = TRUE;
  4773. unlock:
  4774. LOCK_EXIT(&(lp->lock), irql);
  4775. exit:
  4776. /* Unlock the load module. */
  4777. NdisReleaseSpinLock(lockp);
  4778. /* Unlock the global established connection queue. */
  4779. NdisReleaseSpinLock(&g_conn_estabq[index].lock);
  4780. TRACE_FILTER("%!FUNC! Exit: acpt = %u", acpt);
  4781. return acpt;
  4782. }
  4783. /*
  4784. * Function: Load_conn_down
  4785. * Description: This function is called to destroy the state being used to track an existing
  4786. * connection (usually TCP or IPSec/L2TP). If state for the given 5-tuple is
  4787. * found, it is de-referenced and destroyed if appropriate (based partially on
  4788. * the conn_status). If state is not found, FALSE is returned, but it not
  4789. * considered a catastrophic error. In the case of TCP notifications, perhaps
  4790. * the connection was not even established across a NLB NIC.
  4791. * Parameters: svr_ipaddr - the server IP address in network byte order
  4792. * svr_port - the server port in host byte order
  4793. * client_ipaddr - the client IP address in network byte order
  4794. * client_port - the client port in host byte order
  4795. * protocol - the protocol of this connection
  4796. * conn_status - whether the connection is going DOWN or being RESET
  4797. * Returns: BOOLEAN - whether or not the connection state was found and updated.
  4798. * Author: shouse, 4.15.02
  4799. * Notes: DO NOT CALL THIS FUNCTION WITH THE LOAD LOCK HELD.
  4800. */
  4801. BOOLEAN Load_conn_down (
  4802. ULONG svr_ipaddr,
  4803. ULONG svr_port,
  4804. ULONG client_ipaddr,
  4805. ULONG client_port,
  4806. USHORT protocol,
  4807. ULONG conn_status)
  4808. {
  4809. PLOAD_CTXT lp;
  4810. ULONG hash;
  4811. ULONG vindex;
  4812. ULONG index;
  4813. ULONG bin;
  4814. LINK * linkp;
  4815. PBIN_STATE bp;
  4816. PCONN_ENTRY ep;
  4817. PPENDING_ENTRY pp;
  4818. PNDIS_SPIN_LOCK lockp;
  4819. BOOLEAN match = FALSE;
  4820. BOOLEAN acpt = TRUE;
  4821. PMAIN_CTXT ctxtp;
  4822. TRACE_FILTER("%!FUNC! Enter: server IP = %u.%u.%u.%u, server port = %u, client IP = %u.%u.%u.%u, client port = %u, protocol = %u, status = %u",
  4823. IP_GET_OCTET(svr_ipaddr, 0), IP_GET_OCTET(svr_ipaddr, 1), IP_GET_OCTET(svr_ipaddr, 2), IP_GET_OCTET(svr_ipaddr, 3), svr_port,
  4824. IP_GET_OCTET(client_ipaddr, 0), IP_GET_OCTET(client_ipaddr, 1), IP_GET_OCTET(client_ipaddr, 2), IP_GET_OCTET(client_ipaddr, 3), client_port, protocol, conn_status);
  4825. /* Compute a simple and inexpensive hash on all parts of the IP tuple except the protocol. */
  4826. hash = Load_simple_hash(svr_ipaddr, svr_port, client_ipaddr, client_port);
  4827. /* Our index in all connection arrays is this hash, modulo the array size. */
  4828. index = hash % CVY_MAX_CHASH;
  4829. /* ALWAYS lock the global queues BEFORE locking the load module itself. */
  4830. NdisAcquireSpinLock(&g_conn_pendingq[index].lock);
  4831. /* Grab the entry at the front of this pending connection queue. */
  4832. pp = (PPENDING_ENTRY)Queue_front(&g_conn_pendingq[index].queue);
  4833. while (pp != NULL) {
  4834. UNIV_ASSERT(pp->code == CVY_PENDINGCODE);
  4835. /* Look for a matching descriptor. */
  4836. if (CVY_PENDING_MATCH(pp, svr_ipaddr, svr_port, client_ipaddr, client_port, protocol)) {
  4837. match = TRUE;
  4838. break;
  4839. }
  4840. /* Get the next item in the queue. */
  4841. pp = (PPENDING_ENTRY)Queue_next(&g_conn_pendingq[index].queue, &(pp->link));
  4842. }
  4843. /* If we found this connection in the pending connection queue, remove it from
  4844. the queue, destroy the pending connection state and exit. Otherwise, fall
  4845. through and continue looking in the established connection queue. */
  4846. if (match) {
  4847. UNIV_ASSERT(pp);
  4848. /* Remove the pending connection entry from the pending queue. */
  4849. g_conn_pendingq[index].length--;
  4850. Link_unlink(&pp->link);
  4851. /* Free the descriptor back to the fixed-size block pool. */
  4852. NdisFreeToBlockPool((PUCHAR)pp);
  4853. /* Unlock the global pending connection queue. */
  4854. NdisReleaseSpinLock(&g_conn_pendingq[index].lock);
  4855. acpt = TRUE;
  4856. goto exit;
  4857. }
  4858. /* Unlock the global established connection queue. */
  4859. NdisReleaseSpinLock(&g_conn_pendingq[index].lock);
  4860. /* ALWAYS lock the global queues BEFORE locking the load module itself. */
  4861. NdisAcquireSpinLock(&g_conn_estabq[index].lock);
  4862. /* Grab the entry at the front of this established connection queue. */
  4863. linkp = (LINK *)Queue_front(&g_conn_estabq[index].queue);
  4864. while (linkp != NULL) {
  4865. /* Get the CONN_ENTRY pointer from the link pointer. */
  4866. ep = STRUCT_PTR(linkp, CONN_ENTRY, glink);
  4867. UNIV_ASSERT(ep->code == CVY_ENTRCODE);
  4868. /* Look for a matching descriptor. */
  4869. if (CVY_CONN_MATCH(ep, svr_ipaddr, svr_port, client_ipaddr, client_port, protocol)) {
  4870. match = TRUE;
  4871. break;
  4872. }
  4873. /* Get the next item in the queue. */
  4874. linkp = (LINK *)Queue_next(&g_conn_estabq[index].queue, &(ep->glink));
  4875. }
  4876. /* If no matching descriptor was found, bail out. */
  4877. if (!match) {
  4878. TRACE_FILTER("%!FUNC! Drop packet - no matching descriptor for RST/FIN: Index = %u", index);
  4879. acpt = FALSE;
  4880. goto unlock;
  4881. }
  4882. UNIV_ASSERT(ep);
  4883. /* Unlink this descriptor here. We have to do this here because if Load_destroy_dscr does in fact
  4884. destroy the descriptor, we can't touch it once the function call returns. So, we'll pull it off
  4885. here unconditionally and if it turns out that there are still references on the descriptor, we'll
  4886. put it back on when Load_destroy_dscr returns. */
  4887. g_conn_estabq[index].length--;
  4888. Link_unlink(&ep->glink);
  4889. /* Grab a pointer to the load module on which the descriptor resides. */
  4890. lp = ep->load;
  4891. UNIV_ASSERT(lp->code == CVY_LOADCODE);
  4892. /* Get a pointer to the load lock from the load context. */
  4893. lockp = GET_LOAD_LOCK(lp);
  4894. /* Lock the load module on which the connection resides. */
  4895. NdisAcquireSpinLock(lockp);
  4896. LOCK_ENTER(&(lp->lock), &irql);
  4897. /* If we found state for this connection, the bin is the bin from the descriptor,
  4898. not the calculated bin, which may not even been accurate if the port rules have
  4899. been modified since this connection was established. */
  4900. bin = ep->bin;
  4901. /* Lookup the port rule so we can update the port rule info. */
  4902. bp = Load_pg_lookup(lp, ep->svr_ipaddr, ep->svr_port, IS_TCP_PKT(ep->protocol));
  4903. /* If references still remain on the descriptor, then put it back on the global connection queue. */
  4904. if (Load_destroy_dscr(lp, bp, ep, conn_status)) {
  4905. /* Insert the descriptor into the global connection queue. */
  4906. g_conn_estabq[index].length++;
  4907. Queue_enq(&g_conn_estabq[index].queue, &ep->glink);
  4908. }
  4909. /* If this is a PPTP tunnel going down, update the virtual GRE descriptor. Virtual descriptors
  4910. are ALWAYS de-referenced, not destroyed, even if the notification is a RST because these
  4911. descriptors are potentially shared by multiple PPTP tunnels. */
  4912. if (protocol == TCPIP_PROTOCOL_PPTP) {
  4913. /* Compute a simple and inexpensive hash on all parts of the IP tuple except the protocol. */
  4914. hash = Load_simple_hash(svr_ipaddr, PPTP_CTRL_PORT, client_ipaddr, PPTP_CTRL_PORT);
  4915. /* Our index in all connection arrays is this hash, modulo the array size. */
  4916. vindex = hash % CVY_MAX_CHASH;
  4917. /* Look for an existing matching connection descriptor. Now that we have the load module pointer
  4918. from finding the first descriptor, we can narrow our search and look only for virtual descriptors
  4919. that reside on our load module. */
  4920. ep = Load_find_dscr(lp, vindex, svr_ipaddr, PPTP_CTRL_PORT, client_ipaddr, PPTP_CTRL_PORT, TCPIP_PROTOCOL_GRE);
  4921. /* Dereference the virtual GRE descriptor. */
  4922. (VOID)Load_destroy_dscr(lp, bp, ep, conn_status);
  4923. }
  4924. /* If this is an IPSEC tunnel going down, update the virtual ISPEC_UDP descriptor. Virtual descriptors
  4925. are ALWAYS de-referenced, not destroyed, even if the notification is a RST because these
  4926. descriptors are potentially shared by multiple IPSEC tunnels. */
  4927. else if (protocol == TCPIP_PROTOCOL_IPSEC1) {
  4928. /* Compute a simple and inexpensive hash on all parts of the IP tuple except the protocol. */
  4929. hash = Load_simple_hash(svr_ipaddr, IPSEC_CTRL_PORT, client_ipaddr, IPSEC_CTRL_PORT);
  4930. /* Our index in all connection arrays is this hash, modulo the array size. */
  4931. vindex = hash % CVY_MAX_CHASH;
  4932. /* Look for an existing matching virtual connection descriptor. Now that we have the load module pointer
  4933. from finding the first descriptor, we can narrow our search and look only for virtual descriptors
  4934. that reside on our load module. */
  4935. ep = Load_find_dscr(lp, vindex, svr_ipaddr, IPSEC_CTRL_PORT, client_ipaddr, IPSEC_CTRL_PORT, TCPIP_PROTOCOL_IPSEC_UDP);
  4936. /* Dereference the virtual IPSec/UDP descriptor. */
  4937. (VOID)Load_destroy_dscr(lp, bp, ep, conn_status);
  4938. }
  4939. TRACE_FILTER("%!FUNC! Accept packet - state found: Port rule = %u, Index = %u, Bin = %u, Current map = 0x%015I64x, "
  4940. "All idle map = 0x%015I64x, Connections = %u, Cleanup waiting = %u, Dirty %u",
  4941. bp->index, index, bin, bp->cmap, bp->all_idle_map, bp->nconn[bin], lp->cln_waiting, lp->dirty_bin[bin]);
  4942. acpt = TRUE;
  4943. LOCK_EXIT(&(lp->lock), irql);
  4944. /* Unlock the load module. */
  4945. NdisReleaseSpinLock(lockp);
  4946. unlock:
  4947. /* Unlock the global established connection queue. */
  4948. NdisReleaseSpinLock(&g_conn_estabq[index].lock);
  4949. exit:
  4950. TRACE_FILTER("%!FUNC! Exit: acpt = %u", acpt);
  4951. return acpt;
  4952. }
  4953. /*
  4954. * Function: Load_conn_pending
  4955. * Description: This function is called to create state for a pending OUTGOING connection on
  4956. * the server. Because at this time, it is unknown on what interface the connection
  4957. * will ultimately be established, NLB creates global state to track the connection
  4958. * only until it is established. For TCP, when the SYN+ACK arrives from the peer,
  4959. * we only accept it if we find a match in our pending connection queues. When the
  4960. * connection is established, this state is destroyed and new state is created to
  4961. * track the connection is appropriate.
  4962. * Parameters: svr_ipaddr - the server IP address in network byte order
  4963. * svr_port - the server port in host byte order
  4964. * client_ipaddr - the client IP address in network byte order
  4965. * client_port - the client port in host byte order
  4966. * protocol - the protocol of this connection
  4967. * Returns: BOOLEAN - whether or not state was successfully created to track this pending connection.
  4968. * Author: shouse, 4.15.02
  4969. * Notes: DO NOT CALL THIS FUNCTION WITH THE LOAD LOCK HELD.
  4970. */
  4971. BOOLEAN Load_conn_pending (
  4972. ULONG svr_ipaddr,
  4973. ULONG svr_port,
  4974. ULONG client_ipaddr,
  4975. ULONG client_port,
  4976. USHORT protocol)
  4977. {
  4978. ULONG hash;
  4979. ULONG index;
  4980. PPENDING_ENTRY pp = NULL;
  4981. BOOLEAN acpt = TRUE;
  4982. TRACE_FILTER("%!FUNC! Enter: server IP = %u.%u.%u.%u, server port = %u, client IP = %u.%u.%u.%u, client port = %u, protocol = %u",
  4983. IP_GET_OCTET(svr_ipaddr, 0), IP_GET_OCTET(svr_ipaddr, 1), IP_GET_OCTET(svr_ipaddr, 2), IP_GET_OCTET(svr_ipaddr, 3), svr_port,
  4984. IP_GET_OCTET(client_ipaddr, 0), IP_GET_OCTET(client_ipaddr, 1), IP_GET_OCTET(client_ipaddr, 2), IP_GET_OCTET(client_ipaddr, 3), client_port, protocol);
  4985. /* Compute a simple and inexpensive hash on all parts of the IP tuple except the protocol. */
  4986. hash = Load_simple_hash(svr_ipaddr, svr_port, client_ipaddr, client_port);
  4987. /* Our index in all connection arrays is this hash, modulo the array size. */
  4988. index = hash % CVY_MAX_CHASH;
  4989. /* If we falied to allocate the pending connection descriptor pool, bail out. */
  4990. if (g_pending_conn_pool == NULL)
  4991. {
  4992. /* Creation of the global pending connection state pool failed. */
  4993. TRACE_FILTER("%!FUNC! Drop packet - no global connection pending pool: Index = %u", index);
  4994. acpt = FALSE;
  4995. goto exit;
  4996. }
  4997. /* Allocate a descriptor from the fixed-size block pool. */
  4998. pp = (PPENDING_ENTRY)NdisAllocateFromBlockPool(g_pending_conn_pool);
  4999. if (pp == NULL) {
  5000. /* Allocation failed, bail out. */
  5001. TRACE_FILTER("%!FUNC! Drop packet - unable to allocate a pending connection entry: Index = %u", index);
  5002. acpt = FALSE;
  5003. goto exit;
  5004. }
  5005. /* Initialize the link. */
  5006. Link_init(&pp->link);
  5007. /* Fill in the "magic number". */
  5008. pp->code = CVY_PENDINGCODE;
  5009. /* Fill in the IP tuple. */
  5010. CVY_PENDING_SET(pp, svr_ipaddr, svr_port, client_ipaddr, client_port, protocol);
  5011. /* ALWAYS lock the global queues BEFORE locking the load module itself. */
  5012. NdisAcquireSpinLock(&g_conn_pendingq[index].lock);
  5013. /* Insert the descriptor into the global connection queue. */
  5014. g_conn_pendingq[index].length++;
  5015. Queue_enq(&g_conn_pendingq[index].queue, &pp->link);
  5016. /* Unlock the global pending connection queue. */
  5017. NdisReleaseSpinLock(&g_conn_pendingq[index].lock);
  5018. TRACE_FILTER("%!FUNC! Accept packet - pending connection state created: Index = %u", index);
  5019. acpt = TRUE;
  5020. exit:
  5021. TRACE_FILTER("%!FUNC! Exit: acpt = %u", acpt);
  5022. return acpt;
  5023. }
  5024. /*
  5025. * Function: Load_pending_check
  5026. * Description: This function is called to determine whether or not state exists in the pending
  5027. * connection queues for this connection. If it does, the packet should be accepted.
  5028. * If no state exists, the packet should be dropped.
  5029. * Parameters: svr_ipaddr - the server IP address in network byte order
  5030. * svr_port - the server port in host byte order
  5031. * client_ipaddr - the client IP address in network byte order
  5032. * client_port - the client port in host byte order
  5033. * protocol - the protocol of this connection
  5034. * Returns: BOOLEAN - whether or not to accept the packet.
  5035. * Author: shouse, 4.15.02
  5036. * Notes: DO NOT CALL THIS FUNCTION WITH THE LOAD LOCK HELD.
  5037. */
  5038. BOOLEAN Load_pending_check (
  5039. ULONG svr_ipaddr,
  5040. ULONG svr_port,
  5041. ULONG client_ipaddr,
  5042. ULONG client_port,
  5043. USHORT protocol)
  5044. {
  5045. ULONG hash;
  5046. ULONG index;
  5047. PPENDING_ENTRY pp = NULL;
  5048. BOOLEAN match = FALSE;
  5049. BOOLEAN acpt = TRUE;
  5050. TRACE_FILTER("%!FUNC! Enter: server IP = %u.%u.%u.%u, server port = %u, client IP = %u.%u.%u.%u, client port = %u, protocol = %u",
  5051. IP_GET_OCTET(svr_ipaddr, 0), IP_GET_OCTET(svr_ipaddr, 1), IP_GET_OCTET(svr_ipaddr, 2), IP_GET_OCTET(svr_ipaddr, 3), svr_port,
  5052. IP_GET_OCTET(client_ipaddr, 0), IP_GET_OCTET(client_ipaddr, 1), IP_GET_OCTET(client_ipaddr, 2), IP_GET_OCTET(client_ipaddr, 3), client_port, protocol);
  5053. /* Compute a simple and inexpensive hash on all parts of the IP tuple except the protocol. */
  5054. hash = Load_simple_hash(svr_ipaddr, svr_port, client_ipaddr, client_port);
  5055. /* Our index in all connection arrays is this hash, modulo the array size. */
  5056. index = hash % CVY_MAX_CHASH;
  5057. /* ALWAYS lock the global queues BEFORE locking the load module itself. */
  5058. NdisAcquireSpinLock(&g_conn_pendingq[index].lock);
  5059. /* Grab the entry at the front of this pending connection queue. */
  5060. pp = (PPENDING_ENTRY)Queue_front(&g_conn_pendingq[index].queue);
  5061. while (pp != NULL) {
  5062. UNIV_ASSERT(pp->code == CVY_PENDINGCODE);
  5063. /* Look for a matching descriptor. */
  5064. if (CVY_PENDING_MATCH(pp, svr_ipaddr, svr_port, client_ipaddr, client_port, protocol)) {
  5065. match = TRUE;
  5066. break;
  5067. }
  5068. /* Get the next item in the queue. */
  5069. pp = (PPENDING_ENTRY)Queue_next(&g_conn_pendingq[index].queue, &(pp->link));
  5070. }
  5071. /* If no matching descriptor was found, bail out. */
  5072. if (!match) {
  5073. TRACE_FILTER("%!FUNC! Drop packet - no matching pending connection state for SYN+ACK: Index = %u", index);
  5074. acpt = FALSE;
  5075. goto exit;
  5076. }
  5077. TRACE_FILTER("%!FUNC! Accept packet - pending connection state found: Index = %u", index);
  5078. acpt = TRUE;
  5079. exit:
  5080. /* Unlock the global pending connection queue. */
  5081. NdisReleaseSpinLock(&g_conn_pendingq[index].lock);
  5082. TRACE_FILTER("%!FUNC! Exit: acpt = %u", acpt);
  5083. return acpt;
  5084. }
  5085. /*
  5086. * Function: Load_conn_establish
  5087. * Description: This function is invoked when a pending connection has become established.
  5088. * When the pending connection is established, its state in the pending
  5089. * connection queues is destroyed. If the connection was ultimately established
  5090. * on an NLB adapter (if lp != NULL), then state will be created to track this
  5091. * new connection. Otherwise, the operation consists only of destroying the
  5092. * pending connection state.
  5093. * Parameters: lp - a pointer to the load module context (LOAD_CTXT)
  5094. * svr_ipaddr - the server IP address in network byte order
  5095. * svr_port - the server port in host byte order
  5096. * client_ipaddr - the client IP address in network byte order
  5097. * client_port - the client port in host byte order
  5098. * protocol - the protocol of this connection
  5099. * limit_map_fn - whether or not to include server-side parameters in hashing
  5100. * reverse_hash - whether or not to reverse client and server during hashing
  5101. * Returns: BOOLEAN - whether or not the operation was successfully completed.
  5102. * Author: shouse, 4.15.02
  5103. * Notes: DO NOT CALL THIS FUNCTION WITH THE LOAD LOCK HELD.
  5104. */
  5105. BOOLEAN Load_conn_establish (
  5106. PLOAD_CTXT lp,
  5107. ULONG svr_ipaddr,
  5108. ULONG svr_port,
  5109. ULONG client_ipaddr,
  5110. ULONG client_port,
  5111. USHORT protocol,
  5112. BOOLEAN limit_map_fn,
  5113. BOOLEAN reverse_hash)
  5114. {
  5115. ULONG hash;
  5116. ULONG index;
  5117. PPENDING_ENTRY pp = NULL;
  5118. BOOLEAN match = FALSE;
  5119. BOOLEAN acpt = TRUE;
  5120. TRACE_FILTER("%!FUNC! Enter: lp = %p, server IP = %u.%u.%u.%u, server port = %u, client IP = %u.%u.%u.%u, client port = %u, protocol = %u, limit map = %u, reverse hash = %u",
  5121. lp, IP_GET_OCTET(svr_ipaddr, 0), IP_GET_OCTET(svr_ipaddr, 1), IP_GET_OCTET(svr_ipaddr, 2), IP_GET_OCTET(svr_ipaddr, 3), svr_port,
  5122. IP_GET_OCTET(client_ipaddr, 0), IP_GET_OCTET(client_ipaddr, 1), IP_GET_OCTET(client_ipaddr, 2), IP_GET_OCTET(client_ipaddr, 3), client_port,
  5123. protocol, limit_map_fn, reverse_hash);
  5124. /* Compute a simple and inexpensive hash on all parts of the IP tuple except the protocol. */
  5125. hash = Load_simple_hash(svr_ipaddr, svr_port, client_ipaddr, client_port);
  5126. /* Our index in all connection arrays is this hash, modulo the array size. */
  5127. index = hash % CVY_MAX_CHASH;
  5128. /* ALWAYS lock the global queues BEFORE locking the load module itself. */
  5129. NdisAcquireSpinLock(&g_conn_pendingq[index].lock);
  5130. /* Grab the entry at the front of this pending connection queue. */
  5131. pp = (PPENDING_ENTRY)Queue_front(&g_conn_pendingq[index].queue);
  5132. while (pp != NULL) {
  5133. UNIV_ASSERT(pp->code == CVY_PENDINGCODE);
  5134. /* Look for a matching descriptor. */
  5135. if (CVY_PENDING_MATCH(pp, svr_ipaddr, svr_port, client_ipaddr, client_port, protocol)) {
  5136. match = TRUE;
  5137. break;
  5138. }
  5139. /* Get the next item in the queue. */
  5140. pp = (PPENDING_ENTRY)Queue_next(&g_conn_pendingq[index].queue, &(pp->link));
  5141. }
  5142. /* If no matching descriptor was found, bail out. */
  5143. if (!match) {
  5144. TRACE_FILTER("%!FUNC! Drop packet - no matching pending connection state: Index = %u", index);
  5145. /* Unlock the global pending connection queue. */
  5146. NdisReleaseSpinLock(&g_conn_pendingq[index].lock);
  5147. acpt = FALSE;
  5148. goto exit;
  5149. }
  5150. UNIV_ASSERT(pp);
  5151. /* Remove the pending connection entry from the pending queue. */
  5152. g_conn_pendingq[index].length--;
  5153. Link_unlink(&pp->link);
  5154. /* Unlock the global pending connection queue. */
  5155. NdisReleaseSpinLock(&g_conn_pendingq[index].lock);
  5156. /* Free the descriptor back to the fixed-size block pool. */
  5157. NdisFreeToBlockPool((PUCHAR)pp);
  5158. /* If the load module pointer is non-NULL, then this connection is being established on
  5159. an NLB adapter. If so, call Load_conn_up to create state to track the connection. */
  5160. if (lp != NULL) {
  5161. UNIV_ASSERT(lp->code == CVY_LOADCODE);
  5162. /* Create state for the connection. */
  5163. acpt = Load_conn_up(lp, svr_ipaddr, svr_port, client_ipaddr, client_port, protocol, limit_map_fn, reverse_hash);
  5164. }
  5165. exit:
  5166. TRACE_FILTER("%!FUNC! Exit: acpt = %u", acpt);
  5167. return acpt;
  5168. }
  5169. #endif