Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

3952 lines
134 KiB

/*++
Copyright(c) 1998,99 Microsoft Corporation
Module Name:
load.c
Abstract:
Windows Load Balancing Service (WLBS)
Driver - load balancing algorithm
Author:
bbain
ToDo:
Kernel mode queue mgt
Fail safe mode (single server for everything)
--*/
#ifdef KERNEL_MODE
#include <ntddk.h>
#include "log.h"
#include "univ.h"
#include "main.h" // added for multiple nic
static ULONG log_module_id = LOG_MODULE_LOAD;
#else
#include <stdlib.h>
#include <windows.h>
#endif
#include <stdio.h>
#include "wlbsparm.h"
#include "params.h"
#include "wlbsiocl.h"
#include "wlbsip.h"
#include "load.h"
//
// For WPP Event Tracing
//
#include "trace.h" // for event tracing
#include "load.tmh" // for event tracing
#ifndef KERNEL_MODE
#define UNIV_PRINT(s) { printf s ; printf ("\n"); }
#define Univ_ulong_to_str(x, y, z) (y)
#define LOG_MSG(c,s)
#define LOG_MSG1(c,s,d1)
#define LOG_MSG2(c,s,d1,d2)
#define LOG_MSG3(c,s,d1,d2,d3)
#define LOG_MSG4(c,s,d1,d2,d3,d4)
#else
#define malloc(s) ExAllocatePoolWithTag (NonPagedPool, s, UNIV_POOL_TAG)
#define free(s) ExFreePool (s)
#endif
//extern CVY_PARAMS univ_params;
//#define univ_params ( * (lp -> params))
void Bin_state_print(PBIN_STATE bp, ULONG my_host_id);
void Load_conn_kill(PLOAD_CTXT lp, PBIN_STATE bp); /* v1.32B */
// static WCHAR buf [256]; /* string buffer (V1.1.2) */
/* CONSTANTS */
#if 0 /* v2.06 */
#define BIN_ALL_ONES ((MAP_T)-1) /* bin map state for 64 ones (v2.04) */
#endif
#define BIN_ALL_ONES ((MAP_T)(0xFFFFFFFFFFFFFFF)) /* bin map state for 60 ones (v2.04) */
/* FUNCTIONS */
/* Byte offset of a field in a structure of the specified type: */
#define CVY_FIELD_OFFSET(type, field) ((LONG_PTR)&(((type *)0)->field))
/*
* Address of the base of the structure given its type, field name, and the
* address of a field or field offset within the structure:
*/
#define STRUCT_PTR(address, type, field) ((type *)( \
(PCHAR)(address) - \
(PCHAR)CVY_FIELD_OFFSET(type, field)))
/*
* Function: Load_teaming_consistency_notify
* Description: This function is called to notify a team in which this adapter
* might be participating whether the teaming configuration in the
* heartbeats is consistent or not. Inconsistent configuration
* results in the entire team being marked inactive - meaning that
* no adapter in the team will handle any traffic, except to the DIP.
* Parameters: member - a pointer to the team membership information for this adapter.
* consistent - a boolean indicating the polarity of teaming consistency.
* Returns: Nothing.
* Author: shouse, 3.29.01
* Notes: In order to check to see whether or not this adapter is part of a team,
* we need to look into the team member information for this adapter. This
* access should be locked, but for performance reasons, we will only lock
* and check for sure if we "think" we're part of a team. Worst case is that
* we are in the process of joining a team and we missed this check - no
* matter, we'll notify them when/if we see this again.
*/
VOID Load_teaming_consistency_notify (IN PBDA_MEMBER member, IN BOOL consistent) {
/* Make sure that the membership information points to something. */
ASSERT(member);
/* We can check without locking to keep the common case minimally expensive. If we do think
we're part of a team, then we'll grab the lock and make sure. If our first indication is
that we're not part of a team, then just bail out and if we actually are part of a team,
we'll be through here again later to notify our team if necessary. */
if (!member->active) return;
NdisAcquireSpinLock(&univ_bda_teaming_lock);
/* If we are an active member of a BDA team, then notify our team of our state. */
if (member->active) {
/* Assert that the team actually points to something. */
ASSERT(member->bda_team);
/* Assert that the member ID is valid. */
ASSERT(member->member_id <= CVY_BDA_MAXIMUM_MEMBER_ID);
if (consistent) {
UNIV_PRINT(("Load_teaming_consistency_notify: Consistent configuration detected."));
/* Mark this member as consistent. */
member->bda_team->consistency_map |= (1 << member->member_id);
} else {
UNIV_PRINT(("Load_teaming_consistency_notify: Inconsistent configuration detected."));
/* Mark this member as inconsistent. */
member->bda_team->consistency_map &= ~(1 << member->member_id);
/* Inactivate the team. */
member->bda_team->active = FALSE;
}
}
NdisReleaseSpinLock(&univ_bda_teaming_lock);
}
/*
* Function: Load_teaming_consistency_check
* Description: This function is used to check our teaming configuration against the
* teaming configuration received in a remote heartbeat. It does little
* more than check the equality of two DWORDS, however, if this is our
* first notification of bad configuration, it prints a few debug state-
* ments as well.
* Parameters: bAlreadyKnown - a boolean indication of whether or not we have already detected bad configuration.
* If the misconfiguration is already known, no additional logging is done.
* member - a pointer to the team member structure for this adapter.
* myConfig - a DWORD containing the teaming "code" for me.
* theirCofnig - a DWORD containing the teaming "code" received in the heartbeat from them.
* Returns: BOOLEAN (as ULONG) - TRUE means the configuration is consistent, FALSE indicates that it is not.
* Author: shouse, 3.29.01
* Notes: In order to check to see whether or not this adapter is part of a team,
* we need to look into the team member information for this adapter. This
* access should be locked, but for performance reasons, we will only lock
* and check for sure if we "think" we're part of a team. Worst case is that
* we are in the process of joining a team and we missed this check - no
* matter, we'll check again on the next heartbeat.
*/
ULONG Load_teaming_consistency_check (IN BOOLEAN bAlreadyKnown, IN PBDA_MEMBER member, IN ULONG myConfig, IN ULONG theirConfig) {
/* We can check without locking to keep the common case minimally expensive. If we do think
we're part of a team, then we'll grab the lock and make sure. If our first indication is
that we're not part of a team, then just bail out and if we actually are part of a team,
we'll be through here again later to check the consistency. */
if (!member->active) return TRUE;
NdisAcquireSpinLock(&univ_bda_teaming_lock);
/* If we are part of a BDA team, check the BDA teaming configuration consistency. */
if (member->active) {
NdisReleaseSpinLock(&univ_bda_teaming_lock);
/* If the bi-directional affinity teaming configurations don't match, do something about it. */
if (myConfig != theirConfig) {
if (!bAlreadyKnown) {
UNIV_PRINT(("Bad teaming configuration detected: Mine=0x%08x, Theirs=0x%08x", myConfig, theirConfig));
/* Report whether or not the teaming active flags are consistent. */
if ((myConfig & CVY_BDA_TEAMING_CODE_ACTIVE_MASK) != (theirConfig & CVY_BDA_TEAMING_CODE_ACTIVE_MASK)) {
UNIV_PRINT(("Teaming active flags do not match: Mine=%d, Theirs=%d",
(myConfig & CVY_BDA_TEAMING_CODE_ACTIVE_MASK) >> CVY_BDA_TEAMING_CODE_ACTIVE_OFFSET,
(theirConfig & CVY_BDA_TEAMING_CODE_ACTIVE_MASK) >> CVY_BDA_TEAMING_CODE_ACTIVE_OFFSET));
}
/* Report whether or not the master flags are consistent. */
if ((myConfig & CVY_BDA_TEAMING_CODE_MASTER_MASK) != (theirConfig & CVY_BDA_TEAMING_CODE_MASTER_MASK)) {
UNIV_PRINT(("Master/slave settings do not match: Mine=%d, Theirs=%d",
(myConfig & CVY_BDA_TEAMING_CODE_MASTER_MASK) >> CVY_BDA_TEAMING_CODE_MASTER_OFFSET,
(theirConfig & CVY_BDA_TEAMING_CODE_MASTER_MASK) >> CVY_BDA_TEAMING_CODE_MASTER_OFFSET));
}
/* Report whether or not the reverse hashing flags are consistent. */
if ((myConfig & CVY_BDA_TEAMING_CODE_HASHING_MASK) != (theirConfig & CVY_BDA_TEAMING_CODE_HASHING_MASK)) {
UNIV_PRINT(("Reverse hashing flags do not match: Mine=%d, Theirs=%d",
(myConfig & CVY_BDA_TEAMING_CODE_HASHING_MASK) >> CVY_BDA_TEAMING_CODE_HASHING_OFFSET,
(theirConfig & CVY_BDA_TEAMING_CODE_HASHING_MASK) >> CVY_BDA_TEAMING_CODE_HASHING_OFFSET));
}
/* Report whether or not the number of team members is consistent. */
if ((myConfig & CVY_BDA_TEAMING_CODE_NUM_MEMBERS_MASK) != (theirConfig & CVY_BDA_TEAMING_CODE_NUM_MEMBERS_MASK)) {
UNIV_PRINT(("Numbers of team members do not match: Mine=%d, Theirs=%d",
(myConfig & CVY_BDA_TEAMING_CODE_NUM_MEMBERS_MASK) >> CVY_BDA_TEAMING_CODE_NUM_MEMBERS_OFFSET,
(theirConfig & CVY_BDA_TEAMING_CODE_NUM_MEMBERS_MASK) >> CVY_BDA_TEAMING_CODE_NUM_MEMBERS_OFFSET));
}
/* Report whether or not the team membership lists are consistent. */
if ((myConfig & CVY_BDA_TEAMING_CODE_MEMBERS_MASK) != (theirConfig & CVY_BDA_TEAMING_CODE_MEMBERS_MASK)) {
UNIV_PRINT(("Participating members lists do not match: Mine=0x%04x, Theirs=0x%04x",
(myConfig & CVY_BDA_TEAMING_CODE_MEMBERS_MASK) >> CVY_BDA_TEAMING_CODE_MEMBERS_OFFSET,
(theirConfig & CVY_BDA_TEAMING_CODE_MEMBERS_MASK) >> CVY_BDA_TEAMING_CODE_MEMBERS_OFFSET));
}
}
return FALSE;
}
return TRUE;
}
NdisReleaseSpinLock(&univ_bda_teaming_lock);
return TRUE;
}
/*
* Function: Load_teaming_code_create
* Description: This function pieces together the ULONG code that represents the configuration
* of bi-directional affinity teaming on this adapter. If the adapter is not part
* of a team, then the code is zero.
* Parameters: code - a pointer to a ULONG that will receive the 32-bit code word.
* member - a pointer to the team member structure for this adapter.
* Returns: Nothing.
* Author: shouse, 3.29.01
* Notes: In order to check to see whether or not this adapter is part of a team,
* we need to look into the team member information for this adapter. This
* access should be locked, but for performance reasons, we will only lock
* and check for sure if we "think" we're part of a team. Worst case is that
* we are in the process of joining a team and we missed this check - no
* matter, we'll be through here the next time er send a heartbeat anyway.
*/
VOID Load_teaming_code_create (OUT PULONG code, IN PBDA_MEMBER member) {
/* Assert that the code actually points to something. */
ASSERT(code);
/* Assert that the membership information actually points to something. */
ASSERT(member);
/* Reset the code. */
*code = 0;
/* We can check without locking to keep the common case minimally expensive. If we do think
we're part of a team, then we'll grab the lock and make sure. If our first indication is
that we're not part of a team, then just bail out and if we actually are part of a team,
we'll be through here again later to generate the code next time we send a heartbeat. */
if (!member->active) return;
NdisAcquireSpinLock(&univ_bda_teaming_lock);
/* If we are in a team, fill in the team configuration information. */
if (member->active) {
/* Assert that the team actually points to something. */
ASSERT(member->bda_team);
/* Add configuration information for teaming at each timeout. */
CVY_BDA_TEAMING_CODE_CREATE(*code,
member->active,
member->master,
member->reverse_hash,
member->bda_team->membership_count,
member->bda_team->membership_fingerprint);
}
NdisReleaseSpinLock(&univ_bda_teaming_lock);
}
/*
* Function: Load_add_reference
* Description: This function adds a reference to the load module of a given adapter.
* Parameters: pLoad - a pointer to the load module to reference.
* Returns: ULONG - The incremented value.
* Author: shouse, 3.29.01
* Notes:
*/
ULONG Load_add_reference (IN PLOAD_CTXT pLoad) {
/* Assert that the load pointer actually points to something. */
ASSERT(pLoad);
/* Increment the reference count. */
return NdisInterlockedIncrement(&pLoad->ref_count);
}
/*
* Function: Load_release_reference
* Description: This function releases a reference on the load module of a given adapter.
* Parameters: pLoad - a pointer to the load module to dereference.
* Returns: ULONG - The decremented value.
* Author: shouse, 3.29.01
* Notes:
*/
ULONG Load_release_reference (IN PLOAD_CTXT pLoad) {
/* Assert that the load pointer actually points to something. */
ASSERT(pLoad);
/* Decrement the reference count. */
return NdisInterlockedDecrement(&pLoad->ref_count);
}
/*
* Function: Load_get_reference_count
* Description: This function returns the current reference count on a given adapter.
* Parameters: pLoad - a pointer to the load module to check.
* Returns: ULONG - The current reference count.
* Author: shouse, 3.29.01
* Notes:
*/
ULONG Load_get_reference_count (IN PLOAD_CTXT pLoad) {
/* Assert that the load pointer actually points to something. */
ASSERT(pLoad);
/* Return the reference count. */
return pLoad->ref_count;
}
/* Hash routine is based on a public-domain Tiny Encryption Algorithm (TEA) by
David Wheeler and Roger Needham at the Computer Laboratory of Cambridge
University. For reference, please consult
http://vader.brad.ac.uk/tea/tea.shtml */
ULONG Map (
ULONG v1,
ULONG v2) /* v2.06: removed range parameter */
{
ULONG y = v1,
z = v2,
sum = 0;
const ULONG a = 0x67; //key [0];
const ULONG b = 0xdf; //key [1];
const ULONG c = 0x40; //key [2];
const ULONG d = 0xd3; //key [3];
const ULONG delta = 0x9E3779B9;
//
// Unroll the loop to improve performance
//
sum += delta;
y += (z << 4) + a ^ z + sum ^ (z >> 5) + b;
z += (y << 4) + c ^ y + sum ^ (y >> 5) + d;
sum += delta;
y += (z << 4) + a ^ z + sum ^ (z >> 5) + b;
z += (y << 4) + c ^ y + sum ^ (y >> 5) + d;
sum += delta;
y += (z << 4) + a ^ z + sum ^ (z >> 5) + b;
z += (y << 4) + c ^ y + sum ^ (y >> 5) + d;
sum += delta;
y += (z << 4) + a ^ z + sum ^ (z >> 5) + b;
z += (y << 4) + c ^ y + sum ^ (y >> 5) + d;
sum += delta;
y += (z << 4) + a ^ z + sum ^ (z >> 5) + b;
z += (y << 4) + c ^ y + sum ^ (y >> 5) + d;
sum += delta;
y += (z << 4) + a ^ z + sum ^ (z >> 5) + b;
z += (y << 4) + c ^ y + sum ^ (y >> 5) + d;
sum += delta;
y += (z << 4) + a ^ z + sum ^ (z >> 5) + b;
z += (y << 4) + c ^ y + sum ^ (y >> 5) + d;
sum += delta;
y += (z << 4) + a ^ z + sum ^ (z >> 5) + b;
z += (y << 4) + c ^ y + sum ^ (y >> 5) + d;
return y ^ z;
} /* end Map */
BOOLEAN Bin_targ_map_get(
PLOAD_CTXT lp,
PBIN_STATE binp, /* ptr. to bin state */
ULONG my_host_id,
PMAP_T pmap) /* ptr. to target map */
/*
Get target map for this host
returns BOOLEAN:
TRUE => valid target map is returned via pmap
FALSE => error occurred; no target map returned
*/
{
ULONG remsz, /* remainder size */
loadsz, /* size of a load partition */
first_bit; /* first bit position of load partition */
MAP_T targ_map; /* bit map of load bins for this host */
ULONG tot_load = 0; /* total of load perecentages */
ULONG * pload_list; /* ptr. to list of load balance perecntages */
WCHAR num [20];
PMAIN_CTXT ctxtp = CONTAINING_RECORD (lp, MAIN_CTXT, load);
pload_list = binp->load_amt;
if (binp->mode == CVY_SINGLE)
{
ULONG max_pri; /* highest priority */
ULONG i;
first_bit = 0;
/* compute max priority */
max_pri = CVY_MAX_HOSTS + 1;
for (i=0; i<CVY_MAX_HOSTS; i++)
{
tot_load += pload_list[i]; /* v2.1 */
if (pload_list[i] != 0)
{
//
// If another host has the same priority as this host, do not converge
//
if (i!= my_host_id && pload_list[i] == pload_list[my_host_id])
{
if (!(lp->dup_sspri))
{
UNIV_PRINT(("Host %d: duplicate single svr priorities detected", my_host_id));
Univ_ulong_to_str (pload_list[my_host_id], num, 10);
LOG_MSG(MSG_ERROR_SINGLE_DUP, num);
lp->dup_sspri = TRUE;
}
/* 1.03: return error, which inhibits convergence; note that
rule will be automatically reinstated when duplicate server
priorities are eliminated */
return FALSE;
}
if ( pload_list[i] <= max_pri )
{
max_pri = pload_list[i];
}
}
}
binp->tot_load = tot_load; /* v2.1 */
/* now determine if we are the highest priority host */
if (pload_list[my_host_id] == max_pri)
{
loadsz = CVY_MAXBINS;
targ_map = BIN_ALL_ONES; /* v2.05 */
}
else
{
loadsz = 0;
targ_map = 0; /* v2.05 */
}
}
else /* load balanced */
{
ULONG i, j;
ULONG partsz[CVY_MAX_HOSTS+1];
/* new partition size per host */
ULONG cur_partsz[CVY_MAX_HOSTS+1];
/* current partition size per host (v2.05) */
ULONG cur_host[CVY_MAXBINS];
/* current host for each bin (v2.05) */
ULONG tot_partsz; /* sum of partition sizes */
ULONG donor; /* current donor host (v2.05) */
ULONG cur_nbins; /* current # bins (v2.05) */
/* setup current partition sizes and bin to host mapping from current map (v2.05) */
cur_nbins = 0;
for (j=0; j<CVY_MAXBINS; j++)
cur_host[j] = CVY_MAX_HOSTS; /* all bins are initially orphans */
for (i=0; i<CVY_MAX_HOSTS; i++)
{
ULONG count = 0L;
MAP_T cmap = binp->cur_map[i];
tot_load += pload_list[i]; /* folded into this loop v2.1 */
for (j=0; j<CVY_MAXBINS && cmap != ((MAP_T)0); j++)
{
/* if host i has bin j and it's not a duplicate, set up the mapping */
if ((cmap & ((MAP_T)0x1)) != ((MAP_T)0) && cur_host[j] == CVY_MAX_HOSTS)
{
count++;
cur_host[j] = i;
}
cmap >>= 1;
}
cur_partsz[i] = count;
cur_nbins += count;
}
if (cur_nbins > CVY_MAXBINS)
{
UNIV_PRINT(("Bin_targ_map_get: error - too many bins found"));
LOG_MSG(MSG_ERROR_INTERNAL, MSG_NONE);
cur_nbins = CVY_MAXBINS;
}
/* if there are orphan bins, give them to pseudo-host CVY_MAX_HOSTS for now (v2.05) */
if (cur_nbins < CVY_MAXBINS)
cur_partsz[CVY_MAX_HOSTS] = CVY_MAXBINS - cur_nbins;
else
cur_partsz[CVY_MAX_HOSTS] = 0;
/* compute total load */
binp->tot_load = tot_load; /* v2.06 */
/* now compute tentative partition sizes and remainder after initially
dividing up partitions among hosts */
tot_partsz = 0;
first_bit = 0;
for (i=0; i<CVY_MAX_HOSTS; i++)
{
if (tot_load > 0)
partsz[i] = CVY_MAXBINS * pload_list[i] / tot_load;
else
partsz[i] = 0;
tot_partsz += partsz[i];
}
remsz = CVY_MAXBINS - tot_partsz;
/* check for zero total load */
if (tot_partsz == 0)
{
* pmap = 0;
return TRUE;
}
/* first dole out remainder bits to hosts that currently have bins (this
minimizes the number of bins that have to move) v2.05 */
if (remsz > 0)
{
for (i=0; i<CVY_MAX_HOSTS && remsz > 0; i++)
if (cur_partsz[i] > 0 && pload_list[i] > 0)
{
partsz[i]++;
remsz--;
}
}
/* now dole out remainder bits to hosts that currently have no bins (to maintain
the target load balance) v2.05 */
if (remsz > 0)
{
for (i=0; i<CVY_MAX_HOSTS && remsz > 0; i++)
if (cur_partsz[i] == 0 && pload_list[i] > 0)
{
partsz[i]++;
remsz--;
}
}
/* now dole out remainder bits among non-zero partitions round robin */
i = 0;
while (remsz > 0)
{
if (pload_list[i] > 0)
{
partsz[i]++;
remsz--;
}
i++;
if (i == CVY_MAX_HOSTS)
i = 0;
}
/* reallocate bins to target hosts to match new partition sizes (v2.05) */
donor = 0;
partsz[CVY_MAX_HOSTS] = 0; /* pseudo-host needs no bins */
for (i=0; i<CVY_MAX_HOSTS; i++)
{
ULONG rcvrsz; /* current receiver's target partition */
ULONG donorsz; /* current donor's target partition size */
/* find and give this host some bins */
rcvrsz = partsz[i];
while (rcvrsz > cur_partsz[i])
{
/* find a host with too many bins */
for (; donor < CVY_MAX_HOSTS; donor++)
if (partsz[donor] < cur_partsz[donor])
break;
/* if donor is pseudo-host and it's out of bins, give it more bins
to keep algorithm from looping; this should never happen */
if (donor >= CVY_MAX_HOSTS && cur_partsz[donor] == 0)
{
UNIV_PRINT(("Bin_targ_map_get: error - no donor bins"));
LOG_MSG(MSG_ERROR_INTERNAL, MSG_NONE);
cur_partsz[donor] = CVY_MAXBINS;
}
/* now find the donor's bins and give them to the target host */
donorsz = partsz[donor]; /* donor's target bin count */
for (j=0; j<CVY_MAXBINS; j++)
{
if (cur_host[j] == donor)
{
cur_host[j] = i;
cur_partsz[donor]--;
cur_partsz[i]++;
/* if this donor has no more to give, go find the next donor;
if this receiver needs no more, go on to next receiver */
if (donorsz == cur_partsz[donor] || rcvrsz == cur_partsz[i])
break;
}
}
/* if no bin was found, log a fatal error and exit */
if (j == CVY_MAXBINS)
{
UNIV_PRINT(("Bin_targ_map_get: error - no bin found"));
LOG_MSG(MSG_ERROR_INTERNAL, MSG_NONE);
break;
}
}
}
/* finally, compute bit mask for this host (v2.05) */
targ_map = 0;
for (j=0; j<CVY_MAXBINS; j++)
{
if (cur_host[j] == CVY_MAX_HOSTS)
{
UNIV_PRINT(("Bin_targ_map_get: error - incomplete mapping"));
LOG_MSG(MSG_ERROR_INTERNAL, MSG_NONE);
cur_host[j] = 0;
}
if (cur_host[j] == my_host_id)
targ_map |= ((MAP_T)1) << j;
}
}
* pmap = targ_map;
return TRUE;
} /* end Bin_targ_map_get */
BOOLEAN Bin_map_check(
ULONG tot_load, /* total load percentage (v2.06) */
PMAP_T pbin_map) /* bin map for all hosts */
{
MAP_T tot_map, /* total map for all hosts */
ovr_map, /* overlap map between hosts */
exp_tot_map; /* expected total map */
ULONG i;
/* compute expected total map (2.04) */
if (tot_load == 0) /* v2.06 */
return TRUE;
else
exp_tot_map = BIN_ALL_ONES;
/* compute total map and overlap map */
tot_map = ovr_map = 0;
for (i=0; i<CVY_MAX_HOSTS; i++)
{
ovr_map |= (pbin_map[i] & tot_map);
tot_map |= pbin_map[i];
}
if (tot_map == exp_tot_map && ovr_map == 0)
return TRUE;
else
return FALSE;
} /* end Bin_map_check */
BOOLEAN Bin_map_covering(
ULONG tot_load, /* total load percentage (v2.06) */
PMAP_T pbin_map) /* bin map for all hosts */
{
MAP_T tot_map, /* total map for all hosts */
exp_tot_map; /* expected total map */
ULONG i;
/* compute expected total map (v2.04) */
if (tot_load == 0) /* v2.06 */
return TRUE;
else
exp_tot_map = BIN_ALL_ONES;
/* compute total map and overlap map */
tot_map = 0;
for (i=0; i<CVY_MAX_HOSTS; i++)
{
tot_map |= pbin_map[i];
}
if (tot_map == exp_tot_map)
return TRUE;
else
return FALSE;
} /* end Bin_map_covering */
void Bin_state_init(
PLOAD_CTXT lp,
PBIN_STATE binp, /* ptr. to bin state */
ULONG index, /* index of bin state */
ULONG my_host_id,
ULONG mode,
ULONG prot,
BOOLEAN equal_bal, /* TRUE => balance equally across hosts */
USHORT affinity,
ULONG load_amt) /* this host's load percentage if unequal */
/*
Initialize bin state for a port group
*/
{
ULONG i; /* loop variable */
PMAIN_CTXT ctxtp = CONTAINING_RECORD (lp, MAIN_CTXT, load);
if ((equal_bal && mode == CVY_SINGLE) ||
(mode == CVY_SINGLE && load_amt > CVY_MAX_HOSTS) ||
index >= CVY_MAXBINS)
{
UNIV_ASSERT(FALSE); // This should never happen
}
binp->code = CVY_BINCODE; /* (bbain 8/19/99) */
binp->equal_bal = equal_bal;
binp->affinity = affinity;
binp->index = index;
binp->compatible = TRUE;
binp->mode = mode;
binp->prot = prot;
/* initialize target and new load maps */
binp->targ_map = 0;
binp->all_idle_map = BIN_ALL_ONES;
binp->cmap = 0; /* v2.1 */
for (i=0; i<CVY_MAX_HOSTS; i++)
{
binp->new_map[i] = 0;
binp->cur_map[i] = 0;
binp->chk_map[i] = 0;
binp->idle_map[i] = BIN_ALL_ONES;
}
/* initialize load percentages for all hosts */
if (equal_bal)
{
load_amt = CVY_EQUAL_LOAD;
}
binp->tot_load = load_amt;
for (i=0; i<CVY_MAX_HOSTS; i++)
{
if (i == my_host_id)
{
binp->orig_load_amt =
binp->load_amt[i] = load_amt;
}
else
binp->load_amt[i] = 0;
}
/* initialize requesting state to no requests active and all bins local or none */
binp->snd_bins = 0;
binp->rcv_bins = 0;
binp->rdy_bins = 0;
binp->idle_bins = BIN_ALL_ONES; /* we are initially idle */
/* perform first initialization only once (v2.06) */
if (!(binp->initialized))
{
binp->tconn = 0;
for (i=0; i<CVY_MAXBINS; i++)
{
binp->nconn[i] = 0;
}
Queue_init(&(binp->connq));
binp->initialized = TRUE;
}
} /* end Bin_state_init */
BOOLEAN Bin_converge(
PLOAD_CTXT lp,
PBIN_STATE binp, /* ptr. to bin state */
ULONG my_host_id)
/*
Explicitly attempt to converge new port group state
returns BOOL:
TRUE => all hosts have consistent new state for converging
FALSE => parameter error or inconsistent convergence state
*/
{
MAP_T orphan_map; /* map of orphans that this host will now own */
ULONG i;
/* determine new target load map; 1.03: return in error if no map generated */
if (!Bin_targ_map_get(lp, binp, my_host_id, &(binp->targ_map)))
return FALSE;
/* compute map of all currently orphan bins; note that all duplicates are
considered to be orphans */
orphan_map = 0;
for (i=0; i<CVY_MAX_HOSTS; i++)
orphan_map |= binp->cur_map[i];
orphan_map = ~orphan_map;
/* update our new map to include all current bins and orphans that are in the
target set */
binp->new_map[my_host_id] = binp->cmap | /* v2.1 */
(binp->targ_map & orphan_map); /* 1.03 */
/* check that new load maps are consistent and covering */
return Bin_map_check(binp->tot_load, binp->new_map); /* v2.06 */
} /* end Bin_converge */
void Bin_converge_commit(
PLOAD_CTXT lp,
PBIN_STATE binp, /* ptr. to bin state */
ULONG my_host_id)
/*
Commit to new port group state
*/
{
ULONG i;
PMAIN_CTXT ctxtp = CONTAINING_RECORD (lp, MAIN_CTXT, load);
/* check that new load maps are consistent and covering */
if (!(Bin_map_check(binp->tot_load, binp->new_map))) /* v2.06 */
{
if (!(lp->bad_map))
{
UNIV_PRINT(("Bin_converge_commit: bad new map"));
LOG_MSG1(MSG_ERROR_INTERNAL, MSG_NONE, (ULONG_PTR)binp->new_map);
lp->bad_map = TRUE;
}
}
/* commit to new current maps */
for (i=0; i<CVY_MAX_HOSTS; i++)
{
binp->chk_map[i] =
binp->cur_map[i] = binp->new_map[i];
}
/* setup new send/rcv bins, and new ready to ship bins; note that ready to
ship bins are cleared from the current map */
binp->rdy_bins = binp->cur_map[my_host_id] & ~(binp->targ_map); /* 1.03 */
binp->cur_map[my_host_id] &= ~(binp->rdy_bins);
binp->rcv_bins = binp->targ_map & ~(binp->cur_map[my_host_id]);
binp->cmap = binp->cur_map[my_host_id]; /* v2.1 */
#if 0
/* simulation output generator (2.05) */
{
ULONG lcount = 0L;
ULONG ncount = 0L;
MAP_T bins = binp->rdy_bins;
for (i=0; i<CVY_MAXBINS && bins != 0; i++, bins >>= 1)
if ((bins & ((MAP_T)0x1)) != ((MAP_T)0))
lcount++;
bins = binp->targ_map;
for (i=0; i<CVY_MAXBINS && bins != 0; i++, bins >>= 1)
if ((bins & ((MAP_T)0x1)) != ((MAP_T)0))
ncount++;
printf("Connverge at host %d pg %d: losing %d, will have %d bins\n", my_host_id, binp->index,
lcount, ncount);
}
#endif
} /* end Bin_converge_commit */
BOOLEAN Bin_host_update(
PLOAD_CTXT lp,
PBIN_STATE binp, /* ptr. to bin state */
ULONG my_host_id, /* my host's id MINUS one */
BOOLEAN converging, /* TRUE => we are converging now */
BOOLEAN rem_converging, /* TRUE => remote host is converging */
ULONG rem_host, /* remote host's id MINUS one */
MAP_T cur_map, /* remote host's current map or 0 if host died */
MAP_T new_map, /* remote host's new map if converging */
MAP_T idle_map, /* remote host's idle map */
MAP_T rdy_bins, /* bins that host is ready to send; ignored
if converging to prevent bin transfers */
ULONG pkt_count, /* remote host's packet count */
ULONG load_amt) /* remote host's load percentage */
/*
Update hosts's state for a port group
returns BOOL:
TRUE => if not converging, normal return
otherwise, all hosts have consistent state for converging
FALSE => parameter error or inconsistent convergence state
function:
Updates hosts's state for a port group and attempts to converge new states if
in convergence mode. Called when a ping message is received or when a host
is considered to have died. Handles case of newly discovered hosts. Can be
called multiple times with the same information.
*/
{
ULONG i;
PMAIN_CTXT ctxtp = CONTAINING_RECORD (lp, MAIN_CTXT, load);
if (rem_host >= CVY_MAX_HOSTS || rem_host == my_host_id)
{
UNIV_PRINT(("Bin_host_update: parameter error"));
LOG_MSG2(MSG_ERROR_INTERNAL, MSG_NONE, rem_host+1, my_host_id+1);
return FALSE;
}
UNIV_ASSERT(binp->code == CVY_BINCODE); /* (bbain 8/19/99) */
#if 0 /* v2.06 */
/* update current load balance information */
if (binp->equal_bal && load_amt > 0)
{
load_amt = CVY_EQUAL_LOAD;
}
#endif
/* change load percentage if load changed */
if (load_amt != binp->load_amt[rem_host])
{
#if 0 /* v2.06 */
binp->tot_load += (load_amt - binp->load_amt[rem_host]);
#endif
binp->load_amt[rem_host] = load_amt;
}
/* check for non-overlapping maps */
if ((binp->cmap & cur_map) != 0) /* v2.1 */
{
/* if we have received fewer packets than the other host or have a higher host id,
remove duplicates from current map; this uses a heuristic that a newly joining
host that was subnetted probably did not receive packets; we are trying to avoid
having two hosts answer to the same client while minimizing disruption of service
(v1.32B) */
if (lp->send_msg.pkt_count < pkt_count ||
(lp->send_msg.pkt_count == pkt_count && rem_host < my_host_id))
{
MAP_T dup_map;
dup_map = binp->cmap & cur_map; /* v2.1 */
binp->cur_map[my_host_id] &= ~dup_map;
binp->cmap = binp->cur_map[my_host_id]; /* v2.1 */
Load_conn_kill(lp, binp);
}
if (!converging && !rem_converging)
{
if (!(lp->overlap_maps))
{
UNIV_PRINT(("Host %d: two hosts with overlapping maps detected %d.", my_host_id, binp->index));
LOG_MSG2(MSG_WARN_OVERLAP, MSG_NONE, my_host_id+1, binp->index);
lp->overlap_maps = TRUE;
}
/* force convergence if in normal operations */
return FALSE;
}
}
/* now update remote host's current map */
binp->cur_map[rem_host] = cur_map;
/* update idle map and calculate new global idle map if it's changed */
if (binp->idle_map[rem_host] != idle_map)
{
MAP_T saved_map = binp->all_idle_map;
MAP_T new_idle_map = BIN_ALL_ONES;
MAP_T tmp_map;
binp->idle_map[rem_host] = idle_map;
/* compute new idle map for all other hosts */
for (i=0; i<CVY_MAX_HOSTS; i++)
if (i != my_host_id)
new_idle_map &= binp->idle_map[i];
binp->all_idle_map = new_idle_map;
/* see which locally owned bins have gone idle in all other hosts */
tmp_map = new_idle_map & (~saved_map) & binp->cmap; /* v2.1 */
if (tmp_map != 0)
{
UNIV_PRINT(("Host %d pg %d: detected new all idle %08x for local bins",
my_host_id, binp->index, tmp_map));
}
tmp_map = saved_map & (~new_idle_map) & binp->cmap; /* v2.1 */
if (tmp_map != 0)
{
UNIV_PRINT(("Host %d pg %d: detected new non-idle %08x for local bins",
my_host_id, binp->index, tmp_map));
}
}
/* 1.03: eliminated else clause */
/* if we are not converging AND other host not converging, exchange bins;
convergence must now be complete for both hosts */
if (!converging)
{
if (!rem_converging) { /* 1.03: reorganized code to exchange bins only when both
hosts are not converging to avoid using stale bins */
MAP_T new_bins; /* incoming bins from the remote host */
/* check to see if remote host has received some bins from us */
binp->rdy_bins &= (~cur_map);
/* check to see if we can receive some bins */
new_bins = binp->rcv_bins & rdy_bins;
if (new_bins != 0)
{
if ((binp->cmap & new_bins) != 0) /* v2.1 */
{
if (!(lp->err_rcving_bins))
{
UNIV_PRINT(("Bin_host_update: receiving bins already own"));
LOG_MSG2(MSG_ERROR_INTERNAL, MSG_NONE, binp->cur_map[my_host_id], new_bins);
lp->err_rcving_bins = TRUE;
}
}
binp->cur_map[my_host_id] |= new_bins;
binp->rcv_bins &= ~new_bins;
binp->cmap = binp->cur_map[my_host_id]; /* v2.1 */
UNIV_PRINT(("====== host %d pg %d: received %08x ; cur now %08x",
my_host_id, binp->index, new_bins, binp->cur_map[my_host_id]));
}
/* do consistency check that all bins are covered */
binp->chk_map[rem_host] = cur_map | rdy_bins;
binp->chk_map[my_host_id] = binp->cmap | binp->rdy_bins; /* v2.1 */
if (!Bin_map_covering(binp->tot_load, binp->chk_map)) /* v2.06 */
{
if (!(lp->err_orphans))
{
#if 0
UNIV_PRINT(("Host %d: orphan bins detected", my_host_id));
LOG_MSG1(MSG_ERROR_INTERNAL, MSG_NONE, my_host_id+1);
#endif
lp->err_orphans = TRUE;
}
}
}
return TRUE;
}
/* otherwise, store proposed new load map and try to converge current host data */
else
{
binp->chk_map[rem_host] =
binp->new_map[rem_host] = new_map;
return Bin_converge(lp, binp, my_host_id);
}
} /* end Bin_host_update */
void Bin_state_print(
PBIN_STATE binp, /* ptr. to bin state */
ULONG my_host_id)
{
#if 0
ULONG i;
#endif
UNIV_PRINT(("hst %d binp %x: maps: targ %x cur %x new %x; eq %d mode %d amt %d tot %d; bins: snd %x rcv %x rdy %x",
my_host_id, binp, binp->targ_map, binp->cur_map[my_host_id], binp->new_map[my_host_id],
binp->equal_bal, binp->mode, binp->load_amt[my_host_id],
binp->tot_load, binp->snd_bins, binp->rcv_bins, binp->rdy_bins));
#if 0
for (i=0; i<CVY_MAX_HOSTS; i++)
{
UNIV_PRINT(("host %d: cur map %x new %x load_amt %d", i+1, binp->cur_map[i],
binp->new_map[i], binp->load_amt[i]));
}
for (i=0; i<CVY_MAXBINS; i++)
{
UNIV_PRINT(("bin %d: req_host %d bin_state %d nconn %d", i, binp->req_host[i],
binp->bin_state[i], binp->nconn[i]));
}
#endif
} /* end Bin_state_print */
void Load_conn_kill(
PLOAD_CTXT lp,
PBIN_STATE bp)
/*
Kill all connections in a port group (v1.32B)
*/
{
PCONN_ENTRY ep; /* ptr. to connection entry */
PCONN_DESCR dp; /* ptr. to connection descriptor */
QUEUE * qp; /* ptr. to bin's connection queue */
QUEUE * dqp; /* ptr. to dirty queue */
QUEUE * fqp; /* ptr. to free queue */
LONG count[CVY_MAXBINS];
/* count of cleaned up connections per bin for checking */
ULONG i;
BOOLEAN err_bin; /* bin id error detected */
BOOLEAN err_count; /* connection count error detected */
PMAIN_CTXT ctxtp = CONTAINING_RECORD (lp, MAIN_CTXT, load);
err_bin =
err_count = FALSE;
qp = &(bp->connq);
dqp = &(lp->conn_dirtyq);
fqp = &(lp->conn_freeq);
for (i=0; i<CVY_MAXBINS; i++)
count[i] = 0;
#ifdef TRACE_DIRTY
DbgPrint ("marking connections as dirty");
#endif
/* remove connections from bin queue and either make dirty or cleanup */
ep = (PCONN_ENTRY)Queue_deq(qp);
while (ep != NULL)
{
UNIV_ASSERT (ep->code == CVY_ENTRCODE); /* (bbain 8/19/99) */
if (ep->bin >= CVY_MAXBINS)
{
if (!err_bin)
{
UNIV_PRINT(("Load_conn_kill: bad bin id"));
LOG_MSG2(MSG_ERROR_INTERNAL, MSG_NONE, ep->bin, CVY_MAXBINS);
err_bin = TRUE;
}
}
else
{
count[ep->bin]++;
}
/* make connection and bin dirty if we don't have a zero timeout period so that they
will not be handled by TCP/IP anymore; this avoids allowing TCP/IP's now stale
connection state from handling packets for newer connections should traffic be
directed to this host in the future */
if (lp->cln_timeout > 0)
{
ep->dirty = TRUE;
Queue_enq(dqp, &(ep->blink));
lp->dirty_bin[ep->bin] = TRUE;
lp->cln_waiting = TRUE;
}
/* otherwise, just cleanup the connection */
else
{
CVY_CONN_CLEAR(ep); /* v2.06 */
Link_unlink(&(ep->rlink)); /* V2.1.5 */
/* if entry is not in the hash table, free the descriptor */
if (ep->alloc)
{
dp = STRUCT_PTR(ep, CONN_DESCR, entry);
UNIV_ASSERT (dp->code == CVY_DESCCODE); /* (bbain 8/19/99) */
Link_unlink(&(dp->link));
Queue_enq(fqp, &(dp->link));
}
}
ep = (PCONN_ENTRY)Queue_deq(qp);
}
/* now make bins idle */
for (i=0; i<CVY_MAXBINS; i++)
{
if (bp->nconn[i] != count[i])
{
if (!err_count)
{
UNIV_PRINT(("Load_conn_kill: bad connection count %d %d bin %d", bp->nconn[i], (LONG)count[i], i));
/* KXF 2.1.1 - removed after tripped up at MSFT a few times */
#if 0
LOG_MSG3(MSG_ERROR_INTERNAL, MSG_NONE, bp->nconn[i], (LONG)count[i], i);
#endif
err_count = TRUE;
}
}
bp->nconn[i] = 0;
}
lp->nconn -= bp->tconn; /* v2.1 */
if (lp->nconn < 0)
lp->nconn = 0;
bp->tconn = 0; /* v2.06 */
bp->idle_bins = BIN_ALL_ONES;
/* if we at least one connection is dirty, restart cleanup timeout period */
if (lp->cln_waiting)
{
#ifdef TRACE_DIRTY
DbgPrint ("setting cleanup timeout");
#endif
lp->cur_time = 0;
}
else
{
#ifdef TRACE_DIRTY
DbgPrint ("no dirty connections found");
#endif
}
} /* end Load_conn_kill */
void Load_conn_cleanup(
PLOAD_CTXT lp)
/*
Clean up all dirty connections (v1.32B)
*/
{
PCONN_ENTRY ep; /* ptr. to connection entry */
PCONN_DESCR dp; /* ptr. to connection descriptor */
QUEUE * fqp; /* ptr. to free queue */
QUEUE * dqp; /* ptr. to dirty queue */
BOOLEAN err_bin; /* bin id error detected */
ULONG i;
PMAIN_CTXT ctxtp = CONTAINING_RECORD (lp, MAIN_CTXT, load);
err_bin = FALSE;
dqp = &(lp->conn_dirtyq);
fqp = &(lp->conn_freeq);
#ifdef TRACE_DIRTY
DbgPrint ("cleaning up dirty connections");
#endif
/* dequeue and clean up all connections on dirty connection queue */
ep = (PCONN_ENTRY)Queue_deq(dqp);
while (ep != NULL)
{
UNIV_ASSERT (ep->code == CVY_ENTRCODE); /* (bbain 8/19/99) */
if (ep->bin >= CVY_MAXBINS)
{
if (!err_bin)
{
UNIV_PRINT(("Load_conn_cleanup: bad bin id"));
LOG_MSG2(MSG_ERROR_INTERNAL, MSG_NONE, ep->bin, CVY_MAXBINS);
err_bin = TRUE;
}
}
CVY_CONN_CLEAR(ep);
ep->dirty = FALSE;
Link_unlink(&(ep->rlink)); /* V2.1.5 */
/* if entry is not in the hash table, free the descriptor */
if (ep->alloc)
{
dp = STRUCT_PTR(ep, CONN_DESCR, entry);
UNIV_ASSERT (dp->code == CVY_DESCCODE); /* (bbain 8/19/99) */
Link_unlink(&(dp->link));
Queue_enq(fqp, &(dp->link));
}
ep = (PCONN_ENTRY)Queue_deq(dqp);
}
/* clear all dirty bin flags */
for (i=0; i<CVY_MAXBINS; i++)
lp->dirty_bin[i] = FALSE;
} /* end Load_conn_cleanup */
void Load_stop(
PLOAD_CTXT lp)
{
ULONG i;
IRQLEVEL irql;
PMAIN_CTXT ctxtp = CONTAINING_RECORD (lp, MAIN_CTXT, load);
UNIV_ASSERT(lp->code == CVY_LOADCODE); /* (bbain 8/19/99) */
if (!(lp->active))
return;
LOCK_ENTER(&(lp->lock), &irql);
/* make connections for all rules dirty so they will not be handled */
for (i=0; i<lp->send_msg.nrules; i++)
{
PBIN_STATE bp; /* ptr. to bin state */
bp = &(lp->pg_state[i]);
UNIV_ASSERT(bp->code == CVY_BINCODE); /* (bbain 8/21/99) */
Load_conn_kill(lp, bp); /* (v1.32B) */
/* advertise that we are not handling any load in case a ping is sent out */
lp->send_msg.cur_map[i] = 0;
lp->send_msg.new_map[i] = 0;
lp->send_msg.idle_map[i] = BIN_ALL_ONES;
lp->send_msg.rdy_bins[i] = 0;
lp->send_msg.load_amt[i] = 0;
}
lp->send_msg.state = HST_CVG; /* force convergence (v2.1) */
/* go inactive until restarted */
lp->active = FALSE;
lp->nconn = 0; /* v2.1 */
LOCK_EXIT(&(lp->lock), irql);
} /* end Load_stop */
void Load_start( /* (v1.32B) */
PLOAD_CTXT lp)
{
ULONG i;
BOOLEAN ret;
PMAIN_CTXT ctxtp = CONTAINING_RECORD (lp, MAIN_CTXT, load);
WCHAR me[20];
if (!(lp->initialized))
Load_init(lp, & ctxtp -> params);
UNIV_ASSERT(lp->code == CVY_LOADCODE); /* (bbain 8/19/99) */
if (lp->active)
return;
lp->my_host_id =(* (lp->params)).host_priority - 1;
lp->ping_map =
lp->host_map = 1 << lp->my_host_id;
lp->last_hmap = 0; /* bbain RTM RC1 6/23/99 */
for (i=0; i<CVY_MAX_HOSTS; i++)
{
lp->nmissed_pings[i] = 0;
}
lp->min_missed_pings = (* (lp->params)).alive_tolerance;
lp->cln_timeout = (* (lp->params)).cleanup_delay;
lp->def_timeout = (* (lp->params)).alive_period;
lp->stable_map = 0;
lp->consistent = TRUE;
/* Intiialize the bad teaming configuration detected flag. */
lp->bad_team_config = FALSE;
lp->dup_hosts = FALSE;
lp->dup_sspri = FALSE;
lp->bad_map = FALSE;
lp->overlap_maps = FALSE;
lp->err_rcving_bins = FALSE;
lp->err_orphans = FALSE;
lp->bad_num_rules = FALSE;
lp->alloc_inhibited = FALSE;
lp->alloc_failed = FALSE;
lp->bad_defrule = FALSE;
lp->scale_client = (BOOLEAN)(* (lp->params)).scale_client;
lp->my_stable_ct = 0;
lp->all_stable_ct = 0;
lp->min_stable_ct = lp->min_missed_pings;
lp->dscr_per_alloc = (* (lp->params)).dscr_per_alloc;
lp->max_dscr_allocs = (* (lp->params)).max_dscr_allocs;
lp->pkt_count = 0; /* 1.32B */
/* initialize port group bin states; add a default rule at the end */
if ((* (lp->params)).num_rules >= (CVY_MAX_RULES - 1))
{
UNIV_PRINT(("Load_start: too many rules; using max possible."));
lp->send_msg.nrules = (USHORT)CVY_MAX_RULES;
}
else
lp->send_msg.nrules = (USHORT)((* (lp->params)).num_rules) + 1;
for (i=0; i<lp->send_msg.nrules; i++)
{
PBIN_STATE bp; /* ptr. to bin state */
PCVY_RULE rp; /* ptr. to rules array */
bp = &(lp->pg_state[i]);
rp = &((* (lp->params)).port_rules[i]);
if (i == (((ULONG)lp->send_msg.nrules) - 1))
/* initialize bin state for default rule to single server with
host priority */
Bin_state_init(lp, bp, i, lp->my_host_id, CVY_SINGLE, CVY_TCP_UDP,
FALSE, (USHORT)0, (* (lp->params)).host_priority);
else if (rp->mode == CVY_SINGLE)
Bin_state_init(lp, bp, i, lp->my_host_id, rp->mode, rp->protocol,
FALSE, (USHORT)0, rp->mode_data.single.priority);
else if (rp->mode == CVY_MULTI)
Bin_state_init(lp, bp, i, lp->my_host_id, rp->mode, rp->protocol,
(BOOLEAN)(rp->mode_data.multi.equal_load),
rp->mode_data.multi.affinity,
(rp->mode_data.multi.equal_load ?
CVY_EQUAL_LOAD : rp->mode_data.multi.load));
/* handle CVY_NEVER mode as multi-server. the check for
those modes is done before attempting to hash to the bin in
Load_packet_check and Load_conn_advise so bin distribution plays
no role in the behavior, but simply allows the rule to be valid
across all of the operational servers */
else
Bin_state_init(lp, bp, i, lp->my_host_id, rp->mode, rp->protocol,
TRUE, (USHORT)0, CVY_EQUAL_LOAD);
ret = Bin_converge(lp, bp, lp->my_host_id);
if (!ret)
{
UNIV_PRINT(("Load_start: initial convergence inconsistent"));
LOG_MSG(MSG_ERROR_INTERNAL, MSG_NONE);
}
/* export current port group state to send msg */
if (i == (((ULONG)(lp->send_msg.nrules)) - 1))
lp->send_msg.rcode[i]= 0;
else
lp->send_msg.rcode[i]= rp->code;
lp->send_msg.cur_map[i] = bp->cmap; /* v2.1 */
lp->send_msg.new_map[i] = bp->new_map[lp->my_host_id];
lp->send_msg.idle_map[i] = bp->idle_bins;
lp->send_msg.rdy_bins[i] = bp->rdy_bins;
lp->send_msg.load_amt[i] = bp->load_amt[lp->my_host_id];
/* ###### for keynote - ramkrish */
lp->send_msg.pg_rsvd1[i] = (ULONG)bp->all_idle_map;
}
/* initialize send msg */
lp->send_msg.host_id = (USHORT)(lp->my_host_id);
lp->send_msg.master_id = (USHORT)(lp->my_host_id);
lp->send_msg.hcode = lp->params->install_date;
lp->send_msg.pkt_count = lp->pkt_count; /* 1.32B */
Univ_ulong_to_str (lp->my_host_id+1, me, 10);
/* Tracking convergence - Starting convergence because this host is joining the cluster. */
LOG_MSGS(MSG_INFO_CONVERGING_NEW_MEMBER, me, me);
TRACE_CONVERGENCE("Initiating convergence on host %d. Reason: Host %d is joining the cluster.", lp->my_host_id+1, lp->my_host_id+1);
/* Tracking convergence - Starting convergence. */
lp->send_msg.state = HST_CVG;
/* activate module */
lp->active = TRUE;
} /* end Load_start */
void Load_init(
PLOAD_CTXT lp,
PCVY_PARAMS params)
{
ULONG i;
PMAIN_CTXT ctxtp = CONTAINING_RECORD (lp, MAIN_CTXT, load);
TRACE_INFO("-> Load_init: lp=0x%p, param=0x%p", lp, params);
LOCK_INIT(&(lp->lock));
if (!(lp->initialized))
{
lp->code = CVY_LOADCODE; /* (bbain 8/19/99) */
/* initialize hashed connection descriptors and queues */
for (i=0; i<CVY_MAX_CHASH; i++)
{
PCONN_ENTRY ep;
ep = &(lp->hashed_conn[i]);
ep->code = CVY_ENTRCODE; /* (bbain 8/19/99) */
ep->alloc = FALSE;
ep->dirty = FALSE; /* v1.32B */
CVY_CONN_CLEAR(ep);
Link_init(&(ep->blink));
Link_init(&(ep->rlink)); /* V2.1.5 */
Queue_init(&(lp->connq[i]));
}
/* initialize connection free and dirty queues; free descriptors */
Queue_init(&(lp->conn_freeq));
Queue_init(&(lp->conn_dirtyq)); /* v1.32B */
Queue_init(&(lp->conn_rcvryq)); /* V2.1.5 */
for (i=0; i<CVY_INIT_QCONN; i++)
{
lp->conn_descr[i].code = CVY_DESCCODE; /* (bbain 8/19/99) */
Link_init(&(lp->conn_descr[i].link));
lp->conn_descr[i].entry.code = CVY_ENTRCODE; /* (bbain 8/21/99) */
lp->conn_descr[i].entry.alloc = TRUE;
lp->conn_descr[i].entry.dirty = FALSE; /* v1.32B */
CVY_CONN_CLEAR(&(lp->conn_descr[i].entry));
Link_init(&(lp->conn_descr[i].entry.blink));
Link_init(&(lp->conn_descr[i].entry.rlink)); /* V2.1.5 */
Queue_enq(&(lp->conn_freeq), &(lp->conn_descr[i].link));
}
/* (v1.32B) */
for (i=0; i<CVY_MAXBINS; i++)
lp->dirty_bin[i] = FALSE;
lp->cln_waiting = FALSE;
lp->def_timeout =
lp->cur_timeout = params -> alive_period;
lp->nqalloc = 0;
lp->nconn = 0; /* v2.1 */
lp->active = FALSE;
lp->initialized = TRUE;
/* clear list of descriptor queue allocations (bbain 2/25/99) */
for (i=0; i<CVY_MAX_MAX_DSCR_ALLOCS; i++)
lp->qalloc_list[i] = (PCONN_DESCR)NULL;
lp -> params = params;
}
else
{
UNIV_ASSERT(lp->code == CVY_LOADCODE); /* (bbain 8/19/99) */
}
/* Initialize the reference count on this load module. */
lp->ref_count = 0;
/* don't start module (v1.32B) */
TRACE_INFO("<- Load_init");
} /* end Load_init */
void Load_cleanup( /* (bbain 2/25/99) */
PLOAD_CTXT lp)
{
ULONG i;
PMAIN_CTXT ctxtp = CONTAINING_RECORD (lp, MAIN_CTXT, load);
UNIV_ASSERT(lp->code == CVY_LOADCODE); /* (bbain 8/19/99) */
/* free all descriptor queue allocations */
if (lp->nqalloc > CVY_MAX_MAX_DSCR_ALLOCS)
lp->nqalloc = CVY_MAX_MAX_DSCR_ALLOCS;
for (i=0; i<lp->nqalloc; i++)
if (lp->qalloc_list[i] != (PCONN_DESCR)NULL)
free((PVOID)(lp->qalloc_list[i]));
} /* end Load_cleanup */
void Load_convergence_start(
PLOAD_CTXT lp)
{
PMAIN_CTXT ctxtp = CONTAINING_RECORD (lp, MAIN_CTXT, load);
lp->consistent = TRUE; /* 1.03 */
/* setup initial convergence state */
lp->send_msg.state = HST_CVG;
lp->stable_map = 0;
lp->my_stable_ct = 0;
lp->all_stable_ct = 0;
lp->send_msg.master_id = (USHORT)(lp->my_host_id);
} /* end Load_convergence_start */
void Load_msg_rcv(
PLOAD_CTXT lp,
PPING_MSG pmsg) /* ptr. to ping message */
{
ULONG i;
BOOLEAN consistent;
ULONG my_host;
ULONG rem_host;
ULONG saved_map; /* saved host map */
PPING_MSG sendp; /* ptr. to my send message */
IRQLEVEL irql;
WCHAR me[20];
WCHAR them[20];
ULONG map;
PMAIN_CTXT ctxtp = CONTAINING_RECORD (lp, MAIN_CTXT, load);
/* Used for tracking convergence and event logging. */
BOOLEAN bInconsistentMaster = FALSE;
BOOLEAN bInconsistentTeaming = FALSE;
BOOLEAN bInconsistentPortRules = FALSE;
UNIV_ASSERT(lp->code == CVY_LOADCODE);
TRACE_HB("Recv HB from host %d", (ULONG) pmsg->host_id + 1);
if (!(lp->active))
return;
my_host = lp->my_host_id;
rem_host = (ULONG) pmsg->host_id;
Univ_ulong_to_str (my_host+1, me, 10);
Univ_ulong_to_str (rem_host+1, them, 10);
sendp = &(lp->send_msg);
if (rem_host >= CVY_MAX_HOSTS)
return;
LOCK_ENTER(&(lp->lock), &irql);
/* filter out packets broadcast by this host */
if(rem_host == my_host)
{
/* if this packet was really from another host, we have duplicate host ids */
if (sendp->hcode != pmsg->hcode)
{
if (!(lp->dup_hosts))
{
UNIV_PRINT(("Duplicate host ids detected."));
LOG_MSG(MSG_ERROR_HOST_ID, me);
lp->dup_hosts = TRUE;
}
/* Tracking convergence - Starting convergence because duplicate host IDs were detected in the cluster. */
if (sendp->state == HST_NORMAL) {
LOG_MSGS(MSG_INFO_CONVERGING_DUPLICATE_HOST_ID, me, them);
TRACE_CONVERGENCE("Initiating convergence on host %d. Reason: Host %d is configured with the same host ID.", my_host+1, rem_host+1);
}
/* Tracking convergence - Starting convergence. */
Load_convergence_start(lp);
}
/* just update ping and host maps for us */
lp->ping_map |= (1 << my_host);
lp->host_map |= (1 << my_host);
LOCK_EXIT(&(lp->lock), irql);
return;
}
if (sendp->nrules != pmsg->nrules)
{
if (!(lp->bad_num_rules))
{
UNIV_PRINT(("Host %d: Hosts have diff # rules.", my_host));
LOG_MSG2(MSG_ERROR_RULES_MISMATCH, them, sendp->nrules, pmsg->nrules);
lp->bad_num_rules = TRUE;
}
/* Tracking convergence - Starting convergence because the number of port rules on this host and the remote host do not match. */
if (sendp->state == HST_NORMAL) {
LOG_MSGS(MSG_INFO_CONVERGING_NUM_RULES, me, them);
TRACE_CONVERGENCE("Initiating convergence on host %d. Reason: Host %d is configured with a conflicting number of port rules.", my_host+1, rem_host+1);
}
/* Tracking convergence - Starting convergence. */
Load_convergence_start(lp);
/* just update ping and host maps for remote host (bbain 2/17/99) */
lp->ping_map |= (1 << rem_host);
lp->host_map |= (1 << rem_host);
LOCK_EXIT(&(lp->lock), irql);
return;
}
/* update mastership and see if consistent */
if (rem_host < sendp->master_id)
sendp->master_id = (USHORT)rem_host;
consistent = sendp->master_id == pmsg->master_id; /* 1.03 */
/* For the purposes of logging the reason for convergence, note this inconsistency. */
if (!consistent) bInconsistentMaster = TRUE;
/* update ping and host maps to include remote host */
lp->ping_map |= (1 << rem_host);
saved_map = lp->host_map;
lp->host_map |= (1 << rem_host);
/* handle host convergence */
if (sendp->state != HST_NORMAL)
{
/* if master, update stable map for remote host */
if (sendp->master_id == my_host)
{
if (pmsg->state == HST_STABLE)
{
lp->stable_map |= (1 << rem_host);
}
else
{
lp->stable_map &= ~(1 << rem_host);
lp->all_stable_ct = 0;
}
}
/* otherwise, update state if have global stable convergence and the current
master has signalled completion by returning to the normal state; note
that we must do this prior to updating port group states */
else if (rem_host == sendp->master_id && pmsg->state == HST_NORMAL)
{
if (sendp->state == HST_STABLE)
{
sendp->state = HST_NORMAL;
/* Notify our BDA team that this cluster is consistently configured.
If we are not part of a BDA team, this call is essentially a no-op. */
Load_teaming_consistency_notify(&ctxtp->bda_teaming, TRUE);
/* Reset the bad teaming configuration detected flag if we are converged. */
lp->bad_team_config = FALSE;
lp->dup_hosts = FALSE;
lp->dup_sspri = FALSE;
lp->bad_map = FALSE;
lp->overlap_maps = FALSE;
lp->err_rcving_bins = FALSE;
lp->err_orphans = FALSE;
lp->bad_num_rules = FALSE;
lp->pkt_count = 0; /* v1.32B */
for (i=0; i<sendp->nrules; i++)
{
PBIN_STATE bp;
bp = &(lp->pg_state[i]);
bp->compatible = TRUE; /* 1.03 */
Bin_converge_commit(lp, bp, my_host);
UNIV_PRINT(("Host %d pg %d: new cur map %x idle %x all %x",
my_host, i, bp->cur_map[my_host], bp->idle_bins,
bp->all_idle_map));
#if 0 /* 1.03: only update ping message in Load_timeout to avoid locking send */
/* export current port group state */
sendp->cur_map[i] = bp->cmap; /* v2.1 */
sendp->new_map[i] = bp->new_map[my_host];
sendp->idle_map[i] = bp->idle_bins;
sendp->rdy_bins[i] = bp->rdy_bins;
sendp->load_amt[i] = bp->load_amt[my_host];
#endif
}
#if 0
sendp->pkt_count = lp->pkt_count; /* 1.32B */
#endif
UNIV_PRINT(("Host %d: converged as slave", my_host));
/* log convergence completion if host map changed (bbain RTM RC1 6/23/99) */
Load_hosts_query (lp, TRUE, & map);
lp->last_hmap = lp->host_map;
}
else
{
/* Tracking convergence - Starting convergence because the DEFAULT host prematurely ended convergence. In this case, we
are guaranteed to already be in the HST_CVG state, and because this message can be misleading in some circumstances,
we do not log an event. For instance, due to timing issues, when a host joins a cluster he can receive a HST_NORMAL
heartbeat from the DEFAULT host while it is still in the HST_CVG state simply because that heartbeat left the DEFAULT
host before it received our first heartbeat, which initiated convergence. */
TRACE_CONVERGENCE("Initiating convergence on host %d. Reason: Host %d, the DEFAULT host, prematurely terminated convergence.", my_host+1, rem_host+1);
/* Tracking convergence - Starting convergence. */
Load_convergence_start(lp);
}
}
}
/* Compare the teaming configuration of this host with the remote host. If the
two are inconsitent and we are part of a team, we will initiate convergence. */
if (!Load_teaming_consistency_check(lp->bad_team_config, &ctxtp->bda_teaming, sendp->teaming, pmsg->teaming)) {
/* Only log an event if the teaming configuration was, but is now not, consistent. */
if (!lp->bad_team_config) {
/* Note that we saw this. */
lp->bad_team_config = TRUE;
/* Log the event. */
LOG_MSG(MSG_ERROR_BDA_BAD_TEAM_CONFIG, them);
}
/* Notify the team that this cluster is NOT consistently configured. */
Load_teaming_consistency_notify(&ctxtp->bda_teaming, FALSE);
/* Mark the heartbeats inconsistent to force and retain convergence. */
consistent = FALSE;
/* For the purposes of logging the reason for convergence, note this inconsistency. */
bInconsistentTeaming = TRUE;
}
/* update port group state */
for (i=0; i<sendp->nrules; i++)
{
BOOLEAN ret;
PBIN_STATE bp;
bp = &lp->pg_state[i];
/* if rule codes don't match, print message and handle incompatibility (1.03: note
that we previously marked rule invalid, which would stop processing) */
if (sendp->rcode[i] != pmsg->rcode[i])
{
/* 1.03: if rule was peviously compatible, print message */
if (bp->compatible)
{
PCVY_RULE rp;
UNIV_PRINT(("Host %d pg %d: rule codes do not match.", lp->my_host_id, i));
/* bbain 8/27/99 */
LOG_MSG4(MSG_ERROR_RULES_MISMATCH, them, rem_host, i, sendp->rcode[i], pmsg->rcode[i]);
/* Get the port rule information for this rule. */
rp = &lp->params->port_rules[i];
/* Check to see if this is an issue with a win2k host in a cluster utilizing virtual clusters. */
if ((rp->virtual_ip_addr != CVY_ALL_VIP_NUMERIC_VALUE) && ((sendp->rcode[i] ^ ~rp->virtual_ip_addr) == pmsg->rcode[i])) {
UNIV_PRINT((" ** A Windows 2000 or NT4 host MAY be participating in a cluster utilizing virtual cluster support."));
LOG_MSG(MSG_WARN_VIRTUAL_CLUSTERS, MSG_NONE);
}
bp->compatible = FALSE;
}
/* 1.03: mark rule inconsistent to force and continue convergence */
consistent = FALSE;
/* For the purposes of logging the reason for convergence, note this inconsistency. */
bInconsistentPortRules = TRUE;
/* don't update bin state */
continue;
}
ret = Bin_host_update(lp, bp, my_host, (BOOLEAN)(sendp->state != HST_NORMAL),
(BOOLEAN)(pmsg->state != HST_NORMAL),
rem_host, pmsg->cur_map[i], pmsg->new_map[i],
pmsg->idle_map[i], pmsg->rdy_bins[i],
pmsg->pkt_count, pmsg->load_amt[i]);
#if 0 /* 1.03: only update ping message in Load_timeout to avoid locking send */
/* export current port group state */
sendp->cur_map[i] = bp->cmap; /* v2.1 */
sendp->new_map[i] = bp->new_map[my_host];
sendp->idle_map[i] = bp->idle_bins;
sendp->rdy_bins[i] = bp->rdy_bins;
sendp->load_amt[i] = bp->load_amt[my_host];
#endif
if (!ret)
consistent = FALSE;
}
/* update our consistency state */
lp->consistent = consistent;
/* if we are in normal operation and we discover a new host or a host goes into
convergence or we discover an inconsistency, go into convergence */
if (sendp->state == HST_NORMAL)
{
if (lp->host_map != saved_map || pmsg->state == HST_CVG || !consistent)
{
/* If a host has joined the cluster, or if inconsistent teaming configuration or port
rules were detected, then we need to log an event. However, we segregate the
inconsistent master host flag because it is set by the initiating host in MANY
occasions, so we want to log the most specific reason(s) for convergence if
possible and only report the inconsistent master detection only if nothing more
specific can be deduced. */
if (lp->host_map != saved_map || bInconsistentTeaming || bInconsistentPortRules) {
/* If the host maps are different, then we know that the host from which we received
this packet is joining the cluster because the ONLY operation on the host map in
this function is to ADD a remote host to our map. Otherwise, if the map has not
changed, then an inconsistent configuration got us into the branch. */
if (lp->host_map != saved_map) {
/* Tracking convergence - Starting convergence because another host is joining the cluster. */
LOG_MSGS(MSG_INFO_CONVERGING_NEW_MEMBER, me, them);
TRACE_CONVERGENCE("Initiating convergence on host %d. Reason: Host %d is joining the cluster.", my_host+1, rem_host+1);
} else if (bInconsistentTeaming || bInconsistentPortRules) {
/* Tracking convergence - Starting convergence because inconsistent configuration was detected. */
LOG_MSGS(MSG_INFO_CONVERGING_BAD_CONFIG, me, them);
TRACE_CONVERGENCE("Initiating convergence on host %d. Reason: Host %d has conflicting configuration.", my_host+1, rem_host+1);
}
/* If we have nothing better to report, report convergence for an unspecific reason. */
} else if (bInconsistentMaster || pmsg->state == HST_CVG) {
/* Tracking convergence - Starting convergence for unknown reasons. */
LOG_MSGS(MSG_INFO_CONVERGING_UNKNOWN, me, them);
TRACE_CONVERGENCE("Initiating convergence on host %d. Reason: Host %d is converging for an unknown reason.", my_host+1, rem_host+1);
}
/* Tracking convergence - Starting convergence. */
Load_convergence_start(lp);
}
}
/* otherwise, if we are in convergence and we see an inconsistency, just restart
our local convergence */
else
{
/* update our consistency state; if we didn't see consistent information,
restart this host's convergence */
if (!consistent)
{
/* Tracking convergence - Starting convergence because inconsistent configuration was detected.
This keeps hosts in a state of convergence when hosts are inconsistently configured. However,
since the cluster is already in a state of convergece (HST_CVG or HST_STABLE), don't log an
event, which may confuse a user. */
TRACE_CONVERGENCE("Initiating convergence on host %d. Reason: Host %d has conflicting configuration.", my_host+1, rem_host+1);
/* Tracking convergence - Starting convergence. */
sendp->state = HST_CVG;
lp->my_stable_ct = 0;
lp->stable_map &= ~(1 << my_host);
lp->all_stable_ct = 0;
}
}
LOCK_EXIT(&(lp->lock), irql);
} /* end Load_msg_rcv */
PPING_MSG Load_snd_msg_get(
PLOAD_CTXT lp)
{
return &(lp->send_msg);
} /* end Load_snd_msg_get */
BOOLEAN Load_timeout(
PLOAD_CTXT lp,
PULONG new_timeout,
PBOOLEAN pconverging,
PULONG pnconn)
/*
Note: we only update ping message in this function since we know that upper level code
sends out ping messages after calling this routine. We cannot be sure that Load_msg_rcv
is sequentialized with sending a message, (1.03)
Upper level code locks this routine wrt Load_msg_rcv, Load_packet_check, and
Load_conn_advise. (1.03)
*/
{
ULONG missed_pings;
ULONG my_host;
ULONG i;
PPING_MSG sendp; /* ptr. to my send message */
IRQLEVEL irql;
ULONG map; /* returned host map from query */
PMAIN_CTXT ctxtp = CONTAINING_RECORD (lp, MAIN_CTXT, load);
UNIV_ASSERT(lp->code == CVY_LOADCODE); /* (bbain 8/19/99) */
LOCK_ENTER(&(lp->lock), &irql);
/* check for cleanup timeout (v1.32B) */
if (lp->cln_waiting)
{
lp->cur_time += lp->cur_timeout;
if (lp->cur_time >= lp->cln_timeout)
{
Load_conn_cleanup(lp);
lp->cln_waiting = FALSE;
}
}
/* return if not active */
if (!(lp->active))
{
if (new_timeout != NULL)
* new_timeout = lp->cur_timeout = lp->def_timeout;
if (pnconn != NULL) /* v2.1 */
* pnconn = lp->nconn;
if (pconverging != NULL)
* pconverging = FALSE;
LOCK_EXIT(&(lp->lock), irql);
return FALSE;
}
my_host = lp->my_host_id;
sendp = &(lp->send_msg);
/* compute which hosts missed pings and reset ping map */
missed_pings = lp->host_map & (~lp->ping_map);
#ifdef NO_CLEANUP
lp->ping_map = 1 << my_host;
#else
lp->ping_map = 0;
#endif
/* check whether any host is dead, including ourselves */
for (i=0; i<CVY_MAX_HOSTS; i++)
{
/* if we have a missed ping for this host, increment count */
if ((missed_pings & 0x1) == 1)
{
lp->nmissed_pings[i]++;
/* if we missed too many pings, declare host dead and force convergence */
if (lp->nmissed_pings[i] == lp->min_missed_pings)
{
ULONG j;
BOOLEAN ret;
WCHAR me[20];
WCHAR them[20];
if (i == my_host)
{
UNIV_PRINT(("Host %d: missed too many pings; this host declared offline", i));
/* reset our packet count since we are likely not to be receiving
packets from others now; this will make us less favored to
handle duplicate bins later (v1.32B) */
lp->pkt_count = 0;
}
lp->host_map &= ~(1<<i);
for (j=0; j<sendp->nrules; j++)
{
PBIN_STATE bp;
bp = &(lp->pg_state[j]);
UNIV_ASSERT(bp->code == CVY_BINCODE); /* (bbain 8/19/99) */
if (i == my_host)
{
ULONG k;
/* cleanup connections and restore maps to clean state */
Load_conn_kill(lp, bp);
bp->targ_map = 0;
bp->all_idle_map = BIN_ALL_ONES;
bp->cmap = 0; /* v2.1 */
bp->compatible = TRUE; /* v1.03 */
for (k=0; k<CVY_MAX_HOSTS; k++)
{
bp->new_map[k] = 0;
bp->cur_map[k] = 0;
bp->chk_map[k] = 0;
bp->idle_map[k] = BIN_ALL_ONES;
if (k != i)
bp->load_amt[k] = 0;
}
bp->snd_bins =
bp->rcv_bins =
bp->rdy_bins = 0;
bp->idle_bins = BIN_ALL_ONES;
/* compute initial new map for convergence as only host in cluster
(v 1.3.2B) */
ret = Bin_converge(lp, bp, lp->my_host_id);
if (!ret)
{
UNIV_PRINT(("Load_timeout: initial convergence inconsistent"));
LOG_MSG(MSG_ERROR_INTERNAL, MSG_NONE);
}
}
else
{
ret = Bin_host_update(lp, bp, my_host, TRUE, TRUE,
i, 0, 0, BIN_ALL_ONES, 0, 0, 0);
}
}
lp->nmissed_pings[i] = 0;
/* If a host has dropped out of the cluster, then log an event. However, we don't
log an event when we drop out because the only way for us to drop out of our own
cluster is if we are stopping anyway, or if we have lost network connectivity.
Logging such events may be misleading, so we won't bother. */
if (i != my_host) {
Univ_ulong_to_str (my_host+1, me, 10);
Univ_ulong_to_str (i+1, them, 10);
/* Tracking convergence - Starting convergence because a member has fallen out of the cluster. */
LOG_MSGS(MSG_INFO_CONVERGING_MEMBER_LOST, me, them);
TRACE_CONVERGENCE("Initiating convergence on host %d. Reason: Host %d is leaving the cluster.", my_host+1, i+1);
}
/* Tracking convergence - Starting convergence. */
Load_convergence_start(lp);
}
}
/* otherwise reset missed ping count */
else
lp->nmissed_pings[i] = 0;
missed_pings >>= 1;
}
/* handle convergence */
if (sendp->state != HST_NORMAL)
{
/* check whether we have been consistent and have received our own pings
for a sufficient period to move to a stable state and announce it to
other hosts */
if (sendp->state == HST_CVG)
{
if (lp->consistent && ((lp->host_map & (1 << my_host)) != 0))
{
lp->my_stable_ct++;
if (lp->my_stable_ct >= lp->min_stable_ct)
{
sendp->state = HST_STABLE;
lp->stable_map |= (1 << my_host);
}
}
else
lp->my_stable_ct = lp->all_stable_ct = 0; /* wlb B3RC1 */
}
/* otherwise, see if we are the master and everybody's been stable for
a sufficient period for us to terminate convergence */
else if (sendp->state == HST_STABLE &&
my_host == sendp->master_id &&
lp->stable_map == lp->host_map)
{
lp->all_stable_ct++;
if (lp->all_stable_ct >= lp->min_stable_ct)
{
sendp->state = HST_NORMAL;
/* Notify our BDA team that this cluster is consistently configured.
If we are not part of BDA team, this call is essentially a no-op. */
Load_teaming_consistency_notify(&ctxtp->bda_teaming, TRUE);
/* Reset the bad teaming configuration detected flag if we are converged. */
lp->bad_team_config = FALSE;
lp->dup_hosts = FALSE;
lp->dup_sspri = FALSE;
lp->bad_map = FALSE;
lp->overlap_maps = FALSE;
lp->err_rcving_bins = FALSE;
lp->err_orphans = FALSE;
lp->bad_num_rules = FALSE;
lp->pkt_count = 0; /* v1.32B */
for (i=0; i<sendp->nrules; i++)
{
PBIN_STATE bp;
BOOLEAN ret;
bp = &(lp->pg_state[i]);
bp->compatible = TRUE; /* 1.03 */
/* explicitly converge to new map in case we're the only host (v2.06) */
ret = Bin_converge(lp, bp, lp->my_host_id);
if (!ret)
{
UNIV_PRINT(("Load_timeout: final convergence inconsistent"));
LOG_MSG(MSG_ERROR_INTERNAL, MSG_NONE);
}
Bin_converge_commit(lp, bp, my_host);
UNIV_PRINT(("Host %d pg %d: new cur map %x idle %x all %x",
my_host, i, bp->cur_map[my_host], bp->idle_bins,
bp->all_idle_map));
}
UNIV_PRINT(("+++ Host %d: converged as master +++", my_host));
/* log convergence completion if host map changed (bbain RTM RC1 6/23/99) */
Load_hosts_query (lp, TRUE, & map);
lp->last_hmap = lp->host_map;
}
}
}
/* 1.03: update ping message */
for (i=0; i<sendp->nrules; i++)
{
PBIN_STATE bp;
bp = &(lp->pg_state[i]);
/* export current port group state to ping message */
sendp->cur_map[i] = bp->cmap; /* v2.1 */
sendp->new_map[i] = bp->new_map[my_host];
sendp->idle_map[i] = bp->idle_bins;
sendp->rdy_bins[i] = bp->rdy_bins;
sendp->load_amt[i] = bp->load_amt[my_host];
/* ###### for keynote - ramkrish */
sendp->pg_rsvd1[i] = (ULONG)bp->all_idle_map;
}
sendp->pkt_count = lp->pkt_count; /* 1.32B */
/* Add configuration information for teaming at each timeout. */
Load_teaming_code_create(&lp->send_msg.teaming, &ctxtp->bda_teaming);
/* request fast timeout if converging */
if (new_timeout != NULL) /* 1.03 */
{
if (sendp->state != HST_NORMAL)
* new_timeout = lp->cur_timeout = lp->def_timeout / 2;
else
* new_timeout = lp->cur_timeout = lp->def_timeout;
}
if (pnconn != NULL) /* v2.1 */
* pnconn = lp->nconn;
if (pconverging != NULL)
* pconverging = (sendp->state != HST_NORMAL);
LOCK_EXIT(&(lp->lock), irql);
return ((lp->host_map) != 0);
} /* end Load_timeout */
PBIN_STATE Load_pg_lookup(
PLOAD_CTXT lp,
ULONG svr_ipaddr,
ULONG svr_port,
BOOLEAN is_tcp)
{
PCVY_RULE rp; /* ptr. to rules array */
PBIN_STATE bp; /* ptr. to bin state */
ULONG i;
ULONG nurules; /* # user defined rules */
PMAIN_CTXT ctxtp = CONTAINING_RECORD (lp, MAIN_CTXT, load);
UNIV_ASSERT(lp->code == CVY_LOADCODE); /* (bbain 8/19/99) */
rp = (* (lp->params)).port_rules;
nurules = (* (lp->params)).num_rules;
/* check for invalid port value (bbain RC1 6/14/99) */
UNIV_ASSERT(svr_port <= CVY_MAX_PORT);
/* find server port rule */
for (i=0; i<nurules; i++)
{
/* For virtual clusters: If the server IP address matches the VIP for the port rule,
or if the VIP for the port rule is "ALL VIPs", and if the port lies in the range
for this rule, and if the protocol matches, this is the rule. Notice that this
give priority to rules for specific VIPs over those for "ALL VIPs", which means
that this code RELIES on the port rules being sorted by VIP/port where the "ALL
VIP" ports rules are at the end of the port rule list. */
if ((svr_ipaddr == rp->virtual_ip_addr || CVY_ALL_VIP_NUMERIC_VALUE == rp->virtual_ip_addr) &&
(svr_port >= rp->start_port && svr_port <= rp->end_port) &&
((is_tcp && rp->protocol != CVY_UDP) || (!is_tcp && rp->protocol != CVY_TCP)))
break;
else
rp++;
}
/* use default rule if port not found or rule is invalid */
bp = &(lp->pg_state[i]);
UNIV_ASSERT(bp->code == CVY_BINCODE); /* (bbain 8/19/99) */
return bp;
} /* end Load_pg_lookup */
BOOLEAN Load_packet_check(
PLOAD_CTXT lp,
ULONG svr_ipaddr,
ULONG svr_port,
ULONG client_ipaddr,
ULONG client_port,
USHORT protocol,
BOOLEAN limit_map_fn)
{
PBIN_STATE bp; /* ptr. to bin state */
ULONG id; /* hash index for the connection */
ULONG bin; /* bin index */
QUEUE * qp; /* ptr. to connection queue */
IRQLEVEL irql;
PMAIN_CTXT ctxtp = CONTAINING_RECORD (lp, MAIN_CTXT, load);
BOOLEAN is_tcp_pkt = (protocol == TCPIP_PROTOCOL_TCP);
BOOLEAN is_session_pkt;
is_session_pkt = is_tcp_pkt;
if (NLB_IPSEC_SESSION_SUPPORT_ENABLED() && (protocol == TCPIP_PROTOCOL_IPSEC1))
{
is_session_pkt = TRUE;
}
UNIV_ASSERT(lp->code == CVY_LOADCODE); /* (bbain 8/19/99) */
if (! lp -> active)
return FALSE;
lp->pkt_count++; /* increment count of pkts handled (v1.32B) */
bp = Load_pg_lookup(lp, svr_ipaddr, svr_port, is_tcp_pkt);
/* V2.2 make sure that Load_pg_lookup properly handled protocol specific rules */
UNIV_ASSERT ((is_tcp_pkt && bp->prot != CVY_UDP) || (!is_tcp_pkt && bp->prot != CVY_TCP));
/* handle CVY_NEVER mode immediately */
if (bp->mode == CVY_NEVER)
return FALSE;
/* lookup connection entry in hash table */
if (limit_map_fn) {
if (bp->affinity == CVY_AFFINITY_NONE)
id = Map(client_ipaddr, MAP_FN_PARAMETER);
else if (bp->affinity == CVY_AFFINITY_SINGLE)
id = Map(client_ipaddr, MAP_FN_PARAMETER);
else
id = Map(client_ipaddr & TCPIP_CLASSC_MASK, MAP_FN_PARAMETER);
} else {
if (bp->affinity == CVY_AFFINITY_NONE)
id = Map(client_ipaddr, ((svr_port << 16) + client_port));
else if (bp->affinity == CVY_AFFINITY_SINGLE)
id = Map(client_ipaddr, svr_ipaddr);
else
id = Map(client_ipaddr & TCPIP_CLASSC_MASK, svr_ipaddr);
}
/* now hash client address to bin id */
bin = id % CVY_MAXBINS;
LOCK_ENTER(&(lp->lock), &irql);
/* check bin for residency and all other hosts now idle on their bins; in this
case and if we do not have dirty connections, we must be able to handle the packet */
if (((bp->cmap & (((MAP_T) 1) << bin)) != 0) && /* v2.1 */
(!is_session_pkt || (((bp->all_idle_map & (((MAP_T) 1) << bin)) != 0) && (!(lp->cln_waiting))))) /* v1.32B */
{
/* note that we may have missed a connection, but it could also be a stale
packet so we can't start tracking the connection now */
#ifdef TRACE_LOAD
DbgPrint("Host %d: check 1 accepts pkt; rule %d bin %d nconn %d %s port %d\n",
lp->my_host_id, bp->index, bin, bp->nconn[bin], is_tcp_pkt ? "TCP" : "UDP", svr_port);
#endif
LOCK_EXIT(&(lp->lock), irql);
return TRUE;
}
/* otherwise, if we have an active connection for this bin or if we have dirty
connections for this bin and the bin is resident, check for a match */
else if (bp->nconn[bin] > 0 || (lp->cln_waiting && lp->dirty_bin[bin] && ((bp->cmap & (((MAP_T) 1) << bin)) != 0)))
{
PCONN_ENTRY ep; /* ptr. to connection entry */
PCONN_DESCR dp; /* ptr. to connection descriptor */
/* now hash client address to conn. hash table index */
id = id % CVY_MAX_CHASH;
ep = &(lp->hashed_conn[id]);
qp = &(lp->connq[id]);
/* look for a connection match */
if (CVY_CONN_MATCH(ep, svr_ipaddr, svr_port, client_ipaddr, client_port, protocol))
{
/* if connection was dirty, just block the packet since TCP/IP may have stale
connection state for a previous connection from another host (v1.32B) */
if (ep->dirty)
{
LOCK_EXIT(&(lp->lock), irql);
#ifdef TRACE_DIRTY
DbgPrint ("blocking dirty connection from %d to %d\n", client_port, svr_port);
#endif
return FALSE;
}
#ifdef TRACE_LOAD
DbgPrint("Host %d: check 2 accepts pkt; rule %d bin %d nconn %d %s port %d\n",
lp->my_host_id, bp->index, bin, bp->nconn[bin], is_tcp_pkt ? "TCP" : "UDP", svr_port);
#endif
LOCK_EXIT(&(lp->lock), irql);
return TRUE;
}
else
{
for (dp = (PCONN_DESCR)Queue_front(qp); dp != NULL;
dp = (PCONN_DESCR)Queue_next(qp, &(dp->link)))
{
if (CVY_CONN_MATCH(&(dp->entry), svr_ipaddr, svr_port, client_ipaddr, client_port, protocol))
{
/* if connection was dirty, just block the packet since TCP/IP may have
stale connection state for a previous connection from another host
(v1.32B) */
if (dp->entry.dirty)
{
LOCK_EXIT(&(lp->lock), irql);
#ifdef TRACE_DIRTY
DbgPrint ("blocking dirty connection from %d to %d\n", client_port, svr_port);
#endif
return FALSE;
}
#ifdef TRACE_LOAD
DbgPrint("Host %d: check 3 accepts pkt; rule %d bin %d nconn %d %s port %d\n",
lp->my_host_id, bp->index, bin, bp->nconn[bin], is_tcp_pkt ? "TCP" : "UDP", svr_port);
#endif
LOCK_EXIT(&(lp->lock), irql);
return TRUE;
}
}
}
}
LOCK_EXIT(&(lp->lock), irql);
return FALSE;
} /* end Load_packet_check */
BOOLEAN Load_conn_advise(
PLOAD_CTXT lp,
ULONG svr_ipaddr,
ULONG svr_port,
ULONG client_ipaddr,
ULONG client_port,
USHORT protocol,
ULONG conn_status,
BOOLEAN limit_map_fn)
{
BOOLEAN match, /* TRUE => we have a record of this connection */
hit; /* TRUE => we have a hash entry hit */
ULONG id; /* hash index for the connection */
ULONG bin; /* bin index */
PBIN_STATE bp; /* ptr. to bin state */
PCONN_ENTRY ep; /* ptr. to connection entry */
PCONN_DESCR dp; /* ptr. to connection descriptor */
QUEUE * qp; /* ptr. to connection queue */
IRQLEVEL irql;
PMAIN_CTXT ctxtp = CONTAINING_RECORD (lp, MAIN_CTXT, load);
BOOLEAN is_tcp_pkt = (protocol == TCPIP_PROTOCOL_TCP);
UNIV_ASSERT(lp->code == CVY_LOADCODE); /* (bbain 8/19/99) */
if (!lp -> active)
return FALSE;
lp->pkt_count++; /* increment count of pkts handled (v1.32B) */
/* increment bin count */
bp = Load_pg_lookup(lp, svr_ipaddr, svr_port, is_tcp_pkt);
/* handle CVY_NEVER immediately */
if (bp->mode == CVY_NEVER)
return FALSE;
/* This function is no longer for TCP only. */
if (!NLB_SESSION_SUPPORT_ENABLED())
{
/* This should never happen with session support disabled anyway - Load_pg_lookup() will
NEVER return a UDP only rule when the is_tcp_pkt is TRUE, so this isn't necessary. */
if (bp->prot == CVY_UDP)
return TRUE;
}
/* lookup connection entry in hash table */
if (limit_map_fn) {
if (bp->affinity == CVY_AFFINITY_NONE)
id = Map(client_ipaddr, MAP_FN_PARAMETER);
else if (bp->affinity == CVY_AFFINITY_SINGLE)
id = Map(client_ipaddr, MAP_FN_PARAMETER);
else
id = Map(client_ipaddr & TCPIP_CLASSC_MASK, MAP_FN_PARAMETER);
} else {
if (bp->affinity == CVY_AFFINITY_NONE)
id = Map(client_ipaddr, ((svr_port << 16) + client_port));
else if (bp->affinity == CVY_AFFINITY_SINGLE)
id = Map(client_ipaddr, svr_ipaddr);
else
id = Map(client_ipaddr & TCPIP_CLASSC_MASK, svr_ipaddr);
}
/* now hash client address to bin id and conn. hash table index */
bin = id % CVY_MAXBINS;
id = id % CVY_MAX_CHASH;
/* if this connection is not in our current map and it is not a connection
down notification for a non-idle bin, just filter it out */
if ((bp->cmap & (((MAP_T) 1) << bin)) == 0 && /* v2.1 */
(!((conn_status == CVY_CONN_DOWN || conn_status == CVY_CONN_RESET) && bp->nconn[bin] > 0)))
return FALSE;
ep = &(lp->hashed_conn[id]);
UNIV_ASSERT (ep->code == CVY_ENTRCODE); /* (bbain 8/21/99) */
qp = &(lp->connq[id]);
match = hit = FALSE;
LOCK_ENTER(&(lp->lock), &irql);
if (CVY_CONN_MATCH(ep, svr_ipaddr, svr_port, client_ipaddr, client_port, protocol))
{
hit =
match = TRUE;
}
else
{
for (dp = (PCONN_DESCR)Queue_front(qp); dp != NULL;
dp = (PCONN_DESCR)Queue_next(qp, &(dp->link)))
{
if (CVY_CONN_MATCH(&(dp->entry), svr_ipaddr, svr_port, client_ipaddr, client_port, protocol))
{
match = TRUE;
UNIV_ASSERT (dp->code == CVY_DESCCODE); /* (bbain 8/19/99) */
ep = &(dp->entry); /* v 2.06 */
UNIV_ASSERT (ep->code == CVY_ENTRCODE); /* (bbain 8/21/99) */
/* release connection descriptor if taking down connection */
if (conn_status == CVY_CONN_DOWN || conn_status == CVY_CONN_RESET)
{
/* if connection was dirty, just block the packet since TCP/IP may have
stale connection state for a previous connection from another host
(v1.32B) */
if (ep->dirty)
{
LOCK_EXIT(&(lp->lock), irql);
#ifdef TRACE_DIRTY
DbgPrint ("blocking dirty FIN from %d to %d\n", client_port, svr_port);
#endif
return FALSE;
}
/* ###### fin count added for keynote - ramkrish. */
/* if first fin, then only increment the count and return TRUE */
if (conn_status == CVY_CONN_DOWN && ep->fin_count == 0 && is_tcp_pkt)
{
ep->fin_count++;
LOCK_EXIT(&(lp->lock), irql);
return TRUE;
}
Link_unlink(&(dp->entry.blink));
Link_unlink(&(dp->entry.rlink)); /* V2.1.5 */
Link_unlink(&(dp->link));
Queue_enq(&(lp->conn_freeq), &(dp->link));
}
break;
}
}
}
/* if we see a new connection, handle it */
if (conn_status == CVY_CONN_UP)
{
/* if we don't have a connection match, setup a new connection entry */
if (!match)
{
/* if hash entry table is not available, setup and enqueue a new entry */
if (CVY_CONN_IN_USE(ep))
{
dp = (PCONN_DESCR)Queue_deq(&(lp->conn_freeq));
if (dp == NULL)
{
/* allocate new queue descriptors if allowed */
if (lp->nqalloc < lp->max_dscr_allocs)
{
UNIV_PRINT(("Load_conn_advise: %d/%d allocating %d descriptors", lp->nqalloc, lp->max_dscr_allocs, lp->dscr_per_alloc));
lp->qalloc_list[lp->nqalloc] = /* (bbain 2/25/99) */
dp = (PCONN_DESCR)malloc((lp->dscr_per_alloc) * sizeof(CONN_DESCR));
if (dp != NULL)
{
ULONG i;
PCONN_DESCR tp;
QUEUE * fqp;
lp->nqalloc++;
/* initialize and link up descriptors; save first descriptor
for our use */
dp->code = CVY_DESCCODE; /* (bbain 8/19/99) */
Link_init(&(dp->link));
ep = &(dp->entry); /* (bbain 8/21/99) */
ep->code = CVY_ENTRCODE; /* (bbain 8/19/99) */
ep->alloc = TRUE;
ep->dirty = FALSE; /* v1.32B */
CVY_CONN_CLEAR(&(dp->entry));
Link_init(&(dp->entry.blink));
Link_init(&(dp->entry.rlink)); /* V2.1.5 */
tp = dp + 1;
fqp = &(lp->conn_freeq);
for (i=1; i<lp->dscr_per_alloc; i++)
{
tp->code = CVY_DESCCODE; /* (bbain 8/19/99) */
Link_init(&(tp->link));
tp->entry.code = CVY_ENTRCODE; /* (bbain 8/19/99) */
tp->entry.alloc = TRUE;
tp->entry.dirty = FALSE; /* v1.32B */
CVY_CONN_CLEAR(&(tp->entry));
Link_init(&(tp->entry.blink));
Link_init(&(tp->entry.rlink)); /* V2.1.5 */
Queue_enq(fqp, &(tp->link));
tp++;
}
}
else
{
if (!(lp->alloc_failed))
{
UNIV_PRINT(("Load_conn_advise: error allocating conn descrs"));
LOG_MSG(MSG_ERROR_MEMORY, MSG_NONE);
lp->alloc_failed = TRUE;
}
LOCK_EXIT(&(lp->lock), irql);
return TRUE;
}
}
else
{
/* V2.1.5 - if reached allocation limit - start taking
connection descriptors from the recover queue since
they are likely to be stale and very old */
PBIN_STATE rbp;
LINK * rlp;
#ifdef TRACE_RCVRY
DbgPrint ("Host %d: taking connection from recovery queue\n", lp->my_host_id);
#endif
rlp = (LINK *)Queue_deq(&(lp->conn_rcvryq));
UNIV_ASSERT (rlp != NULL);
/* this should not happen at all but protect anyway */
if (rlp == NULL)
{
if (!(lp->alloc_inhibited))
{
UNIV_PRINT(("Host %d: cannot allocate conn descriptors.", lp->my_host_id));
LOG_MSG(MSG_WARN_DESCRIPTORS, CVY_NAME_MAX_DSCR_ALLOCS);
lp->alloc_inhibited = TRUE;
}
LOCK_EXIT(&(lp->lock), irql);
return TRUE;
}
ep = STRUCT_PTR(rlp, CONN_ENTRY, rlink);
UNIV_ASSERT (ep->code == CVY_ENTRCODE); /* (bbain 8/19/99) */
/* fixed for nt4/sp5 */
if (ep->alloc)
{
/* unlink allocated descriptors from the hash table
queue if necessary and set dp so that code below
will put it back in the right hash queue */
dp = STRUCT_PTR(ep, CONN_DESCR, entry);
UNIV_ASSERT (dp->code == CVY_DESCCODE); /* (bbain 8/19/99) */
Link_unlink(&(dp->link));
}
else
{
dp = NULL; /* (bbain 8/21/99) */
}
/* dirty connections are not counted */
if (! ep->dirty)
{
/* find out which port group we are on so we can clean
up its counters */
rbp = Load_pg_lookup(lp, ep->svr_ipaddr, ep->svr_port, is_tcp_pkt);
/* correct bad (negative) bin count */
if (lp->nconn <= 0)
lp->nconn = 0;
else
lp->nconn--;
if (rbp->nconn[ep->bin] <= 0)
rbp->nconn[ep->bin] = 0;
else
{
rbp->nconn[ep->bin]--;
}
if (rbp->tconn <= 0)
rbp->tconn = 0;
else
rbp->tconn--;
if (rbp->nconn[ep->bin] == 0)
{
rbp->idle_bins |= (((MAP_T) 1) << ep->bin);
}
}
Link_unlink(&(ep->blink));
CVY_CONN_CLEAR(ep);
ep->dirty = FALSE;
}
}
/* else dp is not NULL, so setup entry pointer */
else
{
ep = &(dp->entry);
UNIV_ASSERT (ep->code == CVY_ENTRCODE); /* (bbain 8/21/99) */
}
/* enqueue descriptor in hash table unless it's already a hash table entry
(V2.1.5 recovered connection might be in hash table, so make
sure we do not end up queueing it) */
if (dp != NULL)
{
UNIV_ASSERT (dp->code == CVY_DESCCODE); /* (bbain 8/19/99) */
/* enqueue new queue descriptor and setup entry pointer */
Queue_enq(qp, &(dp->link));
}
}
/* setup new entry */
UNIV_ASSERT (ep->code == CVY_ENTRCODE); /* (bbain 8/21/99) */
CVY_CONN_SET(ep, svr_ipaddr, svr_port, client_ipaddr, client_port, protocol);
ep->bin = (UCHAR)bin;
/* ###### fin count added for keynote - ramkrish */
/* initialize the fin count to 0 for a new connection */
ep->fin_count = 0;
/* enqueue entry into port group queue */
Queue_enq(&(bp->connq), &(ep->blink));
/* V2.1.5 add entry to the tail of connection recovery queue */
Queue_enq(&(lp->conn_rcvryq), &(ep->rlink));
/* increment # connections and mark bin not idle if necessary */
lp->nconn++; /* v2.1 */
bp->tconn++;
bp->nconn[bin]++;
if (bp->nconn[bin] == 1)
bp->idle_bins &= ~(((MAP_T) 1) << bin);
#ifdef TRACE_LOAD
DbgPrint("Host %d: advise starts conn; rule %d bin %d nconn %d\n",
lp->my_host_id, bp->index, bin, bp->nconn[bin]);
#endif
}
/* otherwise, we have a match; clean up conn entry if dirty since we have a
new connection, although TCP/IP will likely reject it if it has stale state
from another connection (v1.32B) */
else
{
if (ep->dirty)
{
#ifdef TRACE_DIRTY
DbgPrint ("converting dirty SYN from %d to %d\n", client_port, svr_port);
#endif
UNIV_ASSERT (ep->code == CVY_ENTRCODE); /* (bbain 8/21/99) */
ep->dirty = FALSE;
/* ###### initialize fin count for this new connection added for keynote - ramkrish */
/* ###### since we are reusing a dirty connection desc for a new conn., it needs to be reset */
ep->fin_count = 0;
UNIV_ASSERT (ep->bin == (USHORT)bin);
/* unlink and enqueue entry into port group queue */
Link_unlink(&(ep->blink));
Queue_enq(&(bp->connq), &(ep->blink));
/* increment # connections and mark bin not idle if necessary */
lp->nconn++; /* v2.1 */
bp->tconn++;
bp->nconn[bin]++;
if (bp->nconn[bin] == 1)
bp->idle_bins &= ~(((MAP_T) 1) << bin);
}
}
}
/* otherwise, if a known connection is going down, remove our connection entry */
/* ###### check for reset addded for keynote - ramkrish */
else if ((conn_status == CVY_CONN_DOWN || conn_status == CVY_CONN_RESET) && match)
{
/* if connection was dirty, just block the packet since TCP/IP may have stale
connection state for a previous connection from another host (v1.32B) */
if (ep->dirty)
{
LOCK_EXIT(&(lp->lock), irql);
#ifdef TRACE_DIRTY
DbgPrint ("blocking dirty FIN from %d to %d\n", client_port, svr_port);
#endif
return FALSE;
}
/* ###### fin count added for keynote - ramkrish */
/* if this is the first fin, then simply increment the fincount and return */
if (conn_status == CVY_CONN_DOWN && ep->fin_count == 0 && is_tcp_pkt)
{
ep->fin_count++;
LOCK_EXIT(&(lp->lock), irql);
return TRUE;
}
/* clear hash table entry if we had a hit; enqueued entry was already freed */
if (hit)
{
CVY_CONN_CLEAR(ep);
/* ###### clear fin count for keynote - ramkrish */
ep->fin_count = 0;
Link_unlink(&(ep->rlink)); /* V2.1.5 */
Link_unlink(&(ep->blink));
}
/* decrement # connections and mark bin idle if necessary */
#if 0
if (bp->nconn[bin] <= 0)
DbgPrint("WLBS: Load_conn_advise: count was zero %d %d\n", bin, bp->nconn[bin]);
#endif
UNIV_ASSERT(bp->nconn[bin] > 0 && bp->tconn > 0 && lp->nconn > 0);
if (lp->nconn <= 0) /* v2.1 */
lp->nconn = 0;
else
lp->nconn--;
if (bp->nconn[bin] <= 0) /* correct bad (negative) bin count */
bp->nconn[bin] = 0;
else
bp->nconn[bin]--;
if (bp->tconn <= 0)
bp->tconn = 0;
else
bp->tconn--;
if (bp->nconn[bin] == 0)
{
bp->idle_bins |= (((MAP_T) 1) << bin);
}
#ifdef TRACE_LOAD
DbgPrint("Host %d: advise removes conn; rule %d bin %d nconn %d\n",
lp->my_host_id, bp->index, bin, bp->nconn[bin]);
#endif
}
else
{
LOCK_EXIT(&(lp->lock), irql);
return FALSE;
}
LOCK_EXIT(&(lp->lock), irql);
return TRUE;
} /* end Load_conn_advise */
/*
* Function: Load_create_dscr
* Desctription:
* Parameters:
* Returns:
* Author: shouse, 5.18.01
* Notes:
*/
BOOLEAN Load_create_dscr(
PLOAD_CTXT lp,
ULONG svr_ipaddr,
ULONG svr_port,
ULONG client_ipaddr,
ULONG client_port,
USHORT protocol,
BOOLEAN limit_map_fn)
{
BOOLEAN match = FALSE; /* TRUE => we have a record of this connection. */
ULONG id; /* Hash index for the connection. */
ULONG bin; /* Bin index. */
PBIN_STATE bp; /* Pointer to bin state. */
PCONN_ENTRY ep; /* Pointer to connection entry. */
PCONN_DESCR dp; /* Pointer to connection descriptor. */
QUEUE * qp; /* Pointer to connection queue. */
PMAIN_CTXT ctxtp = CONTAINING_RECORD(lp, MAIN_CTXT, load);
BOOLEAN is_tcp_pkt = (protocol == TCPIP_PROTOCOL_TCP);
UNIV_ASSERT(lp->code == CVY_LOADCODE);
if (!lp->active)
return FALSE;
/* Increment count of packets handled. */
lp->pkt_count++;
/* Find the port rule for this connection. */
bp = Load_pg_lookup(lp, svr_ipaddr, svr_port, is_tcp_pkt);
/* Hash. */
if (limit_map_fn) {
if (bp->affinity == CVY_AFFINITY_NONE)
id = Map(client_ipaddr, MAP_FN_PARAMETER);
else if (bp->affinity == CVY_AFFINITY_SINGLE)
id = Map(client_ipaddr, MAP_FN_PARAMETER);
else
id = Map(client_ipaddr & TCPIP_CLASSC_MASK, MAP_FN_PARAMETER);
} else {
if (bp->affinity == CVY_AFFINITY_NONE)
id = Map(client_ipaddr, ((svr_port << 16) + client_port));
else if (bp->affinity == CVY_AFFINITY_SINGLE)
id = Map(client_ipaddr, svr_ipaddr);
else
id = Map(client_ipaddr & TCPIP_CLASSC_MASK, svr_ipaddr);
}
/* Hash client address to bin id and connection hash table index. */
bin = id % CVY_MAXBINS;
id = id % CVY_MAX_CHASH;
/* Get a pointer to the connection entry for this hash ID. */
ep = &(lp->hashed_conn[id]);
UNIV_ASSERT (ep->code == CVY_ENTRCODE);
/* Get a pointer to the conneciton queue. */
qp = &(lp->connq[id]);
if (CVY_CONN_MATCH(ep, svr_ipaddr, svr_port, client_ipaddr, client_port, protocol))
{
/* Note that we found a match for this tuple. */
match = TRUE;
} else {
for (dp = (PCONN_DESCR)Queue_front(qp); dp != NULL; dp = (PCONN_DESCR)Queue_next(qp, &(dp->link))) {
if (CVY_CONN_MATCH(&(dp->entry), svr_ipaddr, svr_port, client_ipaddr, client_port, protocol))
{
/* Note that we found a match for this tuple. */
match = TRUE;
UNIV_ASSERT (dp->code == CVY_DESCCODE);
/* Get a pointer to the connection entry. */
ep = &(dp->entry);
UNIV_ASSERT (ep->code == CVY_ENTRCODE);
break;
}
}
}
/* If we don't have a connection match, setup a new connection entry. */
if (!match) {
/* If hash entry table is not available, setup and enqueue a new entry. */
if (CVY_CONN_IN_USE(ep)) {
/* Get a pointer to a free descriptor. */
dp = (PCONN_DESCR)Queue_deq(&(lp->conn_freeq));
if (dp == NULL) {
/* Allocate new queue descriptors if allowed. */
if (lp->nqalloc < lp->max_dscr_allocs) {
UNIV_PRINT(("Load_create_dscr: %d/%d allocating %d descriptors", lp->nqalloc, lp->max_dscr_allocs, lp->dscr_per_alloc));
dp = lp->qalloc_list[lp->nqalloc] = (PCONN_DESCR)malloc((lp->dscr_per_alloc) * sizeof(CONN_DESCR));
if (dp != NULL) {
ULONG i;
PCONN_DESCR tp;
QUEUE * fqp;
/* Increment the counter for number of allocations. */
lp->nqalloc++;
/* Initialize and link up descriptors; save first descriptor for our use. */
dp->code = CVY_DESCCODE;
Link_init(&(dp->link));
/* Initialize the connection entry. */
ep = &(dp->entry);
ep->code = CVY_ENTRCODE;
ep->alloc = TRUE;
ep->dirty = FALSE;
/* Mark this entry unused. */
CVY_CONN_CLEAR(&(dp->entry));
Link_init(&(dp->entry.blink));
Link_init(&(dp->entry.rlink));
tp = dp + 1;
fqp = &(lp->conn_freeq);
/* Initialize all descriptors and tack them on the free queue. */
for (i = 1; i < lp->dscr_per_alloc; i++, tp++) {
/* Initialize the descriptor. */
tp->code = CVY_DESCCODE;
Link_init(&(tp->link));
/* Initialize the connection entry. */
tp->entry.code = CVY_ENTRCODE;
tp->entry.alloc = TRUE;
tp->entry.dirty = FALSE;
/* Mark this entry unused. */
CVY_CONN_CLEAR(&(tp->entry));
Link_init(&(tp->entry.blink));
Link_init(&(tp->entry.rlink));
/* Queue the descriptor onto the free queue. */
Queue_enq(fqp, &(tp->link));
}
} else {
/* Allocation failed, log a message and bail out. */
if (!(lp->alloc_failed)) {
UNIV_PRINT(("Load_conn_advise: error allocating conn descrs"));
LOG_MSG(MSG_ERROR_MEMORY, MSG_NONE);
lp->alloc_failed = TRUE;
}
return FALSE;
}
} else {
/* If we have reached the allocation limit, start taking connection descriptors
from the recover queue since they are likely to be stale and very old. */
PBIN_STATE rbp;
LINK * rlp;
#ifdef TRACE_RCVRY
DbgPrint ("Host %d: taking connection from recovery queue\n", lp->my_host_id);
#endif
/* Dequeue a descriptor from the recovery queue. */
rlp = (LINK *)Queue_deq(&(lp->conn_rcvryq));
UNIV_ASSERT (rlp != NULL);
/* This should not happen at all but protect anyway. */
if (rlp == NULL) {
/* Unable to get a descriptor, log a message and bail out. */
if (!(lp->alloc_inhibited)) {
UNIV_PRINT(("Host %d: cannot allocate conn descriptors.", lp->my_host_id));
LOG_MSG(MSG_WARN_DESCRIPTORS, CVY_NAME_MAX_DSCR_ALLOCS);
lp->alloc_inhibited = TRUE;
}
return FALSE;
}
/* Grab a pointer to the connection entry. */
ep = STRUCT_PTR(rlp, CONN_ENTRY, rlink);
UNIV_ASSERT (ep->code == CVY_ENTRCODE);
if (ep->alloc) {
/* Unlink allocated descriptors from the hash table queue if necessary
and set dp so that code below will put it back in the right hash queue. */
dp = STRUCT_PTR(ep, CONN_DESCR, entry);
UNIV_ASSERT (dp->code == CVY_DESCCODE);
Link_unlink(&(dp->link));
} else {
dp = NULL;
}
/* Dirty connections are not counted, so we don't need to update these counters. */
if (! ep->dirty) {
/* Find out which port group we are on so we can clean up its counters. */
rbp = Load_pg_lookup(lp, ep->svr_ipaddr, ep->svr_port, is_tcp_pkt);
if (lp->nconn <= 0)
lp->nconn = 0;
else
lp->nconn--;
if (rbp->nconn[ep->bin] <= 0)
rbp->nconn[ep->bin] = 0;
else
rbp->nconn[ep->bin]--;
if (rbp->tconn <= 0)
rbp->tconn = 0;
else
rbp->tconn--;
if (rbp->nconn[ep->bin] == 0)
rbp->idle_bins |= (((MAP_T) 1) << ep->bin);
}
Link_unlink(&(ep->blink));
/* Mark the descriptor as unused. */
CVY_CONN_CLEAR(ep);
/* Makr the descriptor as clean. */
ep->dirty = FALSE;
}
} else {
/* There was a free descriptor, so setup the connection entry pointer. */
ep = &(dp->entry);
UNIV_ASSERT (ep->code == CVY_ENTRCODE);
}
/* Enqueue descriptor in hash table unless it's already a hash table entry (a recovered
connection might be in hash table, so make sure we do not end up queueing it) */
if (dp != NULL) {
UNIV_ASSERT (dp->code == CVY_DESCCODE);
Queue_enq(qp, &(dp->link));
}
}
UNIV_ASSERT (ep->code == CVY_ENTRCODE);
/* Setup a new entry. */
CVY_CONN_SET(ep, svr_ipaddr, svr_port, client_ipaddr, client_port, protocol);
ep->bin = (UCHAR)bin;
/* Initialize the fin count to 0 for a new connection. */
ep->fin_count = 0;
/* Enqueue entry into port group queue. */
Queue_enq(&(bp->connq), &(ep->blink));
/* Add entry to the tail of connection recovery queue. */
Queue_enq(&(lp->conn_rcvryq), &(ep->rlink));
/* Increment number of connections and mark bin not idle if necessary. */
lp->nconn++;
bp->tconn++;
bp->nconn[bin]++;
if (bp->nconn[bin] == 1) bp->idle_bins &= ~(((MAP_T) 1) << bin);
#ifdef TRACE_LOAD
DbgPrint("Host %d: advise starts conn; rule %d bin %d nconn %d\n",
lp->my_host_id, bp->index, bin, bp->nconn[bin]);
#endif
} else {
/* We have a match. Clean up connection entry if it's dirty since we have a new connection,
although TCP/IP will likely reject it if it has stale state from another connection. */
if (ep->dirty) {
#ifdef TRACE_DIRTY
DbgPrint ("converting dirty SYN from %d to %d\n", client_port, svr_port);
#endif
UNIV_ASSERT (ep->code == CVY_ENTRCODE);
ep->dirty = FALSE;
ep->fin_count = 0;
UNIV_ASSERT (ep->bin == (USHORT)bin);
/* Unlink and enqueue entry into port group queue. */
Link_unlink(&(ep->blink));
Queue_enq(&(bp->connq), &(ep->blink));
/* Increment # connections and mark bin not idle if necessary. */
lp->nconn++;
bp->tconn++;
bp->nconn[bin]++;
if (bp->nconn[bin] == 1) bp->idle_bins &= ~(((MAP_T) 1) << bin);
}
}
return TRUE;
}
ULONG Load_port_change(
PLOAD_CTXT lp,
ULONG ipaddr,
ULONG port,
ULONG cmd,
ULONG value)
{
PCVY_RULE rp; /* Pointer to configured port rules. */
PBIN_STATE bp; /* Pointer to load module port rule state. */
ULONG nrules; /* Number of rules. */
ULONG i;
ULONG ret = IOCTL_CVY_NOT_FOUND;
PMAIN_CTXT ctxtp = CONTAINING_RECORD(lp, MAIN_CTXT, load);
UNIV_ASSERT(lp->code == CVY_LOADCODE);
if (! lp->active)
return IOCTL_CVY_NOT_FOUND;
rp = (* (lp->params)).port_rules;
/* If we are draining whole cluster, include DEFAULT rule; Otherwise, just
include the user-defined rules (the DEFAULT rule is the last rule). */
if (cmd == IOCTL_CVY_CLUSTER_DRAIN || cmd == IOCTL_CVY_CLUSTER_PLUG)
nrules = (* (lp->params)).num_rules + 1;
else
nrules = (* (lp->params)).num_rules;
for (i=0; i<nrules; i++, rp++)
{
/* If the virtual IP address is IOCTL_ALL_VIPS (0x00000000), then we are applying this
change to all port rules for port X, regardless of VIP. If the virtual IP address is
to be applied to a particular VIP, then we apply only to port rules whose VIP matches.
Similarly, if the change is to apply to an "ALL VIP" rule, then we also apply when the
VIP matches because the caller uses CVY_ALL_VIP_NUMERIC_VALUE (0xffffffff) as the
virtual IP address, which is the same value stored in the port rule state. */
if ((ipaddr == IOCTL_ALL_VIPS || ipaddr == rp->virtual_ip_addr) &&
(port == IOCTL_ALL_PORTS || (port >= rp->start_port && port <= rp->end_port)))
{
bp = &(lp->pg_state[i]);
UNIV_ASSERT(bp->code == CVY_BINCODE); /* (bbain 8/19/99) */
/* If enabling a port rule, set the load amount to original value;
If disabling a port rule, set the load amount to zero;
Otherwise, set the load amount it to the specified amount. */
if (cmd == IOCTL_CVY_PORT_ON || cmd == IOCTL_CVY_CLUSTER_PLUG)
{
if (bp->load_amt[lp->my_host_id] == bp->orig_load_amt)
{
/* If we are the first port rule to match, then set the
return value to "Already"; Otherwise, we don't want to
overwrite some other port rule's return value of "OK"
in the case of ALL_VIPS or ALL_PORTS. */
if (ret == IOCTL_CVY_NOT_FOUND) ret = IOCTL_CVY_ALREADY;
continue;
}
/* Restore the original load amount. */
bp->load_amt[lp->my_host_id] = bp->orig_load_amt;
ret = IOCTL_CVY_OK;
}
else if (cmd == IOCTL_CVY_PORT_OFF)
{
if (bp->load_amt[lp->my_host_id] == 0)
{
/* If we are the first port rule to match, then set the
return value to "Already"; Otherwise, we don't want to
overwrite some other port rule's return value of "OK"
in the case of ALL_VIPS or ALL_PORTS. */
if (ret == IOCTL_CVY_NOT_FOUND) ret = IOCTL_CVY_ALREADY;
continue;
}
bp->load_amt[lp->my_host_id] = 0;
/* Immediately stop handling all traffic on the port group. */
bp->cmap = 0;
bp->cur_map[lp->my_host_id] = 0;
Load_conn_kill(lp, bp);
ret = IOCTL_CVY_OK;
}
else if (cmd == IOCTL_CVY_PORT_DRAIN || cmd == IOCTL_CVY_CLUSTER_DRAIN)
{
if (bp->load_amt[lp->my_host_id] == 0)
{
/* If we are the first port rule to match, then set the
return value to "Already"; Otherwise, we don't want to
overwrite some other port rule's return value of "OK"
in the case of ALL_VIPS or ALL_PORTS. */
if (ret == IOCTL_CVY_NOT_FOUND) ret = IOCTL_CVY_ALREADY;
continue;
}
/* Set load weight to zero, but continue to handle existing connections. */
bp->load_amt[lp->my_host_id] = 0;
ret = IOCTL_CVY_OK;
}
else
{
UNIV_ASSERT(cmd == IOCTL_CVY_PORT_SET);
if (bp->load_amt[lp->my_host_id] == value)
{
/* If we are the first port rule to match, then set the
return value to "Already"; Otherwise, we don't want to
overwrite some other port rule's return value of "OK"
in the case of ALL_VIPS or ALL_PORTS. */
if (ret == IOCTL_CVY_NOT_FOUND) ret = IOCTL_CVY_ALREADY;
continue;
}
/* Set the load weight for this port rule. */
bp->orig_load_amt = value;
bp->load_amt[lp->my_host_id] = value;
ret = IOCTL_CVY_OK;
}
if (port != IOCTL_ALL_PORTS && ipaddr != IOCTL_ALL_VIPS) break;
}
}
/* If the cluster isn't already converging, then initiate convergence if the load weight of a port rule has been modified. */
if (lp->send_msg.state != HST_CVG && ret == IOCTL_CVY_OK) {
WCHAR me[20];
Univ_ulong_to_str (lp->my_host_id+1, me, 10);
/* Tracking convergence - Starting convergence because our port rule configuration has changed. */
LOG_MSGS(MSG_INFO_CONVERGING_NEW_RULES, me, me);
TRACE_CONVERGENCE("Initiating convergence on host %d. Reason: Host %d has changed its port rule configuration.", lp->my_host_id+1, lp->my_host_id+1);
/* Tracking convergence. */
Load_convergence_start(lp);
}
return ret;
} /* end Load_port_change */
ULONG Load_hosts_query(
PLOAD_CTXT lp,
BOOLEAN internal,
PULONG host_map)
{
WCHAR buf1 [256];
WCHAR buf2 [256];
PWCHAR ptr1 = buf1;
PWCHAR ptr2 = buf2;
WCHAR num [20]; /* v2.1 */
WCHAR msk [33];
ULONG i, j, k;
PMAIN_CTXT ctxtp = CONTAINING_RECORD (lp, MAIN_CTXT, load);
UNIV_ASSERT(lp->code == CVY_LOADCODE); /* (bbain 8/19/99) */
buf1 [0] = 0;
buf2 [0] = 0;
msk [0] = 0;
num [0] = 0;
for (i = 0, j = 0; i < 16; i++)
{
if (lp -> host_map & (1 << i))
{
ptr1 = Univ_ulong_to_str (i + 1, ptr1, 10);
* ptr1 = L',';
ptr1 ++;
j ++;
msk [i] = L'1';
}
else
msk [i] = L'0';
}
for (i = 16, k = 0; i < 32; i++)
{
if (lp -> host_map & (1 << i))
{
ptr2 = Univ_ulong_to_str (i + 1, ptr2, 10);
* ptr2 = L',';
ptr2 ++;
k ++;
msk [i] = L'1';
}
else
msk [i] = L'0';
}
if (k)
{
ptr2 --;
// * ptr2 = L'.';
// ptr2 ++;
}
else if (j)
{
ptr1 --;
// * ptr1 = L'.';
// ptr1 ++;
}
* ptr1 = 0;
* ptr2 = 0;
* host_map = lp->host_map;
Univ_ulong_to_str ((* (lp->params)) . host_priority, num, 10); /* v2.1 */
if (lp->send_msg.state != HST_NORMAL)
{
UNIV_PRINT (("current host map is %08x and converging", lp->host_map));
if (internal) /* 1.03 */
{
LOG_MSGS3 (MSG_INFO_CONVERGING, num, buf1, buf2);
}
return IOCTL_CVY_CONVERGING;
}
/* if this host has the bins for the deafult rule, it is the default host (v2.1) */
else if (lp->pg_state[(* (lp->params)).num_rules].cmap != 0)
{
UNIV_PRINT (("current host map is %08x and converged as DEFAULT", lp->host_map));
if (internal) /* 1.03 */
{
LOG_MSGS3(MSG_INFO_MASTER, num, buf1, buf2);
}
return IOCTL_CVY_MASTER;
}
else
{
UNIV_PRINT (("current host map is %08x and converged (NON-DEFAULT)", lp->host_map));
if (internal) /* 1.03 */
{
LOG_MSGS3(MSG_INFO_SLAVE, num, buf1, buf2);
}
return IOCTL_CVY_SLAVE;
}
} /* end Load_hosts_query */
/*
* Function: Load_query_packet_filter
* Desctription:
* Parameters:
* Returns:
* Author: shouse, 5.18.01
* Notes:
*/
VOID Load_query_packet_filter (
PIOCTL_QUERY_STATE_PACKET_FILTER pQuery,
PLOAD_CTXT lp,
ULONG svr_ipaddr,
ULONG svr_port,
ULONG client_ipaddr,
ULONG client_port,
USHORT protocol,
BOOLEAN limit_map_fn)
{
PBIN_STATE bp;
ULONG id;
ULONG bin;
QUEUE * qp;
/* This variable is used for port rule lookup and since the port rules only cover
UDP and TCP, we categorize as TCP and non-TCP, meaning that any protocol that's
not TCP will be treated like UDP for the sake of port rule lookup. */
BOOLEAN is_tcp_pkt = (protocol == TCPIP_PROTOCOL_TCP);
/* Further, some protocols are treated with "session" semantics, while others are
not. For TCP, this "session" is currently a single TCP connection, which is
tracked from SYN to FIN using a connection descriptor. IPSec "sessions" are
also tracked using descriptors, so even though its treated like UDP for port
rule lookup, its treated with the session semantics resembling TCP. Therefore,
by default the determination of a session packet is initially the same as the
determination of a TCP packet. */
BOOLEAN is_session_pkt = is_tcp_pkt;
/* If we have enabled IPSec session tracking, then if the protocol is IPSec, this
packet should also be treated as part of an existing session. */
if (NLB_IPSEC_SESSION_SUPPORT_ENABLED() && (protocol == TCPIP_PROTOCOL_IPSEC1)) is_session_pkt = TRUE;
UNIV_ASSERT(lp->code == CVY_LOADCODE);
/* If the load module has been "turned off", then we drop the packet. */
if (!lp->active) {
pQuery->Results.Accept = NLB_REJECT_LOAD_MODULE_INACTIVE;
return;
}
/* Find the port rule for this server IP address / port pair. */
bp = Load_pg_lookup(lp, svr_ipaddr, svr_port, is_tcp_pkt);
UNIV_ASSERT ((is_tcp_pkt && bp->prot != CVY_UDP) || (!is_tcp_pkt && bp->prot != CVY_TCP));
/* If the matching port rule is configured as "disabled", which means to drop any
packets that match the rule, then we drop the packet. */
if (bp->mode == CVY_NEVER) {
pQuery->Results.Accept = NLB_REJECT_PORT_RULE_DISABLED;
return;
}
/* Apply the NLB hashing algorithm on the client identification. If for reasons
such as BDA teaming, we have chosen to limit the map function, we hard code the
second parameter, rather than use some of the server identification in an
effort to make the processing of this packet agnostic to the server identity.
The hashing parameters also, of course, depend on the configured afffinity
settings for the retrieved port rule. */
if (limit_map_fn) {
if (bp->affinity == CVY_AFFINITY_NONE)
id = Map(client_ipaddr, MAP_FN_PARAMETER);
else if (bp->affinity == CVY_AFFINITY_SINGLE)
id = Map(client_ipaddr, MAP_FN_PARAMETER);
else
id = Map(client_ipaddr & TCPIP_CLASSC_MASK, MAP_FN_PARAMETER);
} else {
if (bp->affinity == CVY_AFFINITY_NONE)
id = Map(client_ipaddr, ((svr_port << 16) + client_port));
else if (bp->affinity == CVY_AFFINITY_SINGLE)
id = Map(client_ipaddr, svr_ipaddr);
else
id = Map(client_ipaddr & TCPIP_CLASSC_MASK, svr_ipaddr);
}
/* Find the applicable "bucket" by a modulo operation on the number of bins, 60. */
bin = id % CVY_MAXBINS;
/* At this point, we can begin providing the requestee some actual information about
the state of the load module to better inform them as to why the decision we return
them was actually made. Here will provide some appropriate information about the
port rule we are operating on, including the "bucket" ID, the current "bucket"
ownership map and the number of connections active on this "bucket". */
pQuery->Results.HashInfo.Valid = TRUE;
pQuery->Results.HashInfo.Bin = bin;
pQuery->Results.HashInfo.CurrentMap = bp->cmap;
pQuery->Results.HashInfo.AllIdleMap = bp->all_idle_map;
pQuery->Results.HashInfo.ActiveConnections = bp->nconn[bin];
/* check bin for residency and all other hosts now idle on their bins; in this
case and if we do not have dirty connections, we must be able to handle the packet */
/* If we currently own the "bucket" to which this connection maps and either NLB provides
no session support for this protocol, or all other hosts have no exisitng connections
on this "bucket" and we have no dirty connections, then we can safely take the packet
with no regard to the connection (session) descriptors. */
if (((bp->cmap & (((MAP_T) 1) << bin)) != 0) && (!is_session_pkt || (((bp->all_idle_map & (((MAP_T) 1) << bin)) != 0) && (!(lp->cln_waiting))))) {
pQuery->Results.Accept = NLB_ACCEPT_UNCONDITIONAL_OWNERSHIP;
return;
/* Otherwise, if there are active connections on this "bucket" or if we own the
"bucket" and there are dirty connections on it, then we'll walk our descriptor
lists to determine whether or not we should take the packet or not. */
} else if (bp->nconn[bin] > 0 || (lp->cln_waiting && lp->dirty_bin[bin] && ((bp->cmap & (((MAP_T) 1) << bin)) != 0))) {
PCONN_ENTRY ep;
PCONN_DESCR dp;
/* Calculate our index into the descriptor hash table by a modulo operation on the
length of the static descriptor array, 4096. */
id = id % CVY_MAX_CHASH;
/* Grab a pointer to the descriptor in our spot in the hash table. */
ep = &(lp->hashed_conn[id]);
/* Grab a pointer to our assigned queue of descriptors - our second level hashing. */
qp = &(lp->connq[id]);
/* First look for a match in the first-level hashing array. */
if (CVY_CONN_MATCH(ep, svr_ipaddr, svr_port, client_ipaddr, client_port, protocol)) {
/* If we find a match in the static hash table, fill in some descriptor
information for the user, including whether or not the descriptor was
allocated or static (static is this case) and the observed FIN count. */
pQuery->Results.DescriptorInfo.Valid = TRUE;
pQuery->Results.DescriptorInfo.Alloc = ep->alloc;
pQuery->Results.DescriptorInfo.Dirty = ep->dirty;
pQuery->Results.DescriptorInfo.FinCount = ep->fin_count;
/* If the connection is dirty, we do not take the packet because TCP may
have stale information for this descriptor. */
if (ep->dirty) {
pQuery->Results.Accept = NLB_REJECT_CONNECTION_DIRTY;
return;
}
/* If the connection is not dirty, we'll take the packet, as it belongs
to an existing connection that we are servicing on this host. */
pQuery->Results.Accept = NLB_ACCEPT_FOUND_MATCHING_DESCRIPTOR;
return;
/* Otherwise, we have to walk the second-level hashing linked list of connection
(session) descriptors looking for a match. */
} else {
/* Walk the queue until we reach the end or find what we're looking for. */
for (dp = (PCONN_DESCR)Queue_front(qp); dp != NULL; dp = (PCONN_DESCR)Queue_next(qp, &(dp->link))) {
if (CVY_CONN_MATCH(&(dp->entry), svr_ipaddr, svr_port, client_ipaddr, client_port, protocol)) {
/* If we find a match in the static hash table, fill in some descriptor
information for the user, including whether or not the descriptor was
allocated or static (allocated is this case) and the observed FIN count. */
pQuery->Results.DescriptorInfo.Valid = TRUE;
pQuery->Results.DescriptorInfo.Alloc = dp->entry.alloc;
pQuery->Results.DescriptorInfo.Dirty = dp->entry.dirty;
pQuery->Results.DescriptorInfo.FinCount = dp->entry.fin_count;
/* If the connection is dirty, we do not take the packet because TCP may
have stale information for this descriptor. */
if (dp->entry.dirty) {
pQuery->Results.Accept = NLB_REJECT_CONNECTION_DIRTY;
return;
}
/* If the connection is not dirty, we'll take the packet, as it belongs
to an existing connection that we are servicing on this host. */
pQuery->Results.Accept = NLB_ACCEPT_FOUND_MATCHING_DESCRIPTOR;
return;
}
}
}
}
/* If we get all the way down here, then we aren't going to accept the packet
because we do not own the "bucket" to which the packet maps and we have no
existing connection (session) state to allow us to service the packet. */
pQuery->Results.Accept = NLB_REJECT_OWNED_ELSEWHERE;
return;
}
#if defined (SBH)
/*
* Function: Load_packet_filter
* Desctription:
* Parameters:
* Returns:
* Author: shouse, 5.18.01
* Notes:
*/
BOOLEAN Load_packet_filter (
PLOAD_CTXT lp,
ULONG svr_ipaddr,
ULONG svr_port,
ULONG client_ipaddr,
ULONG client_port,
USHORT protocol,
ULONG conn_status,
BOOLEAN limit_map_fn)
{
BIN_LOOKUP();
HASH();
switch (conn_status) {
case CVY_CONN_CREATE:
CREATE_DSCR();
break;
case CVY_CONN_UP:
CREATE_DSCR();
break;
case CVY_CONN_DOWN:
case CVY_CONN_RESET:
REMOVE_DSCR();
break;
case CVY_CONN_DATA:
// protocol dependent.
CHECK_HASH();
SEARCH_QUEUE();
break;
}
}
#endif