Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

2816 lines
99 KiB

// -*- mode: C++; tab-width: 4; indent-tabs-mode: nil -*- (for GNU Emacs)
//
// Copyright (c) 1985-2000 Microsoft Corporation
//
// This file is part of the Microsoft Research IPv6 Network Protocol Stack.
// You should have received a copy of the Microsoft End-User License Agreement
// for this software along with this release; see the file "license.txt".
// If not, please see http://www.research.microsoft.com/msripv6/license.htm,
// or write to Microsoft Research, One Microsoft Way, Redmond, WA 98052-6399.
//
// Abstract:
//
// TCP send code.
//
// This file contains the code for sending Data and Control segments.
//
#include "oscfg.h"
#include "ndis.h"
#include "ip6imp.h"
#include "ip6def.h"
#include "tdi.h"
#include "tdint.h"
#include "tdistat.h"
#include "queue.h"
#include "transprt.h"
#include "addr.h"
#include "tcp.h"
#include "tcb.h"
#include "tcpconn.h"
#include "tcpsend.h"
#include "tcprcv.h"
#include "info.h"
#include "tcpcfg.h"
#include "route.h"
#include "security.h"
void *TCPProtInfo; // TCP protocol info for IP.
SLIST_HEADER TCPSendReqFree; // Send req. free list.
KSPIN_LOCK TCPSendReqFreeLock;
KSPIN_LOCK TCPSendReqCompleteLock;
uint NumTCPSendReq; // Current number of SendReqs in system.
uint MaxSendReq = 0xffffffff; // Maximum allowed number of SendReqs.
extern KSPIN_LOCK TCBTableLock;
//
// All of the init code can be discarded.
//
#ifdef ALLOC_PRAGMA
#pragma alloc_text(INIT, InitTCPSend)
#endif // ALLOC_PRAGMA
extern void ResetSendNext(TCB *SeqTCB, SeqNum NewSeq);
#define MIN_INITIAL_RTT 3 // In msec.
//* FreeSendReq - Free a send request structure.
//
// Called to free a send request structure.
//
void // Returns: Nothing.
FreeSendReq(
TCPSendReq *FreedReq) // Connection request structure to be freed.
{
PSLIST_ENTRY BufferLink;
CHECK_STRUCT(FreedReq, tsr);
BufferLink = CONTAINING_RECORD(&(FreedReq->tsr_req.tr_q.q_next),
SLIST_ENTRY, Next);
ExInterlockedPushEntrySList(&TCPSendReqFree, BufferLink,
&TCPSendReqFreeLock);
}
//* GetSendReq - Get a send request structure.
//
// Called to get a send request structure.
//
TCPSendReq * // Returns: Pointer to SendReq structure, or NULL if none.
GetSendReq(
void) // Nothing.
{
TCPSendReq *Temp;
PSLIST_ENTRY BufferLink;
Queue *QueuePtr;
TCPReq *ReqPtr;
BufferLink = ExInterlockedPopEntrySList(&TCPSendReqFree,
&TCPSendReqFreeLock);
if (BufferLink != NULL) {
QueuePtr = CONTAINING_RECORD(BufferLink, Queue, q_next);
ReqPtr = CONTAINING_RECORD(QueuePtr, TCPReq, tr_q);
Temp = CONTAINING_RECORD(ReqPtr, TCPSendReq, tsr_req);
CHECK_STRUCT(Temp, tsr);
} else {
if (NumTCPSendReq < MaxSendReq)
Temp = ExAllocatePool(NonPagedPool, sizeof(TCPSendReq));
else
Temp = NULL;
if (Temp != NULL) {
ExInterlockedAddUlong((PULONG)&NumTCPSendReq, 1, &TCPSendReqFreeLock);
#if DBG
Temp->tsr_req.tr_sig = tr_signature;
Temp->tsr_sig = tsr_signature;
#endif
}
}
return Temp;
}
//* TCPHopLimit
//
// Given a TCB, returns the Hop Limit to use in a sent packet.
// Assumes the caller holds a lock on the TCB.
//
uchar
TCPHopLimit(TCB *Tcb)
{
if (Tcb->tcb_hops != -1)
return (uchar) Tcb->tcb_hops;
else
return (uchar) Tcb->tcb_rce->NCE->IF->CurHopLimit;
}
//* TCPSendComplete - Complete a TCP send.
//
// Called by IP when a send we've made is complete. We free the buffer,
// and possibly complete some sends. Each send queued on a TCB has a ref.
// count with it, which is the number of times a pointer to a buffer
// associated with the send has been passed to the underlying IP layer. We
// can't complete a send until that count it 0. If this send was actually
// from a send of data, we'll go down the chain of send and decrement the
// refcount on each one. If we have one going to 0 and the send has already
// been acked we'll complete the send. If it hasn't been acked we'll leave
// it until the ack comes in.
//
// NOTE: We aren't protecting any of this with locks. When we port this to
// NT we'll need to fix this, probably with a global lock. See the comments
// in ACKSend() in TCPRCV.C for more details.
//
void // Returns: Nothing.
TCPSendComplete(
PNDIS_PACKET Packet, // Packet that was sent.
IP_STATUS Status)
{
PNDIS_BUFFER BufferChain;
SendCmpltContext *SCContext;
PVOID Memory;
UINT Unused;
UNREFERENCED_PARAMETER(Status);
//
// Pull values we care about out of the packet structure.
//
SCContext = (SendCmpltContext *) PC(Packet)->CompletionData;
BufferChain = NdisFirstBuffer(Packet);
NdisQueryBufferSafe(BufferChain, &Memory, &Unused, LowPagePriority);
ASSERT(Memory != NULL);
//
// See if we have a send complete context. It will be present for data
// packets and means we have extra work to do. For non-data packets, we
// can just skip all this as there is only the header buffer to deal with.
//
if (SCContext != NULL) {
KIRQL OldIrql;
PNDIS_BUFFER CurrentBuffer;
TCPSendReq *CurrentSend;
uint i;
CHECK_STRUCT(SCContext, scc);
//
// First buffer in chain is the TCP header buffer.
// Skip over it for now.
//
CurrentBuffer = NDIS_BUFFER_LINKAGE(BufferChain);
//
// Also skip over any 'user' buffers (those loaned out to us
// instead of copied) as we don't need to free them.
//
for (i = 0; i < (uint)SCContext->scc_ubufcount; i++) {
ASSERT(CurrentBuffer != NULL);
CurrentBuffer = NDIS_BUFFER_LINKAGE(CurrentBuffer);
}
//
// Now loop through and free our (aka 'transport') buffers.
// We need to do this before decrementing the reference count to avoid
// destroying the buffer chain if we have to zap tsr_lastbuf->Next to
// NULL.
//
for (i = 0; i < (uint)SCContext->scc_tbufcount; i++) {
PNDIS_BUFFER TempBuffer;
ASSERT(CurrentBuffer != NULL);
TempBuffer = CurrentBuffer;
CurrentBuffer = NDIS_BUFFER_LINKAGE(CurrentBuffer);
NdisFreeBuffer(TempBuffer);
}
//
// Loop through the send requests attached to this packet,
// reducing the reference count on each and enqueing them for
// completion where appropriate.
//
CurrentSend = SCContext->scc_firstsend;
for (i = 0; i< SCContext->scc_count; i++) {
Queue *TempQ;
long Result;
TempQ = QNEXT(&CurrentSend->tsr_req.tr_q);
CHECK_STRUCT(CurrentSend, tsr);
Result = InterlockedDecrement(&(CurrentSend->tsr_refcnt));
ASSERT(Result >= 0);
if (Result <= 0) {
//
// Reference count has gone to 0 which means the send has
// been ACK'd or cancelled. Complete it now.
//
// If we've sent directly from this send, NULL out the next
// pointer for the last buffer in the chain.
//
if (CurrentSend->tsr_lastbuf != NULL) {
NDIS_BUFFER_LINKAGE(CurrentSend->tsr_lastbuf) = NULL;
CurrentSend->tsr_lastbuf = NULL;
}
KeAcquireSpinLock(&RequestCompleteLock, &OldIrql);
ENQUEUE(&SendCompleteQ, &CurrentSend->tsr_req.tr_q);
RequestCompleteFlags |= SEND_REQUEST_COMPLETE;
KeReleaseSpinLock(&RequestCompleteLock, OldIrql);
}
CurrentSend = CONTAINING_RECORD(QSTRUCT(TCPReq, TempQ, tr_q),
TCPSendReq, tsr_req);
}
}
//
// Free the TCP header buffer and our packet structure proper.
//
NdisFreeBuffer(BufferChain);
ExFreePool(Memory);
NdisFreePacket(Packet);
//
// If there are any TCP send requests to complete, do so now.
//
if (RequestCompleteFlags & SEND_REQUEST_COMPLETE)
TCPRcvComplete();
}
//* RcvWin - Figure out the receive window to offer in an ack.
//
// A routine to figure out what window to offer on a connection. We
// take into account SWS avoidance, what the default connection window is,
// and what the last window we offered is.
//
uint // Returns: Window to be offered.
RcvWin(
TCB *WinTCB) // TCB on which to perform calculations.
{
int CouldOffer; // The window size we could offer.
CHECK_STRUCT(WinTCB, tcb);
CheckPacketList(WinTCB->tcb_pendhead, WinTCB->tcb_pendingcnt);
ASSERT(WinTCB->tcb_rcvwin >= 0);
CouldOffer = WinTCB->tcb_defaultwin - WinTCB->tcb_pendingcnt;
ASSERT(CouldOffer >= 0);
ASSERT(CouldOffer >= WinTCB->tcb_rcvwin);
if ((CouldOffer - WinTCB->tcb_rcvwin) >=
(int) MIN(WinTCB->tcb_defaultwin/2, WinTCB->tcb_mss)) {
WinTCB->tcb_rcvwin = CouldOffer;
}
return WinTCB->tcb_rcvwin;
}
//* ValidateSourceAndRoute - Validate the NTE and RCE.
//
// Checks that the NTE and RCE referenced by this TCB are still ok to use.
//
BOOLEAN
ValidateSourceAndRoute(
TCB *Tcb) // TCB being validated.
{
KIRQL Irql0;
//
// Update our copy of the validation counter.
// We need to do this before making the validation checks below
// (to avoid missing any additional changes while we're in here).
//
Tcb->tcb_routing = RouteCacheValidationCounter;
//
// Check that our NTE hasn't gone away.
//
KeAcquireSpinLock(&Tcb->tcb_nte->IF->Lock, &Irql0);
if (!IsValidNTE(Tcb->tcb_nte)) {
//
// Can't use this one anymore.
//
KeReleaseSpinLock(&Tcb->tcb_nte->IF->Lock, Irql0);
ReleaseNTE(Tcb->tcb_nte);
//
// See if this address lives on as a different NTE.
//
Tcb->tcb_nte = FindNetworkWithAddress(&Tcb->tcb_saddr,
Tcb->tcb_sscope_id);
if (Tcb->tcb_nte == NULL) {
//
// The address is gone.
//
return FALSE;
}
} else {
KeReleaseSpinLock(&Tcb->tcb_nte->IF->Lock, Irql0);
}
//
// Also check that the RCE is still around.
//
Tcb->tcb_rce = ValidateRCE(Tcb->tcb_rce, Tcb->tcb_nte);
return TRUE;
}
//* SendSYN - Send a SYN segment.
//
// This is called during connection establishment time to send a SYN
// segment to the peer. We get a buffer if we can, and then fill
// it in. There's a tricky part here where we have to build the MSS
// option in the header - we find the MSS by finding the MSS offered
// by the net for the local address. After that, we send it.
//
void // Returns: Nothing.
SendSYN(
TCB *SYNTcb, // TCB from which SYN is to be sent.
KIRQL PreLockIrql) // IRQL prior to acquiring TCB lock.
{
PNDIS_PACKET Packet;
void *Memory;
IPv6Header UNALIGNED *IP;
TCPHeader UNALIGNED *TCP;
uchar *OptPtr;
NDIS_STATUS NdisStatus;
uint Offset;
uint Length;
uint PayloadLength;
ushort TempWin;
ushort MSS;
RouteCacheEntry *RCE;
CHECK_STRUCT(SYNTcb, tcb);
//
// Go ahead and set the retransmission timer now, in case we can't get a
// packet or a buffer. In the future we might want to queue the
// connection for when we get resources.
//
START_TCB_TIMER(SYNTcb->tcb_rexmittimer, SYNTcb->tcb_rexmit);
//
// In most cases, we will already have a route at this point.
// However, if we failed to get one earlier in the passive receive
// path, we may need to retry here.
//
if (SYNTcb->tcb_rce == NULL) {
InitRCE(SYNTcb);
if (SYNTcb->tcb_rce == NULL) {
goto ErrorReturn;
}
}
//
// Validate that the address we're sourcing from and the route we're
// sending upon are still okay to use.
//
if (SYNTcb->tcb_routing != RouteCacheValidationCounter) {
if (!ValidateSourceAndRoute(SYNTcb)) {
//
// Even though we're about to close this TCB,
// we should leave it in a consistent state.
//
SYNTcb->tcb_sendnext++;
if (SEQ_GT(SYNTcb->tcb_sendnext, SYNTcb->tcb_sendmax)) {
SYNTcb->tcb_sendmax = SYNTcb->tcb_sendnext;
}
TryToCloseTCB(SYNTcb, TCB_CLOSE_ABORTED, PreLockIrql);
return;
}
}
//
// Allocate a packet header/buffer/data region for this SYN.
//
// Our buffer has space at the beginning which will be filled in
// later by the link level. At this level we add the IPv6Header,
// TCPHeader, and TCP Maximum Segment Size option which follow.
//
// REVIEW: This grabs packets and buffers from the IPv6PacketPool and
// REVIEW: the IPv6BufferPool respectively. Have seperate pools for TCP?
//
Offset = SYNTcb->tcb_rce->NCE->IF->LinkHeaderSize;
Length = Offset + sizeof(*IP) + sizeof(*TCP) + MSS_OPT_SIZE;
NdisStatus = IPv6AllocatePacket(Length, &Packet, &Memory);
if (NdisStatus != NDIS_STATUS_SUCCESS) {
//
// Upon failure, advance tcb_sendnext anyway.
// We need to do this because TCBTimeout will *retreat* tcb_sendnext
// if this SYN is later retransmitted, and if that retreat occurs
// without this advance, we end up with a hole in the sequence-space.
//
KdPrintEx((DPFLTR_TCPIP6_ID, DPFLTR_NTOS_ERROR,
"TCP SendSYN: Couldn't allocate IPv6 packet header!?!\n"));
ErrorReturn:
SYNTcb->tcb_sendnext++;
if (SEQ_GT(SYNTcb->tcb_sendnext, SYNTcb->tcb_sendmax)) {
SYNTcb->tcb_sendmax = SYNTcb->tcb_sendnext;
}
KeReleaseSpinLock(&SYNTcb->tcb_lock, PreLockIrql);
return;
}
PC(Packet)->CompletionHandler = TCPSendComplete;
PC(Packet)->CompletionData = NULL;
//
// Since this is a SYN-only packet (maybe someday we'll send data with
// the SYN?) we only have the one buffer and nothing to link on after.
//
//
// We now have all the resources we need to send.
// Prepare the actual packet.
//
//
// Our header buffer has extra space for other headers to be
// prepended to ours without requiring further allocation calls.
// Put the actual TCP/IP header at the end of the buffer.
//
IP = (IPv6Header UNALIGNED *)((uchar *)Memory + Offset);
IP->VersClassFlow = IP_VERSION;
IP->NextHeader = IP_PROTOCOL_TCP;
IP->HopLimit = TCPHopLimit(SYNTcb);
IP->Source = SYNTcb->tcb_saddr;
IP->Dest = SYNTcb->tcb_daddr;
TCP = (TCPHeader UNALIGNED *)(IP + 1);
TCP->tcp_src = SYNTcb->tcb_sport;
TCP->tcp_dest = SYNTcb->tcb_dport;
TCP->tcp_seq = net_long(SYNTcb->tcb_sendnext);
//
// The SYN flag takes up one element in sequence number space.
// Record that we've sent it here (if we need to retransmit the SYN
// segment, TCBTimeout will reset sendnext before calling us again).
//
SYNTcb->tcb_sendnext++;
if (SEQ_GT(SYNTcb->tcb_sendnext, SYNTcb->tcb_sendmax)) {
TStats.ts_outsegs++;
SYNTcb->tcb_sendmax = SYNTcb->tcb_sendnext;
} else
TStats.ts_retranssegs++;
TCP->tcp_ack = net_long(SYNTcb->tcb_rcvnext);
//
// REVIEW: TCP flags are entirely based upon our state, so this could
// REVIEW: be replaced by a (quicker) array lookup.
//
if (SYNTcb->tcb_state == TCB_SYN_RCVD)
TCP->tcp_flags = MAKE_TCP_FLAGS(6, TCP_FLAG_SYN | TCP_FLAG_ACK);
else
TCP->tcp_flags = MAKE_TCP_FLAGS(6, TCP_FLAG_SYN);
TempWin = (ushort)SYNTcb->tcb_rcvwin;
TCP->tcp_window = net_short(TempWin);
TCP->tcp_urgent = 0;
TCP->tcp_xsum = 0;
OptPtr = (uchar *)(TCP + 1);
//
// Compose the Maximum Segment Size option.
//
// TBD: If we add IPv6 Jumbogram support, we should also add LFN
// TBD: support to TCP and change this to handle a larger MSS.
//
MSS = SYNTcb->tcb_rce->NTE->IF->LinkMTU
- sizeof(IPv6Header) - sizeof(TCPHeader);
IF_TCPDBG(TCP_DEBUG_MSS) {
KdPrintEx((DPFLTR_TCPIP6_ID, DPFLTR_INFO_TCPDBG,
"SendSYN: Sending MSS option value of %d\n", MSS));
}
*OptPtr++ = TCP_OPT_MSS;
*OptPtr++ = MSS_OPT_SIZE;
*(ushort UNALIGNED *)OptPtr = net_short(MSS);
PayloadLength = sizeof(TCPHeader) + MSS_OPT_SIZE;
//
// Compute the TCP checksum. It covers the entire TCP segment
// starting with the TCP header, plus the IPv6 pseudo-header.
//
// REVIEW: The IPv4 implementation kept the IPv4 psuedo-header around
// REVIEW: in the TCB rather than recalculate it every time. Do this?
//
TCP->tcp_xsum = 0;
TCP->tcp_xsum = ChecksumPacket(
Packet, Offset + sizeof *IP, NULL, PayloadLength,
AlignAddr(&IP->Source), AlignAddr(&IP->Dest), IP_PROTOCOL_TCP);
ASSERT(TCP->tcp_xsum != 0);
//
// Capture and reference the RCE while we still hold the TCB lock.
// The TCB's reference on this particular RCE might go away at any point
// after we release the lock (or because we drop it ourselves below).
//
RCE = SYNTcb->tcb_rce;
AddRefRCE(RCE);
//
// If connection-acceptance has been delayed, release the TCB's RCE.
// This prevents TCBs in pre-established states from consuming
// an unbounded number of RCEs.
//
if (SYNTcb->tcb_flags & ACCEPT_PENDING) {
SYNTcb->tcb_rce = NULL;
ReleaseRCE(RCE);
}
//
// Everything's ready. Now send the packet.
//
// Note that IPv6Send does not return a status code.
// Instead it *always* completes the packet
// with an appropriate status code.
//
KeReleaseSpinLock(&SYNTcb->tcb_lock, PreLockIrql);
IPv6Send(Packet, Offset, IP, PayloadLength, RCE, 0,
IP_PROTOCOL_TCP,
net_short(TCP->tcp_src),
net_short(TCP->tcp_dest));
//
// Release the extra reference we took on the RCE above.
//
ReleaseRCE(RCE);
}
//* SendKA - Send a keep alive segment.
//
// This is called when we want to send a keep-alive. The idea is to provoke
// a response from our peer on an otherwise idle connection. We send a
// garbage byte of data in our keep-alives in order to cooperate with broken
// TCP implementations that don't respond to segments outside the window
// unless they contain data.
//
void // Returns: Nothing.
SendKA(
TCB *KATcb, // TCB from which keep alive is to be sent.
KIRQL PreLockIrql) // IRQL prior to acquiring lock on TCB.
{
PNDIS_PACKET Packet;
void *Memory;
IPv6Header UNALIGNED *IP;
TCPHeader UNALIGNED *TCP;
NDIS_STATUS NdisStatus;
int Offset;
uint Length;
uint PayloadLength;
ushort TempWin;
SeqNum TempSeq;
RouteCacheEntry *RCE;
CHECK_STRUCT(KATcb, tcb);
//
// In most cases, we will already have a route at this point.
// However, if we failed to get one earlier in the passive receive
// path, we may need to retry here.
//
if (KATcb->tcb_rce == NULL) {
InitRCE(KATcb);
if (KATcb->tcb_rce == NULL) {
KeReleaseSpinLock(&KATcb->tcb_lock, PreLockIrql);
return;
}
}
//
// Validate that the address we're sourcing from and the route we're
// sending upon are still okay to use.
//
if (KATcb->tcb_routing != RouteCacheValidationCounter) {
if (!ValidateSourceAndRoute(KATcb)) {
TryToCloseTCB(KATcb, TCB_CLOSE_ABORTED, PreLockIrql);
return;
}
}
//
// Allocate a packet header/buffer/data region for this keepalive packet.
//
// Our buffer has space at the beginning which will be filled in
// later by the link level. At this level we add the IPv6Header,
// TCPHeader, and a single byte of data which follow.
//
// REVIEW: This grabs packets and buffers from the IPv6PacketPool and
// REVIEW: the IPv6BufferPool respectively. Have seperate pools for TCP?
//
Offset = KATcb->tcb_rce->NCE->IF->LinkHeaderSize;
Length = Offset + sizeof(*IP) + sizeof(*TCP) + 1;
NdisStatus = IPv6AllocatePacket(Length, &Packet, &Memory);
if (NdisStatus != NDIS_STATUS_SUCCESS) {
//
// REVIEW: What to do if this fails.
//
KdPrintEx((DPFLTR_TCPIP6_ID, DPFLTR_NTOS_ERROR,
"TCP SendKA: Couldn't allocate IPv6 packet header!?!\n"));
KeReleaseSpinLock(&KATcb->tcb_lock, PreLockIrql);
return;
}
PC(Packet)->CompletionHandler = TCPSendComplete;
PC(Packet)->CompletionData = NULL;
//
// Since this is a keepalive packet we only have the one buffer and
// nothing to link on after.
//
//
// Our header buffer has extra space for other headers to be
// prepended to ours without requiring further allocation calls.
// Put the actual TCP/IP header at the end of the buffer.
//
IP = (IPv6Header UNALIGNED *)((uchar *)Memory + Offset);
IP->VersClassFlow = IP_VERSION;
IP->NextHeader = IP_PROTOCOL_TCP;
IP->HopLimit = TCPHopLimit(KATcb);
IP->Source = KATcb->tcb_saddr;
IP->Dest = KATcb->tcb_daddr;
TCP = (TCPHeader UNALIGNED *)(IP + 1);
TCP->tcp_src = KATcb->tcb_sport;
TCP->tcp_dest = KATcb->tcb_dport;
TempSeq = KATcb->tcb_senduna - 1;
TCP->tcp_seq = net_long(TempSeq);
TCP->tcp_ack = net_long(KATcb->tcb_rcvnext);
TCP->tcp_flags = MAKE_TCP_FLAGS(5, TCP_FLAG_ACK);
TempWin = (ushort)RcvWin(KATcb);
TCP->tcp_window = net_short(TempWin);
TCP->tcp_urgent = 0;
//
// Initialize the single byte that we're resending.
// N.B. Adequate space for this byte was allocated above.
//
*(uchar *)(TCP + 1) = 0;
TStats.ts_retranssegs++;
PayloadLength = sizeof(TCPHeader) + 1;
//
// Compute the TCP checksum. It covers the entire TCP segment
// starting with the TCP header, plus the IPv6 pseudo-header.
//
TCP->tcp_xsum = 0;
TCP->tcp_xsum = ChecksumPacket(
Packet, Offset + sizeof *IP, NULL, PayloadLength,
AlignAddr(&IP->Source), AlignAddr(&IP->Dest), IP_PROTOCOL_TCP);
ASSERT(TCP->tcp_xsum != 0);
//
// Capture and reference the RCE while we still hold the TCB lock.
// The TCB's reference on this particular RCE might go away at any
// point after we release the lock.
//
RCE = KATcb->tcb_rce;
AddRefRCE(RCE);
//
// Everything's ready. Now send the packet.
//
// Note that IPv6Send does not return a status code.
// Instead it *always* completes the packet
// with an appropriate status code.
//
KATcb->tcb_kacount++;
KeReleaseSpinLock(&KATcb->tcb_lock, PreLockIrql);
IPv6Send(Packet, Offset, IP, PayloadLength, RCE, 0,
IP_PROTOCOL_TCP,
net_short(TCP->tcp_src),
net_short(TCP->tcp_dest));
//
// Release the extra reference we took on the RCE above.
//
ReleaseRCE(RCE);
}
//* SendACK - Send an ACK segment.
//
// This is called whenever we need to send an ACK for some reason. Nothing
// fancy, we just do it.
//
void // Returns: Nothing.
SendACK(
TCB *ACKTcb) // TCB from which ACK is to be sent.
{
PNDIS_PACKET Packet;
void *Memory;
IPv6Header UNALIGNED *IP;
TCPHeader UNALIGNED *TCP;
NDIS_STATUS NdisStatus;
KIRQL OldIrql;
int Offset;
uint Length;
uint PayloadLength;
SeqNum SendNext;
ushort TempWin;
RouteCacheEntry *RCE;
CHECK_STRUCT(ACKTcb, tcb);
KeAcquireSpinLock(&ACKTcb->tcb_lock, &OldIrql);
//
// In most cases, we will already have a route at this point.
// However, if we failed to get one earlier in the passive receive
// path, we may need to retry here.
//
if (ACKTcb->tcb_rce == NULL) {
InitRCE(ACKTcb);
if (ACKTcb->tcb_rce == NULL) {
KeReleaseSpinLock(&ACKTcb->tcb_lock, OldIrql);
return;
}
}
//
// Validate that the address we're sourcing from and the route we're
// sending upon are still okay to use.
//
if (ACKTcb->tcb_routing != RouteCacheValidationCounter) {
if (!ValidateSourceAndRoute(ACKTcb)) {
TryToCloseTCB(ACKTcb, TCB_CLOSE_ABORTED, OldIrql);
return;
}
}
//
// Allocate a packet header/buffer/data region for this ACK packet.
//
// Our buffer has space at the beginning which will be filled in
// later by the link level. At this level we add the IPv6Header
// and the TCPHeader.
//
// REVIEW: This grabs packets and buffers from the IPv6PacketPool and
// REVIEW: the IPv6BufferPool respectively. Have seperate pools for TCP?
//
Offset = ACKTcb->tcb_rce->NCE->IF->LinkHeaderSize;
Length = Offset + sizeof(*IP) + sizeof(*TCP);
NdisStatus = IPv6AllocatePacket(Length, &Packet, &Memory);
if (NdisStatus != NDIS_STATUS_SUCCESS) {
KeReleaseSpinLock(&ACKTcb->tcb_lock, OldIrql);
//
// REVIEW: What to do if this fails.
//
KdPrintEx((DPFLTR_TCPIP6_ID, DPFLTR_NTOS_ERROR,
"TCP SendACK: Couldn't allocate IPv6 packet header!?!\n"));
return;
}
PC(Packet)->CompletionHandler = TCPSendComplete;
PC(Packet)->CompletionData = NULL;
//
// Our header buffer has extra space for other headers to be
// prepended to ours without requiring further allocation calls.
// Put the actual TCP/IP header at the end of the buffer.
//
IP = (IPv6Header UNALIGNED *)((uchar *)Memory + Offset);
IP->VersClassFlow = IP_VERSION;
IP->NextHeader = IP_PROTOCOL_TCP;
IP->HopLimit = TCPHopLimit(ACKTcb);
IP->Source = ACKTcb->tcb_saddr;
IP->Dest = ACKTcb->tcb_daddr;
TCP = (TCPHeader UNALIGNED *)(IP + 1);
TCP->tcp_src = ACKTcb->tcb_sport;
TCP->tcp_dest = ACKTcb->tcb_dport;
TCP->tcp_ack = net_long(ACKTcb->tcb_rcvnext);
//
// If the remote peer is advertising a window of zero, we need to send
// this ack with a sequence number of his rcv_next (which in that case
// should be our senduna). We have code here ifdef'd out that makes
// sure that we don't send outside the RWE, but this doesn't work. We
// need to be able to send a pure ACK exactly at the RWE.
//
if (ACKTcb->tcb_sendwin != 0) {
SendNext = ACKTcb->tcb_sendnext;
#if 0
SeqNum MaxValidSeq;
MaxValidSeq = ACKTcb->tcb_senduna + ACKTcb->tcb_sendwin - 1;
SendNext = (SEQ_LT(SendNext, MaxValidSeq) ? SendNext : MaxValidSeq);
#endif
} else
SendNext = ACKTcb->tcb_senduna;
if ((ACKTcb->tcb_flags & FIN_SENT) &&
SEQ_EQ(SendNext, ACKTcb->tcb_sendmax - 1)) {
TCP->tcp_flags = MAKE_TCP_FLAGS(5, TCP_FLAG_FIN | TCP_FLAG_ACK);
} else
TCP->tcp_flags = MAKE_TCP_FLAGS(5, TCP_FLAG_ACK);
TCP->tcp_seq = net_long(SendNext);
TempWin = (ushort)RcvWin(ACKTcb);
TCP->tcp_window = net_short(TempWin);
TCP->tcp_urgent = 0;
PayloadLength = sizeof(*TCP);
//
// Compute the TCP checksum. It covers the entire TCP segment
// starting with the TCP header, plus the IPv6 pseudo-header.
//
TCP->tcp_xsum = 0;
TCP->tcp_xsum = ChecksumPacket(
Packet, Offset + sizeof *IP, NULL, PayloadLength,
AlignAddr(&IP->Source), AlignAddr(&IP->Dest), IP_PROTOCOL_TCP);
ASSERT(TCP->tcp_xsum != 0);
STOP_TCB_TIMER(ACKTcb->tcb_delacktimer);
ACKTcb->tcb_flags &= ~(NEED_ACK | ACK_DELAYED);
TStats.ts_outsegs++;
//
// Capture and reference the RCE while we still hold the TCB lock.
// The TCB's reference on this particular RCE might go away at any point
// after we release the lock (or because we drop it ourselves below).
//
RCE = ACKTcb->tcb_rce;
AddRefRCE(RCE);
//
// If connection-acceptance has been delayed, release the TCB's RCE.
// This prevents TCBs in pre-established states from consuming
// an unbounded number of RCEs.
//
if (ACKTcb->tcb_flags & ACCEPT_PENDING) {
ACKTcb->tcb_rce = NULL;
ReleaseRCE(RCE);
}
//
// Everything's ready. Now send the packet.
//
// Note that IPv6Send does not return a status code.
// Instead it *always* completes the packet
// with an appropriate status code.
//
KeReleaseSpinLock(&ACKTcb->tcb_lock, OldIrql);
IPv6Send(Packet, Offset, IP, PayloadLength, RCE, 0,
IP_PROTOCOL_TCP,
net_short(TCP->tcp_src),
net_short(TCP->tcp_dest));
//
// Release the extra reference we took on the RCE above.
//
ReleaseRCE(RCE);
}
//* SendRSTFromTCB - Send a RST from a TCB.
//
// This is called during close when we need to send a RST.
//
// Called only when TCB is going away, so we have exclusive access.
//
void // Returns: Nothing.
SendRSTFromTCB(
TCB *RSTTcb) // TCB from which RST is to be sent.
{
PNDIS_PACKET Packet;
void *Memory;
IPv6Header UNALIGNED *IP;
TCPHeader UNALIGNED *TCP;
NDIS_STATUS NdisStatus;
int Offset;
uint Length;
uint PayloadLength;
SeqNum RSTSeq;
CHECK_STRUCT(RSTTcb, tcb);
ASSERT(RSTTcb->tcb_state == TCB_CLOSED);
//
// In most cases, we will already have a route at this point.
// However, if we failed to get one earlier in the passive receive
// path, we may need to retry here.
//
if (RSTTcb->tcb_rce == NULL) {
InitRCE(RSTTcb);
if (RSTTcb->tcb_rce == NULL) {
return;
}
}
//
// Validate that the address we're sourcing from and the route we're
// sending upon are still okay to use.
//
if (RSTTcb->tcb_routing != RouteCacheValidationCounter) {
if (!ValidateSourceAndRoute(RSTTcb)) {
return;
}
}
//
// Allocate a packet header/buffer/data region for this RST packet.
//
// Our buffer has space at the beginning which will be filled in
// later by the link level. At this level we add the IPv6Header
// and the TCPHeader.
//
// REVIEW: This grabs packets and buffers from the IPv6PacketPool and
// REVIEW: the IPv6BufferPool respectively. Have seperate pools for TCP?
//
Offset = RSTTcb->tcb_rce->NCE->IF->LinkHeaderSize;
Length = Offset + sizeof(*IP) + sizeof(*TCP);
NdisStatus = IPv6AllocatePacket(Length, &Packet, &Memory);
if (NdisStatus != NDIS_STATUS_SUCCESS) {
//
// REVIEW: What to do if this fails.
//
KdPrintEx((DPFLTR_TCPIP6_ID, DPFLTR_NTOS_ERROR,
"TCP SendRSTFromTCB: "
"Couldn't alloc IPv6 packet header!\n"));
return;
}
PC(Packet)->CompletionHandler = TCPSendComplete;
PC(Packet)->CompletionData = NULL;
//
// Since this is an RST-only packet we only have the one buffer and
// nothing to link on after.
//
//
// Our header buffer has extra space for other headers to be
// prepended to ours without requiring further allocation calls.
// Put the actual TCP/IP header at the end of the buffer.
//
IP = (IPv6Header UNALIGNED *)((uchar *)Memory + Offset);
IP->VersClassFlow = IP_VERSION;
IP->NextHeader = IP_PROTOCOL_TCP;
IP->HopLimit = TCPHopLimit(RSTTcb);
IP->Source = RSTTcb->tcb_saddr;
IP->Dest = RSTTcb->tcb_daddr;
TCP = (TCPHeader UNALIGNED *)(IP + 1);
TCP->tcp_src = RSTTcb->tcb_sport;
TCP->tcp_dest = RSTTcb->tcb_dport;
//
// If the remote peer has a window of 0, send with a seq. # equal
// to senduna so he'll accept it. Otherwise send with send max.
//
if (RSTTcb->tcb_sendwin != 0)
RSTSeq = RSTTcb->tcb_sendmax;
else
RSTSeq = RSTTcb->tcb_senduna;
TCP->tcp_seq = net_long(RSTSeq);
TCP->tcp_ack = net_long(RSTTcb->tcb_rcvnext);
TCP->tcp_flags = MAKE_TCP_FLAGS(5, TCP_FLAG_RST | TCP_FLAG_ACK);
TCP->tcp_window = 0;
TCP->tcp_urgent = 0;
PayloadLength = sizeof(*TCP);
//
// Compute the TCP checksum. It covers the entire TCP segment
// starting with the TCP header, plus the IPv6 pseudo-header.
//
TCP->tcp_xsum = 0;
TCP->tcp_xsum = ChecksumPacket(
Packet, Offset + sizeof *IP, NULL, PayloadLength,
AlignAddr(&IP->Source), AlignAddr(&IP->Dest), IP_PROTOCOL_TCP);
ASSERT(TCP->tcp_xsum != 0);
TStats.ts_outsegs++;
TStats.ts_outrsts++;
//
// Everything's ready. Now send the packet.
//
// Note that IPv6Send does not return a status code.
// Instead it *always* completes the packet
// with an appropriate status code.
//
IPv6Send(Packet, Offset, IP, PayloadLength, RSTTcb->tcb_rce, 0,
IP_PROTOCOL_TCP,
net_short(TCP->tcp_src),
net_short(TCP->tcp_dest));
}
//* SendRSTFromHeader - Send a RST back, based on a header.
//
// Called when we need to send a RST, but don't necessarily have a TCB.
//
void // Returns: Nothing.
SendRSTFromHeader(
TCPHeader UNALIGNED *RecvTCP, // TCP header to be RST.
uint Length, // Length of the incoming segment.
IPv6Addr *Dest, // Destination IP address for RST.
uint DestScopeId, // Scope id for destination address.
IPv6Addr *Src, // Source IP address for RST.
uint SrcScopeId) // Scope id for source address.
{
PNDIS_PACKET Packet;
void *Memory;
IPv6Header UNALIGNED *IP;
TCPHeader UNALIGNED *SendTCP;
NetTableEntry *NTE;
RouteCacheEntry *RCE;
IP_STATUS Status;
NDIS_STATUS NdisStatus;
uint Offset;
uint SendLength;
uint PayloadLength;
//
// Never send a RST in response to a RST.
//
if (RecvTCP->tcp_flags & TCP_FLAG_RST)
return;
//
// Determine NTE to send on based on incoming packet's destination.
// REVIEW: Alternatively, we could/should just pass the NTE in.
//
NTE = FindNetworkWithAddress(Src, SrcScopeId);
if (NTE == NULL) {
//
// This should only happen if the NTE became invalid
// between accepting the packet and getting here. It
// cannot completely go away since the packet's Packet
// structure holds a reference to it.
//
return;
}
//
// Get the route to the destination (incoming packet's source).
//
Status = RouteToDestination(Dest, DestScopeId, CastFromNTE(NTE),
RTD_FLAG_NORMAL, &RCE);
if (Status != IP_SUCCESS) {
//
// Failed to get a route to the destination. Error out.
//
KdPrintEx((DPFLTR_TCPIP6_ID, DPFLTR_INTERNAL_ERROR,
"TCP SendRSTFromHeader: Can't get a route?!?\n"));
ReleaseNTE(NTE);
return;
}
//
// Allocate a packet header/buffer/data region for this RST packet.
//
// Our buffer has space at the beginning which will be filled in
// later by the link level. At this level we add the IPv6Header
// and the TCPHeader.
//
// REVIEW: This grabs packets and buffers from the IPv6PacketPool and
// REVIEW: the IPv6BufferPool respectively. Have seperate pools for TCP?
//
Offset = RCE->NCE->IF->LinkHeaderSize;
SendLength = Offset + sizeof(*IP) + sizeof(*SendTCP);
NdisStatus = IPv6AllocatePacket(SendLength, &Packet, &Memory);
if (NdisStatus != NDIS_STATUS_SUCCESS) {
//
// Failed to allocate a packet header/buffer/data region. Error out.
//
KdPrintEx((DPFLTR_TCPIP6_ID, DPFLTR_NTOS_ERROR,
"TCP SendRSTFromHeader: Couldn't alloc IPv6 pkt header!\n"));
ReleaseRCE(RCE);
ReleaseNTE(NTE);
return;
}
PC(Packet)->CompletionHandler = TCPSendComplete;
PC(Packet)->CompletionData = NULL;
//
// We now have all the resources we need to send. Since this is a
// RST-only packet we only have the one header buffer and nothing
// to link on after.
//
//
// Our header buffer has extra space for other headers to be
// prepended to ours without requiring further allocation calls.
// Put the actual TCP/IP header at the end of the buffer.
//
IP = (IPv6Header UNALIGNED *)((uchar *)Memory + Offset);
IP->VersClassFlow = IP_VERSION;
IP->NextHeader = IP_PROTOCOL_TCP;
IP->HopLimit = (uchar)RCE->NCE->IF->CurHopLimit;
IP->Source = *Src;
IP->Dest = *Dest;
//
// Fill in the header so as to make it believable to our peer, and send it.
//
SendTCP = (TCPHeader UNALIGNED *)(IP + 1);
if (RecvTCP->tcp_flags & TCP_FLAG_SYN)
Length++;
if (RecvTCP->tcp_flags & TCP_FLAG_FIN)
Length++;
if (RecvTCP->tcp_flags & TCP_FLAG_ACK) {
SendTCP->tcp_seq = RecvTCP->tcp_ack;
SendTCP->tcp_ack = 0;
SendTCP->tcp_flags = MAKE_TCP_FLAGS(sizeof(TCPHeader)/sizeof(ulong),
TCP_FLAG_RST);
} else {
SeqNum TempSeq;
SendTCP->tcp_seq = 0;
TempSeq = net_long(RecvTCP->tcp_seq);
TempSeq += Length;
SendTCP->tcp_ack = net_long(TempSeq);
SendTCP->tcp_flags = MAKE_TCP_FLAGS(sizeof(TCPHeader)/sizeof(ulong),
TCP_FLAG_RST | TCP_FLAG_ACK);
}
SendTCP->tcp_window = 0;
SendTCP->tcp_urgent = 0;
SendTCP->tcp_dest = RecvTCP->tcp_src;
SendTCP->tcp_src = RecvTCP->tcp_dest;
PayloadLength = sizeof(*SendTCP);
//
// Compute the TCP checksum. It covers the entire TCP segment
// starting with the TCP header, plus the IPv6 pseudo-header.
//
SendTCP->tcp_xsum = 0;
SendTCP->tcp_xsum = ChecksumPacket(
Packet, Offset + sizeof *IP, NULL, PayloadLength,
AlignAddr(&IP->Source), AlignAddr(&IP->Dest), IP_PROTOCOL_TCP);
ASSERT(SendTCP->tcp_xsum != 0);
TStats.ts_outsegs++;
TStats.ts_outrsts++;
//
// Everything's ready. Now send the packet.
//
// Note that IPv6Send does not return a status code.
// Instead it *always* completes the packet
// with an appropriate status code.
//
IPv6Send(Packet, Offset, IP, PayloadLength, RCE, 0,
IP_PROTOCOL_TCP,
net_short(SendTCP->tcp_src),
net_short(SendTCP->tcp_dest));
//
// Release the Route and the NTE.
//
ReleaseRCE(RCE);
ReleaseNTE(NTE);
} // end of SendRSTFromHeader()
//* GoToEstab - Transition to the established state.
//
// Called when we are going to the established state and need to finish up
// initializing things that couldn't be done until now. We assume the TCB
// lock is held by the caller on the TCB we're called with.
//
void // Returns: Nothing.
GoToEstab(
TCB *EstabTCB) // TCB to transition.
{
//
// Initialize our slow start and congestion control variables.
//
EstabTCB->tcb_cwin = 2 * EstabTCB->tcb_mss;
EstabTCB->tcb_ssthresh = 0xffffffff;
EstabTCB->tcb_state = TCB_ESTAB;
//
// We're in established. We'll subtract one from slow count for this fact,
// and if the slowcount goes to 0 we'll move onto the fast path.
//
if (--(EstabTCB->tcb_slowcount) == 0)
EstabTCB->tcb_fastchk &= ~TCP_FLAG_SLOW;
InterlockedIncrement((PLONG)&TStats.ts_currestab);
EstabTCB->tcb_flags &= ~ACTIVE_OPEN; // Turn off the active opening flag.
}
//* InitSendState - Initialize the send state of a connection.
//
// Called during connection establishment to initialize our send state.
// (In this case, this refers to all information we'll put on the wire as
// well as pure send state). We pick an ISS, set up a rexmit timer value,
// etc. We assume the tcb_lock is held on the TCB when we are called.
//
void // Returns: Nothing.
InitSendState(
TCB *NewTCB) // TCB to be set up.
{
uint InitialRTT;
CHECK_STRUCT(NewTCB, tcb);
if (NewTCB->tcb_flags & ACTIVE_OPEN) {
GetRandomISN(&NewTCB->tcb_sendnext, (uchar*)&NewTCB->tcb_md5data);
}
NewTCB->tcb_senduna = NewTCB->tcb_sendnext;
NewTCB->tcb_sendmax = NewTCB->tcb_sendnext;
NewTCB->tcb_error = IP_SUCCESS;
//
// Initialize retransmit and delayed ack stuff.
//
NewTCB->tcb_rexmitcnt = 0;
NewTCB->tcb_rtt = 0;
NewTCB->tcb_smrtt = 0;
//
// Check for interface specific initial RTT.
// This can be as low as 3ms.
//
if ((NewTCB->tcb_rce != NULL) &&
((InitialRTT = GetInitialRTTFromRCE(NewTCB->tcb_rce)) >
MIN_INITIAL_RTT)) {
NewTCB->tcb_delta = MS_TO_TICKS(InitialRTT * 2);
NewTCB->tcb_rexmit = MS_TO_TICKS(InitialRTT);
} else {
NewTCB->tcb_delta = MS_TO_TICKS(6000);
NewTCB->tcb_rexmit = MS_TO_TICKS(3000);
}
STOP_TCB_TIMER(NewTCB->tcb_rexmittimer);
STOP_TCB_TIMER(NewTCB->tcb_delacktimer);
}
//* FillTCPHeader - Fill the TCP header in.
//
// A utility routine to fill in the TCP header.
//
void // Returns: Nothing.
FillTCPHeader(
TCB *SendTCB, // TCB to fill from.
TCPHeader UNALIGNED *Header) // Header to fill into.
{
ushort S;
ulong L;
Header->tcp_src = SendTCB->tcb_sport;
Header->tcp_dest = SendTCB->tcb_dport;
L = SendTCB->tcb_sendnext;
Header->tcp_seq = net_long(L);
L = SendTCB->tcb_rcvnext;
Header->tcp_ack = net_long(L);
Header->tcp_flags = 0x1050;
*(ulong UNALIGNED *)&Header->tcp_xsum = 0;
S = (ushort)RcvWin(SendTCB);
Header->tcp_window = net_short(S);
Header->tcp_urgent = 0;
}
//* TCPSend - Send data from a TCP connection.
//
// This is the main 'send data' routine. We go into a loop, trying
// to send data until we can't for some reason. First we compute
// the useable window, use it to figure the amount we could send. If
// the amount we could send meets certain criteria we'll build a frame
// and send it, after setting any appropriate control bits. We assume
// the caller has put a reference on the TCB.
//
void // Returns: Nothing.
TCPSend(
TCB *SendTCB, // TCB to be sent from.
KIRQL PreLockIrql) // IRQL prior to acquiring TCB lock.
{
int SendWin; // Useable send window.
uint AmountToSend; // Amount to send this time.
uint AmountLeft;
IPv6Header UNALIGNED *IP;
TCPHeader UNALIGNED *TCP;
PNDIS_PACKET Packet;
PNDIS_BUFFER FirstBuffer, CurrentBuffer;
void *Memory;
TCPSendReq *CurSend;
SendCmpltContext *SCC;
SeqNum OldSeq;
NDIS_STATUS NdisStatus;
uint AmtOutstanding, AmtUnsent;
int ForceWin; // Window we're forced to use.
uint HeaderLength;
uint LinkOffset;
uint PMTU;
RouteCacheEntry *RCE;
CHECK_STRUCT(SendTCB, tcb);
ASSERT(SendTCB->tcb_refcnt != 0);
ASSERT(*(int *)&SendTCB->tcb_sendwin >= 0);
ASSERT(*(int *)&SendTCB->tcb_cwin >= SendTCB->tcb_mss);
ASSERT(!(SendTCB->tcb_flags & FIN_OUTSTANDING) ||
(SendTCB->tcb_sendnext == SendTCB->tcb_sendmax));
//
// See if we should even be here. If another instance of ourselves is
// already in this code, or is about to enter it after completing a
// receive, then just skip on out.
//
if ((SendTCB->tcb_flags & IN_TCP_SEND) ||
(SendTCB->tcb_fastchk & TCP_FLAG_IN_RCV)) {
SendTCB->tcb_flags |= SEND_AFTER_RCV;
goto bail;
}
SendTCB->tcb_flags |= IN_TCP_SEND;
//
// In most cases, we will already have a route at this point.
// However, if we failed to get one earlier in the passive receive
// path, we may need to retry here.
//
if (SendTCB->tcb_rce == NULL) {
InitRCE(SendTCB);
if (SendTCB->tcb_rce == NULL) {
SendTCB->tcb_flags &= ~IN_TCP_SEND;
goto bail;
}
}
//
// Validate that the address we're sourcing from and the route we're
// sending upon are still okay to use.
//
// We fail existing send requests for TCBs with a disconnected
// outgoing interface, except when a loopback route is used.
//
if (SendTCB->tcb_routing != RouteCacheValidationCounter) {
if (!ValidateSourceAndRoute(SendTCB) ||
IsDisconnectedAndNotLoopbackRCE(SendTCB->tcb_rce)) {
SendTCB->tcb_flags &= ~IN_TCP_SEND;
ASSERT(SendTCB->tcb_refcnt != 0);
TryToCloseTCB(SendTCB, TCB_CLOSE_ABORTED, PreLockIrql);
KeAcquireSpinLock(&SendTCB->tcb_lock, &PreLockIrql);
goto bail;
}
}
//
// Verify that our cached Path MTU is still valid.
// Watch for changes to IPsec policies since they can also effect our MSS.
// REVIEW: This the best spot to do this?
//
PMTU = GetEffectivePathMTUFromRCE(SendTCB->tcb_rce);
if (PMTU != SendTCB->tcb_pmtu ||
SecurityStateValidationCounter != SendTCB->tcb_security) {
//
// Either our Path MTU or the global security state has changed.
// Cache current values and then calculate a new MSS.
//
SendTCB->tcb_pmtu = PMTU;
SendTCB->tcb_security = SecurityStateValidationCounter;
CalculateMSSForTCB(SendTCB);
}
//
// We'll continue this loop until we send a FIN, or we break out
// internally for some other reason.
//
while (!(SendTCB->tcb_flags & FIN_OUTSTANDING)) {
CheckTCBSends(SendTCB);
AmtOutstanding = (uint)(SendTCB->tcb_sendnext - SendTCB->tcb_senduna);
AmtUnsent = SendTCB->tcb_unacked - AmtOutstanding;
ASSERT(*(int *)&AmtUnsent >= 0);
SendWin = (int)(MIN(SendTCB->tcb_sendwin, SendTCB->tcb_cwin) -
AmtOutstanding);
//
// If this send is after a fast recovery and sendwin is zero because
// of amount outstanding, then at least force 1 segment to prevent
// delayed ack timeouts from peer.
//
if (SendTCB->tcb_force) {
SendTCB->tcb_force = 0;
if (SendWin < SendTCB->tcb_mss) {
SendWin = SendTCB->tcb_mss;
}
}
//
// Since the window could have shrank, need to get it to zero at
// least.
//
ForceWin = (int)((SendTCB->tcb_flags & FORCE_OUTPUT) >>
FORCE_OUT_SHIFT);
SendWin = MAX(SendWin, ForceWin);
AmountToSend = MIN(MIN((uint)SendWin, AmtUnsent), SendTCB->tcb_mss);
ASSERT(SendTCB->tcb_mss > 0);
//
// See if we have enough to send. We'll send if we have at least a
// segment, or if we really have some data to send and we can send
// all that we have, or the send window is > 0 and we need to force
// output or send a FIN (note that if we need to force output
// SendWin will be at least 1 from the check above), or if we can
// send an amount == to at least half the maximum send window
// we've seen.
//
if (AmountToSend == SendTCB->tcb_mss ||
(AmountToSend != 0 && AmountToSend == AmtUnsent) ||
(SendWin != 0 &&
(((SendTCB->tcb_flags & FIN_NEEDED) &&
AmtUnsent <= SendTCB->tcb_mss) ||
(SendTCB->tcb_flags & FORCE_OUTPUT) ||
AmountToSend >= (SendTCB->tcb_maxwin / 2)))) {
//
// It's OK to send something. Allocate a packet header.
//
// REVIEW: It was easier to code all these allocations directly
// REVIEW: rather than use IPv6AllocatePacket.
//
// REVIEW: This grabs packets and buffers from the IPv6PacketPool
// REVIEW: and the IPv6BufferPool respectively. Should we instead
// REVIEW: have separate pools for TCP?
//
NdisAllocatePacket(&NdisStatus, &Packet, IPv6PacketPool);
if (NdisStatus != NDIS_STATUS_SUCCESS) {
KdPrintEx((DPFLTR_TCPIP6_ID, DPFLTR_NTOS_ERROR,
"TCPSend: couldn't allocate packet header!?!\n"));
goto error_oor;
}
// We'll fill in the CompletionData below.
InitializeNdisPacket(Packet);
PC(Packet)->CompletionHandler = TCPSendComplete;
//
// Our header buffer has extra space at the beginning for other
// headers to be prepended to ours without requiring further
// allocation calls. It also has extra space at the end to hold
// the send completion data.
//
LinkOffset = SendTCB->tcb_rce->NCE->IF->LinkHeaderSize;
HeaderLength =
(LinkOffset + sizeof(*IP) + sizeof(*TCP) +
sizeof(SendCmpltContext) +
__builtin_alignof(SendCmpltContext) - 1) &~
(UINT_PTR)(__builtin_alignof(SendCmpltContext) - 1);
Memory = ExAllocatePool(NonPagedPool, HeaderLength);
if (Memory == NULL) {
KdPrintEx((DPFLTR_TCPIP6_ID, DPFLTR_NTOS_ERROR,
"TCPSend: couldn't allocate header memory!?!\n"));
NdisFreePacket(Packet);
goto error_oor;
}
//
// When allocating the NDIS buffer describing this memory region,
// we don't tell it about the extra space on the end that we
// allocated for the send completion data.
//
NdisAllocateBuffer(&NdisStatus, &FirstBuffer, IPv6BufferPool,
Memory, LinkOffset + sizeof(*IP) + sizeof(*TCP));
if (NdisStatus != NDIS_STATUS_SUCCESS) {
KdPrintEx((DPFLTR_TCPIP6_ID, DPFLTR_NTOS_ERROR,
"TCPSend: couldn't allocate buffer!?!\n"));
ExFreePool(Memory);
NdisFreePacket(Packet);
goto error_oor;
}
//
// Skip over the extra space that will be filled in later by the
// link level. At this level we add the IPv6Header, the
// TCPHeader, and the data.
//
IP = (IPv6Header UNALIGNED *)((uchar *)Memory + LinkOffset);
IP->VersClassFlow = IP_VERSION;
IP->NextHeader = IP_PROTOCOL_TCP;
IP->HopLimit = TCPHopLimit(SendTCB);
IP->Source = SendTCB->tcb_saddr;
IP->Dest = SendTCB->tcb_daddr;
//
// Begin preparing the TCP header.
//
TCP = (TCPHeader UNALIGNED *)(IP + 1);
FillTCPHeader(SendTCB, TCP);
//
// Store the send completion data in the same buffer as the TCP
// header, right after the TCP header. This saves allocation
// overhead and works because we don't consider this area to be
// part of the packet data (we set this buffer's length to
// indicate that the data ends with the TCP header above).
//
// Note that this code relies on the fact that we don't include
// any TCP options (and thus don't have a variable length TCP
// header) in our data packets.
//
SCC = (SendCmpltContext *)((uchar *)Memory + HeaderLength -
sizeof(*SCC));
PC(Packet)->CompletionData = SCC;
#if DBG
SCC->scc_sig = scc_signature;
#endif
SCC->scc_ubufcount = 0;
SCC->scc_tbufcount = 0;
SCC->scc_count = 0;
AmountLeft = AmountToSend;
if (AmountToSend != 0) {
long Result;
//
// Loop through the sends on the TCB, building a frame.
//
CurrentBuffer = FirstBuffer;
CurSend = SendTCB->tcb_cursend;
CHECK_STRUCT(CurSend, tsr);
SCC->scc_firstsend = CurSend;
do {
ASSERT(CurSend->tsr_refcnt > 0);
Result = InterlockedIncrement(&(CurSend->tsr_refcnt));
ASSERT(Result > 0);
SCC->scc_count++;
//
// If the current send offset is 0 and the current
// send is less than or equal to what we have left
// to send, we haven't already put a transport
// buffer on this send, and nobody else is using
// the buffer chain directly, just use the input
// buffers. We check for other people using them
// by looking at tsr_lastbuf. If it's NULL,
// nobody else is using the buffers. If it's not
// NULL, somebody is.
//
if (SendTCB->tcb_sendofs == 0 &&
(SendTCB->tcb_sendsize <= AmountLeft) &&
(SCC->scc_tbufcount == 0) &&
CurSend->tsr_lastbuf == NULL) {
PNDIS_BUFFER LastBuf = SendTCB->tcb_sendbuf;
uint UBufLength = NdisBufferLength(LastBuf);
ushort UBufCount = 1;
while (NDIS_BUFFER_LINKAGE(LastBuf) != NULL) {
LastBuf = NDIS_BUFFER_LINKAGE(LastBuf);
UBufLength += NdisBufferLength(LastBuf);
UBufCount++;
}
if (SendTCB->tcb_sendsize == UBufLength) {
SCC->scc_ubufcount += UBufCount;
NDIS_BUFFER_LINKAGE(CurrentBuffer) =
SendTCB->tcb_sendbuf;
CurSend->tsr_lastbuf = CurrentBuffer = LastBuf;
AmountLeft -= SendTCB->tcb_sendsize;
SendTCB->tcb_sendsize = 0;
} else {
//
// Fall through with a non-zero tcb_sendsize.
//
ASSERT(SendTCB->tcb_sendsize != 0);
}
}
if (SendTCB->tcb_sendsize != 0) {
uint AmountToDup;
PNDIS_BUFFER NewBuf, Buf;
uint Offset;
NDIS_STATUS NStatus;
uchar *VirtualAddress;
uint Length;
//
// Either the current send has more data than
// we want to send, or the starting offset is
// not 0. In either case we'll need to loop
// through the current send, allocating
// buffers.
//
Buf = SendTCB->tcb_sendbuf;
Offset = SendTCB->tcb_sendofs;
do {
ASSERT(Buf != NULL);
NdisQueryBufferSafe(Buf, &VirtualAddress, &Length,
LowPagePriority);
if (VirtualAddress == NULL) {
//
// Couldn't map into kernel address space.
// If the packet is already partly built,
// send what we've got, otherwise error out.
//
goto error_oor2;
}
ASSERT((Offset < Length) ||
(Offset == 0 && Length == 0));
//
// Adjust the length for the offset into
// this buffer.
//
Length -= Offset;
AmountToDup = MIN(AmountLeft, Length);
NdisAllocateBuffer(&NStatus, &NewBuf,
IPv6BufferPool,
VirtualAddress + Offset,
AmountToDup);
if (NStatus == NDIS_STATUS_SUCCESS) {
SCC->scc_tbufcount++;
NDIS_BUFFER_LINKAGE(CurrentBuffer) = NewBuf;
CurrentBuffer = NewBuf;
if (AmountToDup >= Length) {
// Exhausted this buffer.
Buf = NDIS_BUFFER_LINKAGE(Buf);
Offset = 0;
} else {
Offset += AmountToDup;
ASSERT(Offset < NdisBufferLength(Buf));
}
SendTCB->tcb_sendsize -= AmountToDup;
AmountLeft -= AmountToDup;
} else {
//
// Couldn't allocate a buffer. If
// the packet is already partly built,
// send what we've got, otherwise
// error out.
//
error_oor2:
if (SCC->scc_tbufcount == 0 &&
SCC->scc_ubufcount == 0) {
NdisChainBufferAtFront(Packet, FirstBuffer);
TCPSendComplete(Packet, IP_GENERAL_FAILURE);
goto error_oor;
}
AmountToSend -= AmountLeft;
AmountLeft = 0;
break;
}
} while (AmountLeft && SendTCB->tcb_sendsize);
SendTCB->tcb_sendbuf = Buf;
SendTCB->tcb_sendofs = Offset;
}
if (CurSend->tsr_flags & TSR_FLAG_URG) {
ushort UP;
//
// This send is urgent data. We need to figure
// out what the urgent data pointer should be.
// We know sendnext is the starting sequence
// number of the frame, and that at the top of
// this do loop sendnext identified a byte in
// the CurSend at that time. We advanced CurSend
// at the same rate we've decremented
// AmountLeft (AmountToSend - AmountLeft ==
// AmountBuilt), so sendnext +
// (AmountToSend - AmountLeft) identifies a byte
// in the current value of CurSend, and that
// quantity plus tcb_sendsize is the sequence
// number one beyond the current send.
//
UP = (ushort)(AmountToSend - AmountLeft) +
(ushort)SendTCB->tcb_sendsize -
((SendTCB->tcb_flags & BSD_URGENT) ? 0 : 1);
TCP->tcp_urgent = net_short(UP);
TCP->tcp_flags |= TCP_FLAG_URG;
}
//
// See if we've exhausted this send. If we have,
// set the PUSH bit in this frame and move on to
// the next send. We also need to check the
// urgent data bit.
//
if (SendTCB->tcb_sendsize == 0) {
Queue *Next;
uchar PrevFlags;
//
// We've exhausted this send. Set the PUSH bit.
//
TCP->tcp_flags |= TCP_FLAG_PUSH;
PrevFlags = CurSend->tsr_flags;
Next = QNEXT(&CurSend->tsr_req.tr_q);
if (Next != QEND(&SendTCB->tcb_sendq)) {
CurSend = CONTAINING_RECORD(
QSTRUCT(TCPReq, Next, tr_q),
TCPSendReq, tsr_req);
CHECK_STRUCT(CurSend, tsr);
SendTCB->tcb_sendsize = CurSend->tsr_unasize;
SendTCB->tcb_sendofs = CurSend->tsr_offset;
SendTCB->tcb_sendbuf = CurSend->tsr_buffer;
SendTCB->tcb_cursend = CurSend;
//
// Check the urgent flags. We can't combine new
// urgent data on to the end of old non-urgent
// data.
//
if ((PrevFlags & TSR_FLAG_URG) &&
!(CurSend->tsr_flags & TSR_FLAG_URG))
break;
} else {
ASSERT(AmountLeft == 0);
SendTCB->tcb_cursend = NULL;
SendTCB->tcb_sendbuf = NULL;
}
}
} while (AmountLeft != 0);
} else {
//
// We're in the loop, but AmountToSend is 0. This
// should happen only when we're sending a FIN. Check
// this, and return if it's not true.
//
ASSERT(AmtUnsent == 0);
if (!(SendTCB->tcb_flags & FIN_NEEDED)) {
// KdBreakPoint();
ExFreePool(NdisBufferVirtualAddress(FirstBuffer));
NdisFreeBuffer(FirstBuffer);
NdisFreePacket(Packet);
break;
}
SCC->scc_firstsend = NULL; // REVIEW: looks unneccessary.
NDIS_BUFFER_LINKAGE(FirstBuffer) = NULL;
}
// Adjust for what we're really going to send.
AmountToSend -= AmountLeft;
//
// Update the sequence numbers, and start a RTT measurement
// if needed.
//
OldSeq = SendTCB->tcb_sendnext;
SendTCB->tcb_sendnext += AmountToSend;
if (!SEQ_EQ(OldSeq, SendTCB->tcb_sendmax)) {
//
// We have at least some retransmission. Bump the stat.
//
TStats.ts_retranssegs++;
}
if (SEQ_GT(SendTCB->tcb_sendnext, SendTCB->tcb_sendmax)) {
//
// We're sending at least some new data.
// We can't advance sendmax once FIN_SENT is set.
//
ASSERT(!(SendTCB->tcb_flags & FIN_SENT));
SendTCB->tcb_sendmax = SendTCB->tcb_sendnext;
TStats.ts_outsegs++;
//
// Check the Round-Trip Timer.
//
if (SendTCB->tcb_rtt == 0) {
// No RTT running, so start one.
SendTCB->tcb_rtt = TCPTime;
SendTCB->tcb_rttseq = OldSeq;
}
}
//
// We've built the frame entirely. If we've sent everything
// we have and there's a FIN pending, OR it in.
//
if (AmtUnsent == AmountToSend) {
if (SendTCB->tcb_flags & FIN_NEEDED) {
ASSERT(!(SendTCB->tcb_flags & FIN_SENT) ||
(SendTCB->tcb_sendnext ==
(SendTCB->tcb_sendmax - 1)));
//
// See if we still have room in the window for a FIN.
//
if (SendWin > (int) AmountToSend) {
TCP->tcp_flags |= TCP_FLAG_FIN;
SendTCB->tcb_sendnext++;
SendTCB->tcb_sendmax = SendTCB->tcb_sendnext;
SendTCB->tcb_flags |= (FIN_SENT | FIN_OUTSTANDING);
SendTCB->tcb_flags &= ~FIN_NEEDED;
}
}
}
AmountToSend += sizeof(TCPHeader);
if (!TCB_TIMER_RUNNING(SendTCB->tcb_rexmittimer))
START_TCB_TIMER(SendTCB->tcb_rexmittimer, SendTCB->tcb_rexmit);
SendTCB->tcb_flags &= ~(NEED_ACK | ACK_DELAYED | FORCE_OUTPUT);
STOP_TCB_TIMER(SendTCB->tcb_delacktimer);
STOP_TCB_TIMER(SendTCB->tcb_swstimer);
SendTCB->tcb_alive = TCPTime;
// Add the buffers to the packet.
NdisChainBufferAtFront(Packet, FirstBuffer);
//
// Compute the TCP checksum. It covers the entire TCP segment
// starting with the TCP header, plus the IPv6 pseudo-header.
//
TCP->tcp_xsum = 0;
TCP->tcp_xsum = ChecksumPacket(
Packet, LinkOffset + sizeof *IP, NULL, AmountToSend,
AlignAddr(&IP->Source), AlignAddr(&IP->Dest), IP_PROTOCOL_TCP);
//
// Capture and reference the RCE while we still hold the TCB lock.
// The TCB's reference on this particular RCE might go away at any
// point after we release the lock.
//
RCE = SendTCB->tcb_rce;
AddRefRCE(RCE);
//
// Everything's ready. Now send the packet.
//
// Note that IPv6Send does not return a status code.
// Instead it *always* completes the packet
// with an appropriate status code.
//
KeReleaseSpinLock(&SendTCB->tcb_lock, PreLockIrql);
if (TCP->tcp_xsum == 0) {
//
// ChecksumPacket failed, so abort the transmission.
//
IPv6SendComplete(NULL, Packet, IP_NO_RESOURCES);
} else {
IPv6Send(Packet, LinkOffset, IP,
AmountToSend, RCE, 0,
IP_PROTOCOL_TCP,
net_short(TCP->tcp_src),
net_short(TCP->tcp_dest));
}
ReleaseRCE(RCE);
KeAcquireSpinLock(&SendTCB->tcb_lock, &PreLockIrql);
continue;
} else {
//
// We've decided we can't send anything now. Figure out why, and
// see if we need to set a timer.
//
if (SendTCB->tcb_sendwin == 0) {
if (!(SendTCB->tcb_flags & FLOW_CNTLD)) {
SendTCB->tcb_flags |= FLOW_CNTLD;
SendTCB->tcb_rexmitcnt = 0;
START_TCB_TIMER(SendTCB->tcb_rexmittimer,
SendTCB->tcb_rexmit);
SendTCB->tcb_slowcount++;
SendTCB->tcb_fastchk |= TCP_FLAG_SLOW;
} else
if (!TCB_TIMER_RUNNING(SendTCB->tcb_rexmittimer))
START_TCB_TIMER(SendTCB->tcb_rexmittimer,
SendTCB->tcb_rexmit);
} else
if (AmountToSend != 0)
// We have something to send, but we're not sending
// it, presumably due to SWS avoidance.
if (!TCB_TIMER_RUNNING(SendTCB->tcb_swstimer))
START_TCB_TIMER(SendTCB->tcb_swstimer, SWS_TO);
break;
}
} // while (!FIN_OUTSTANDING)
//
// We're done sending, so we don't need the output flags set.
//
SendTCB->tcb_flags &= ~(IN_TCP_SEND | NEED_OUTPUT | FORCE_OUTPUT |
SEND_AFTER_RCV);
bail:
DerefTCB(SendTCB, PreLockIrql);
return;
//
// Common case error handling code for out of resource conditions. Start the
// retransmit timer if it's not already running (so that we try this again
// later), clean up and return.
//
error_oor:
if (!TCB_TIMER_RUNNING(SendTCB->tcb_rexmittimer))
START_TCB_TIMER(SendTCB->tcb_rexmittimer, SendTCB->tcb_rexmit);
// We had an out of resource problem, so clear the OUTPUT flags.
SendTCB->tcb_flags &= ~(IN_TCP_SEND | NEED_OUTPUT | FORCE_OUTPUT);
DerefTCB(SendTCB, PreLockIrql);
return;
} // end of TCPSend()
//* ResetSendNextAndFastSend - Set the sendnext value of a TCB.
//
// Called to fast retransmit the dropped segment.
//
// We assume the caller has put a reference on the TCB, and the TCB is locked
// on entry. The reference is dropped and the lock released before returning.
//
void // Returns: Nothing.
ResetAndFastSend(
TCB *SeqTCB, // TCB for this connection.
SeqNum NewSeq, // Sequence number to set.
uint NewCWin) // New value for congestion window.
{
TCPSendReq *SendReq;
Queue *CurQ;
PNDIS_BUFFER Buffer;
uint Offset;
uint SendSize;
CHECK_STRUCT(SeqTCB, tcb);
ASSERT(SEQ_GTE(NewSeq, SeqTCB->tcb_senduna));
//
// The new seq must be less than send max, or NewSeq, senduna, sendnext,
// and sendmax must all be equal. (The latter case happens when we're
// called exiting TIME_WAIT, or possibly when we're retransmitting
// during a flow controlled situation).
//
ASSERT(SEQ_LT(NewSeq, SeqTCB->tcb_sendmax) ||
(SEQ_EQ(SeqTCB->tcb_senduna, SeqTCB->tcb_sendnext) &&
SEQ_EQ(SeqTCB->tcb_senduna, SeqTCB->tcb_sendmax) &&
SEQ_EQ(SeqTCB->tcb_senduna, NewSeq)));
if (SYNC_STATE(SeqTCB->tcb_state) &&
(SeqTCB->tcb_state != TCB_TIME_WAIT)) {
if (!EMPTYQ(&SeqTCB->tcb_sendq)) {
CurQ = QHEAD(&SeqTCB->tcb_sendq);
SendReq = (TCPSendReq *) CONTAINING_RECORD(CurQ, TCPReq, tr_q);
//
// SendReq points to the first send request on the send queue.
// We're pointing at the proper send req now. We need to go down.
//
// SendReq points to the cursend.
// SendSize point to sendsize in the cursend.
//
SendSize = SendReq->tsr_unasize;
Buffer = SendReq->tsr_buffer;
Offset = SendReq->tsr_offset;
// Call the fast retransmit send now.
TCPFastSend(SeqTCB, Buffer, Offset, SendReq, SendSize, NewSeq,
SeqTCB->tcb_mss);
} else {
ASSERT(SeqTCB->tcb_cursend == NULL);
}
}
SeqTCB->tcb_cwin = NewCWin;
DerefTCB(SeqTCB, DISPATCH_LEVEL);
return;
}
//* TCPFastSend - To send a segment without changing TCB state.
//
// Called to handle fast retransmit of the lost segment.
// tcb_lock will be held while entering (called by TCPRcv).
//
void // Returns: Nothing.
TCPFastSend(
TCB *SendTCB, // TCB for this connection.
PNDIS_BUFFER in_SendBuf, // NDIS buffer.
uint SendOfs, // Send offset.
TCPSendReq *CurSend, // Current send request.
uint SendSize, // Size of this send.
SeqNum SendNext, // Sequence number to use for this send.
int in_ToBeSent) // Cap on SendSize (REVIEW: Callee should cap).
{
uint AmountToSend; // Amount to send this time.
uint AmountLeft;
IPv6Header UNALIGNED *IP;
TCPHeader UNALIGNED *TCP;
PNDIS_PACKET Packet;
PNDIS_BUFFER FirstBuffer, CurrentBuffer;
void *Memory;
SendCmpltContext *SCC;
NDIS_STATUS NdisStatus;
uint AmtOutstanding, AmtUnsent;
uint HeaderLength;
uint LinkOffset;
uint PMTU;
KIRQL PreLockIrql;
PNDIS_BUFFER SendBuf = in_SendBuf;
RouteCacheEntry *RCE;
PreLockIrql = DISPATCH_LEVEL;
CHECK_STRUCT(SendTCB, tcb);
ASSERT(SendTCB->tcb_refcnt != 0);
ASSERT(*(int *)&SendTCB->tcb_sendwin >= 0);
ASSERT(*(int *)&SendTCB->tcb_cwin >= SendTCB->tcb_mss);
ASSERT(!(SendTCB->tcb_flags & FIN_OUTSTANDING) ||
(SendTCB->tcb_sendnext == SendTCB->tcb_sendmax));
//
// In most cases, we will already have a route at this point.
// However, if we failed to get one earlier in the passive receive
// path, we may need to retry here.
//
if (SendTCB->tcb_rce == NULL) {
InitRCE(SendTCB);
if (SendTCB->tcb_rce == NULL) {
DerefTCB(SendTCB, PreLockIrql);
return;
}
}
//
// Validate that the address we're sourcing from and the route we're
// sending upon are still okay to use.
//
// We fail existing send requests for TCBs with a disconnected
// outgoing interface, except when a loopback route is used.
//
if (SendTCB->tcb_routing != RouteCacheValidationCounter) {
if (!ValidateSourceAndRoute(SendTCB) ||
IsDisconnectedAndNotLoopbackRCE(SendTCB->tcb_rce)) {
ASSERT(SendTCB->tcb_refcnt != 0);
TryToCloseTCB(SendTCB, TCB_CLOSE_ABORTED, PreLockIrql);
KeAcquireSpinLock(&SendTCB->tcb_lock, &PreLockIrql);
DerefTCB(SendTCB, PreLockIrql);
return;
}
}
//
// Verify that our cached Path MTU is still valid.
// Watch for changes to IPsec policies since they can also effect our MSS.
// REVIEW: This the best spot to do this?
//
PMTU = GetEffectivePathMTUFromRCE(SendTCB->tcb_rce);
if (PMTU != SendTCB->tcb_pmtu ||
SecurityStateValidationCounter != SendTCB->tcb_security) {
//
// Either our Path MTU or the global security state has changed.
// Cache current values and then calculate a new MSS.
//
SendTCB->tcb_pmtu = PMTU;
SendTCB->tcb_security = SecurityStateValidationCounter;
CalculateMSSForTCB(SendTCB);
}
AmtOutstanding = (uint)(SendTCB->tcb_sendnext - SendTCB->tcb_senduna);
AmtUnsent = MIN(MIN(in_ToBeSent, (int)SendSize),
(int)SendTCB->tcb_sendwin);
while (AmtUnsent > 0) {
if (SEQ_GT(SendTCB->tcb_senduna, SendNext)) {
//
// Since tcb_lock is released in this loop
// it is possible that delayed ack acked
// what we are trying to retransmit.
//
goto error_oor;
}
// AmtUnsent below was minimum of sendwin and amtunsent
AmountToSend = MIN(AmtUnsent, SendTCB->tcb_mss);
ASSERT((int)AmtUnsent >= 0);
//
// We're going to send something. Allocate a packet header.
//
// REVIEW: It was easier to code all these allocations directly
// REVIEW: rather than use IPv6AllocatePacket.
//
// REVIEW: This grabs packets and buffers from the IPv6PacketPool
// REVIEW: and the IPv6BufferPool respectively. Should we instead
// REVIEW: have separate pools for TCP?
//
NdisAllocatePacket(&NdisStatus, &Packet, IPv6PacketPool);
if (NdisStatus != NDIS_STATUS_SUCCESS) {
KdPrintEx((DPFLTR_TCPIP6_ID, DPFLTR_NTOS_ERROR,
"TCPSend: couldn't allocate packet header!?!\n"));
goto error_oor;
}
// We'll fill in the CompletionData below.
InitializeNdisPacket(Packet);
PC(Packet)->CompletionHandler = TCPSendComplete;
//
// Our header buffer has extra space at the beginning for other
// headers to be prepended to ours without requiring further
// allocation calls. It also has extra space at the end to hold
// the send completion data.
//
LinkOffset = SendTCB->tcb_rce->NCE->IF->LinkHeaderSize;
HeaderLength = (LinkOffset + sizeof(*IP) + sizeof(*TCP) +
sizeof(SendCmpltContext) +
__builtin_alignof(SendCmpltContext) - 1) &~
(UINT_PTR)(__builtin_alignof(SendCmpltContext) - 1);
Memory = ExAllocatePool(NonPagedPool, HeaderLength);
if (Memory == NULL) {
KdPrintEx((DPFLTR_TCPIP6_ID, DPFLTR_NTOS_ERROR,
"TCPSend: couldn't allocate header memory!?!\n"));
NdisFreePacket(Packet);
goto error_oor;
}
//
// When allocating the NDIS buffer describing this memory region,
// we don't tell it about the extra space on the end that we
// allocated for the send completion data.
//
NdisAllocateBuffer(&NdisStatus, &FirstBuffer, IPv6BufferPool,
Memory, LinkOffset + sizeof(*IP) + sizeof(*TCP));
if (NdisStatus != NDIS_STATUS_SUCCESS) {
KdPrintEx((DPFLTR_TCPIP6_ID, DPFLTR_NTOS_ERROR,
"TCPSend: couldn't allocate buffer!?!\n"));
ExFreePool(Memory);
NdisFreePacket(Packet);
goto error_oor;
}
//
// Skip over the extra space that will be filled in later by the
// link level. At this level we add the IPv6Header, the
// TCPHeader, and the data.
//
IP = (IPv6Header UNALIGNED *)((uchar *)Memory + LinkOffset);
IP->VersClassFlow = IP_VERSION;
IP->NextHeader = IP_PROTOCOL_TCP;
IP->HopLimit = TCPHopLimit(SendTCB);
IP->Source = SendTCB->tcb_saddr;
IP->Dest = SendTCB->tcb_daddr;
//
// Begin preparing the TCP header.
//
TCP = (TCPHeader UNALIGNED *)(IP + 1);
FillTCPHeader(SendTCB, TCP);
TCP->tcp_seq = net_long(SendNext);
//
// Store the send completion data in the same buffer as the TCP
// header, right after the TCP header. This saves allocation
// overhead and works because we don't consider this area to be
// part of the packet data (we set this buffer's length to
// indicate that the data ends with the TCP header above).
//
// Note that this code relies on the fact that we don't include
// any TCP options (and thus don't have a variable length TCP
// header) in our data packets.
//
SCC = (SendCmpltContext *)((uchar *)Memory + HeaderLength -
sizeof(*SCC));
PC(Packet)->CompletionData = SCC;
#if DBG
SCC->scc_sig = scc_signature;
#endif
SCC->scc_ubufcount = 0;
SCC->scc_tbufcount = 0;
SCC->scc_count = 0;
AmountLeft = AmountToSend;
if (AmountToSend != 0) {
long Result;
//
// Loop through the sends on the TCB, building a frame.
//
CurrentBuffer = FirstBuffer;
CHECK_STRUCT(CurSend, tsr);
SCC->scc_firstsend = CurSend;
do {
ASSERT(CurSend->tsr_refcnt > 0);
Result = InterlockedIncrement(&(CurSend->tsr_refcnt));
ASSERT(Result > 0);
SCC->scc_count++;
//
// If the current send offset is 0 and the current
// send is less than or equal to what we have left
// to send, we haven't already put a transport
// buffer on this send, and nobody else is using
// the buffer chain directly, just use the input
// buffers. We check for other people using them
// by looking at tsr_lastbuf. If it's NULL,
// nobody else is using the buffers. If it's not
// NULL, somebody is.
//
if (SendOfs == 0 &&
(SendSize <= AmountLeft) &&
(SCC->scc_tbufcount == 0) &&
CurSend->tsr_lastbuf == NULL) {
PNDIS_BUFFER LastBuf = SendBuf;
uint UBufLength = NdisBufferLength(LastBuf);
ushort UBufCount = 1;
while (NDIS_BUFFER_LINKAGE(LastBuf) != NULL) {
LastBuf = NDIS_BUFFER_LINKAGE(LastBuf);
UBufLength += NdisBufferLength(LastBuf);
UBufCount++;
}
if (SendSize == UBufLength) {
SCC->scc_ubufcount += UBufCount;
NDIS_BUFFER_LINKAGE(CurrentBuffer) = SendBuf;
CurSend->tsr_lastbuf = CurrentBuffer = LastBuf;
AmountLeft -= SendSize;
SendSize = 0;
} else {
//
// Fall through with a non-zero SendSize.
//
ASSERT(SendSize != 0);
}
}
if (SendSize != 0) {
uint AmountToDup;
PNDIS_BUFFER NewBuf, Buf;
uint Offset;
NDIS_STATUS NStatus;
uchar *VirtualAddress;
uint Length;
//
// Either the current send has more data than
// we want to send, or the starting offset is
// not 0. In either case we'll need to loop
// through the current send, allocating buffers.
//
Buf = SendBuf;
Offset = SendOfs;
do {
ASSERT(Buf != NULL);
NdisQueryBufferSafe(Buf, &VirtualAddress, &Length,
LowPagePriority);
if (VirtualAddress == NULL) {
goto error_oor2;
}
ASSERT((Offset < Length) ||
(Offset == 0 && Length == 0));
//
// Adjust the length for the offset into
// this buffer.
//
Length -= Offset;
AmountToDup = MIN(AmountLeft, Length);
NdisAllocateBuffer(&NStatus, &NewBuf,
IPv6BufferPool,
VirtualAddress + Offset,
AmountToDup);
if (NStatus == NDIS_STATUS_SUCCESS) {
SCC->scc_tbufcount++;
NDIS_BUFFER_LINKAGE(CurrentBuffer) = NewBuf;
CurrentBuffer = NewBuf;
if (AmountToDup >= Length) {
// Exhausted this buffer.
Buf = NDIS_BUFFER_LINKAGE(Buf);
Offset = 0;
} else {
Offset += AmountToDup;
ASSERT(Offset < NdisBufferLength(Buf));
}
SendSize -= AmountToDup;
AmountLeft -= AmountToDup;
} else {
//
// Couldn't allocate a buffer. If
// the packet is already partly built,
// send what we've got, otherwise
// error out.
//
error_oor2:
if (SCC->scc_tbufcount == 0 &&
SCC->scc_ubufcount == 0) {
KeReleaseSpinLockFromDpcLevel(
&SendTCB->tcb_lock);
NdisChainBufferAtFront(Packet, FirstBuffer);
TCPSendComplete(Packet, IP_GENERAL_FAILURE);
KeAcquireSpinLockAtDpcLevel(&SendTCB->tcb_lock);
goto error_oor;
}
AmountToSend -= AmountLeft;
AmountLeft = 0;
break;
}
} while (AmountLeft && SendSize);
SendBuf = Buf;
SendOfs = Offset;
}
if (CurSend->tsr_flags & TSR_FLAG_URG) {
ushort UP;
//
// This send is urgent data. We need to figure
// out what the urgent data pointer should be.
// We know sendnext is the starting sequence
// number of the frame, and that at the top of
// this do loop sendnext identified a byte in
// the CurSend at that time. We advanced CurSend
// at the same rate we've decremented
// AmountLeft (AmountToSend - AmountLeft ==
// AmountBuilt), so sendnext +
// (AmountToSend - AmountLeft) identifies a byte
// in the current value of CurSend, and that
// quantity plus tcb_sendsize is the sequence
// number one beyond the current send.
//
UP = (ushort) (AmountToSend - AmountLeft) +
(ushort) SendSize -
((SendTCB->tcb_flags & BSD_URGENT) ? 0 : 1);
TCP->tcp_urgent = net_short(UP);
TCP->tcp_flags |= TCP_FLAG_URG;
}
//
// See if we've exhausted this send. If we have,
// set the PUSH bit in this frame and move on to
// the next send. We also need to check the
// urgent data bit.
//
if (SendSize == 0) {
Queue *Next;
ulong PrevFlags;
//
// We've exhausted this send. Set the PUSH bit.
//
TCP->tcp_flags |= TCP_FLAG_PUSH;
PrevFlags = CurSend->tsr_flags;
Next = QNEXT(&CurSend->tsr_req.tr_q);
if (Next != QEND(&SendTCB->tcb_sendq)) {
CurSend = CONTAINING_RECORD(
QSTRUCT(TCPReq, Next, tr_q),
TCPSendReq, tsr_req);
CHECK_STRUCT(CurSend, tsr);
SendSize = CurSend->tsr_unasize;
SendOfs = CurSend->tsr_offset;
SendBuf = CurSend->tsr_buffer;
//
// Check the urgent flags. We can't combine new
// urgent data on to the end of old non-urgent
// data.
//
if ((PrevFlags & TSR_FLAG_URG) &&
!(CurSend->tsr_flags & TSR_FLAG_URG)) {
break;
}
} else {
ASSERT(AmountLeft == 0);
CurSend = NULL;
SendBuf = NULL;
}
}
} while (AmountLeft != 0);
} else {
//
// Amt to send is 0.
// Just bail out and start timer.
//
if (!TCB_TIMER_RUNNING(SendTCB->tcb_rexmittimer)) {
START_TCB_TIMER(SendTCB->tcb_rexmittimer,
SendTCB->tcb_rexmit);
}
ExFreePool(NdisBufferVirtualAddress(FirstBuffer));
NdisFreeBuffer(FirstBuffer);
NdisFreePacket(Packet);
return;
}
//
// Adjust for what we're really going to send.
//
AmountToSend -= AmountLeft;
SendNext += AmountToSend;
AmtUnsent -= AmountToSend;
TStats.ts_retranssegs++;
AmountToSend += sizeof(TCPHeader);
if (!TCB_TIMER_RUNNING(SendTCB->tcb_rexmittimer)) {
START_TCB_TIMER(SendTCB->tcb_rexmittimer, SendTCB->tcb_rexmit);
}
SendTCB->tcb_flags &= ~(NEED_ACK | ACK_DELAYED | FORCE_OUTPUT);
STOP_TCB_TIMER(SendTCB->tcb_delacktimer);
STOP_TCB_TIMER(SendTCB->tcb_swstimer);
//
// Add the buffers to the packet.
//
NdisChainBufferAtFront(Packet, FirstBuffer);
//
// Compute the TCP checksum. It covers the entire TCP segment
// starting with the TCP header, plus the IPv6 pseudo-header.
//
TCP->tcp_xsum = 0;
TCP->tcp_xsum = ChecksumPacket(
Packet, LinkOffset + sizeof *IP, NULL, AmountToSend,
AlignAddr(&IP->Source), AlignAddr(&IP->Dest), IP_PROTOCOL_TCP);
//
// Capture and reference the RCE while we still hold the TCB lock.
// The TCB's reference on this particular RCE might go away at any
// point after we release the lock.
//
RCE = SendTCB->tcb_rce;
AddRefRCE(RCE);
//
// Everything's ready. Now send the packet.
//
// Note that IPv6Send does not return a status code.
// Instead it *always* completes the packet
// with an appropriate status code.
//
KeReleaseSpinLock(&SendTCB->tcb_lock, PreLockIrql);
if (TCP->tcp_xsum == 0) {
//
// ChecksumPacket failed, so abort the transmission.
//
IPv6SendComplete(NULL, Packet, IP_NO_RESOURCES);
} else {
IPv6Send(Packet, LinkOffset, IP,
AmountToSend, RCE, 0,
IP_PROTOCOL_TCP,
net_short(TCP->tcp_src),
net_short(TCP->tcp_dest));
}
//
// Release reference and reacquire lock we dropped before sending.
//
ReleaseRCE(RCE);
KeAcquireSpinLock(&SendTCB->tcb_lock, &PreLockIrql);
}
return;
//
// Common case error handling code for out of resource conditions.
// Start the retransmit timer if it's not already running
// (so that we try this again later), clean up and return.
//
error_oor:
if (!TCB_TIMER_RUNNING(SendTCB->tcb_rexmittimer)) {
START_TCB_TIMER(SendTCB->tcb_rexmittimer, SendTCB->tcb_rexmit);
}
return;
}
//* TDISend - Send data on a connection.
//
// The main TDI send entry point. We take the input parameters, validate
// them, allocate a send request, etc. We then put the send request on the
// queue. If we have no other sends on the queue or Nagling is disabled we'll
// call TCPSend to send the data.
//
TDI_STATUS // Returns: Status of attempt to send.
TdiSend(
PTDI_REQUEST Request, // TDI request for the call.
ushort Flags, // Flags for this send.
uint SendLength, // Length in bytes of send.
PNDIS_BUFFER SendBuffer) // Buffer chain to be sent.
{
TCPConn *Conn;
TCB *SendTCB;
TCPSendReq *SendReq;
KIRQL OldIrql;
TDI_STATUS Error;
uint EmptyQ;
#if DBG
uint RealSendSize;
PNDIS_BUFFER Temp;
//
// Loop through the buffer chain, and make sure that the length matches
// up with SendLength.
//
Temp = SendBuffer;
RealSendSize = 0;
do {
ASSERT(Temp != NULL);
RealSendSize += NdisBufferLength(Temp);
Temp = NDIS_BUFFER_LINKAGE(Temp);
} while (Temp != NULL);
ASSERT(RealSendSize == SendLength);
#endif
//
// Grab lock on Connection Table. Then get our connection info from
// the TDI request, and our TCP control block from that.
//
Conn = GetConnFromConnID(PtrToUlong(Request->Handle.ConnectionContext),
&OldIrql);
if (Conn == NULL) {
Error = TDI_INVALID_CONNECTION;
goto abort;
}
CHECK_STRUCT(Conn, tc);
SendTCB = Conn->tc_tcb;
if (SendTCB == NULL) {
Error = TDI_INVALID_STATE;
KeReleaseSpinLock(&Conn->tc_ConnBlock->cb_lock, OldIrql);
abort:
return Error;
}
CHECK_STRUCT(SendTCB, tcb);
//
// Switch to a finer-grained lock:
// Drop lock on the Connection Table in favor of one on our TCB.
//
KeAcquireSpinLockAtDpcLevel(&SendTCB->tcb_lock);
KeReleaseSpinLockFromDpcLevel(&Conn->tc_ConnBlock->cb_lock);
//
// Make sure our TCB is in a send-able state.
//
if (!DATA_SEND_STATE(SendTCB->tcb_state) || CLOSING(SendTCB)) {
Error = TDI_INVALID_STATE;
goto abort2;
}
CheckTCBSends(SendTCB); // Just a debug check.
//
// If we've released our RCE for some reason, reacquire one.
//
if (SendTCB->tcb_rce == NULL) {
InitRCE(SendTCB);
if (SendTCB->tcb_rce == NULL) {
Error = TDI_DEST_NET_UNREACH;
goto abort2;
}
}
//
// Verify that the cached RCE is still valid.
//
SendTCB->tcb_rce = ValidateRCE(SendTCB->tcb_rce, SendTCB->tcb_nte);
ASSERT(SendTCB->tcb_rce != NULL);
if (IsDisconnectedAndNotLoopbackRCE(SendTCB->tcb_rce)) {
//
// Fail new send requests for TCBs with a disconnected
// outgoing interface, except when the loopback route is used.
//
Error = TDI_DEST_NET_UNREACH;
goto abort2;
}
if (SendLength == 0) {
//
// Wow, nothing to do!
//
// REVIEW: Can't we do this check earlier (like before we even grab the
// REVIEW: Connection Table lock? The only reason I can think not to
// REVIEW: would be if something cared about the return code if a bad
// REVIEW: Tdi Request was given to us.
//
Error = TDI_SUCCESS;
goto abort2;
}
//
// We have a TCB, and it's valid. Allocate a send request now.
//
SendReq = GetSendReq();
if (SendReq == NULL) {
Error = TDI_NO_RESOURCES;
abort2:
KeReleaseSpinLock(&SendTCB->tcb_lock, OldIrql);
return Error;
}
//
// Prepare a TCP send request based on the TDI request and the
// passed in buffer chain.
//
SendReq->tsr_req.tr_rtn = Request->RequestNotifyObject;
SendReq->tsr_req.tr_context = Request->RequestContext;
SendReq->tsr_buffer = SendBuffer;
SendReq->tsr_size = SendLength;
SendReq->tsr_unasize = SendLength;
SendReq->tsr_refcnt = 1; // ACK will decrement this ref
SendReq->tsr_offset = 0;
SendReq->tsr_lastbuf = NULL;
SendReq->tsr_time = TCPTime;
SendReq->tsr_flags = (Flags & TDI_SEND_EXPEDITED) ? TSR_FLAG_URG : 0;
//
// Check current status of our send queue.
//
EmptyQ = EMPTYQ(&SendTCB->tcb_sendq);
//
// Add this send request to our send queue.
//
SendTCB->tcb_unacked += SendLength;
ENQUEUE(&SendTCB->tcb_sendq, &SendReq->tsr_req.tr_q);
if (SendTCB->tcb_cursend == NULL) {
//
// No existing current send request, so make this new one
// the current send.
//
// REVIEW: Is this always equivalent to EMPTYQ test above?
// REVIEW: If so, why not just set EmptyQ flag here and save a test?
//
SendTCB->tcb_cursend = SendReq;
SendTCB->tcb_sendbuf = SendBuffer;
SendTCB->tcb_sendofs = 0;
SendTCB->tcb_sendsize = SendLength;
}
//
// See if we should try to send now. We attempt to do so if we weren't
// already blocked, or if we were and either the Nagle Algorithm is turned
// off or we now have at least one max segment worth of data to send.
//
if (EmptyQ || (!(SendTCB->tcb_flags & NAGLING) ||
(SendTCB->tcb_unacked -
(SendTCB->tcb_sendmax - SendTCB->tcb_senduna))
>= SendTCB->tcb_mss)) {
SendTCB->tcb_refcnt++;
TCPSend(SendTCB, OldIrql);
} else
KeReleaseSpinLock(&SendTCB->tcb_lock, OldIrql);
//
// When TCPSend returns, we may or may not have already sent the data
// associated with this particular request.
//
return TDI_PENDING;
}
#pragma BEGIN_INIT
//* InitTCPSend - Initialize our send side.
//
// Called during init time to initialize our TCP send state.
//
int // Returns: TRUE if we inited, false if we didn't.
InitTCPSend(
void) // Nothing.
{
ExInitializeSListHead(&TCPSendReqFree);
KeInitializeSpinLock(&TCPSendReqFreeLock);
IPv6RegisterULProtocol(IP_PROTOCOL_TCP, TCPReceive, TCPControlReceive);
return TRUE;
}
#pragma END_INIT
//* UnloadTCPSend
//
// Cleanup and prepare for stack unload.
//
void
UnloadTCPSend(void)
{
PSLIST_ENTRY BufferLink;
while ((BufferLink = ExInterlockedPopEntrySList(&TCPSendReqFree,
&TCPSendReqFreeLock))
!= NULL) {
Queue *QueuePtr = CONTAINING_RECORD(BufferLink, Queue, q_next);
TCPReq *Req = CONTAINING_RECORD(QueuePtr, TCPReq, tr_q);
TCPSendReq *SendReq = CONTAINING_RECORD(Req, TCPSendReq, tsr_req);
CHECK_STRUCT(SendReq, tsr);
ExFreePool(SendReq);
}
IPv6RegisterULProtocol(IP_PROTOCOL_TCP, NULL, NULL);
}