// -*- mode: C++; tab-width: 4; indent-tabs-mode: nil -*- (for GNU Emacs) // // Copyright (c) 1985-2000 Microsoft Corporation // // This file is part of the Microsoft Research IPv6 Network Protocol Stack. // You should have received a copy of the Microsoft End-User License Agreement // for this software along with this release; see the file "license.txt". // If not, please see http://www.research.microsoft.com/msripv6/license.htm, // or write to Microsoft Research, One Microsoft Way, Redmond, WA 98052-6399. // // Abstract: // // TCP send code. // // This file contains the code for sending Data and Control segments. // #include "oscfg.h" #include "ndis.h" #include "ip6imp.h" #include "ip6def.h" #include "tdi.h" #include "tdint.h" #include "tdistat.h" #include "queue.h" #include "transprt.h" #include "addr.h" #include "tcp.h" #include "tcb.h" #include "tcpconn.h" #include "tcpsend.h" #include "tcprcv.h" #include "info.h" #include "tcpcfg.h" #include "route.h" #include "security.h" void *TCPProtInfo; // TCP protocol info for IP. SLIST_HEADER TCPSendReqFree; // Send req. free list. KSPIN_LOCK TCPSendReqFreeLock; KSPIN_LOCK TCPSendReqCompleteLock; uint NumTCPSendReq; // Current number of SendReqs in system. uint MaxSendReq = 0xffffffff; // Maximum allowed number of SendReqs. extern KSPIN_LOCK TCBTableLock; // // All of the init code can be discarded. // #ifdef ALLOC_PRAGMA #pragma alloc_text(INIT, InitTCPSend) #endif // ALLOC_PRAGMA extern void ResetSendNext(TCB *SeqTCB, SeqNum NewSeq); #define MIN_INITIAL_RTT 3 // In msec. //* FreeSendReq - Free a send request structure. // // Called to free a send request structure. // void // Returns: Nothing. FreeSendReq( TCPSendReq *FreedReq) // Connection request structure to be freed. { PSLIST_ENTRY BufferLink; CHECK_STRUCT(FreedReq, tsr); BufferLink = CONTAINING_RECORD(&(FreedReq->tsr_req.tr_q.q_next), SLIST_ENTRY, Next); ExInterlockedPushEntrySList(&TCPSendReqFree, BufferLink, &TCPSendReqFreeLock); } //* GetSendReq - Get a send request structure. // // Called to get a send request structure. // TCPSendReq * // Returns: Pointer to SendReq structure, or NULL if none. GetSendReq( void) // Nothing. { TCPSendReq *Temp; PSLIST_ENTRY BufferLink; Queue *QueuePtr; TCPReq *ReqPtr; BufferLink = ExInterlockedPopEntrySList(&TCPSendReqFree, &TCPSendReqFreeLock); if (BufferLink != NULL) { QueuePtr = CONTAINING_RECORD(BufferLink, Queue, q_next); ReqPtr = CONTAINING_RECORD(QueuePtr, TCPReq, tr_q); Temp = CONTAINING_RECORD(ReqPtr, TCPSendReq, tsr_req); CHECK_STRUCT(Temp, tsr); } else { if (NumTCPSendReq < MaxSendReq) Temp = ExAllocatePool(NonPagedPool, sizeof(TCPSendReq)); else Temp = NULL; if (Temp != NULL) { ExInterlockedAddUlong((PULONG)&NumTCPSendReq, 1, &TCPSendReqFreeLock); #if DBG Temp->tsr_req.tr_sig = tr_signature; Temp->tsr_sig = tsr_signature; #endif } } return Temp; } //* TCPHopLimit // // Given a TCB, returns the Hop Limit to use in a sent packet. // Assumes the caller holds a lock on the TCB. // uchar TCPHopLimit(TCB *Tcb) { if (Tcb->tcb_hops != -1) return (uchar) Tcb->tcb_hops; else return (uchar) Tcb->tcb_rce->NCE->IF->CurHopLimit; } //* TCPSendComplete - Complete a TCP send. // // Called by IP when a send we've made is complete. We free the buffer, // and possibly complete some sends. Each send queued on a TCB has a ref. // count with it, which is the number of times a pointer to a buffer // associated with the send has been passed to the underlying IP layer. We // can't complete a send until that count it 0. If this send was actually // from a send of data, we'll go down the chain of send and decrement the // refcount on each one. If we have one going to 0 and the send has already // been acked we'll complete the send. If it hasn't been acked we'll leave // it until the ack comes in. // // NOTE: We aren't protecting any of this with locks. When we port this to // NT we'll need to fix this, probably with a global lock. See the comments // in ACKSend() in TCPRCV.C for more details. // void // Returns: Nothing. TCPSendComplete( PNDIS_PACKET Packet, // Packet that was sent. IP_STATUS Status) { PNDIS_BUFFER BufferChain; SendCmpltContext *SCContext; PVOID Memory; UINT Unused; UNREFERENCED_PARAMETER(Status); // // Pull values we care about out of the packet structure. // SCContext = (SendCmpltContext *) PC(Packet)->CompletionData; BufferChain = NdisFirstBuffer(Packet); NdisQueryBufferSafe(BufferChain, &Memory, &Unused, LowPagePriority); ASSERT(Memory != NULL); // // See if we have a send complete context. It will be present for data // packets and means we have extra work to do. For non-data packets, we // can just skip all this as there is only the header buffer to deal with. // if (SCContext != NULL) { KIRQL OldIrql; PNDIS_BUFFER CurrentBuffer; TCPSendReq *CurrentSend; uint i; CHECK_STRUCT(SCContext, scc); // // First buffer in chain is the TCP header buffer. // Skip over it for now. // CurrentBuffer = NDIS_BUFFER_LINKAGE(BufferChain); // // Also skip over any 'user' buffers (those loaned out to us // instead of copied) as we don't need to free them. // for (i = 0; i < (uint)SCContext->scc_ubufcount; i++) { ASSERT(CurrentBuffer != NULL); CurrentBuffer = NDIS_BUFFER_LINKAGE(CurrentBuffer); } // // Now loop through and free our (aka 'transport') buffers. // We need to do this before decrementing the reference count to avoid // destroying the buffer chain if we have to zap tsr_lastbuf->Next to // NULL. // for (i = 0; i < (uint)SCContext->scc_tbufcount; i++) { PNDIS_BUFFER TempBuffer; ASSERT(CurrentBuffer != NULL); TempBuffer = CurrentBuffer; CurrentBuffer = NDIS_BUFFER_LINKAGE(CurrentBuffer); NdisFreeBuffer(TempBuffer); } // // Loop through the send requests attached to this packet, // reducing the reference count on each and enqueing them for // completion where appropriate. // CurrentSend = SCContext->scc_firstsend; for (i = 0; i< SCContext->scc_count; i++) { Queue *TempQ; long Result; TempQ = QNEXT(&CurrentSend->tsr_req.tr_q); CHECK_STRUCT(CurrentSend, tsr); Result = InterlockedDecrement(&(CurrentSend->tsr_refcnt)); ASSERT(Result >= 0); if (Result <= 0) { // // Reference count has gone to 0 which means the send has // been ACK'd or cancelled. Complete it now. // // If we've sent directly from this send, NULL out the next // pointer for the last buffer in the chain. // if (CurrentSend->tsr_lastbuf != NULL) { NDIS_BUFFER_LINKAGE(CurrentSend->tsr_lastbuf) = NULL; CurrentSend->tsr_lastbuf = NULL; } KeAcquireSpinLock(&RequestCompleteLock, &OldIrql); ENQUEUE(&SendCompleteQ, &CurrentSend->tsr_req.tr_q); RequestCompleteFlags |= SEND_REQUEST_COMPLETE; KeReleaseSpinLock(&RequestCompleteLock, OldIrql); } CurrentSend = CONTAINING_RECORD(QSTRUCT(TCPReq, TempQ, tr_q), TCPSendReq, tsr_req); } } // // Free the TCP header buffer and our packet structure proper. // NdisFreeBuffer(BufferChain); ExFreePool(Memory); NdisFreePacket(Packet); // // If there are any TCP send requests to complete, do so now. // if (RequestCompleteFlags & SEND_REQUEST_COMPLETE) TCPRcvComplete(); } //* RcvWin - Figure out the receive window to offer in an ack. // // A routine to figure out what window to offer on a connection. We // take into account SWS avoidance, what the default connection window is, // and what the last window we offered is. // uint // Returns: Window to be offered. RcvWin( TCB *WinTCB) // TCB on which to perform calculations. { int CouldOffer; // The window size we could offer. CHECK_STRUCT(WinTCB, tcb); CheckPacketList(WinTCB->tcb_pendhead, WinTCB->tcb_pendingcnt); ASSERT(WinTCB->tcb_rcvwin >= 0); CouldOffer = WinTCB->tcb_defaultwin - WinTCB->tcb_pendingcnt; ASSERT(CouldOffer >= 0); ASSERT(CouldOffer >= WinTCB->tcb_rcvwin); if ((CouldOffer - WinTCB->tcb_rcvwin) >= (int) MIN(WinTCB->tcb_defaultwin/2, WinTCB->tcb_mss)) { WinTCB->tcb_rcvwin = CouldOffer; } return WinTCB->tcb_rcvwin; } //* ValidateSourceAndRoute - Validate the NTE and RCE. // // Checks that the NTE and RCE referenced by this TCB are still ok to use. // BOOLEAN ValidateSourceAndRoute( TCB *Tcb) // TCB being validated. { KIRQL Irql0; // // Update our copy of the validation counter. // We need to do this before making the validation checks below // (to avoid missing any additional changes while we're in here). // Tcb->tcb_routing = RouteCacheValidationCounter; // // Check that our NTE hasn't gone away. // KeAcquireSpinLock(&Tcb->tcb_nte->IF->Lock, &Irql0); if (!IsValidNTE(Tcb->tcb_nte)) { // // Can't use this one anymore. // KeReleaseSpinLock(&Tcb->tcb_nte->IF->Lock, Irql0); ReleaseNTE(Tcb->tcb_nte); // // See if this address lives on as a different NTE. // Tcb->tcb_nte = FindNetworkWithAddress(&Tcb->tcb_saddr, Tcb->tcb_sscope_id); if (Tcb->tcb_nte == NULL) { // // The address is gone. // return FALSE; } } else { KeReleaseSpinLock(&Tcb->tcb_nte->IF->Lock, Irql0); } // // Also check that the RCE is still around. // Tcb->tcb_rce = ValidateRCE(Tcb->tcb_rce, Tcb->tcb_nte); return TRUE; } //* SendSYN - Send a SYN segment. // // This is called during connection establishment time to send a SYN // segment to the peer. We get a buffer if we can, and then fill // it in. There's a tricky part here where we have to build the MSS // option in the header - we find the MSS by finding the MSS offered // by the net for the local address. After that, we send it. // void // Returns: Nothing. SendSYN( TCB *SYNTcb, // TCB from which SYN is to be sent. KIRQL PreLockIrql) // IRQL prior to acquiring TCB lock. { PNDIS_PACKET Packet; void *Memory; IPv6Header UNALIGNED *IP; TCPHeader UNALIGNED *TCP; uchar *OptPtr; NDIS_STATUS NdisStatus; uint Offset; uint Length; uint PayloadLength; ushort TempWin; ushort MSS; RouteCacheEntry *RCE; CHECK_STRUCT(SYNTcb, tcb); // // Go ahead and set the retransmission timer now, in case we can't get a // packet or a buffer. In the future we might want to queue the // connection for when we get resources. // START_TCB_TIMER(SYNTcb->tcb_rexmittimer, SYNTcb->tcb_rexmit); // // In most cases, we will already have a route at this point. // However, if we failed to get one earlier in the passive receive // path, we may need to retry here. // if (SYNTcb->tcb_rce == NULL) { InitRCE(SYNTcb); if (SYNTcb->tcb_rce == NULL) { goto ErrorReturn; } } // // Validate that the address we're sourcing from and the route we're // sending upon are still okay to use. // if (SYNTcb->tcb_routing != RouteCacheValidationCounter) { if (!ValidateSourceAndRoute(SYNTcb)) { // // Even though we're about to close this TCB, // we should leave it in a consistent state. // SYNTcb->tcb_sendnext++; if (SEQ_GT(SYNTcb->tcb_sendnext, SYNTcb->tcb_sendmax)) { SYNTcb->tcb_sendmax = SYNTcb->tcb_sendnext; } TryToCloseTCB(SYNTcb, TCB_CLOSE_ABORTED, PreLockIrql); return; } } // // Allocate a packet header/buffer/data region for this SYN. // // Our buffer has space at the beginning which will be filled in // later by the link level. At this level we add the IPv6Header, // TCPHeader, and TCP Maximum Segment Size option which follow. // // REVIEW: This grabs packets and buffers from the IPv6PacketPool and // REVIEW: the IPv6BufferPool respectively. Have seperate pools for TCP? // Offset = SYNTcb->tcb_rce->NCE->IF->LinkHeaderSize; Length = Offset + sizeof(*IP) + sizeof(*TCP) + MSS_OPT_SIZE; NdisStatus = IPv6AllocatePacket(Length, &Packet, &Memory); if (NdisStatus != NDIS_STATUS_SUCCESS) { // // Upon failure, advance tcb_sendnext anyway. // We need to do this because TCBTimeout will *retreat* tcb_sendnext // if this SYN is later retransmitted, and if that retreat occurs // without this advance, we end up with a hole in the sequence-space. // KdPrintEx((DPFLTR_TCPIP6_ID, DPFLTR_NTOS_ERROR, "TCP SendSYN: Couldn't allocate IPv6 packet header!?!\n")); ErrorReturn: SYNTcb->tcb_sendnext++; if (SEQ_GT(SYNTcb->tcb_sendnext, SYNTcb->tcb_sendmax)) { SYNTcb->tcb_sendmax = SYNTcb->tcb_sendnext; } KeReleaseSpinLock(&SYNTcb->tcb_lock, PreLockIrql); return; } PC(Packet)->CompletionHandler = TCPSendComplete; PC(Packet)->CompletionData = NULL; // // Since this is a SYN-only packet (maybe someday we'll send data with // the SYN?) we only have the one buffer and nothing to link on after. // // // We now have all the resources we need to send. // Prepare the actual packet. // // // Our header buffer has extra space for other headers to be // prepended to ours without requiring further allocation calls. // Put the actual TCP/IP header at the end of the buffer. // IP = (IPv6Header UNALIGNED *)((uchar *)Memory + Offset); IP->VersClassFlow = IP_VERSION; IP->NextHeader = IP_PROTOCOL_TCP; IP->HopLimit = TCPHopLimit(SYNTcb); IP->Source = SYNTcb->tcb_saddr; IP->Dest = SYNTcb->tcb_daddr; TCP = (TCPHeader UNALIGNED *)(IP + 1); TCP->tcp_src = SYNTcb->tcb_sport; TCP->tcp_dest = SYNTcb->tcb_dport; TCP->tcp_seq = net_long(SYNTcb->tcb_sendnext); // // The SYN flag takes up one element in sequence number space. // Record that we've sent it here (if we need to retransmit the SYN // segment, TCBTimeout will reset sendnext before calling us again). // SYNTcb->tcb_sendnext++; if (SEQ_GT(SYNTcb->tcb_sendnext, SYNTcb->tcb_sendmax)) { TStats.ts_outsegs++; SYNTcb->tcb_sendmax = SYNTcb->tcb_sendnext; } else TStats.ts_retranssegs++; TCP->tcp_ack = net_long(SYNTcb->tcb_rcvnext); // // REVIEW: TCP flags are entirely based upon our state, so this could // REVIEW: be replaced by a (quicker) array lookup. // if (SYNTcb->tcb_state == TCB_SYN_RCVD) TCP->tcp_flags = MAKE_TCP_FLAGS(6, TCP_FLAG_SYN | TCP_FLAG_ACK); else TCP->tcp_flags = MAKE_TCP_FLAGS(6, TCP_FLAG_SYN); TempWin = (ushort)SYNTcb->tcb_rcvwin; TCP->tcp_window = net_short(TempWin); TCP->tcp_urgent = 0; TCP->tcp_xsum = 0; OptPtr = (uchar *)(TCP + 1); // // Compose the Maximum Segment Size option. // // TBD: If we add IPv6 Jumbogram support, we should also add LFN // TBD: support to TCP and change this to handle a larger MSS. // MSS = SYNTcb->tcb_rce->NTE->IF->LinkMTU - sizeof(IPv6Header) - sizeof(TCPHeader); IF_TCPDBG(TCP_DEBUG_MSS) { KdPrintEx((DPFLTR_TCPIP6_ID, DPFLTR_INFO_TCPDBG, "SendSYN: Sending MSS option value of %d\n", MSS)); } *OptPtr++ = TCP_OPT_MSS; *OptPtr++ = MSS_OPT_SIZE; *(ushort UNALIGNED *)OptPtr = net_short(MSS); PayloadLength = sizeof(TCPHeader) + MSS_OPT_SIZE; // // Compute the TCP checksum. It covers the entire TCP segment // starting with the TCP header, plus the IPv6 pseudo-header. // // REVIEW: The IPv4 implementation kept the IPv4 psuedo-header around // REVIEW: in the TCB rather than recalculate it every time. Do this? // TCP->tcp_xsum = 0; TCP->tcp_xsum = ChecksumPacket( Packet, Offset + sizeof *IP, NULL, PayloadLength, AlignAddr(&IP->Source), AlignAddr(&IP->Dest), IP_PROTOCOL_TCP); ASSERT(TCP->tcp_xsum != 0); // // Capture and reference the RCE while we still hold the TCB lock. // The TCB's reference on this particular RCE might go away at any point // after we release the lock (or because we drop it ourselves below). // RCE = SYNTcb->tcb_rce; AddRefRCE(RCE); // // If connection-acceptance has been delayed, release the TCB's RCE. // This prevents TCBs in pre-established states from consuming // an unbounded number of RCEs. // if (SYNTcb->tcb_flags & ACCEPT_PENDING) { SYNTcb->tcb_rce = NULL; ReleaseRCE(RCE); } // // Everything's ready. Now send the packet. // // Note that IPv6Send does not return a status code. // Instead it *always* completes the packet // with an appropriate status code. // KeReleaseSpinLock(&SYNTcb->tcb_lock, PreLockIrql); IPv6Send(Packet, Offset, IP, PayloadLength, RCE, 0, IP_PROTOCOL_TCP, net_short(TCP->tcp_src), net_short(TCP->tcp_dest)); // // Release the extra reference we took on the RCE above. // ReleaseRCE(RCE); } //* SendKA - Send a keep alive segment. // // This is called when we want to send a keep-alive. The idea is to provoke // a response from our peer on an otherwise idle connection. We send a // garbage byte of data in our keep-alives in order to cooperate with broken // TCP implementations that don't respond to segments outside the window // unless they contain data. // void // Returns: Nothing. SendKA( TCB *KATcb, // TCB from which keep alive is to be sent. KIRQL PreLockIrql) // IRQL prior to acquiring lock on TCB. { PNDIS_PACKET Packet; void *Memory; IPv6Header UNALIGNED *IP; TCPHeader UNALIGNED *TCP; NDIS_STATUS NdisStatus; int Offset; uint Length; uint PayloadLength; ushort TempWin; SeqNum TempSeq; RouteCacheEntry *RCE; CHECK_STRUCT(KATcb, tcb); // // In most cases, we will already have a route at this point. // However, if we failed to get one earlier in the passive receive // path, we may need to retry here. // if (KATcb->tcb_rce == NULL) { InitRCE(KATcb); if (KATcb->tcb_rce == NULL) { KeReleaseSpinLock(&KATcb->tcb_lock, PreLockIrql); return; } } // // Validate that the address we're sourcing from and the route we're // sending upon are still okay to use. // if (KATcb->tcb_routing != RouteCacheValidationCounter) { if (!ValidateSourceAndRoute(KATcb)) { TryToCloseTCB(KATcb, TCB_CLOSE_ABORTED, PreLockIrql); return; } } // // Allocate a packet header/buffer/data region for this keepalive packet. // // Our buffer has space at the beginning which will be filled in // later by the link level. At this level we add the IPv6Header, // TCPHeader, and a single byte of data which follow. // // REVIEW: This grabs packets and buffers from the IPv6PacketPool and // REVIEW: the IPv6BufferPool respectively. Have seperate pools for TCP? // Offset = KATcb->tcb_rce->NCE->IF->LinkHeaderSize; Length = Offset + sizeof(*IP) + sizeof(*TCP) + 1; NdisStatus = IPv6AllocatePacket(Length, &Packet, &Memory); if (NdisStatus != NDIS_STATUS_SUCCESS) { // // REVIEW: What to do if this fails. // KdPrintEx((DPFLTR_TCPIP6_ID, DPFLTR_NTOS_ERROR, "TCP SendKA: Couldn't allocate IPv6 packet header!?!\n")); KeReleaseSpinLock(&KATcb->tcb_lock, PreLockIrql); return; } PC(Packet)->CompletionHandler = TCPSendComplete; PC(Packet)->CompletionData = NULL; // // Since this is a keepalive packet we only have the one buffer and // nothing to link on after. // // // Our header buffer has extra space for other headers to be // prepended to ours without requiring further allocation calls. // Put the actual TCP/IP header at the end of the buffer. // IP = (IPv6Header UNALIGNED *)((uchar *)Memory + Offset); IP->VersClassFlow = IP_VERSION; IP->NextHeader = IP_PROTOCOL_TCP; IP->HopLimit = TCPHopLimit(KATcb); IP->Source = KATcb->tcb_saddr; IP->Dest = KATcb->tcb_daddr; TCP = (TCPHeader UNALIGNED *)(IP + 1); TCP->tcp_src = KATcb->tcb_sport; TCP->tcp_dest = KATcb->tcb_dport; TempSeq = KATcb->tcb_senduna - 1; TCP->tcp_seq = net_long(TempSeq); TCP->tcp_ack = net_long(KATcb->tcb_rcvnext); TCP->tcp_flags = MAKE_TCP_FLAGS(5, TCP_FLAG_ACK); TempWin = (ushort)RcvWin(KATcb); TCP->tcp_window = net_short(TempWin); TCP->tcp_urgent = 0; // // Initialize the single byte that we're resending. // N.B. Adequate space for this byte was allocated above. // *(uchar *)(TCP + 1) = 0; TStats.ts_retranssegs++; PayloadLength = sizeof(TCPHeader) + 1; // // Compute the TCP checksum. It covers the entire TCP segment // starting with the TCP header, plus the IPv6 pseudo-header. // TCP->tcp_xsum = 0; TCP->tcp_xsum = ChecksumPacket( Packet, Offset + sizeof *IP, NULL, PayloadLength, AlignAddr(&IP->Source), AlignAddr(&IP->Dest), IP_PROTOCOL_TCP); ASSERT(TCP->tcp_xsum != 0); // // Capture and reference the RCE while we still hold the TCB lock. // The TCB's reference on this particular RCE might go away at any // point after we release the lock. // RCE = KATcb->tcb_rce; AddRefRCE(RCE); // // Everything's ready. Now send the packet. // // Note that IPv6Send does not return a status code. // Instead it *always* completes the packet // with an appropriate status code. // KATcb->tcb_kacount++; KeReleaseSpinLock(&KATcb->tcb_lock, PreLockIrql); IPv6Send(Packet, Offset, IP, PayloadLength, RCE, 0, IP_PROTOCOL_TCP, net_short(TCP->tcp_src), net_short(TCP->tcp_dest)); // // Release the extra reference we took on the RCE above. // ReleaseRCE(RCE); } //* SendACK - Send an ACK segment. // // This is called whenever we need to send an ACK for some reason. Nothing // fancy, we just do it. // void // Returns: Nothing. SendACK( TCB *ACKTcb) // TCB from which ACK is to be sent. { PNDIS_PACKET Packet; void *Memory; IPv6Header UNALIGNED *IP; TCPHeader UNALIGNED *TCP; NDIS_STATUS NdisStatus; KIRQL OldIrql; int Offset; uint Length; uint PayloadLength; SeqNum SendNext; ushort TempWin; RouteCacheEntry *RCE; CHECK_STRUCT(ACKTcb, tcb); KeAcquireSpinLock(&ACKTcb->tcb_lock, &OldIrql); // // In most cases, we will already have a route at this point. // However, if we failed to get one earlier in the passive receive // path, we may need to retry here. // if (ACKTcb->tcb_rce == NULL) { InitRCE(ACKTcb); if (ACKTcb->tcb_rce == NULL) { KeReleaseSpinLock(&ACKTcb->tcb_lock, OldIrql); return; } } // // Validate that the address we're sourcing from and the route we're // sending upon are still okay to use. // if (ACKTcb->tcb_routing != RouteCacheValidationCounter) { if (!ValidateSourceAndRoute(ACKTcb)) { TryToCloseTCB(ACKTcb, TCB_CLOSE_ABORTED, OldIrql); return; } } // // Allocate a packet header/buffer/data region for this ACK packet. // // Our buffer has space at the beginning which will be filled in // later by the link level. At this level we add the IPv6Header // and the TCPHeader. // // REVIEW: This grabs packets and buffers from the IPv6PacketPool and // REVIEW: the IPv6BufferPool respectively. Have seperate pools for TCP? // Offset = ACKTcb->tcb_rce->NCE->IF->LinkHeaderSize; Length = Offset + sizeof(*IP) + sizeof(*TCP); NdisStatus = IPv6AllocatePacket(Length, &Packet, &Memory); if (NdisStatus != NDIS_STATUS_SUCCESS) { KeReleaseSpinLock(&ACKTcb->tcb_lock, OldIrql); // // REVIEW: What to do if this fails. // KdPrintEx((DPFLTR_TCPIP6_ID, DPFLTR_NTOS_ERROR, "TCP SendACK: Couldn't allocate IPv6 packet header!?!\n")); return; } PC(Packet)->CompletionHandler = TCPSendComplete; PC(Packet)->CompletionData = NULL; // // Our header buffer has extra space for other headers to be // prepended to ours without requiring further allocation calls. // Put the actual TCP/IP header at the end of the buffer. // IP = (IPv6Header UNALIGNED *)((uchar *)Memory + Offset); IP->VersClassFlow = IP_VERSION; IP->NextHeader = IP_PROTOCOL_TCP; IP->HopLimit = TCPHopLimit(ACKTcb); IP->Source = ACKTcb->tcb_saddr; IP->Dest = ACKTcb->tcb_daddr; TCP = (TCPHeader UNALIGNED *)(IP + 1); TCP->tcp_src = ACKTcb->tcb_sport; TCP->tcp_dest = ACKTcb->tcb_dport; TCP->tcp_ack = net_long(ACKTcb->tcb_rcvnext); // // If the remote peer is advertising a window of zero, we need to send // this ack with a sequence number of his rcv_next (which in that case // should be our senduna). We have code here ifdef'd out that makes // sure that we don't send outside the RWE, but this doesn't work. We // need to be able to send a pure ACK exactly at the RWE. // if (ACKTcb->tcb_sendwin != 0) { SendNext = ACKTcb->tcb_sendnext; #if 0 SeqNum MaxValidSeq; MaxValidSeq = ACKTcb->tcb_senduna + ACKTcb->tcb_sendwin - 1; SendNext = (SEQ_LT(SendNext, MaxValidSeq) ? SendNext : MaxValidSeq); #endif } else SendNext = ACKTcb->tcb_senduna; if ((ACKTcb->tcb_flags & FIN_SENT) && SEQ_EQ(SendNext, ACKTcb->tcb_sendmax - 1)) { TCP->tcp_flags = MAKE_TCP_FLAGS(5, TCP_FLAG_FIN | TCP_FLAG_ACK); } else TCP->tcp_flags = MAKE_TCP_FLAGS(5, TCP_FLAG_ACK); TCP->tcp_seq = net_long(SendNext); TempWin = (ushort)RcvWin(ACKTcb); TCP->tcp_window = net_short(TempWin); TCP->tcp_urgent = 0; PayloadLength = sizeof(*TCP); // // Compute the TCP checksum. It covers the entire TCP segment // starting with the TCP header, plus the IPv6 pseudo-header. // TCP->tcp_xsum = 0; TCP->tcp_xsum = ChecksumPacket( Packet, Offset + sizeof *IP, NULL, PayloadLength, AlignAddr(&IP->Source), AlignAddr(&IP->Dest), IP_PROTOCOL_TCP); ASSERT(TCP->tcp_xsum != 0); STOP_TCB_TIMER(ACKTcb->tcb_delacktimer); ACKTcb->tcb_flags &= ~(NEED_ACK | ACK_DELAYED); TStats.ts_outsegs++; // // Capture and reference the RCE while we still hold the TCB lock. // The TCB's reference on this particular RCE might go away at any point // after we release the lock (or because we drop it ourselves below). // RCE = ACKTcb->tcb_rce; AddRefRCE(RCE); // // If connection-acceptance has been delayed, release the TCB's RCE. // This prevents TCBs in pre-established states from consuming // an unbounded number of RCEs. // if (ACKTcb->tcb_flags & ACCEPT_PENDING) { ACKTcb->tcb_rce = NULL; ReleaseRCE(RCE); } // // Everything's ready. Now send the packet. // // Note that IPv6Send does not return a status code. // Instead it *always* completes the packet // with an appropriate status code. // KeReleaseSpinLock(&ACKTcb->tcb_lock, OldIrql); IPv6Send(Packet, Offset, IP, PayloadLength, RCE, 0, IP_PROTOCOL_TCP, net_short(TCP->tcp_src), net_short(TCP->tcp_dest)); // // Release the extra reference we took on the RCE above. // ReleaseRCE(RCE); } //* SendRSTFromTCB - Send a RST from a TCB. // // This is called during close when we need to send a RST. // // Called only when TCB is going away, so we have exclusive access. // void // Returns: Nothing. SendRSTFromTCB( TCB *RSTTcb) // TCB from which RST is to be sent. { PNDIS_PACKET Packet; void *Memory; IPv6Header UNALIGNED *IP; TCPHeader UNALIGNED *TCP; NDIS_STATUS NdisStatus; int Offset; uint Length; uint PayloadLength; SeqNum RSTSeq; CHECK_STRUCT(RSTTcb, tcb); ASSERT(RSTTcb->tcb_state == TCB_CLOSED); // // In most cases, we will already have a route at this point. // However, if we failed to get one earlier in the passive receive // path, we may need to retry here. // if (RSTTcb->tcb_rce == NULL) { InitRCE(RSTTcb); if (RSTTcb->tcb_rce == NULL) { return; } } // // Validate that the address we're sourcing from and the route we're // sending upon are still okay to use. // if (RSTTcb->tcb_routing != RouteCacheValidationCounter) { if (!ValidateSourceAndRoute(RSTTcb)) { return; } } // // Allocate a packet header/buffer/data region for this RST packet. // // Our buffer has space at the beginning which will be filled in // later by the link level. At this level we add the IPv6Header // and the TCPHeader. // // REVIEW: This grabs packets and buffers from the IPv6PacketPool and // REVIEW: the IPv6BufferPool respectively. Have seperate pools for TCP? // Offset = RSTTcb->tcb_rce->NCE->IF->LinkHeaderSize; Length = Offset + sizeof(*IP) + sizeof(*TCP); NdisStatus = IPv6AllocatePacket(Length, &Packet, &Memory); if (NdisStatus != NDIS_STATUS_SUCCESS) { // // REVIEW: What to do if this fails. // KdPrintEx((DPFLTR_TCPIP6_ID, DPFLTR_NTOS_ERROR, "TCP SendRSTFromTCB: " "Couldn't alloc IPv6 packet header!\n")); return; } PC(Packet)->CompletionHandler = TCPSendComplete; PC(Packet)->CompletionData = NULL; // // Since this is an RST-only packet we only have the one buffer and // nothing to link on after. // // // Our header buffer has extra space for other headers to be // prepended to ours without requiring further allocation calls. // Put the actual TCP/IP header at the end of the buffer. // IP = (IPv6Header UNALIGNED *)((uchar *)Memory + Offset); IP->VersClassFlow = IP_VERSION; IP->NextHeader = IP_PROTOCOL_TCP; IP->HopLimit = TCPHopLimit(RSTTcb); IP->Source = RSTTcb->tcb_saddr; IP->Dest = RSTTcb->tcb_daddr; TCP = (TCPHeader UNALIGNED *)(IP + 1); TCP->tcp_src = RSTTcb->tcb_sport; TCP->tcp_dest = RSTTcb->tcb_dport; // // If the remote peer has a window of 0, send with a seq. # equal // to senduna so he'll accept it. Otherwise send with send max. // if (RSTTcb->tcb_sendwin != 0) RSTSeq = RSTTcb->tcb_sendmax; else RSTSeq = RSTTcb->tcb_senduna; TCP->tcp_seq = net_long(RSTSeq); TCP->tcp_ack = net_long(RSTTcb->tcb_rcvnext); TCP->tcp_flags = MAKE_TCP_FLAGS(5, TCP_FLAG_RST | TCP_FLAG_ACK); TCP->tcp_window = 0; TCP->tcp_urgent = 0; PayloadLength = sizeof(*TCP); // // Compute the TCP checksum. It covers the entire TCP segment // starting with the TCP header, plus the IPv6 pseudo-header. // TCP->tcp_xsum = 0; TCP->tcp_xsum = ChecksumPacket( Packet, Offset + sizeof *IP, NULL, PayloadLength, AlignAddr(&IP->Source), AlignAddr(&IP->Dest), IP_PROTOCOL_TCP); ASSERT(TCP->tcp_xsum != 0); TStats.ts_outsegs++; TStats.ts_outrsts++; // // Everything's ready. Now send the packet. // // Note that IPv6Send does not return a status code. // Instead it *always* completes the packet // with an appropriate status code. // IPv6Send(Packet, Offset, IP, PayloadLength, RSTTcb->tcb_rce, 0, IP_PROTOCOL_TCP, net_short(TCP->tcp_src), net_short(TCP->tcp_dest)); } //* SendRSTFromHeader - Send a RST back, based on a header. // // Called when we need to send a RST, but don't necessarily have a TCB. // void // Returns: Nothing. SendRSTFromHeader( TCPHeader UNALIGNED *RecvTCP, // TCP header to be RST. uint Length, // Length of the incoming segment. IPv6Addr *Dest, // Destination IP address for RST. uint DestScopeId, // Scope id for destination address. IPv6Addr *Src, // Source IP address for RST. uint SrcScopeId) // Scope id for source address. { PNDIS_PACKET Packet; void *Memory; IPv6Header UNALIGNED *IP; TCPHeader UNALIGNED *SendTCP; NetTableEntry *NTE; RouteCacheEntry *RCE; IP_STATUS Status; NDIS_STATUS NdisStatus; uint Offset; uint SendLength; uint PayloadLength; // // Never send a RST in response to a RST. // if (RecvTCP->tcp_flags & TCP_FLAG_RST) return; // // Determine NTE to send on based on incoming packet's destination. // REVIEW: Alternatively, we could/should just pass the NTE in. // NTE = FindNetworkWithAddress(Src, SrcScopeId); if (NTE == NULL) { // // This should only happen if the NTE became invalid // between accepting the packet and getting here. It // cannot completely go away since the packet's Packet // structure holds a reference to it. // return; } // // Get the route to the destination (incoming packet's source). // Status = RouteToDestination(Dest, DestScopeId, CastFromNTE(NTE), RTD_FLAG_NORMAL, &RCE); if (Status != IP_SUCCESS) { // // Failed to get a route to the destination. Error out. // KdPrintEx((DPFLTR_TCPIP6_ID, DPFLTR_INTERNAL_ERROR, "TCP SendRSTFromHeader: Can't get a route?!?\n")); ReleaseNTE(NTE); return; } // // Allocate a packet header/buffer/data region for this RST packet. // // Our buffer has space at the beginning which will be filled in // later by the link level. At this level we add the IPv6Header // and the TCPHeader. // // REVIEW: This grabs packets and buffers from the IPv6PacketPool and // REVIEW: the IPv6BufferPool respectively. Have seperate pools for TCP? // Offset = RCE->NCE->IF->LinkHeaderSize; SendLength = Offset + sizeof(*IP) + sizeof(*SendTCP); NdisStatus = IPv6AllocatePacket(SendLength, &Packet, &Memory); if (NdisStatus != NDIS_STATUS_SUCCESS) { // // Failed to allocate a packet header/buffer/data region. Error out. // KdPrintEx((DPFLTR_TCPIP6_ID, DPFLTR_NTOS_ERROR, "TCP SendRSTFromHeader: Couldn't alloc IPv6 pkt header!\n")); ReleaseRCE(RCE); ReleaseNTE(NTE); return; } PC(Packet)->CompletionHandler = TCPSendComplete; PC(Packet)->CompletionData = NULL; // // We now have all the resources we need to send. Since this is a // RST-only packet we only have the one header buffer and nothing // to link on after. // // // Our header buffer has extra space for other headers to be // prepended to ours without requiring further allocation calls. // Put the actual TCP/IP header at the end of the buffer. // IP = (IPv6Header UNALIGNED *)((uchar *)Memory + Offset); IP->VersClassFlow = IP_VERSION; IP->NextHeader = IP_PROTOCOL_TCP; IP->HopLimit = (uchar)RCE->NCE->IF->CurHopLimit; IP->Source = *Src; IP->Dest = *Dest; // // Fill in the header so as to make it believable to our peer, and send it. // SendTCP = (TCPHeader UNALIGNED *)(IP + 1); if (RecvTCP->tcp_flags & TCP_FLAG_SYN) Length++; if (RecvTCP->tcp_flags & TCP_FLAG_FIN) Length++; if (RecvTCP->tcp_flags & TCP_FLAG_ACK) { SendTCP->tcp_seq = RecvTCP->tcp_ack; SendTCP->tcp_ack = 0; SendTCP->tcp_flags = MAKE_TCP_FLAGS(sizeof(TCPHeader)/sizeof(ulong), TCP_FLAG_RST); } else { SeqNum TempSeq; SendTCP->tcp_seq = 0; TempSeq = net_long(RecvTCP->tcp_seq); TempSeq += Length; SendTCP->tcp_ack = net_long(TempSeq); SendTCP->tcp_flags = MAKE_TCP_FLAGS(sizeof(TCPHeader)/sizeof(ulong), TCP_FLAG_RST | TCP_FLAG_ACK); } SendTCP->tcp_window = 0; SendTCP->tcp_urgent = 0; SendTCP->tcp_dest = RecvTCP->tcp_src; SendTCP->tcp_src = RecvTCP->tcp_dest; PayloadLength = sizeof(*SendTCP); // // Compute the TCP checksum. It covers the entire TCP segment // starting with the TCP header, plus the IPv6 pseudo-header. // SendTCP->tcp_xsum = 0; SendTCP->tcp_xsum = ChecksumPacket( Packet, Offset + sizeof *IP, NULL, PayloadLength, AlignAddr(&IP->Source), AlignAddr(&IP->Dest), IP_PROTOCOL_TCP); ASSERT(SendTCP->tcp_xsum != 0); TStats.ts_outsegs++; TStats.ts_outrsts++; // // Everything's ready. Now send the packet. // // Note that IPv6Send does not return a status code. // Instead it *always* completes the packet // with an appropriate status code. // IPv6Send(Packet, Offset, IP, PayloadLength, RCE, 0, IP_PROTOCOL_TCP, net_short(SendTCP->tcp_src), net_short(SendTCP->tcp_dest)); // // Release the Route and the NTE. // ReleaseRCE(RCE); ReleaseNTE(NTE); } // end of SendRSTFromHeader() //* GoToEstab - Transition to the established state. // // Called when we are going to the established state and need to finish up // initializing things that couldn't be done until now. We assume the TCB // lock is held by the caller on the TCB we're called with. // void // Returns: Nothing. GoToEstab( TCB *EstabTCB) // TCB to transition. { // // Initialize our slow start and congestion control variables. // EstabTCB->tcb_cwin = 2 * EstabTCB->tcb_mss; EstabTCB->tcb_ssthresh = 0xffffffff; EstabTCB->tcb_state = TCB_ESTAB; // // We're in established. We'll subtract one from slow count for this fact, // and if the slowcount goes to 0 we'll move onto the fast path. // if (--(EstabTCB->tcb_slowcount) == 0) EstabTCB->tcb_fastchk &= ~TCP_FLAG_SLOW; InterlockedIncrement((PLONG)&TStats.ts_currestab); EstabTCB->tcb_flags &= ~ACTIVE_OPEN; // Turn off the active opening flag. } //* InitSendState - Initialize the send state of a connection. // // Called during connection establishment to initialize our send state. // (In this case, this refers to all information we'll put on the wire as // well as pure send state). We pick an ISS, set up a rexmit timer value, // etc. We assume the tcb_lock is held on the TCB when we are called. // void // Returns: Nothing. InitSendState( TCB *NewTCB) // TCB to be set up. { uint InitialRTT; CHECK_STRUCT(NewTCB, tcb); if (NewTCB->tcb_flags & ACTIVE_OPEN) { GetRandomISN(&NewTCB->tcb_sendnext, (uchar*)&NewTCB->tcb_md5data); } NewTCB->tcb_senduna = NewTCB->tcb_sendnext; NewTCB->tcb_sendmax = NewTCB->tcb_sendnext; NewTCB->tcb_error = IP_SUCCESS; // // Initialize retransmit and delayed ack stuff. // NewTCB->tcb_rexmitcnt = 0; NewTCB->tcb_rtt = 0; NewTCB->tcb_smrtt = 0; // // Check for interface specific initial RTT. // This can be as low as 3ms. // if ((NewTCB->tcb_rce != NULL) && ((InitialRTT = GetInitialRTTFromRCE(NewTCB->tcb_rce)) > MIN_INITIAL_RTT)) { NewTCB->tcb_delta = MS_TO_TICKS(InitialRTT * 2); NewTCB->tcb_rexmit = MS_TO_TICKS(InitialRTT); } else { NewTCB->tcb_delta = MS_TO_TICKS(6000); NewTCB->tcb_rexmit = MS_TO_TICKS(3000); } STOP_TCB_TIMER(NewTCB->tcb_rexmittimer); STOP_TCB_TIMER(NewTCB->tcb_delacktimer); } //* FillTCPHeader - Fill the TCP header in. // // A utility routine to fill in the TCP header. // void // Returns: Nothing. FillTCPHeader( TCB *SendTCB, // TCB to fill from. TCPHeader UNALIGNED *Header) // Header to fill into. { ushort S; ulong L; Header->tcp_src = SendTCB->tcb_sport; Header->tcp_dest = SendTCB->tcb_dport; L = SendTCB->tcb_sendnext; Header->tcp_seq = net_long(L); L = SendTCB->tcb_rcvnext; Header->tcp_ack = net_long(L); Header->tcp_flags = 0x1050; *(ulong UNALIGNED *)&Header->tcp_xsum = 0; S = (ushort)RcvWin(SendTCB); Header->tcp_window = net_short(S); Header->tcp_urgent = 0; } //* TCPSend - Send data from a TCP connection. // // This is the main 'send data' routine. We go into a loop, trying // to send data until we can't for some reason. First we compute // the useable window, use it to figure the amount we could send. If // the amount we could send meets certain criteria we'll build a frame // and send it, after setting any appropriate control bits. We assume // the caller has put a reference on the TCB. // void // Returns: Nothing. TCPSend( TCB *SendTCB, // TCB to be sent from. KIRQL PreLockIrql) // IRQL prior to acquiring TCB lock. { int SendWin; // Useable send window. uint AmountToSend; // Amount to send this time. uint AmountLeft; IPv6Header UNALIGNED *IP; TCPHeader UNALIGNED *TCP; PNDIS_PACKET Packet; PNDIS_BUFFER FirstBuffer, CurrentBuffer; void *Memory; TCPSendReq *CurSend; SendCmpltContext *SCC; SeqNum OldSeq; NDIS_STATUS NdisStatus; uint AmtOutstanding, AmtUnsent; int ForceWin; // Window we're forced to use. uint HeaderLength; uint LinkOffset; uint PMTU; RouteCacheEntry *RCE; CHECK_STRUCT(SendTCB, tcb); ASSERT(SendTCB->tcb_refcnt != 0); ASSERT(*(int *)&SendTCB->tcb_sendwin >= 0); ASSERT(*(int *)&SendTCB->tcb_cwin >= SendTCB->tcb_mss); ASSERT(!(SendTCB->tcb_flags & FIN_OUTSTANDING) || (SendTCB->tcb_sendnext == SendTCB->tcb_sendmax)); // // See if we should even be here. If another instance of ourselves is // already in this code, or is about to enter it after completing a // receive, then just skip on out. // if ((SendTCB->tcb_flags & IN_TCP_SEND) || (SendTCB->tcb_fastchk & TCP_FLAG_IN_RCV)) { SendTCB->tcb_flags |= SEND_AFTER_RCV; goto bail; } SendTCB->tcb_flags |= IN_TCP_SEND; // // In most cases, we will already have a route at this point. // However, if we failed to get one earlier in the passive receive // path, we may need to retry here. // if (SendTCB->tcb_rce == NULL) { InitRCE(SendTCB); if (SendTCB->tcb_rce == NULL) { SendTCB->tcb_flags &= ~IN_TCP_SEND; goto bail; } } // // Validate that the address we're sourcing from and the route we're // sending upon are still okay to use. // // We fail existing send requests for TCBs with a disconnected // outgoing interface, except when a loopback route is used. // if (SendTCB->tcb_routing != RouteCacheValidationCounter) { if (!ValidateSourceAndRoute(SendTCB) || IsDisconnectedAndNotLoopbackRCE(SendTCB->tcb_rce)) { SendTCB->tcb_flags &= ~IN_TCP_SEND; ASSERT(SendTCB->tcb_refcnt != 0); TryToCloseTCB(SendTCB, TCB_CLOSE_ABORTED, PreLockIrql); KeAcquireSpinLock(&SendTCB->tcb_lock, &PreLockIrql); goto bail; } } // // Verify that our cached Path MTU is still valid. // Watch for changes to IPsec policies since they can also effect our MSS. // REVIEW: This the best spot to do this? // PMTU = GetEffectivePathMTUFromRCE(SendTCB->tcb_rce); if (PMTU != SendTCB->tcb_pmtu || SecurityStateValidationCounter != SendTCB->tcb_security) { // // Either our Path MTU or the global security state has changed. // Cache current values and then calculate a new MSS. // SendTCB->tcb_pmtu = PMTU; SendTCB->tcb_security = SecurityStateValidationCounter; CalculateMSSForTCB(SendTCB); } // // We'll continue this loop until we send a FIN, or we break out // internally for some other reason. // while (!(SendTCB->tcb_flags & FIN_OUTSTANDING)) { CheckTCBSends(SendTCB); AmtOutstanding = (uint)(SendTCB->tcb_sendnext - SendTCB->tcb_senduna); AmtUnsent = SendTCB->tcb_unacked - AmtOutstanding; ASSERT(*(int *)&AmtUnsent >= 0); SendWin = (int)(MIN(SendTCB->tcb_sendwin, SendTCB->tcb_cwin) - AmtOutstanding); // // If this send is after a fast recovery and sendwin is zero because // of amount outstanding, then at least force 1 segment to prevent // delayed ack timeouts from peer. // if (SendTCB->tcb_force) { SendTCB->tcb_force = 0; if (SendWin < SendTCB->tcb_mss) { SendWin = SendTCB->tcb_mss; } } // // Since the window could have shrank, need to get it to zero at // least. // ForceWin = (int)((SendTCB->tcb_flags & FORCE_OUTPUT) >> FORCE_OUT_SHIFT); SendWin = MAX(SendWin, ForceWin); AmountToSend = MIN(MIN((uint)SendWin, AmtUnsent), SendTCB->tcb_mss); ASSERT(SendTCB->tcb_mss > 0); // // See if we have enough to send. We'll send if we have at least a // segment, or if we really have some data to send and we can send // all that we have, or the send window is > 0 and we need to force // output or send a FIN (note that if we need to force output // SendWin will be at least 1 from the check above), or if we can // send an amount == to at least half the maximum send window // we've seen. // if (AmountToSend == SendTCB->tcb_mss || (AmountToSend != 0 && AmountToSend == AmtUnsent) || (SendWin != 0 && (((SendTCB->tcb_flags & FIN_NEEDED) && AmtUnsent <= SendTCB->tcb_mss) || (SendTCB->tcb_flags & FORCE_OUTPUT) || AmountToSend >= (SendTCB->tcb_maxwin / 2)))) { // // It's OK to send something. Allocate a packet header. // // REVIEW: It was easier to code all these allocations directly // REVIEW: rather than use IPv6AllocatePacket. // // REVIEW: This grabs packets and buffers from the IPv6PacketPool // REVIEW: and the IPv6BufferPool respectively. Should we instead // REVIEW: have separate pools for TCP? // NdisAllocatePacket(&NdisStatus, &Packet, IPv6PacketPool); if (NdisStatus != NDIS_STATUS_SUCCESS) { KdPrintEx((DPFLTR_TCPIP6_ID, DPFLTR_NTOS_ERROR, "TCPSend: couldn't allocate packet header!?!\n")); goto error_oor; } // We'll fill in the CompletionData below. InitializeNdisPacket(Packet); PC(Packet)->CompletionHandler = TCPSendComplete; // // Our header buffer has extra space at the beginning for other // headers to be prepended to ours without requiring further // allocation calls. It also has extra space at the end to hold // the send completion data. // LinkOffset = SendTCB->tcb_rce->NCE->IF->LinkHeaderSize; HeaderLength = (LinkOffset + sizeof(*IP) + sizeof(*TCP) + sizeof(SendCmpltContext) + __builtin_alignof(SendCmpltContext) - 1) &~ (UINT_PTR)(__builtin_alignof(SendCmpltContext) - 1); Memory = ExAllocatePool(NonPagedPool, HeaderLength); if (Memory == NULL) { KdPrintEx((DPFLTR_TCPIP6_ID, DPFLTR_NTOS_ERROR, "TCPSend: couldn't allocate header memory!?!\n")); NdisFreePacket(Packet); goto error_oor; } // // When allocating the NDIS buffer describing this memory region, // we don't tell it about the extra space on the end that we // allocated for the send completion data. // NdisAllocateBuffer(&NdisStatus, &FirstBuffer, IPv6BufferPool, Memory, LinkOffset + sizeof(*IP) + sizeof(*TCP)); if (NdisStatus != NDIS_STATUS_SUCCESS) { KdPrintEx((DPFLTR_TCPIP6_ID, DPFLTR_NTOS_ERROR, "TCPSend: couldn't allocate buffer!?!\n")); ExFreePool(Memory); NdisFreePacket(Packet); goto error_oor; } // // Skip over the extra space that will be filled in later by the // link level. At this level we add the IPv6Header, the // TCPHeader, and the data. // IP = (IPv6Header UNALIGNED *)((uchar *)Memory + LinkOffset); IP->VersClassFlow = IP_VERSION; IP->NextHeader = IP_PROTOCOL_TCP; IP->HopLimit = TCPHopLimit(SendTCB); IP->Source = SendTCB->tcb_saddr; IP->Dest = SendTCB->tcb_daddr; // // Begin preparing the TCP header. // TCP = (TCPHeader UNALIGNED *)(IP + 1); FillTCPHeader(SendTCB, TCP); // // Store the send completion data in the same buffer as the TCP // header, right after the TCP header. This saves allocation // overhead and works because we don't consider this area to be // part of the packet data (we set this buffer's length to // indicate that the data ends with the TCP header above). // // Note that this code relies on the fact that we don't include // any TCP options (and thus don't have a variable length TCP // header) in our data packets. // SCC = (SendCmpltContext *)((uchar *)Memory + HeaderLength - sizeof(*SCC)); PC(Packet)->CompletionData = SCC; #if DBG SCC->scc_sig = scc_signature; #endif SCC->scc_ubufcount = 0; SCC->scc_tbufcount = 0; SCC->scc_count = 0; AmountLeft = AmountToSend; if (AmountToSend != 0) { long Result; // // Loop through the sends on the TCB, building a frame. // CurrentBuffer = FirstBuffer; CurSend = SendTCB->tcb_cursend; CHECK_STRUCT(CurSend, tsr); SCC->scc_firstsend = CurSend; do { ASSERT(CurSend->tsr_refcnt > 0); Result = InterlockedIncrement(&(CurSend->tsr_refcnt)); ASSERT(Result > 0); SCC->scc_count++; // // If the current send offset is 0 and the current // send is less than or equal to what we have left // to send, we haven't already put a transport // buffer on this send, and nobody else is using // the buffer chain directly, just use the input // buffers. We check for other people using them // by looking at tsr_lastbuf. If it's NULL, // nobody else is using the buffers. If it's not // NULL, somebody is. // if (SendTCB->tcb_sendofs == 0 && (SendTCB->tcb_sendsize <= AmountLeft) && (SCC->scc_tbufcount == 0) && CurSend->tsr_lastbuf == NULL) { PNDIS_BUFFER LastBuf = SendTCB->tcb_sendbuf; uint UBufLength = NdisBufferLength(LastBuf); ushort UBufCount = 1; while (NDIS_BUFFER_LINKAGE(LastBuf) != NULL) { LastBuf = NDIS_BUFFER_LINKAGE(LastBuf); UBufLength += NdisBufferLength(LastBuf); UBufCount++; } if (SendTCB->tcb_sendsize == UBufLength) { SCC->scc_ubufcount += UBufCount; NDIS_BUFFER_LINKAGE(CurrentBuffer) = SendTCB->tcb_sendbuf; CurSend->tsr_lastbuf = CurrentBuffer = LastBuf; AmountLeft -= SendTCB->tcb_sendsize; SendTCB->tcb_sendsize = 0; } else { // // Fall through with a non-zero tcb_sendsize. // ASSERT(SendTCB->tcb_sendsize != 0); } } if (SendTCB->tcb_sendsize != 0) { uint AmountToDup; PNDIS_BUFFER NewBuf, Buf; uint Offset; NDIS_STATUS NStatus; uchar *VirtualAddress; uint Length; // // Either the current send has more data than // we want to send, or the starting offset is // not 0. In either case we'll need to loop // through the current send, allocating // buffers. // Buf = SendTCB->tcb_sendbuf; Offset = SendTCB->tcb_sendofs; do { ASSERT(Buf != NULL); NdisQueryBufferSafe(Buf, &VirtualAddress, &Length, LowPagePriority); if (VirtualAddress == NULL) { // // Couldn't map into kernel address space. // If the packet is already partly built, // send what we've got, otherwise error out. // goto error_oor2; } ASSERT((Offset < Length) || (Offset == 0 && Length == 0)); // // Adjust the length for the offset into // this buffer. // Length -= Offset; AmountToDup = MIN(AmountLeft, Length); NdisAllocateBuffer(&NStatus, &NewBuf, IPv6BufferPool, VirtualAddress + Offset, AmountToDup); if (NStatus == NDIS_STATUS_SUCCESS) { SCC->scc_tbufcount++; NDIS_BUFFER_LINKAGE(CurrentBuffer) = NewBuf; CurrentBuffer = NewBuf; if (AmountToDup >= Length) { // Exhausted this buffer. Buf = NDIS_BUFFER_LINKAGE(Buf); Offset = 0; } else { Offset += AmountToDup; ASSERT(Offset < NdisBufferLength(Buf)); } SendTCB->tcb_sendsize -= AmountToDup; AmountLeft -= AmountToDup; } else { // // Couldn't allocate a buffer. If // the packet is already partly built, // send what we've got, otherwise // error out. // error_oor2: if (SCC->scc_tbufcount == 0 && SCC->scc_ubufcount == 0) { NdisChainBufferAtFront(Packet, FirstBuffer); TCPSendComplete(Packet, IP_GENERAL_FAILURE); goto error_oor; } AmountToSend -= AmountLeft; AmountLeft = 0; break; } } while (AmountLeft && SendTCB->tcb_sendsize); SendTCB->tcb_sendbuf = Buf; SendTCB->tcb_sendofs = Offset; } if (CurSend->tsr_flags & TSR_FLAG_URG) { ushort UP; // // This send is urgent data. We need to figure // out what the urgent data pointer should be. // We know sendnext is the starting sequence // number of the frame, and that at the top of // this do loop sendnext identified a byte in // the CurSend at that time. We advanced CurSend // at the same rate we've decremented // AmountLeft (AmountToSend - AmountLeft == // AmountBuilt), so sendnext + // (AmountToSend - AmountLeft) identifies a byte // in the current value of CurSend, and that // quantity plus tcb_sendsize is the sequence // number one beyond the current send. // UP = (ushort)(AmountToSend - AmountLeft) + (ushort)SendTCB->tcb_sendsize - ((SendTCB->tcb_flags & BSD_URGENT) ? 0 : 1); TCP->tcp_urgent = net_short(UP); TCP->tcp_flags |= TCP_FLAG_URG; } // // See if we've exhausted this send. If we have, // set the PUSH bit in this frame and move on to // the next send. We also need to check the // urgent data bit. // if (SendTCB->tcb_sendsize == 0) { Queue *Next; uchar PrevFlags; // // We've exhausted this send. Set the PUSH bit. // TCP->tcp_flags |= TCP_FLAG_PUSH; PrevFlags = CurSend->tsr_flags; Next = QNEXT(&CurSend->tsr_req.tr_q); if (Next != QEND(&SendTCB->tcb_sendq)) { CurSend = CONTAINING_RECORD( QSTRUCT(TCPReq, Next, tr_q), TCPSendReq, tsr_req); CHECK_STRUCT(CurSend, tsr); SendTCB->tcb_sendsize = CurSend->tsr_unasize; SendTCB->tcb_sendofs = CurSend->tsr_offset; SendTCB->tcb_sendbuf = CurSend->tsr_buffer; SendTCB->tcb_cursend = CurSend; // // Check the urgent flags. We can't combine new // urgent data on to the end of old non-urgent // data. // if ((PrevFlags & TSR_FLAG_URG) && !(CurSend->tsr_flags & TSR_FLAG_URG)) break; } else { ASSERT(AmountLeft == 0); SendTCB->tcb_cursend = NULL; SendTCB->tcb_sendbuf = NULL; } } } while (AmountLeft != 0); } else { // // We're in the loop, but AmountToSend is 0. This // should happen only when we're sending a FIN. Check // this, and return if it's not true. // ASSERT(AmtUnsent == 0); if (!(SendTCB->tcb_flags & FIN_NEEDED)) { // KdBreakPoint(); ExFreePool(NdisBufferVirtualAddress(FirstBuffer)); NdisFreeBuffer(FirstBuffer); NdisFreePacket(Packet); break; } SCC->scc_firstsend = NULL; // REVIEW: looks unneccessary. NDIS_BUFFER_LINKAGE(FirstBuffer) = NULL; } // Adjust for what we're really going to send. AmountToSend -= AmountLeft; // // Update the sequence numbers, and start a RTT measurement // if needed. // OldSeq = SendTCB->tcb_sendnext; SendTCB->tcb_sendnext += AmountToSend; if (!SEQ_EQ(OldSeq, SendTCB->tcb_sendmax)) { // // We have at least some retransmission. Bump the stat. // TStats.ts_retranssegs++; } if (SEQ_GT(SendTCB->tcb_sendnext, SendTCB->tcb_sendmax)) { // // We're sending at least some new data. // We can't advance sendmax once FIN_SENT is set. // ASSERT(!(SendTCB->tcb_flags & FIN_SENT)); SendTCB->tcb_sendmax = SendTCB->tcb_sendnext; TStats.ts_outsegs++; // // Check the Round-Trip Timer. // if (SendTCB->tcb_rtt == 0) { // No RTT running, so start one. SendTCB->tcb_rtt = TCPTime; SendTCB->tcb_rttseq = OldSeq; } } // // We've built the frame entirely. If we've sent everything // we have and there's a FIN pending, OR it in. // if (AmtUnsent == AmountToSend) { if (SendTCB->tcb_flags & FIN_NEEDED) { ASSERT(!(SendTCB->tcb_flags & FIN_SENT) || (SendTCB->tcb_sendnext == (SendTCB->tcb_sendmax - 1))); // // See if we still have room in the window for a FIN. // if (SendWin > (int) AmountToSend) { TCP->tcp_flags |= TCP_FLAG_FIN; SendTCB->tcb_sendnext++; SendTCB->tcb_sendmax = SendTCB->tcb_sendnext; SendTCB->tcb_flags |= (FIN_SENT | FIN_OUTSTANDING); SendTCB->tcb_flags &= ~FIN_NEEDED; } } } AmountToSend += sizeof(TCPHeader); if (!TCB_TIMER_RUNNING(SendTCB->tcb_rexmittimer)) START_TCB_TIMER(SendTCB->tcb_rexmittimer, SendTCB->tcb_rexmit); SendTCB->tcb_flags &= ~(NEED_ACK | ACK_DELAYED | FORCE_OUTPUT); STOP_TCB_TIMER(SendTCB->tcb_delacktimer); STOP_TCB_TIMER(SendTCB->tcb_swstimer); SendTCB->tcb_alive = TCPTime; // Add the buffers to the packet. NdisChainBufferAtFront(Packet, FirstBuffer); // // Compute the TCP checksum. It covers the entire TCP segment // starting with the TCP header, plus the IPv6 pseudo-header. // TCP->tcp_xsum = 0; TCP->tcp_xsum = ChecksumPacket( Packet, LinkOffset + sizeof *IP, NULL, AmountToSend, AlignAddr(&IP->Source), AlignAddr(&IP->Dest), IP_PROTOCOL_TCP); // // Capture and reference the RCE while we still hold the TCB lock. // The TCB's reference on this particular RCE might go away at any // point after we release the lock. // RCE = SendTCB->tcb_rce; AddRefRCE(RCE); // // Everything's ready. Now send the packet. // // Note that IPv6Send does not return a status code. // Instead it *always* completes the packet // with an appropriate status code. // KeReleaseSpinLock(&SendTCB->tcb_lock, PreLockIrql); if (TCP->tcp_xsum == 0) { // // ChecksumPacket failed, so abort the transmission. // IPv6SendComplete(NULL, Packet, IP_NO_RESOURCES); } else { IPv6Send(Packet, LinkOffset, IP, AmountToSend, RCE, 0, IP_PROTOCOL_TCP, net_short(TCP->tcp_src), net_short(TCP->tcp_dest)); } ReleaseRCE(RCE); KeAcquireSpinLock(&SendTCB->tcb_lock, &PreLockIrql); continue; } else { // // We've decided we can't send anything now. Figure out why, and // see if we need to set a timer. // if (SendTCB->tcb_sendwin == 0) { if (!(SendTCB->tcb_flags & FLOW_CNTLD)) { SendTCB->tcb_flags |= FLOW_CNTLD; SendTCB->tcb_rexmitcnt = 0; START_TCB_TIMER(SendTCB->tcb_rexmittimer, SendTCB->tcb_rexmit); SendTCB->tcb_slowcount++; SendTCB->tcb_fastchk |= TCP_FLAG_SLOW; } else if (!TCB_TIMER_RUNNING(SendTCB->tcb_rexmittimer)) START_TCB_TIMER(SendTCB->tcb_rexmittimer, SendTCB->tcb_rexmit); } else if (AmountToSend != 0) // We have something to send, but we're not sending // it, presumably due to SWS avoidance. if (!TCB_TIMER_RUNNING(SendTCB->tcb_swstimer)) START_TCB_TIMER(SendTCB->tcb_swstimer, SWS_TO); break; } } // while (!FIN_OUTSTANDING) // // We're done sending, so we don't need the output flags set. // SendTCB->tcb_flags &= ~(IN_TCP_SEND | NEED_OUTPUT | FORCE_OUTPUT | SEND_AFTER_RCV); bail: DerefTCB(SendTCB, PreLockIrql); return; // // Common case error handling code for out of resource conditions. Start the // retransmit timer if it's not already running (so that we try this again // later), clean up and return. // error_oor: if (!TCB_TIMER_RUNNING(SendTCB->tcb_rexmittimer)) START_TCB_TIMER(SendTCB->tcb_rexmittimer, SendTCB->tcb_rexmit); // We had an out of resource problem, so clear the OUTPUT flags. SendTCB->tcb_flags &= ~(IN_TCP_SEND | NEED_OUTPUT | FORCE_OUTPUT); DerefTCB(SendTCB, PreLockIrql); return; } // end of TCPSend() //* ResetSendNextAndFastSend - Set the sendnext value of a TCB. // // Called to fast retransmit the dropped segment. // // We assume the caller has put a reference on the TCB, and the TCB is locked // on entry. The reference is dropped and the lock released before returning. // void // Returns: Nothing. ResetAndFastSend( TCB *SeqTCB, // TCB for this connection. SeqNum NewSeq, // Sequence number to set. uint NewCWin) // New value for congestion window. { TCPSendReq *SendReq; Queue *CurQ; PNDIS_BUFFER Buffer; uint Offset; uint SendSize; CHECK_STRUCT(SeqTCB, tcb); ASSERT(SEQ_GTE(NewSeq, SeqTCB->tcb_senduna)); // // The new seq must be less than send max, or NewSeq, senduna, sendnext, // and sendmax must all be equal. (The latter case happens when we're // called exiting TIME_WAIT, or possibly when we're retransmitting // during a flow controlled situation). // ASSERT(SEQ_LT(NewSeq, SeqTCB->tcb_sendmax) || (SEQ_EQ(SeqTCB->tcb_senduna, SeqTCB->tcb_sendnext) && SEQ_EQ(SeqTCB->tcb_senduna, SeqTCB->tcb_sendmax) && SEQ_EQ(SeqTCB->tcb_senduna, NewSeq))); if (SYNC_STATE(SeqTCB->tcb_state) && (SeqTCB->tcb_state != TCB_TIME_WAIT)) { if (!EMPTYQ(&SeqTCB->tcb_sendq)) { CurQ = QHEAD(&SeqTCB->tcb_sendq); SendReq = (TCPSendReq *) CONTAINING_RECORD(CurQ, TCPReq, tr_q); // // SendReq points to the first send request on the send queue. // We're pointing at the proper send req now. We need to go down. // // SendReq points to the cursend. // SendSize point to sendsize in the cursend. // SendSize = SendReq->tsr_unasize; Buffer = SendReq->tsr_buffer; Offset = SendReq->tsr_offset; // Call the fast retransmit send now. TCPFastSend(SeqTCB, Buffer, Offset, SendReq, SendSize, NewSeq, SeqTCB->tcb_mss); } else { ASSERT(SeqTCB->tcb_cursend == NULL); } } SeqTCB->tcb_cwin = NewCWin; DerefTCB(SeqTCB, DISPATCH_LEVEL); return; } //* TCPFastSend - To send a segment without changing TCB state. // // Called to handle fast retransmit of the lost segment. // tcb_lock will be held while entering (called by TCPRcv). // void // Returns: Nothing. TCPFastSend( TCB *SendTCB, // TCB for this connection. PNDIS_BUFFER in_SendBuf, // NDIS buffer. uint SendOfs, // Send offset. TCPSendReq *CurSend, // Current send request. uint SendSize, // Size of this send. SeqNum SendNext, // Sequence number to use for this send. int in_ToBeSent) // Cap on SendSize (REVIEW: Callee should cap). { uint AmountToSend; // Amount to send this time. uint AmountLeft; IPv6Header UNALIGNED *IP; TCPHeader UNALIGNED *TCP; PNDIS_PACKET Packet; PNDIS_BUFFER FirstBuffer, CurrentBuffer; void *Memory; SendCmpltContext *SCC; NDIS_STATUS NdisStatus; uint AmtOutstanding, AmtUnsent; uint HeaderLength; uint LinkOffset; uint PMTU; KIRQL PreLockIrql; PNDIS_BUFFER SendBuf = in_SendBuf; RouteCacheEntry *RCE; PreLockIrql = DISPATCH_LEVEL; CHECK_STRUCT(SendTCB, tcb); ASSERT(SendTCB->tcb_refcnt != 0); ASSERT(*(int *)&SendTCB->tcb_sendwin >= 0); ASSERT(*(int *)&SendTCB->tcb_cwin >= SendTCB->tcb_mss); ASSERT(!(SendTCB->tcb_flags & FIN_OUTSTANDING) || (SendTCB->tcb_sendnext == SendTCB->tcb_sendmax)); // // In most cases, we will already have a route at this point. // However, if we failed to get one earlier in the passive receive // path, we may need to retry here. // if (SendTCB->tcb_rce == NULL) { InitRCE(SendTCB); if (SendTCB->tcb_rce == NULL) { DerefTCB(SendTCB, PreLockIrql); return; } } // // Validate that the address we're sourcing from and the route we're // sending upon are still okay to use. // // We fail existing send requests for TCBs with a disconnected // outgoing interface, except when a loopback route is used. // if (SendTCB->tcb_routing != RouteCacheValidationCounter) { if (!ValidateSourceAndRoute(SendTCB) || IsDisconnectedAndNotLoopbackRCE(SendTCB->tcb_rce)) { ASSERT(SendTCB->tcb_refcnt != 0); TryToCloseTCB(SendTCB, TCB_CLOSE_ABORTED, PreLockIrql); KeAcquireSpinLock(&SendTCB->tcb_lock, &PreLockIrql); DerefTCB(SendTCB, PreLockIrql); return; } } // // Verify that our cached Path MTU is still valid. // Watch for changes to IPsec policies since they can also effect our MSS. // REVIEW: This the best spot to do this? // PMTU = GetEffectivePathMTUFromRCE(SendTCB->tcb_rce); if (PMTU != SendTCB->tcb_pmtu || SecurityStateValidationCounter != SendTCB->tcb_security) { // // Either our Path MTU or the global security state has changed. // Cache current values and then calculate a new MSS. // SendTCB->tcb_pmtu = PMTU; SendTCB->tcb_security = SecurityStateValidationCounter; CalculateMSSForTCB(SendTCB); } AmtOutstanding = (uint)(SendTCB->tcb_sendnext - SendTCB->tcb_senduna); AmtUnsent = MIN(MIN(in_ToBeSent, (int)SendSize), (int)SendTCB->tcb_sendwin); while (AmtUnsent > 0) { if (SEQ_GT(SendTCB->tcb_senduna, SendNext)) { // // Since tcb_lock is released in this loop // it is possible that delayed ack acked // what we are trying to retransmit. // goto error_oor; } // AmtUnsent below was minimum of sendwin and amtunsent AmountToSend = MIN(AmtUnsent, SendTCB->tcb_mss); ASSERT((int)AmtUnsent >= 0); // // We're going to send something. Allocate a packet header. // // REVIEW: It was easier to code all these allocations directly // REVIEW: rather than use IPv6AllocatePacket. // // REVIEW: This grabs packets and buffers from the IPv6PacketPool // REVIEW: and the IPv6BufferPool respectively. Should we instead // REVIEW: have separate pools for TCP? // NdisAllocatePacket(&NdisStatus, &Packet, IPv6PacketPool); if (NdisStatus != NDIS_STATUS_SUCCESS) { KdPrintEx((DPFLTR_TCPIP6_ID, DPFLTR_NTOS_ERROR, "TCPSend: couldn't allocate packet header!?!\n")); goto error_oor; } // We'll fill in the CompletionData below. InitializeNdisPacket(Packet); PC(Packet)->CompletionHandler = TCPSendComplete; // // Our header buffer has extra space at the beginning for other // headers to be prepended to ours without requiring further // allocation calls. It also has extra space at the end to hold // the send completion data. // LinkOffset = SendTCB->tcb_rce->NCE->IF->LinkHeaderSize; HeaderLength = (LinkOffset + sizeof(*IP) + sizeof(*TCP) + sizeof(SendCmpltContext) + __builtin_alignof(SendCmpltContext) - 1) &~ (UINT_PTR)(__builtin_alignof(SendCmpltContext) - 1); Memory = ExAllocatePool(NonPagedPool, HeaderLength); if (Memory == NULL) { KdPrintEx((DPFLTR_TCPIP6_ID, DPFLTR_NTOS_ERROR, "TCPSend: couldn't allocate header memory!?!\n")); NdisFreePacket(Packet); goto error_oor; } // // When allocating the NDIS buffer describing this memory region, // we don't tell it about the extra space on the end that we // allocated for the send completion data. // NdisAllocateBuffer(&NdisStatus, &FirstBuffer, IPv6BufferPool, Memory, LinkOffset + sizeof(*IP) + sizeof(*TCP)); if (NdisStatus != NDIS_STATUS_SUCCESS) { KdPrintEx((DPFLTR_TCPIP6_ID, DPFLTR_NTOS_ERROR, "TCPSend: couldn't allocate buffer!?!\n")); ExFreePool(Memory); NdisFreePacket(Packet); goto error_oor; } // // Skip over the extra space that will be filled in later by the // link level. At this level we add the IPv6Header, the // TCPHeader, and the data. // IP = (IPv6Header UNALIGNED *)((uchar *)Memory + LinkOffset); IP->VersClassFlow = IP_VERSION; IP->NextHeader = IP_PROTOCOL_TCP; IP->HopLimit = TCPHopLimit(SendTCB); IP->Source = SendTCB->tcb_saddr; IP->Dest = SendTCB->tcb_daddr; // // Begin preparing the TCP header. // TCP = (TCPHeader UNALIGNED *)(IP + 1); FillTCPHeader(SendTCB, TCP); TCP->tcp_seq = net_long(SendNext); // // Store the send completion data in the same buffer as the TCP // header, right after the TCP header. This saves allocation // overhead and works because we don't consider this area to be // part of the packet data (we set this buffer's length to // indicate that the data ends with the TCP header above). // // Note that this code relies on the fact that we don't include // any TCP options (and thus don't have a variable length TCP // header) in our data packets. // SCC = (SendCmpltContext *)((uchar *)Memory + HeaderLength - sizeof(*SCC)); PC(Packet)->CompletionData = SCC; #if DBG SCC->scc_sig = scc_signature; #endif SCC->scc_ubufcount = 0; SCC->scc_tbufcount = 0; SCC->scc_count = 0; AmountLeft = AmountToSend; if (AmountToSend != 0) { long Result; // // Loop through the sends on the TCB, building a frame. // CurrentBuffer = FirstBuffer; CHECK_STRUCT(CurSend, tsr); SCC->scc_firstsend = CurSend; do { ASSERT(CurSend->tsr_refcnt > 0); Result = InterlockedIncrement(&(CurSend->tsr_refcnt)); ASSERT(Result > 0); SCC->scc_count++; // // If the current send offset is 0 and the current // send is less than or equal to what we have left // to send, we haven't already put a transport // buffer on this send, and nobody else is using // the buffer chain directly, just use the input // buffers. We check for other people using them // by looking at tsr_lastbuf. If it's NULL, // nobody else is using the buffers. If it's not // NULL, somebody is. // if (SendOfs == 0 && (SendSize <= AmountLeft) && (SCC->scc_tbufcount == 0) && CurSend->tsr_lastbuf == NULL) { PNDIS_BUFFER LastBuf = SendBuf; uint UBufLength = NdisBufferLength(LastBuf); ushort UBufCount = 1; while (NDIS_BUFFER_LINKAGE(LastBuf) != NULL) { LastBuf = NDIS_BUFFER_LINKAGE(LastBuf); UBufLength += NdisBufferLength(LastBuf); UBufCount++; } if (SendSize == UBufLength) { SCC->scc_ubufcount += UBufCount; NDIS_BUFFER_LINKAGE(CurrentBuffer) = SendBuf; CurSend->tsr_lastbuf = CurrentBuffer = LastBuf; AmountLeft -= SendSize; SendSize = 0; } else { // // Fall through with a non-zero SendSize. // ASSERT(SendSize != 0); } } if (SendSize != 0) { uint AmountToDup; PNDIS_BUFFER NewBuf, Buf; uint Offset; NDIS_STATUS NStatus; uchar *VirtualAddress; uint Length; // // Either the current send has more data than // we want to send, or the starting offset is // not 0. In either case we'll need to loop // through the current send, allocating buffers. // Buf = SendBuf; Offset = SendOfs; do { ASSERT(Buf != NULL); NdisQueryBufferSafe(Buf, &VirtualAddress, &Length, LowPagePriority); if (VirtualAddress == NULL) { goto error_oor2; } ASSERT((Offset < Length) || (Offset == 0 && Length == 0)); // // Adjust the length for the offset into // this buffer. // Length -= Offset; AmountToDup = MIN(AmountLeft, Length); NdisAllocateBuffer(&NStatus, &NewBuf, IPv6BufferPool, VirtualAddress + Offset, AmountToDup); if (NStatus == NDIS_STATUS_SUCCESS) { SCC->scc_tbufcount++; NDIS_BUFFER_LINKAGE(CurrentBuffer) = NewBuf; CurrentBuffer = NewBuf; if (AmountToDup >= Length) { // Exhausted this buffer. Buf = NDIS_BUFFER_LINKAGE(Buf); Offset = 0; } else { Offset += AmountToDup; ASSERT(Offset < NdisBufferLength(Buf)); } SendSize -= AmountToDup; AmountLeft -= AmountToDup; } else { // // Couldn't allocate a buffer. If // the packet is already partly built, // send what we've got, otherwise // error out. // error_oor2: if (SCC->scc_tbufcount == 0 && SCC->scc_ubufcount == 0) { KeReleaseSpinLockFromDpcLevel( &SendTCB->tcb_lock); NdisChainBufferAtFront(Packet, FirstBuffer); TCPSendComplete(Packet, IP_GENERAL_FAILURE); KeAcquireSpinLockAtDpcLevel(&SendTCB->tcb_lock); goto error_oor; } AmountToSend -= AmountLeft; AmountLeft = 0; break; } } while (AmountLeft && SendSize); SendBuf = Buf; SendOfs = Offset; } if (CurSend->tsr_flags & TSR_FLAG_URG) { ushort UP; // // This send is urgent data. We need to figure // out what the urgent data pointer should be. // We know sendnext is the starting sequence // number of the frame, and that at the top of // this do loop sendnext identified a byte in // the CurSend at that time. We advanced CurSend // at the same rate we've decremented // AmountLeft (AmountToSend - AmountLeft == // AmountBuilt), so sendnext + // (AmountToSend - AmountLeft) identifies a byte // in the current value of CurSend, and that // quantity plus tcb_sendsize is the sequence // number one beyond the current send. // UP = (ushort) (AmountToSend - AmountLeft) + (ushort) SendSize - ((SendTCB->tcb_flags & BSD_URGENT) ? 0 : 1); TCP->tcp_urgent = net_short(UP); TCP->tcp_flags |= TCP_FLAG_URG; } // // See if we've exhausted this send. If we have, // set the PUSH bit in this frame and move on to // the next send. We also need to check the // urgent data bit. // if (SendSize == 0) { Queue *Next; ulong PrevFlags; // // We've exhausted this send. Set the PUSH bit. // TCP->tcp_flags |= TCP_FLAG_PUSH; PrevFlags = CurSend->tsr_flags; Next = QNEXT(&CurSend->tsr_req.tr_q); if (Next != QEND(&SendTCB->tcb_sendq)) { CurSend = CONTAINING_RECORD( QSTRUCT(TCPReq, Next, tr_q), TCPSendReq, tsr_req); CHECK_STRUCT(CurSend, tsr); SendSize = CurSend->tsr_unasize; SendOfs = CurSend->tsr_offset; SendBuf = CurSend->tsr_buffer; // // Check the urgent flags. We can't combine new // urgent data on to the end of old non-urgent // data. // if ((PrevFlags & TSR_FLAG_URG) && !(CurSend->tsr_flags & TSR_FLAG_URG)) { break; } } else { ASSERT(AmountLeft == 0); CurSend = NULL; SendBuf = NULL; } } } while (AmountLeft != 0); } else { // // Amt to send is 0. // Just bail out and start timer. // if (!TCB_TIMER_RUNNING(SendTCB->tcb_rexmittimer)) { START_TCB_TIMER(SendTCB->tcb_rexmittimer, SendTCB->tcb_rexmit); } ExFreePool(NdisBufferVirtualAddress(FirstBuffer)); NdisFreeBuffer(FirstBuffer); NdisFreePacket(Packet); return; } // // Adjust for what we're really going to send. // AmountToSend -= AmountLeft; SendNext += AmountToSend; AmtUnsent -= AmountToSend; TStats.ts_retranssegs++; AmountToSend += sizeof(TCPHeader); if (!TCB_TIMER_RUNNING(SendTCB->tcb_rexmittimer)) { START_TCB_TIMER(SendTCB->tcb_rexmittimer, SendTCB->tcb_rexmit); } SendTCB->tcb_flags &= ~(NEED_ACK | ACK_DELAYED | FORCE_OUTPUT); STOP_TCB_TIMER(SendTCB->tcb_delacktimer); STOP_TCB_TIMER(SendTCB->tcb_swstimer); // // Add the buffers to the packet. // NdisChainBufferAtFront(Packet, FirstBuffer); // // Compute the TCP checksum. It covers the entire TCP segment // starting with the TCP header, plus the IPv6 pseudo-header. // TCP->tcp_xsum = 0; TCP->tcp_xsum = ChecksumPacket( Packet, LinkOffset + sizeof *IP, NULL, AmountToSend, AlignAddr(&IP->Source), AlignAddr(&IP->Dest), IP_PROTOCOL_TCP); // // Capture and reference the RCE while we still hold the TCB lock. // The TCB's reference on this particular RCE might go away at any // point after we release the lock. // RCE = SendTCB->tcb_rce; AddRefRCE(RCE); // // Everything's ready. Now send the packet. // // Note that IPv6Send does not return a status code. // Instead it *always* completes the packet // with an appropriate status code. // KeReleaseSpinLock(&SendTCB->tcb_lock, PreLockIrql); if (TCP->tcp_xsum == 0) { // // ChecksumPacket failed, so abort the transmission. // IPv6SendComplete(NULL, Packet, IP_NO_RESOURCES); } else { IPv6Send(Packet, LinkOffset, IP, AmountToSend, RCE, 0, IP_PROTOCOL_TCP, net_short(TCP->tcp_src), net_short(TCP->tcp_dest)); } // // Release reference and reacquire lock we dropped before sending. // ReleaseRCE(RCE); KeAcquireSpinLock(&SendTCB->tcb_lock, &PreLockIrql); } return; // // Common case error handling code for out of resource conditions. // Start the retransmit timer if it's not already running // (so that we try this again later), clean up and return. // error_oor: if (!TCB_TIMER_RUNNING(SendTCB->tcb_rexmittimer)) { START_TCB_TIMER(SendTCB->tcb_rexmittimer, SendTCB->tcb_rexmit); } return; } //* TDISend - Send data on a connection. // // The main TDI send entry point. We take the input parameters, validate // them, allocate a send request, etc. We then put the send request on the // queue. If we have no other sends on the queue or Nagling is disabled we'll // call TCPSend to send the data. // TDI_STATUS // Returns: Status of attempt to send. TdiSend( PTDI_REQUEST Request, // TDI request for the call. ushort Flags, // Flags for this send. uint SendLength, // Length in bytes of send. PNDIS_BUFFER SendBuffer) // Buffer chain to be sent. { TCPConn *Conn; TCB *SendTCB; TCPSendReq *SendReq; KIRQL OldIrql; TDI_STATUS Error; uint EmptyQ; #if DBG uint RealSendSize; PNDIS_BUFFER Temp; // // Loop through the buffer chain, and make sure that the length matches // up with SendLength. // Temp = SendBuffer; RealSendSize = 0; do { ASSERT(Temp != NULL); RealSendSize += NdisBufferLength(Temp); Temp = NDIS_BUFFER_LINKAGE(Temp); } while (Temp != NULL); ASSERT(RealSendSize == SendLength); #endif // // Grab lock on Connection Table. Then get our connection info from // the TDI request, and our TCP control block from that. // Conn = GetConnFromConnID(PtrToUlong(Request->Handle.ConnectionContext), &OldIrql); if (Conn == NULL) { Error = TDI_INVALID_CONNECTION; goto abort; } CHECK_STRUCT(Conn, tc); SendTCB = Conn->tc_tcb; if (SendTCB == NULL) { Error = TDI_INVALID_STATE; KeReleaseSpinLock(&Conn->tc_ConnBlock->cb_lock, OldIrql); abort: return Error; } CHECK_STRUCT(SendTCB, tcb); // // Switch to a finer-grained lock: // Drop lock on the Connection Table in favor of one on our TCB. // KeAcquireSpinLockAtDpcLevel(&SendTCB->tcb_lock); KeReleaseSpinLockFromDpcLevel(&Conn->tc_ConnBlock->cb_lock); // // Make sure our TCB is in a send-able state. // if (!DATA_SEND_STATE(SendTCB->tcb_state) || CLOSING(SendTCB)) { Error = TDI_INVALID_STATE; goto abort2; } CheckTCBSends(SendTCB); // Just a debug check. // // If we've released our RCE for some reason, reacquire one. // if (SendTCB->tcb_rce == NULL) { InitRCE(SendTCB); if (SendTCB->tcb_rce == NULL) { Error = TDI_DEST_NET_UNREACH; goto abort2; } } // // Verify that the cached RCE is still valid. // SendTCB->tcb_rce = ValidateRCE(SendTCB->tcb_rce, SendTCB->tcb_nte); ASSERT(SendTCB->tcb_rce != NULL); if (IsDisconnectedAndNotLoopbackRCE(SendTCB->tcb_rce)) { // // Fail new send requests for TCBs with a disconnected // outgoing interface, except when the loopback route is used. // Error = TDI_DEST_NET_UNREACH; goto abort2; } if (SendLength == 0) { // // Wow, nothing to do! // // REVIEW: Can't we do this check earlier (like before we even grab the // REVIEW: Connection Table lock? The only reason I can think not to // REVIEW: would be if something cared about the return code if a bad // REVIEW: Tdi Request was given to us. // Error = TDI_SUCCESS; goto abort2; } // // We have a TCB, and it's valid. Allocate a send request now. // SendReq = GetSendReq(); if (SendReq == NULL) { Error = TDI_NO_RESOURCES; abort2: KeReleaseSpinLock(&SendTCB->tcb_lock, OldIrql); return Error; } // // Prepare a TCP send request based on the TDI request and the // passed in buffer chain. // SendReq->tsr_req.tr_rtn = Request->RequestNotifyObject; SendReq->tsr_req.tr_context = Request->RequestContext; SendReq->tsr_buffer = SendBuffer; SendReq->tsr_size = SendLength; SendReq->tsr_unasize = SendLength; SendReq->tsr_refcnt = 1; // ACK will decrement this ref SendReq->tsr_offset = 0; SendReq->tsr_lastbuf = NULL; SendReq->tsr_time = TCPTime; SendReq->tsr_flags = (Flags & TDI_SEND_EXPEDITED) ? TSR_FLAG_URG : 0; // // Check current status of our send queue. // EmptyQ = EMPTYQ(&SendTCB->tcb_sendq); // // Add this send request to our send queue. // SendTCB->tcb_unacked += SendLength; ENQUEUE(&SendTCB->tcb_sendq, &SendReq->tsr_req.tr_q); if (SendTCB->tcb_cursend == NULL) { // // No existing current send request, so make this new one // the current send. // // REVIEW: Is this always equivalent to EMPTYQ test above? // REVIEW: If so, why not just set EmptyQ flag here and save a test? // SendTCB->tcb_cursend = SendReq; SendTCB->tcb_sendbuf = SendBuffer; SendTCB->tcb_sendofs = 0; SendTCB->tcb_sendsize = SendLength; } // // See if we should try to send now. We attempt to do so if we weren't // already blocked, or if we were and either the Nagle Algorithm is turned // off or we now have at least one max segment worth of data to send. // if (EmptyQ || (!(SendTCB->tcb_flags & NAGLING) || (SendTCB->tcb_unacked - (SendTCB->tcb_sendmax - SendTCB->tcb_senduna)) >= SendTCB->tcb_mss)) { SendTCB->tcb_refcnt++; TCPSend(SendTCB, OldIrql); } else KeReleaseSpinLock(&SendTCB->tcb_lock, OldIrql); // // When TCPSend returns, we may or may not have already sent the data // associated with this particular request. // return TDI_PENDING; } #pragma BEGIN_INIT //* InitTCPSend - Initialize our send side. // // Called during init time to initialize our TCP send state. // int // Returns: TRUE if we inited, false if we didn't. InitTCPSend( void) // Nothing. { ExInitializeSListHead(&TCPSendReqFree); KeInitializeSpinLock(&TCPSendReqFreeLock); IPv6RegisterULProtocol(IP_PROTOCOL_TCP, TCPReceive, TCPControlReceive); return TRUE; } #pragma END_INIT //* UnloadTCPSend // // Cleanup and prepare for stack unload. // void UnloadTCPSend(void) { PSLIST_ENTRY BufferLink; while ((BufferLink = ExInterlockedPopEntrySList(&TCPSendReqFree, &TCPSendReqFreeLock)) != NULL) { Queue *QueuePtr = CONTAINING_RECORD(BufferLink, Queue, q_next); TCPReq *Req = CONTAINING_RECORD(QueuePtr, TCPReq, tr_q); TCPSendReq *SendReq = CONTAINING_RECORD(Req, TCPSendReq, tsr_req); CHECK_STRUCT(SendReq, tsr); ExFreePool(SendReq); } IPv6RegisterULProtocol(IP_PROTOCOL_TCP, NULL, NULL); }