/********************************************************************/ /** Microsoft LAN Manager **/ /** Copyright(c) Microsoft Corp., 1990-2000 **/ /********************************************************************/ /* :ts=4 */ //** TCPSEND.C - TCP send protocol code. // // This file contains the code for sending Data and Control segments. // #include "precomp.h" #include "addr.h" #include "tcp.h" #include "tcb.h" #include "tcpconn.h" #include "tcpsend.h" #include "tcprcv.h" #include "tlcommon.h" #include "info.h" #include "tcpcfg.h" #include "secfltr.h" #include "tcpipbuf.h" #include "mdlpool.h" #include "pplasl.h" #if GPC #include "qos.h" #include "traffic.h" #include "gpcifc.h" #include "ntddtc.h" extern GPC_HANDLE hGpcClient[GPC_CF_MAX]; extern ULONG GpcCfCounts[GPC_CF_MAX]; extern GPC_EXPORTED_CALLS GpcEntries; extern ULONG GPCcfInfo; #endif NTSTATUS GetIFAndLink(void *Rce, ULONG * IFIndex, IPAddr * NextHop); extern ulong DisableUserTOSSetting; uint MaxSendSegments = 64; #if MILLEN uint DisableLargeSendOffload = 1; #else // MILLEN uint DisableLargeSendOffload = 0; #endif // !MILLEN #if DBG ulong DbgDcProb = 0; ulong DbgTcpSendHwChksumCount = 0; #endif extern HANDLE TcpRequestPool; extern CTELock *pTWTCBTableLock; extern CACHE_LINE_KSPIN_LOCK RequestCompleteListLock; extern uint TcpHostOpts; extern uint TcpHostSendOpts; #define ALIGNED_SACK_OPT_SIZE 4+8*4 //Maximum 4 sack blocks of 2longword each+sack opt itself void ClassifyPacket(TCB *SendTCB); void TCPFastSend(TCB * SendTCB, PNDIS_BUFFER in_SendBuf, uint in_SendOfs, TCPSendReq * in_SendReq, uint in_SendSize, SeqNum NextSeq, int in_ToBeSent); void *TCPProtInfo; // TCP protocol info for IP. NDIS_HANDLE TCPSendBufferPool; USHORT TcpHeaderBufferSize; HANDLE TcpHeaderPool; extern IPInfo LocalNetInfo; // // All of the init code can be discarded. // int InitTCPSend(void); void UnInitTCPSend(void); #ifdef ALLOC_PRAGMA #pragma alloc_text(INIT, InitTCPSend) #pragma alloc_text(INIT, UnInitTCPSend) #endif extern void ResetSendNext(TCB * SeqTCB, SeqNum NewSeq); extern NTSTATUS TCPPnPPowerRequest(void *ipContext, IPAddr ipAddr, NDIS_HANDLE handle, PNET_PNP_EVENT netPnPEvent); extern void TCPElistChangeHandler(void); //* GetTCPHeader - Get a TCP header buffer. // // Called when we need to get a TCP header buffer. This routine is // specific to the particular environment (VxD or NT). All we // need to do is pop the buffer from the free list. // // Input: Nothing. // // Returns: Pointer to an NDIS buffer, or NULL is none. // PNDIS_BUFFER GetTCPHeaderAtDpcLevel(TCPHeader **Header) { PNDIS_BUFFER Buffer; #if DBG *Header = NULL; #endif Buffer = MdpAllocateAtDpcLevel(TcpHeaderPool, Header); if (Buffer) { ASSERT(*Header); NdisAdjustBufferLength(Buffer, sizeof(TCPHeader)); #if BACK_FILL ASSERT(Buffer->ByteOffset >= 40); *Header = (TCPHeader*)((ULONG_PTR)(*Header) + MAX_BACKFILL_HDR_SIZE); Buffer->MappedSystemVa = (PVOID)((ULONG_PTR)Buffer->MappedSystemVa + MAX_BACKFILL_HDR_SIZE); Buffer->ByteOffset += MAX_BACKFILL_HDR_SIZE; Buffer->MdlFlags |= MDL_NETWORK_HEADER; #endif } return Buffer; } #if MILLEN #define GetTCPHeader GetTCPHeaderAtDpcLevel #else __inline PNDIS_BUFFER GetTCPHeader(TCPHeader **Header) { KIRQL OldIrql; PNDIS_BUFFER Buffer; OldIrql = KeRaiseIrqlToDpcLevel(); Buffer = GetTCPHeaderAtDpcLevel(Header); KeLowerIrql(OldIrql); return Buffer; } #endif //* FreeTCPHeader - Free a TCP header buffer. // // Called to free a TCP header buffer. // // Input: Buffer to be freed. // // Returns: Nothing. // __inline VOID FreeTCPHeader(PNDIS_BUFFER Buffer) { NdisAdjustBufferLength(Buffer, TcpHeaderBufferSize); #if BACK_FILL Buffer->MappedSystemVa = (PVOID)((ULONG_PTR)Buffer->MappedSystemVa - MAX_BACKFILL_HDR_SIZE); Buffer->ByteOffset -= MAX_BACKFILL_HDR_SIZE; #endif MdpFree(Buffer); } //* FreeSendReq - Free a send request structure. // // Called to free a send request structure. // // Input: FreedReq - Connection request structure to be freed. // // Returns: Nothing. // __inline void FreeSendReq(TCPSendReq *Request) { PplFree(TcpRequestPool, Request); } //* GetSendReq - Get a send request structure. // // Called to get a send request structure. // // Input: Nothing. // // Returns: Pointer to SendReq structure, or NULL if none. // __inline TCPSendReq * GetSendReq(VOID) { TCPSendReq *Request; LOGICAL FromList; Request = PplAllocate(TcpRequestPool, &FromList); if (Request) { #if DBG Request->tsr_req.tr_sig = tr_signature; Request->tsr_sig = tsr_signature; #endif } return Request; } //* TCPSendComplete - Complete a TCP send. // // Called by IP when a send we've made is complete. We free the buffer, // and possibly complete some sends. Each send queued on a TCB has a ref. // count with it, which is the number of times a pointer to a buffer // associated with the send has been passed to the underlying IP layer. We // can't complete a send until that count it 0. If this send was actually // from a send of data, we'll go down the chain of send and decrement the // refcount on each one. If we have one going to 0 and the send has already // been acked we'll complete the send. If it hasn't been acked we'll leave // it until the ack comes in. // // NOTE: We aren't protecting any of this with locks. When we port this to // NT we'll need to fix this, probably with a global lock. See the comments // in ACKSend() in TCPRCV.C for more details. // // Input: Context - Context we gave to IP. // BufferChain - BufferChain for send. // // Returns: Nothing. // void TCPSendComplete(void *Context, PNDIS_BUFFER BufferChain, IP_STATUS SendStatus) { BOOLEAN DoRcvComplete = FALSE; PNDIS_BUFFER CurrentBuffer; if (Context != NULL) { SendCmpltContext *SCContext = (SendCmpltContext *) Context; TCPSendReq *CurrentSend; uint i; CTEStructAssert(SCContext, scc); if (SCContext->scc_LargeSend) { TCB *LargeSendTCB = SCContext->scc_LargeSend; CTELockHandle TCBHandle; CTEGetLock(&LargeSendTCB->tcb_lock, &TCBHandle); IF_TCPDBG(TCP_DEBUG_OFFLOAD) { KdPrintEx((DPFLTR_TCPIP_ID, DPFLTR_INFO_LEVEL,"TCPSendComplete: tcb %x sent %d of %d una %u " "next %u unacked %u\n", LargeSendTCB, SCContext->scc_ByteSent, SCContext->scc_SendSize, LargeSendTCB->tcb_senduna, LargeSendTCB->tcb_sendnext, LargeSendTCB->tcb_unacked)); } if (SCContext->scc_ByteSent < SCContext->scc_SendSize) { uint BytesNotSent = SCContext->scc_SendSize - SCContext->scc_ByteSent; SeqNum Next = LargeSendTCB->tcb_sendnext; IF_TCPDBG(TCP_DEBUG_OFFLOAD) { KdPrintEx((DPFLTR_TCPIP_ID, DPFLTR_INFO_LEVEL,"TCPSendComplete: unsent %d\n", SCContext->scc_SendSize-SCContext->scc_ByteSent)); } if (SEQ_GTE((Next - BytesNotSent), LargeSendTCB->tcb_senduna) && SEQ_LT((Next - BytesNotSent), LargeSendTCB->tcb_sendnext)) { ResetSendNext(LargeSendTCB, (Next - BytesNotSent)); } } #if DBG LargeSendTCB->tcb_LargeSend--; #endif if (LargeSendTCB->tcb_unacked) DelayAction(LargeSendTCB, NEED_OUTPUT); DerefTCB(LargeSendTCB, TCBHandle); } // First, loop through and free any NDIS buffers here that need to be. // freed. We'll skip any 'user' buffers, and then free our buffers. We // need to do this before decrementing the reference count to avoid // destroying the buffer chain if we have to zap tsr_lastbuf->Next to // NULL. CurrentBuffer = NDIS_BUFFER_LINKAGE(BufferChain); for (i = 0; i < (uint) SCContext->scc_ubufcount; i++) { ASSERT(CurrentBuffer != NULL); CurrentBuffer = NDIS_BUFFER_LINKAGE(CurrentBuffer); } for (i = 0; i < (uint) SCContext->scc_tbufcount; i++) { PNDIS_BUFFER TempBuffer; ASSERT(CurrentBuffer != NULL); TempBuffer = CurrentBuffer; CurrentBuffer = NDIS_BUFFER_LINKAGE(CurrentBuffer); NdisFreeBuffer(TempBuffer); } CurrentSend = SCContext->scc_firstsend; i = 0; while (i < SCContext->scc_count) { Queue *TempQ; long Result; uint SendReqFlags; TempQ = QNEXT(&CurrentSend->tsr_req.tr_q); SendReqFlags = CurrentSend->tsr_flags; CTEStructAssert(CurrentSend, tsr); Result = CTEInterlockedDecrementLong(&(CurrentSend->tsr_refcnt)); ASSERT(Result >= 0); if ((Result <= 0) || ((SendReqFlags & TSR_FLAG_SEND_AND_DISC) && (Result == 1))) { TCPReq *Req; // Reference count has gone to 0 which means the send has // been ACK'd or cancelled. Complete it now. // If we've sent directly from this send, NULL out the next // pointer for the last buffer in the chain. if (CurrentSend->tsr_lastbuf != NULL) { NDIS_BUFFER_LINKAGE(CurrentSend->tsr_lastbuf) = NULL; CurrentSend->tsr_lastbuf = NULL; } Req = &CurrentSend->tsr_req; (*Req->tr_rtn)(Req->tr_context, Req->tr_status, Req->tr_status == TDI_SUCCESS ? CurrentSend->tsr_size : 0); FreeSendReq(CurrentSend); DoRcvComplete = TRUE; } CurrentSend = STRUCT_OF(TCPSendReq, QSTRUCT(TCPReq, TempQ, tr_q), tsr_req); i++; } } FreeTCPHeader(BufferChain); if (DoRcvComplete && !PartitionedDelayQ) { KIRQL Irql = KeRaiseIrqlToDpcLevel(); TCPRcvComplete(); KeLowerIrql(Irql); } } //* RcvWin - Figure out the receive window to offer in an ack. // // A routine to figure out what window to offer on a connection. We // take into account SWS avoidance, what the default connection window is, // and what the last window we offered is. // // Input: WinTCB - TCB on which to perform calculations. // // Returns: Window to be offered. // uint RcvWin(TCB * WinTCB) { int CouldOffer; // The window size we could offer. CTEStructAssert(WinTCB, tcb); CheckRBList(WinTCB->tcb_pendhead, WinTCB->tcb_pendingcnt); ASSERT(WinTCB->tcb_rcvwin >= 0); CouldOffer = WinTCB->tcb_defaultwin - WinTCB->tcb_pendingcnt; ASSERT(CouldOffer >= 0); ASSERT(CouldOffer >= WinTCB->tcb_rcvwin); if ((CouldOffer - WinTCB->tcb_rcvwin) >= (int)MIN(WinTCB->tcb_defaultwin / 2, WinTCB->tcb_mss)) WinTCB->tcb_rcvwin = CouldOffer; return WinTCB->tcb_rcvwin; } //* SendSYNOnSynTCB - Send a SYN segment for syntcb // // This is called during connection establishment time to send a SYN // segment to the peer. We get a buffer if we can, and then fill // it in. There's a tricky part here where we have to build the MSS // option in the header - we find the MSS by finding the MSS offered // by the net for the local address. After that, we send it. // // Input: SYNTcb - TCB from which SYN is to be sent. // // Returns: Nothing. // void SendSYNOnSynTCB(SYNTCB * SYNTcb, CTELockHandle TCBHandle) { PNDIS_BUFFER HeaderBuffer; TCPHeader *SYNHeader; uchar *OptPtr; IP_STATUS SendStatus; ushort OptSize = 0, HdrSize = 0; BOOLEAN SackOpt = FALSE; IPOptInfo OptInfo; CTEStructAssert(SYNTcb, syntcb); HeaderBuffer = GetTCPHeaderAtDpcLevel(&SYNHeader); // Go ahead and set the retransmission timer now, in case we didn't get a // buffer. In the future we might want to queue the connection for // when we free a buffer. START_TCB_TIMER(SYNTcb->syntcb_rexmittimer, SYNTcb->syntcb_rexmit); // The Rexmit interval has to be doubled here SYNTcb->syntcb_rexmit = MIN(SYNTcb->syntcb_rexmit << 1, MAX_REXMIT_TO); if (HeaderBuffer != NULL) { ushort TempWin; ushort MSS; uchar FoundMSS; SYNHeader = (TCPHeader *) ((PUCHAR)SYNHeader + LocalNetInfo.ipi_hsize); NDIS_BUFFER_LINKAGE(HeaderBuffer) = NULL; if (SYNTcb->syntcb_tcpopts & TCP_FLAG_WS) { OptSize += WS_OPT_SIZE + 1; // 1 NOP for alignment } if (SYNTcb->syntcb_tcpopts & TCP_FLAG_TS) { OptSize += TS_OPT_SIZE + 2; // 2 NOPs for alignment } if (SYNTcb->syntcb_tcpopts & TCP_FLAG_SACK){ SackOpt = TRUE; OptSize += 4; // 2 NOPS, SACK kind and length field } NdisAdjustBufferLength(HeaderBuffer, sizeof(TCPHeader) + MSS_OPT_SIZE + OptSize); SYNHeader->tcp_src = SYNTcb->syntcb_sport; SYNHeader->tcp_dest = SYNTcb->syntcb_dport; SYNHeader->tcp_seq = net_long(SYNTcb->syntcb_sendnext); SYNTcb->syntcb_sendnext++; if (SYNTcb->syntcb_rexmitcnt == 0) { TCPSIncrementOutSegCount(); } else TStats.ts_retranssegs++; SYNHeader->tcp_ack = net_long(SYNTcb->syntcb_rcvnext); // Reuse OPt size for header size determination // default is MSS amd tcp header size HdrSize = 6; // set size field to reflect TS and WND scale option // tcp header + windowscale + Timestamp + pad if (SYNTcb->syntcb_tcpopts & TCP_FLAG_WS) { // WS: Add one more long word HdrSize += 1; } if (SYNTcb->syntcb_tcpopts & TCP_FLAG_TS) { // TS: Add 3 more long words HdrSize += 3; } if (SackOpt) { // SACK: Add 1 more long word HdrSize += 1; } SYNHeader->tcp_flags = MAKE_TCP_FLAGS(HdrSize, TCP_FLAG_SYN | TCP_FLAG_ACK); if (SYNTcb->syntcb_defaultwin <= TCP_MAXWIN) { TempWin = (ushort)SYNTcb->syntcb_defaultwin; } else { // Don't apply the scale-factor in a SYN segment. // Instead, advertise the largest window possible. TempWin = TCP_MAXWIN; } SYNHeader->tcp_window = net_short(TempWin); SYNHeader->tcp_urgent = 0; SYNHeader->tcp_xsum = 0; OptPtr = (uchar *) (SYNHeader + 1); FoundMSS = (*LocalNetInfo.ipi_getlocalmtu) (SYNTcb->syntcb_saddr, &MSS); if (!FoundMSS) { CTEFreeLock(&SYNTcb->syntcb_lock, TCBHandle); FreeTCPHeader(HeaderBuffer); return; } MSS -= sizeof(TCPHeader); SYNTcb->syntcb_mss = MSS; *OptPtr++ = TCP_OPT_MSS; *OptPtr++ = MSS_OPT_SIZE; **(ushort **) & OptPtr = net_short(MSS); OptPtr++; OptPtr++; if (SYNTcb->syntcb_tcpopts & TCP_FLAG_WS) { // Fill in the WS option headers and value *OptPtr++ = TCP_OPT_NOP; *OptPtr++ = TCP_OPT_WS; *OptPtr++ = WS_OPT_SIZE; //Initial window scale factor *OptPtr++ = (uchar) SYNTcb->syntcb_rcvwinscale; } if (SYNTcb->syntcb_tcpopts & TCP_FLAG_TS) { //Start loading time stamp option header and value *OptPtr++ = TCP_OPT_NOP; *OptPtr++ = TCP_OPT_NOP; *OptPtr++ = TCP_OPT_TS; *OptPtr++ = TS_OPT_SIZE; // Initialize TS value TSval *(long *)OptPtr = 0; OptPtr += 4; //Initialize TS Echo Reply TSecr *(long *)OptPtr = 0; OptPtr += 4; } if (SackOpt) { // Initialize with SACK_PERMITTED option *(long *)OptPtr = net_long(0x01010402); IF_TCPDBG(TCP_DEBUG_SACK) { KdPrintEx((DPFLTR_TCPIP_ID, DPFLTR_INFO_LEVEL,"Sending SACK_OPT %x\n", SYNTcb)); } } SYNTcb->syntcb_refcnt++; // Account for Options. (*LocalNetInfo.ipi_initopts) (&OptInfo); OptInfo.ioi_ttl = SYNTcb->syntcb_ttl; SYNHeader->tcp_xsum = ~XsumSendChain(PHXSUM(SYNTcb->syntcb_saddr, SYNTcb->syntcb_daddr, PROTOCOL_TCP, 0) + (uint)net_short(sizeof(TCPHeader) + MSS_OPT_SIZE + OptSize), HeaderBuffer); CTEFreeLock(&SYNTcb->syntcb_lock, TCBHandle); SendStatus = (*LocalNetInfo.ipi_xmit)(TCPProtInfo, NULL, HeaderBuffer, sizeof(TCPHeader) + MSS_OPT_SIZE + OptSize, SYNTcb->syntcb_daddr, SYNTcb->syntcb_saddr, &OptInfo, NULL, PROTOCOL_TCP, NULL); if (SendStatus != IP_PENDING) { FreeTCPHeader(HeaderBuffer); } CTEGetLock(&SYNTcb->syntcb_lock, &TCBHandle); DerefSynTCB(SYNTcb, TCBHandle); } else { SYNTcb->syntcb_sendnext++; CTEFreeLock(&SYNTcb->syntcb_lock, TCBHandle); return; } } //* SendSYN - Send a SYN segment. // // This is called during connection establishment time to send a SYN // segment to the peer. We get a buffer if we can, and then fill // it in. There's a tricky part here where we have to build the MSS // option in the header - we find the MSS by finding the MSS offered // by the net for the local address. After that, we send it. // // Input: SYNTcb - TCB from which SYN is to be sent. // TCBHandle - Handle for lock on TCB. // // Returns: Nothing. // void SendSYN(TCB * SYNTcb, CTELockHandle TCBHandle) { PNDIS_BUFFER HeaderBuffer; TCPHeader *SYNHeader; uchar *OptPtr; IP_STATUS SendStatus; ushort OptSize = 0, HdrSize = 0, rfc1323opts = 0; BOOLEAN SackOpt = FALSE; CTEStructAssert(SYNTcb, tcb); HeaderBuffer = GetTCPHeaderAtDpcLevel(&SYNHeader); // Go ahead and set the retransmission timer now, in case we didn't get a // buffer. In the future we might want to queue the connection for // when we free a buffer. START_TCB_TIMER_R(SYNTcb, RXMIT_TIMER, SYNTcb->tcb_rexmit); if (HeaderBuffer != NULL) { ushort TempWin; ushort MSS; uchar FoundMSS; SYNHeader = (TCPHeader *) ((PUCHAR)SYNHeader + LocalNetInfo.ipi_hsize); NDIS_BUFFER_LINKAGE(HeaderBuffer) = NULL; // If we are doing active open, check if we are configured to do // window scaling and time stamp options if ((((TcpHostSendOpts & TCP_FLAG_WS) || SYNTcb->tcb_rcvwinscale) && SYNTcb->tcb_state == TCB_SYN_SENT) || (SYNTcb->tcb_tcpopts & TCP_FLAG_WS)) { rfc1323opts |= TCP_FLAG_WS; IF_TCPDBG(TCP_DEBUG_1323) { KdPrintEx((DPFLTR_TCPIP_ID, DPFLTR_INFO_LEVEL,"Selected WS option TCB %x\n", SYNTcb)); } } if (((TcpHostSendOpts & TCP_FLAG_TS) && (SYNTcb->tcb_state == TCB_SYN_SENT)) || (SYNTcb->tcb_tcpopts & TCP_FLAG_TS)) { IF_TCPDBG(TCP_DEBUG_1323) { KdPrintEx((DPFLTR_TCPIP_ID, DPFLTR_INFO_LEVEL,"Selected TS option TCB %x\n", SYNTcb)); } rfc1323opts |= TCP_FLAG_TS; } if (rfc1323opts & TCP_FLAG_WS) { OptSize += WS_OPT_SIZE + 1; // 1 NOP for alignment } if (rfc1323opts & TCP_FLAG_TS) { OptSize += TS_OPT_SIZE + 2; // 2 NOPs for alignment } if ((SYNTcb->tcb_tcpopts & TCP_FLAG_SACK) || ((SYNTcb->tcb_state == TCB_SYN_SENT) && (TcpHostOpts & TCP_FLAG_SACK))) { SackOpt = TRUE; OptSize += 4; // 2 NOPS, SACK kind and length field } NdisAdjustBufferLength(HeaderBuffer, sizeof(TCPHeader) + MSS_OPT_SIZE + OptSize); SYNHeader->tcp_src = SYNTcb->tcb_sport; SYNHeader->tcp_dest = SYNTcb->tcb_dport; SYNHeader->tcp_seq = net_long(SYNTcb->tcb_sendnext); SYNTcb->tcb_sendnext++; if (SEQ_GT(SYNTcb->tcb_sendnext, SYNTcb->tcb_sendmax)) { TCPSIncrementOutSegCount(); SYNTcb->tcb_sendmax = SYNTcb->tcb_sendnext; } else TStats.ts_retranssegs++; SYNHeader->tcp_ack = net_long(SYNTcb->tcb_rcvnext); // Reuse OPt size for header size determination // default is MSS amd tcp header size HdrSize = 6; // set size field to reflect TS and WND scale option // tcp header + windowscale + Timestamp + pad if (rfc1323opts & TCP_FLAG_WS) { // WS: Add one more long word HdrSize += 1; } if (rfc1323opts & TCP_FLAG_TS) { // TS: Add 3 more long words HdrSize += 3; } if (SackOpt) { // SACK: Add 1 more long word HdrSize += 1; } if (SYNTcb->tcb_state == TCB_SYN_RCVD) { SYNHeader->tcp_flags = MAKE_TCP_FLAGS(HdrSize, TCP_FLAG_SYN | TCP_FLAG_ACK); } else { SYNHeader->tcp_flags = MAKE_TCP_FLAGS(HdrSize, TCP_FLAG_SYN); } SYNTcb->tcb_lastack = SYNTcb->tcb_rcvnext; if (SYNTcb->tcb_rcvwin <= TCP_MAXWIN) { TempWin = (ushort)SYNTcb->tcb_rcvwin; } else { // Don't apply the scale-factor in a SYN segment. // Instead, advertise the largest window possible. TempWin = TCP_MAXWIN; } SYNHeader->tcp_window = net_short(TempWin); SYNHeader->tcp_urgent = 0; SYNHeader->tcp_xsum = 0; OptPtr = (uchar *) (SYNHeader + 1); FoundMSS = (*LocalNetInfo.ipi_getlocalmtu) (SYNTcb->tcb_saddr, &MSS); if (!FoundMSS) { CTEFreeLock(&SYNTcb->tcb_lock, TCBHandle); FreeTCPHeader(HeaderBuffer); return; } MSS -= sizeof(TCPHeader); *OptPtr++ = TCP_OPT_MSS; *OptPtr++ = MSS_OPT_SIZE; **(ushort **) & OptPtr = net_short(MSS); OptPtr++; OptPtr++; if (rfc1323opts & TCP_FLAG_WS) { // Fill in the WS option headers and value *OptPtr++ = TCP_OPT_NOP; *OptPtr++ = TCP_OPT_WS; *OptPtr++ = WS_OPT_SIZE; // Initial window scale factor *OptPtr++ = (uchar) SYNTcb->tcb_rcvwinscale; } if (rfc1323opts & TCP_FLAG_TS) { // Start loading time stamp option header and value *OptPtr++ = TCP_OPT_NOP; *OptPtr++ = TCP_OPT_NOP; *OptPtr++ = TCP_OPT_TS; *OptPtr++ = TS_OPT_SIZE; // Initialize TS value TSval *(long *)OptPtr = 0; OptPtr += 4; // Initialize TS Echo Reply TSecr *(long *)OptPtr = 0; OptPtr += 4; } if (SackOpt) { // Initialize with SACK_PERMITTED option *(long *)OptPtr = net_long(0x01010402); IF_TCPDBG(TCP_DEBUG_SACK) { KdPrintEx((DPFLTR_TCPIP_ID, DPFLTR_INFO_LEVEL,"Sending SACK_OPT %x\n", SYNTcb)); } } REFERENCE_TCB(SYNTcb); // Account for Options. SYNTcb->tcb_opt.ioi_TcpChksum = 0; SYNHeader->tcp_xsum = ~XsumSendChain(SYNTcb->tcb_phxsum + (uint)net_short(sizeof(TCPHeader) + MSS_OPT_SIZE + OptSize), HeaderBuffer); ClassifyPacket(SYNTcb); CTEFreeLock(&SYNTcb->tcb_lock, TCBHandle); SendStatus = (*LocalNetInfo.ipi_xmit)(TCPProtInfo, NULL, HeaderBuffer, sizeof(TCPHeader) + MSS_OPT_SIZE + OptSize, SYNTcb->tcb_daddr, SYNTcb->tcb_saddr, &SYNTcb->tcb_opt, SYNTcb->tcb_rce, PROTOCOL_TCP, NULL); SYNTcb->tcb_error = SendStatus; if (SendStatus != IP_PENDING) { FreeTCPHeader(HeaderBuffer); } CTEGetLock(&SYNTcb->tcb_lock, &TCBHandle); DerefTCB(SYNTcb, TCBHandle); } else { SYNTcb->tcb_sendnext++; if (SEQ_GT(SYNTcb->tcb_sendnext, SYNTcb->tcb_sendmax)) SYNTcb->tcb_sendmax = SYNTcb->tcb_sendnext; CTEFreeLock(&SYNTcb->tcb_lock, TCBHandle); return; } } //* SendKA - Send a keep alive segment. // // This is called when we want to send a keep alive. // // Input: KATcb - TCB from which keep alive is to be sent. // Handle - Handle for lock on TCB. // // Returns: Nothing. // void SendKA(TCB * KATcb, CTELockHandle Handle) { PNDIS_BUFFER HeaderBuffer; TCPHeader *Header; IP_STATUS SendStatus; CTEStructAssert(KATcb, tcb); HeaderBuffer = GetTCPHeaderAtDpcLevel(&Header); if (HeaderBuffer != NULL) { ushort TempWin; SeqNum TempSeq; Header = (TCPHeader *) ((PUCHAR) Header + LocalNetInfo.ipi_hsize); NDIS_BUFFER_LINKAGE(HeaderBuffer) = NULL; NdisAdjustBufferLength(HeaderBuffer, sizeof(TCPHeader) + 1); Header->tcp_src = KATcb->tcb_sport; Header->tcp_dest = KATcb->tcb_dport; TempSeq = KATcb->tcb_senduna - 1; Header->tcp_seq = net_long(TempSeq); TStats.ts_retranssegs++; Header->tcp_ack = net_long(KATcb->tcb_rcvnext); Header->tcp_flags = MAKE_TCP_FLAGS(5, TCP_FLAG_ACK); // Initialize the single byte that we're sending. *(uchar*)(Header + 1) = 0; // We need to scale the rcv window // Use temprary variable to workaround truncation // caused by net_short TempWin = (ushort) (RcvWin(KATcb) >> KATcb->tcb_rcvwinscale); Header->tcp_window = net_short(TempWin); Header->tcp_urgent = 0; KATcb->tcb_lastack = KATcb->tcb_rcvnext; Header->tcp_xsum = 0; KATcb->tcb_opt.ioi_TcpChksum = 0; Header->tcp_xsum = ~XsumSendChain(KATcb->tcb_phxsum + (uint)net_short(sizeof(TCPHeader) + 1), HeaderBuffer); KATcb->tcb_kacount++; ClassifyPacket(KATcb); REFERENCE_TCB(KATcb); CTEFreeLock(&KATcb->tcb_lock, Handle); SendStatus = (*LocalNetInfo.ipi_xmit)(TCPProtInfo, NULL, HeaderBuffer, sizeof(TCPHeader) + 1, KATcb->tcb_daddr, KATcb->tcb_saddr, &KATcb->tcb_opt, KATcb->tcb_rce, PROTOCOL_TCP, NULL); if (SendStatus != IP_PENDING) { FreeTCPHeader(HeaderBuffer); } CTEGetLock(&KATcb->tcb_lock, &Handle); DerefTCB(KATcb, Handle); } else { CTEFreeLock(&KATcb->tcb_lock, Handle); } } //* SendACK - Send an ACK segment. // // This is called whenever we need to send an ACK for some reason. Nothing // fancy, we just do it. // // Input: ACKTcb - TCB from which ACK is to be sent. // // Returns: Nothing. // void SendACK(TCB * ACKTcb) { PNDIS_BUFFER HeaderBuffer; TCPHeader *ACKHeader; IP_STATUS SendStatus; CTELockHandle TCBHandle; SeqNum SendNext; ushort SackLength = 0, i, hdrlen = 5; ulong *ts_opt; BOOLEAN HWChksum = FALSE; CTEStructAssert(ACKTcb, tcb); HeaderBuffer = GetTCPHeader(&ACKHeader); if (HeaderBuffer != NULL) { ushort TempWin; ushort Size; ACKHeader = (TCPHeader *) ((PUCHAR) ACKHeader + LocalNetInfo.ipi_hsize); CTEGetLock(&ACKTcb->tcb_lock, &TCBHandle); // Allow room for filling time stamp option. // Note that it is 12 bytes and will never ever change if (ACKTcb->tcb_tcpopts & TCP_FLAG_TS) { NdisAdjustBufferLength(HeaderBuffer, sizeof(TCPHeader) + ALIGNED_TS_OPT_SIZE); // Header length is multiple of 32bits hdrlen = 5 + 3; // standard header size + // header size requirement for TS option ACKTcb->tcb_lastack = ACKTcb->tcb_rcvnext; } if ((ACKTcb->tcb_tcpopts & TCP_FLAG_SACK) && ACKTcb->tcb_SackBlock && (ACKTcb->tcb_SackBlock->Mask[0] == 1)) { SackLength++; for (i = 1; i < 3; i++) { if (ACKTcb->tcb_SackBlock->Mask[i] == 1) SackLength++; } IF_TCPDBG(TCP_DEBUG_SACK) { KdPrintEx((DPFLTR_TCPIP_ID, DPFLTR_INFO_LEVEL,"Sending SACKs!! %x %x\n", ACKTcb, SackLength)); } NdisAdjustBufferLength(HeaderBuffer, NdisBufferLength(HeaderBuffer) + SackLength * 8 + 4); // Sack block is of 2 long words (8 bytes) and 4 bytes // is for Sack option header. hdrlen += ((SackLength * 8 + 4) >> 2); } NDIS_BUFFER_LINKAGE(HeaderBuffer) = NULL; ACKHeader->tcp_src = ACKTcb->tcb_sport; ACKHeader->tcp_dest = ACKTcb->tcb_dport; ACKHeader->tcp_ack = net_long(ACKTcb->tcb_rcvnext); // If the remote peer is advertising a window of zero, we need to // send this ack with a seq. number of his rcv_next (which in that case // should be our senduna). We have code here ifdef'd out that makes // sure that we don't send outside the RWE, but this doesn't work. We // need to be able to send a pure ACK exactly at the RWE. if (ACKTcb->tcb_sendwin != 0) { SendNext = ACKTcb->tcb_sendnext; } else SendNext = ACKTcb->tcb_senduna; if ((ACKTcb->tcb_flags & FIN_SENT) && SEQ_EQ(SendNext, ACKTcb->tcb_sendmax - 1)) { ACKHeader->tcp_flags = MAKE_TCP_FLAGS(hdrlen, TCP_FLAG_FIN | TCP_FLAG_ACK); } else ACKHeader->tcp_flags = MAKE_TCP_FLAGS(hdrlen, TCP_FLAG_ACK); ACKHeader->tcp_seq = net_long(SendNext); TempWin = (ushort) (RcvWin(ACKTcb) >> ACKTcb->tcb_rcvwinscale); ACKHeader->tcp_window = net_short(TempWin); ACKHeader->tcp_urgent = 0; ACKHeader->tcp_xsum = 0; Size = sizeof(TCPHeader); // Point to a place beyond tcp header ts_opt = (ulong *)((uchar *) ACKHeader + 20); if (ACKTcb->tcb_tcpopts & TCP_FLAG_TS) { // Form time stamp header with 2 NOPs for alignment *ts_opt++ = net_long(0x0101080A); *ts_opt++ = net_long(TCPTime); *ts_opt++ = net_long(ACKTcb->tcb_tsrecent); // Add 12 more bytes to the size to account for TS Size += ALIGNED_TS_OPT_SIZE; } if ((ACKTcb->tcb_tcpopts & TCP_FLAG_SACK) && ACKTcb->tcb_SackBlock && (ACKTcb->tcb_SackBlock->Mask[0] == 1)) { ushort* UshortPtr; uchar* UcharPtr; UshortPtr = (ushort *)ts_opt; *UshortPtr = 0x0101; ts_opt = (ulong *)((uchar *)ts_opt + 2); UcharPtr = (uchar *)ts_opt; *UcharPtr = (uchar)0x05; ts_opt = (ulong *)((uchar *)ts_opt + 1); UcharPtr = (uchar *)ts_opt; *UcharPtr = (uchar) SackLength * 8 + 2; ts_opt = (ulong *)((uchar *)ts_opt + 1); // Sack option header + the block times times sack length! Size += 4 + SackLength * 8; for (i = 0; i < 3; i++) { if (ACKTcb->tcb_SackBlock->Mask[i] != 0) { *ts_opt++ = net_long(ACKTcb->tcb_SackBlock->Block[i].begin); *ts_opt++ = net_long(ACKTcb->tcb_SackBlock->Block[i].end); } } } if (ACKTcb->tcb_rce && (ACKTcb->tcb_rce->rce_OffloadFlags & TCP_XMT_CHECKSUM_OFFLOAD)) { HWChksum = TRUE; if ((Size > sizeof(TCPHeader)) && !(ACKTcb->tcb_rce->rce_OffloadFlags & TCP_CHECKSUM_OPT_OFFLOAD)) { HWChksum = FALSE; } } if (HWChksum) { uint PHXsum = ACKTcb->tcb_phxsum + (uint) net_short(Size); PHXsum = (((PHXsum << 16) | (PHXsum >> 16)) + PHXsum) >> 16; ACKHeader->tcp_xsum = (ushort) PHXsum; ACKTcb->tcb_opt.ioi_TcpChksum = 1; #if DBG DbgTcpSendHwChksumCount++; #endif } else { ACKHeader->tcp_xsum = ~XsumSendChain(ACKTcb->tcb_phxsum + (uint)net_short(Size), HeaderBuffer); ACKTcb->tcb_opt.ioi_TcpChksum = 0; } STOP_TCB_TIMER_R(ACKTcb, DELACK_TIMER); ACKTcb->tcb_rcvdsegs = 0; ACKTcb->tcb_flags &= ~(NEED_ACK | ACK_DELAYED); ClassifyPacket(ACKTcb); CTEFreeLock(&ACKTcb->tcb_lock, TCBHandle); TCPSIncrementOutSegCount(); if (ACKTcb->tcb_tcpopts) { SendStatus = (*LocalNetInfo.ipi_xmit)(TCPProtInfo, NULL, HeaderBuffer, Size, ACKTcb->tcb_daddr, ACKTcb->tcb_saddr, &ACKTcb->tcb_opt, ACKTcb->tcb_rce, PROTOCOL_TCP, NULL); } else { SendStatus = (*LocalNetInfo.ipi_xmit)(TCPProtInfo, NULL, HeaderBuffer, sizeof(TCPHeader), ACKTcb->tcb_daddr, ACKTcb->tcb_saddr, &ACKTcb->tcb_opt, ACKTcb->tcb_rce, PROTOCOL_TCP, NULL); } ACKTcb->tcb_error = SendStatus; if (SendStatus != IP_PENDING) FreeTCPHeader(HeaderBuffer); } return; } //* SendTWtcbACK- Send an ACK segment for a twtcb // // // Input: ACKTcb - TCB from which ACK is to be sent. // // Returns: Nothing. // void SendTWtcbACK(TWTCB *ACKTcb, uint Partition, CTELockHandle TCBHandle) { PNDIS_BUFFER HeaderBuffer; TCPHeader *ACKHeader; IP_STATUS SendStatus; SeqNum SendNext; ushort hdrlen = 5; uint phxsum; CTEStructAssert(ACKTcb, twtcb); HeaderBuffer = GetTCPHeaderAtDpcLevel(&ACKHeader); if (HeaderBuffer != NULL) { ushort Size; IPOptInfo NewInfo; ACKHeader = (TCPHeader *)((PUCHAR)ACKHeader + LocalNetInfo.ipi_hsize); NDIS_BUFFER_LINKAGE(HeaderBuffer) = NULL; ACKHeader->tcp_src = ACKTcb->twtcb_sport; ACKHeader->tcp_dest = ACKTcb->twtcb_dport; ACKHeader->tcp_ack = net_long(ACKTcb->twtcb_rcvnext); SendNext = ACKTcb->twtcb_sendnext; ACKHeader->tcp_flags = MAKE_TCP_FLAGS(hdrlen, TCP_FLAG_ACK); ACKHeader->tcp_seq = net_long(SendNext); // Window needs to be zero since we can not rcv anyway. ACKHeader->tcp_window = 0; ACKHeader->tcp_urgent = 0; Size = sizeof(TCPHeader); phxsum = PHXSUM(ACKTcb->twtcb_saddr, ACKTcb->twtcb_daddr, PROTOCOL_TCP, 0); ACKHeader->tcp_xsum = 0; ACKHeader->tcp_xsum = ~XsumSendChain(phxsum + (uint)net_short(Size), HeaderBuffer); //ACKTcb->tcb_opt.ioi_TcpChksum=0; CTEFreeLockFromDPC(&pTWTCBTableLock[Partition]); TCPSIncrementOutSegCount(); (*LocalNetInfo.ipi_initopts) (&NewInfo); SendStatus = (*LocalNetInfo.ipi_xmit)(TCPProtInfo, NULL, HeaderBuffer, sizeof(TCPHeader), ACKTcb->twtcb_daddr, ACKTcb->twtcb_saddr, &NewInfo, NULL, PROTOCOL_TCP, NULL); if (SendStatus != IP_PENDING) FreeTCPHeader(HeaderBuffer); (*LocalNetInfo.ipi_freeopts) (&NewInfo); } else { CTEFreeLockFromDPC(&pTWTCBTableLock[Partition]); } } //* SendRSTFromTCB - Send a RST from a TCB. // // This is called during close when we need to send a RST. // // Input: RSTTcb - TCB from which RST is to be sent. // RCE - Optional RCE to be used in sending. // // Returns: Nothing. // void SendRSTFromTCB(TCB * RSTTcb, RouteCacheEntry* RCE) { PNDIS_BUFFER HeaderBuffer; TCPHeader *RSTHeader; IP_STATUS SendStatus; CTEStructAssert(RSTTcb, tcb); ASSERT(RSTTcb->tcb_state == TCB_CLOSED); HeaderBuffer = GetTCPHeader(&RSTHeader); if (HeaderBuffer != NULL) { SeqNum RSTSeq; RSTHeader = (TCPHeader *) ((PUCHAR)RSTHeader + LocalNetInfo.ipi_hsize); NDIS_BUFFER_LINKAGE(HeaderBuffer) = NULL; RSTHeader->tcp_src = RSTTcb->tcb_sport; RSTHeader->tcp_dest = RSTTcb->tcb_dport; // If the remote peer has a window of 0, send with a seq. # equal // to senduna so he'll accept it. Otherwise send with send max. if (RSTTcb->tcb_sendwin != 0) RSTSeq = RSTTcb->tcb_sendmax; else RSTSeq = RSTTcb->tcb_senduna; RSTHeader->tcp_seq = net_long(RSTSeq); RSTHeader->tcp_ack = net_long(RSTTcb->tcb_rcvnext); RSTHeader->tcp_flags = MAKE_TCP_FLAGS(sizeof(TCPHeader) / sizeof(ulong), TCP_FLAG_RST | TCP_FLAG_ACK); RSTHeader->tcp_window = 0; RSTHeader->tcp_urgent = 0; RSTHeader->tcp_xsum = 0; // Recompute pseudo checksum as this will // not be valid when connection is disconnected // in pre-accept case. RSTHeader->tcp_xsum = ~XsumSendChain(PHXSUM(RSTTcb->tcb_saddr, RSTTcb->tcb_daddr, PROTOCOL_TCP, sizeof(TCPHeader)), HeaderBuffer); RSTTcb->tcb_opt.ioi_TcpChksum = 0; TCPSIncrementOutSegCount(); TStats.ts_outrsts++; SendStatus = (*LocalNetInfo.ipi_xmit)(TCPProtInfo, NULL, HeaderBuffer, sizeof(TCPHeader), RSTTcb->tcb_daddr, RSTTcb->tcb_saddr, &RSTTcb->tcb_opt, RCE, PROTOCOL_TCP, NULL); if (SendStatus != IP_PENDING) FreeTCPHeader(HeaderBuffer); } return; } //* SendRSTFromHeader - Send a RST back, based on a header. // // Called when we need to send a RST, but don't necessarily have a TCB. // // Input: TCPH - TCP header to be RST. // Length - Length of the incoming segment. // Dest - Destination IP address for RST. // Src - Source IP address for RST. // OptInfo - IP Options to use on RST. // // Returns: Nothing. // void SendRSTFromHeader(TCPHeader UNALIGNED * TCPH, uint Length, IPAddr Dest, IPAddr Src, IPOptInfo * OptInfo) { PNDIS_BUFFER Buffer; TCPHeader *RSTHdr; IPOptInfo NewInfo; IP_STATUS SendStatus; if (TCPH->tcp_flags & TCP_FLAG_RST) return; Buffer = GetTCPHeader(&RSTHdr); if (Buffer != NULL) { // Got a buffer. Fill in the header so as to make it believable to // the remote guy, and send it. RSTHdr = (TCPHeader *) ((PUCHAR)RSTHdr + LocalNetInfo.ipi_hsize); NDIS_BUFFER_LINKAGE(Buffer) = NULL; if (TCPH->tcp_flags & TCP_FLAG_SYN) Length++; if (TCPH->tcp_flags & TCP_FLAG_FIN) Length++; if (TCPH->tcp_flags & TCP_FLAG_ACK) { RSTHdr->tcp_seq = TCPH->tcp_ack; RSTHdr->tcp_ack = TCPH->tcp_ack; RSTHdr->tcp_flags = MAKE_TCP_FLAGS(sizeof(TCPHeader) / sizeof(ulong), TCP_FLAG_RST); } else { SeqNum TempSeq; RSTHdr->tcp_seq = 0; TempSeq = net_long(TCPH->tcp_seq); TempSeq += Length; RSTHdr->tcp_ack = net_long(TempSeq); RSTHdr->tcp_flags = MAKE_TCP_FLAGS(sizeof(TCPHeader) / sizeof(ulong), TCP_FLAG_RST | TCP_FLAG_ACK); } RSTHdr->tcp_window = 0; RSTHdr->tcp_urgent = 0; RSTHdr->tcp_dest = TCPH->tcp_src; RSTHdr->tcp_src = TCPH->tcp_dest; RSTHdr->tcp_xsum = 0; RSTHdr->tcp_xsum = ~XsumSendChain(PHXSUM(Src, Dest, PROTOCOL_TCP, sizeof(TCPHeader)), Buffer); (*LocalNetInfo.ipi_initopts) (&NewInfo); if (OptInfo->ioi_options != NULL) (*LocalNetInfo.ipi_updateopts)(OptInfo, &NewInfo, Dest, NULL_IP_ADDR); TCPSIncrementOutSegCount(); TStats.ts_outrsts++; SendStatus = (*LocalNetInfo.ipi_xmit)(TCPProtInfo, NULL, Buffer, sizeof(TCPHeader), Dest, Src, &NewInfo, NULL, PROTOCOL_TCP, NULL); if (SendStatus != IP_PENDING) FreeTCPHeader(Buffer); (*LocalNetInfo.ipi_freeopts) (&NewInfo); } } //* GoToEstab - Transition to the established state. // // Called when we are going to the established state and need to finish up // initializing things that couldn't be done until now. We assume the TCB // lock is held by the caller on the TCB we're called with. // // Input: EstabTCB - TCB to transition. // // Returns: Nothing. // void GoToEstab(TCB * EstabTCB) { uchar DType; ushort MSS; // Initialize our slow start and congestion control variables. EstabTCB->tcb_cwin = 2 * EstabTCB->tcb_mss; EstabTCB->tcb_ssthresh = 0xffffffff; EstabTCB->tcb_state = TCB_ESTAB; if (SynAttackProtect && EstabTCB->tcb_rce == NULL) { (*LocalNetInfo.ipi_openrce)(EstabTCB->tcb_daddr, EstabTCB->tcb_saddr, &EstabTCB->tcb_rce, &DType, &MSS, &EstabTCB->tcb_opt); } // We're in established. We'll subtract one from slow count for this fact, // and if the slowcount goes to 0 we'll move onto the fast path. if (--(EstabTCB->tcb_slowcount) == 0) EstabTCB->tcb_fastchk &= ~TCP_FLAG_SLOW; InterlockedIncrement((PLONG)&TStats.ts_currestab); EstabTCB->tcb_flags &= ~ACTIVE_OPEN; // Turn off the active opening flag. // Start the Keep-Alive timer if necessary. if ((EstabTCB->tcb_flags & KEEPALIVE) && EstabTCB->tcb_conn) { START_TCB_TIMER_R(EstabTCB, KA_TIMER, EstabTCB->tcb_conn->tc_tcbkatime); EstabTCB->tcb_kacount = 0; } } //* InitSendState - Initialize the send state of a connection. // // Called during connection establishment to initialize our send state. // (In this case, this refers to all information we'll put on the wire as // well as pure send state). We pick an ISS, set up a rexmit timer value, // etc. We assume the tcb_lock is held on the TCB when we are called. // // Input: NewTCB - TCB to be set up. // // Returns: Nothing. void InitSendState(TCB * NewTCB) { CTEStructAssert(NewTCB, tcb); NewTCB->tcb_senduna = NewTCB->tcb_sendnext; NewTCB->tcb_sendmax = NewTCB->tcb_sendnext; NewTCB->tcb_error = IP_SUCCESS; // Initialize pseudo-header xsum. NewTCB->tcb_phxsum = PHXSUM(NewTCB->tcb_saddr, NewTCB->tcb_daddr, PROTOCOL_TCP, 0); // Initialize retransmit and delayed ack stuff. NewTCB->tcb_rexmitcnt = 0; NewTCB->tcb_rtt = 0; NewTCB->tcb_smrtt = 0; NewTCB->tcb_delta = MS_TO_TICKS(6000); NewTCB->tcb_rexmit = MS_TO_TICKS(3000); if (NewTCB->tcb_rce) { // // InitialRtt can be as low as 300msec to enable // certain scenarios to work correctly. // if (NewTCB->tcb_rce->rce_TcpInitialRTT && NewTCB->tcb_rce->rce_TcpInitialRTT > 3) { NewTCB->tcb_delta = MS_TO_TICKS(NewTCB->tcb_rce->rce_TcpInitialRTT * 2); NewTCB->tcb_rexmit = MS_TO_TICKS(NewTCB->tcb_rce->rce_TcpInitialRTT); } } STOP_TCB_TIMER_R(NewTCB, RXMIT_TIMER); STOP_TCB_TIMER_R(NewTCB, DELACK_TIMER); } //* TCPStatus - Handle a status indication. // // This is the TCP status handler, called by IP when a status event // occurs. For most of these we do nothing. For certain severe status // events we will mark the local address as invalid. // // Entry: StatusType - Type of status (NET or HW). NET status // is usually caused by a received ICMP // message. HW status indicate a HW // problem. // StatusCode - Code identifying IP_STATUS. // OrigDest - If this is NET status, the original dest. of // DG that triggered it. // OrigSrc - " " " " " , the original src. // Src - IP address of status originator (could be local // or remote). // Param - Additional information for status - i.e. the // param field of an ICMP message. // Data - Data pertaining to status - for NET status, this // is the first 8 bytes of the original DG. // // Returns: Nothing // void TCPStatus(uchar StatusType, IP_STATUS StatusCode, IPAddr OrigDest, IPAddr OrigSrc, IPAddr Src, ulong Param, void *Data) { CTELockHandle TCBHandle; TCB *StatusTCB; TCPHeader UNALIGNED *Header = (TCPHeader UNALIGNED *) Data; SeqNum DropSeq; uint index; // Handle NET status codes differently from HW status codes. if (StatusType == IP_NET_STATUS) { // It's a NET code. Find a matching TCB. StatusTCB = FindTCB(OrigSrc, OrigDest, Header->tcp_dest, Header->tcp_src, &TCBHandle, FALSE, &index); if (StatusTCB != NULL) { // Found one. Get the lock on it, and continue. CTEStructAssert(StatusTCB, tcb); // Make sure the TCB is in a state that is interesting. if (StatusTCB->tcb_state == TCB_CLOSED || StatusTCB->tcb_state == TCB_TIME_WAIT || CLOSING(StatusTCB)) { CTEFreeLock(&StatusTCB->tcb_lock, TCBHandle); return; } switch (StatusCode) { // Hard errors - Destination protocol unreachable. We treat // these as fatal errors. Close the connection now. case IP_DEST_PROT_UNREACHABLE: StatusTCB->tcb_error = StatusCode; REFERENCE_TCB(StatusTCB); TryToCloseTCB(StatusTCB, TCB_CLOSE_UNREACH, TCBHandle); RemoveTCBFromConn(StatusTCB); NotifyOfDisc(StatusTCB, NULL, MapIPError(StatusCode, TDI_DEST_UNREACHABLE), NULL); CTEGetLock(&StatusTCB->tcb_lock, &TCBHandle); DerefTCB(StatusTCB, TCBHandle); return; break; // Soft errors. Save the error in case it time out. case IP_DEST_NET_UNREACHABLE: case IP_DEST_HOST_UNREACHABLE: case IP_DEST_PORT_UNREACHABLE: case IP_BAD_ROUTE: case IP_TTL_EXPIRED_TRANSIT: case IP_TTL_EXPIRED_REASSEM: case IP_PARAM_PROBLEM: StatusTCB->tcb_error = StatusCode; break; case IP_PACKET_TOO_BIG: // icmp new MTU is in ich_param=1 Param = net_short(Param >> 16); StatusTCB->tcb_error = StatusCode; // Fall through mtu change code case IP_SPEC_MTU_CHANGE: // A TCP datagram has triggered an MTU change. Figure out // which connection it is, and update him to retransmit the // segment. The Param value is the new MTU. We'll need to // retransmit if the new MTU is less than our existing MTU // and the sequence of the dropped packet is less than our // current send next. Param = Param - (sizeof(TCPHeader) + StatusTCB->tcb_opt.ioi_optlength + sizeof(IPHeader)); DropSeq = net_long(Header->tcp_seq); if (*(ushort *) & Param <= StatusTCB->tcb_mss && (SEQ_GTE(DropSeq, StatusTCB->tcb_senduna) && SEQ_LT(DropSeq, StatusTCB->tcb_sendnext))) { // Need to initiate a retranmsit. ResetSendNext(StatusTCB, DropSeq); // Set the congestion window to allow only one packet. // This may prevent us from sending anything if we // didn't just set sendnext to senduna. This is OK, // we'll retransmit later, or send when we get an ack. StatusTCB->tcb_cwin = Param; DelayAction(StatusTCB, NEED_OUTPUT); PartitionDelayQProcessing(FALSE); } StatusTCB->tcb_mss = (ushort) MIN(Param, (ulong) StatusTCB->tcb_remmss); ASSERT(StatusTCB->tcb_mss > 0); ValidateMSS(StatusTCB); // // Reset the Congestion Window if necessary // if (StatusTCB->tcb_cwin < StatusTCB->tcb_mss) { StatusTCB->tcb_cwin = StatusTCB->tcb_mss; // // Make sure the slow start threshold is at least // 2 segments // if (StatusTCB->tcb_ssthresh < ((uint) StatusTCB->tcb_mss * 2) ) { StatusTCB->tcb_ssthresh = StatusTCB->tcb_mss * 2; } } break; // Source quench. This will cause us to reinitiate our // slow start by resetting our congestion window and // adjusting our slow start threshold. case IP_SOURCE_QUENCH: // // Code is removed, since source quench messages can be // misused to cause DoS attack. // break; default: ASSERT(0); break; } CTEFreeLock(&StatusTCB->tcb_lock, TCBHandle); } else { // Couldn't find a matching TCB. Just free the lock and return. } } else if (StatusType == IP_RECONFIG_STATUS) { if (StatusCode == IP_RECONFIG_SECFLTR) { ControlSecurityFiltering(Param); } } else { uint NewMTU; // 'Hardware' or 'global' status. Figure out what to do. switch (StatusCode) { case IP_ADDR_DELETED: // Local address has gone away. OrigDest is the IPAddr which is // gone. // // Delete any security filters associated with this address // DeleteProtocolSecurityFilter(OrigDest, PROTOCOL_TCP); break; case IP_ADDR_ADDED: // // An address has materialized. OrigDest identifies the address. // Data is a handle to the IP configuration information for the // interface on which the address is instantiated. // AddProtocolSecurityFilter(OrigDest, PROTOCOL_TCP, (NDIS_HANDLE) Data); break; case IP_MTU_CHANGE: NewMTU = Param - sizeof(TCPHeader); TCBWalk(SetTCBMTU, &OrigDest, &OrigSrc, &NewMTU); break; default: ASSERT(0); break; } } } //* FillTCPHeader - Fill the TCP header in. // // A utility routine to fill in the TCP header. // // Input: SendTCB - TCB to fill from. // Header - Header to fill into. // // Returns: Nothing. // void FillTCPHeader(TCB * SendTCB, TCPHeader * Header) { ushort S; ulong L; Header->tcp_src = SendTCB->tcb_sport; Header->tcp_dest = SendTCB->tcb_dport; L = SendTCB->tcb_sendnext; Header->tcp_seq = net_long(L); L = SendTCB->tcb_rcvnext; Header->tcp_ack = net_long(L); Header->tcp_flags = 0x1050; Header->tcp_xsum = 0; Header->tcp_urgent = 0; if (SendTCB->tcb_tcpopts & TCP_FLAG_TS) { ulong *ts_opt; ts_opt = (ulong *)((uchar *) Header + 20); //ts_opt = ts_opt + sizeof(TCPHeader); *ts_opt++ = net_long(0x0101080A); *ts_opt++ = net_long(TCPTime); *ts_opt = net_long(SendTCB->tcb_tsrecent); // Now the header is 32 bytes!! Header->tcp_flags = 0x1080; } S = (ushort) (RcvWin(SendTCB) >> SendTCB->tcb_rcvwinscale); Header->tcp_window = net_short(S); } //* ClassifyPacket - Classifies packets for GPC flow. // // // Input: SendTCB - TCB of data/control packet to classify. // // Returns: Nothing. // void ClassifyPacket( TCB *SendTCB ) { #if GPC // // clear the precedence bits and get ready to be set // according to the service type // if (DisableUserTOSSetting) SendTCB->tcb_opt.ioi_tos &= TOS_MASK; if (SendTCB->tcb_rce && GPCcfInfo) { struct QosCfTransportInfo TransportInfo = {0, 0}; GPC_STATUS status = STATUS_SUCCESS; GPC_IP_PATTERN Pattern; IF_TCPDBG(TCP_DEBUG_GPC) KdPrintEx((DPFLTR_TCPIP_ID, DPFLTR_INFO_LEVEL,"TCPSend: Classifying packet TCP %x\n", SendTCB)); Pattern.SrcAddr = SendTCB->tcb_saddr; Pattern.DstAddr = SendTCB->tcb_daddr; Pattern.ProtocolId = PROTOCOL_TCP; Pattern.gpcSrcPort = SendTCB->tcb_sport; Pattern.gpcDstPort = SendTCB->tcb_dport; if (SendTCB->tcb_GPCCachedRTE != (void *)SendTCB->tcb_rce->rce_rte) { // // first time we use this RTE, or it has been changed // since the last send // if (GetIFAndLink(SendTCB->tcb_rce, &SendTCB->tcb_GPCCachedIF, (IPAddr *) & SendTCB->tcb_GPCCachedLink) == STATUS_SUCCESS) { SendTCB->tcb_GPCCachedRTE = (void *)SendTCB->tcb_rce->rce_rte; } // // invaludate the classification handle // SendTCB->tcb_opt.ioi_GPCHandle = 0; } Pattern.InterfaceId.InterfaceId = SendTCB->tcb_GPCCachedIF; Pattern.InterfaceId.LinkId = SendTCB->tcb_GPCCachedLink; IF_TCPDBG(TCP_DEBUG_GPC) KdPrintEx((DPFLTR_TCPIP_ID, DPFLTR_INFO_LEVEL,"TCPSend: IF=%x Link=%x\n", Pattern.InterfaceId.InterfaceId, Pattern.InterfaceId.LinkId)); if (!SendTCB->tcb_opt.ioi_GPCHandle) { IF_TCPDBG(TCP_DEBUG_GPC) KdPrintEx((DPFLTR_TCPIP_ID, DPFLTR_INFO_LEVEL,"TCPsend: Classification Handle is NULL, getting one now.\n")); status = GpcEntries.GpcClassifyPatternHandler( (GPC_HANDLE)hGpcClient[GPC_CF_QOS], GPC_PROTOCOL_TEMPLATE_IP, &Pattern, NULL, // context (PCLASSIFICATION_HANDLE)&SendTCB->tcb_opt.ioi_GPCHandle, 0, NULL, FALSE); } // Only if QOS patterns exist, we get the TOS bits out. if (NT_SUCCESS(status) && GpcCfCounts[GPC_CF_QOS]) { status = GpcEntries.GpcGetUlongFromCfInfoHandler( (GPC_HANDLE) hGpcClient[GPC_CF_QOS], SendTCB->tcb_opt.ioi_GPCHandle, FIELD_OFFSET(CF_INFO_QOS, TransportInformation), (PULONG)&TransportInfo); // It is likely that the pattern has gone by now // and the handle that we are caching is INVALID. // We need to pull up a new handle and get the // TOS bit again. if (STATUS_INVALID_HANDLE == status) { IF_TCPDBG(TCP_DEBUG_GPC) KdPrintEx((DPFLTR_TCPIP_ID, DPFLTR_INFO_LEVEL,"TCPsend: Classification Handle is NULL, " "getting one now.\n")); SendTCB->tcb_opt.ioi_GPCHandle = 0; status = GpcEntries.GpcClassifyPatternHandler( (GPC_HANDLE) hGpcClient[GPC_CF_QOS], GPC_PROTOCOL_TEMPLATE_IP, &Pattern, NULL, // context (PCLASSIFICATION_HANDLE)&SendTCB->tcb_opt.ioi_GPCHandle, 0, NULL, FALSE); // // Only if QOS patterns exist, we get the TOS bits out. // if (NT_SUCCESS(status)) { status = GpcEntries.GpcGetUlongFromCfInfoHandler( (GPC_HANDLE) hGpcClient[GPC_CF_QOS], SendTCB->tcb_opt.ioi_GPCHandle, FIELD_OFFSET(CF_INFO_QOS, TransportInformation), (PULONG)&TransportInfo); } } // // Perhaps something needs to be done if GPC_CF_IPSEC has non-zero patterns. // // // Set the TOS bit now. // IF_TCPDBG(TCP_DEBUG_GPC) KdPrintEx((DPFLTR_TCPIP_ID, DPFLTR_INFO_LEVEL,"TCPsend: ServiceType(%d)=%d\n", FIELD_OFFSET(CF_INFO_QOS, TransportInformation))); if (status == STATUS_SUCCESS) { // // Get the TOS value and the types of allowed offloads. // SendTCB->tcb_opt.ioi_tos |= TransportInfo.ToSValue; SendTCB->tcb_allowedoffloads = (USHORT)TransportInfo.AllowedOffloads; // // We are guaranteed for now that the other kind of offloads are // never disabled, and hence, we won't check them on a per // connection basis. // ASSERT((TransportInfo.AllowedOffloads | TCP_LARGE_SEND_OFFLOAD | TCP_LARGE_SEND_TCPOPT_OFFLOAD | TCP_LARGE_SEND_IPOPT_OFFLOAD) == TCP_IP_OFFLOAD_TYPES); } IF_TCPDBG(TCP_DEBUG_GPC) KdPrintEx((DPFLTR_TCPIP_ID, DPFLTR_INFO_LEVEL,"TCPsend: TOS set to 0x%x\n", SendTCB->tcb_opt.ioi_tos)); } } #endif } BOOLEAN ProcessSend(TCB *SendTCB, SendCmpltContext *SCC, uint *pSendLength, uint AmtUnsent, TCPHeader *Header, int SendWin, PNDIS_BUFFER CurrentBuffer) { TCPSendReq *CurSend = SCC->scc_firstsend; long Result; uint AmountLeft = *pSendLength; ulong PrevFlags; Queue *Next; SeqNum OldSeq; if (*pSendLength != 0) { do { BOOLEAN DirectSend = FALSE; ASSERT(CurSend->tsr_refcnt > 0); Result = CTEInterlockedIncrementLong(&(CurSend->tsr_refcnt)); ASSERT(Result > 0); SCC->scc_count++; if (SendTCB->tcb_sendofs == 0 && (SendTCB->tcb_sendsize <= AmountLeft) && (SCC->scc_tbufcount == 0) && (CurSend->tsr_lastbuf == NULL)) { ulong length = 0; PNDIS_BUFFER tmp = SendTCB->tcb_sendbuf; while (tmp) { length += NdisBufferLength(tmp); tmp = NDIS_BUFFER_LINKAGE(tmp); } // If the requested length is // more than in this mdl chain // we can use fast path if (AmountLeft >= length) { DirectSend = TRUE; } } if (DirectSend) { NDIS_BUFFER_LINKAGE(CurrentBuffer) = SendTCB->tcb_sendbuf; do { SCC->scc_ubufcount++; CurrentBuffer = NDIS_BUFFER_LINKAGE(CurrentBuffer); } while (NDIS_BUFFER_LINKAGE(CurrentBuffer) != NULL); CurSend->tsr_lastbuf = CurrentBuffer; AmountLeft -= SendTCB->tcb_sendsize; SendTCB->tcb_sendsize = 0; } else { uint AmountToDup; PNDIS_BUFFER NewBuf, Buf; uint Offset; NDIS_STATUS NStatus; uint Length; // Either the current send has more data than // or the offset is not zero. // In either case we'll need to loop // through the current send, allocating buffers. Buf = SendTCB->tcb_sendbuf; Offset = SendTCB->tcb_sendofs; do { ASSERT(Buf != NULL); Length = NdisBufferLength(Buf); ASSERT((Offset < Length) || (Offset == 0 && Length == 0)); // Adjust the length for the offset into // this buffer. Length -= Offset; AmountToDup = MIN(AmountLeft, Length); NdisCopyBuffer(&NStatus, &NewBuf, TCPSendBufferPool, Buf, Offset, AmountToDup); if (NStatus == NDIS_STATUS_SUCCESS) { SCC->scc_tbufcount++; NDIS_BUFFER_LINKAGE(CurrentBuffer) = NewBuf; CurrentBuffer = NewBuf; if (AmountToDup >= Length) { // Exhausted this buffer. Buf = NDIS_BUFFER_LINKAGE(Buf); Offset = 0; } else { Offset += AmountToDup; ASSERT(Offset < NdisBufferLength(Buf)); } SendTCB->tcb_sendsize -= AmountToDup; AmountLeft -= AmountToDup; } else { // Couldn't allocate a buffer. If // the packet is already partly built, // send what we've got, otherwise // bail out. if (SCC->scc_tbufcount == 0 && SCC->scc_ubufcount == 0) { return FALSE; } *pSendLength -= AmountLeft; AmountLeft = 0; } } while (AmountLeft && SendTCB->tcb_sendsize); SendTCB->tcb_sendbuf = Buf; SendTCB->tcb_sendofs = Offset; } if (CurSend->tsr_flags & TSR_FLAG_URG) { ushort UP; // This send is urgent data. We need to figure // out what the urgent data pointer should be. // We know sendnext is the starting sequence // number of the frame, and that at the top of // this do loop sendnext identified a byte in // the CurSend at that time. We advanced CurSend // at the same rate we've decremented // AmountLeft (AmountToSend - AmountLeft == // AmountBuilt), so sendnext + // (AmountToSend - AmountLeft) identifies a byte // in the current value of CurSend, and that // quantity plus tcb_sendsize is the sequence // number one beyond the current send. UP = (ushort) (*pSendLength - AmountLeft) + (ushort) SendTCB->tcb_sendsize - ((SendTCB->tcb_flags & BSD_URGENT) ? 0 : 1); Header->tcp_urgent = net_short(UP); Header->tcp_flags |= TCP_FLAG_URG; } if (SendTCB->tcb_sendsize == 0) { // We've exhausted this send. Set the PUSH bit. Header->tcp_flags |= TCP_FLAG_PUSH; PrevFlags = CurSend->tsr_flags; Next = QNEXT(&CurSend->tsr_req.tr_q); if (Next != QEND(&SendTCB->tcb_sendq)) { CurSend = STRUCT_OF(TCPSendReq, QSTRUCT(TCPReq, Next, tr_q), tsr_req); CTEStructAssert(CurSend, tsr); SendTCB->tcb_sendsize = CurSend->tsr_unasize; SendTCB->tcb_sendofs = CurSend->tsr_offset; SendTCB->tcb_sendbuf = CurSend->tsr_buffer; SendTCB->tcb_cursend = CurSend; // Check the urgent flags. We can't combine // new urgent data on to the end of old // non-urgent data. if ((PrevFlags & TSR_FLAG_URG) && ! (CurSend->tsr_flags & TSR_FLAG_URG)) break; } else { ASSERT(AmountLeft == 0); SendTCB->tcb_cursend = NULL; SendTCB->tcb_sendbuf = NULL; } } } while (AmountLeft != 0); } // Update the sequence numbers, and start a RTT // measurement if needed. // Adjust for what we're really going to send. *pSendLength -= AmountLeft; OldSeq = SendTCB->tcb_sendnext; SendTCB->tcb_sendnext += *pSendLength; if (SEQ_EQ(OldSeq, SendTCB->tcb_sendmax)) { // We're sending entirely new data. // We can't advance sendmax once FIN_SENT is set. ASSERT(!(SendTCB->tcb_flags & FIN_SENT)); SendTCB->tcb_sendmax = SendTCB->tcb_sendnext; // We've advanced sendmax, so we must be sending // some new data, so bump the outsegs counter. TCPSIncrementOutSegCount(); if (SendTCB->tcb_rtt == 0) { // No RTT running, so start one. SendTCB->tcb_rtt = TCPTime; SendTCB->tcb_rttseq = OldSeq; } } else { // We have at least some retransmission. if ((SendTCB->tcb_sendmax - OldSeq) > 1) { TStats.ts_retranssegs++; } if (SEQ_GT(SendTCB->tcb_sendnext, SendTCB->tcb_sendmax)) { // But we also have some new data, so check the rtt stuff. TCPSIncrementOutSegCount(); ASSERT(!(SendTCB->tcb_flags & FIN_SENT)); SendTCB->tcb_sendmax = SendTCB->tcb_sendnext; if (SendTCB->tcb_rtt == 0) { // No RTT running, so start one. SendTCB->tcb_rtt = TCPTime; SendTCB->tcb_rttseq = OldSeq; } } } // We've built the frame entirely. If we've send // everything we have and there is a FIN pending, // OR it in. if (AmtUnsent == *pSendLength) { if (SendTCB->tcb_flags & FIN_NEEDED) { ASSERT(!(SendTCB->tcb_flags & FIN_SENT) || (SendTCB->tcb_sendnext == (SendTCB->tcb_sendmax - 1))); // See if we still have room in the window for a FIN. if (SendWin > (int)*pSendLength) { Header->tcp_flags |= TCP_FLAG_FIN; SendTCB->tcb_sendnext++; SendTCB->tcb_sendmax = SendTCB->tcb_sendnext; SendTCB->tcb_flags |= (FIN_SENT | FIN_OUTSTANDING); SendTCB->tcb_flags &= ~FIN_NEEDED; } } } return TRUE; } //* TCPSend - Send data from a TCP connection. // // This is the main 'send data' routine. We go into a loop, trying // to send data until we can't for some reason. First we compute // the useable window, use it to figure the amount we could send. If // the amount we could send meets certain criteria we'll build a frame // and send it, after setting any appropriate control bits. We assume // the caller has put a reference on the TCB. // // Input: SendTCB - TCB to be sent from. // TCBHandle - Lock handle for TCB. // // Returns: Nothing. // void TCPSend(TCB * SendTCB, CTELockHandle TCBHandle) { int SendWin; // Useable send window. uint AmountToSend; // Amount to send this time. uint AmountLeft; TCPHeader *Header; // TCP header for a send. PNDIS_BUFFER FirstBuffer, CurrentBuffer; TCPSendReq *CurSend; SendCmpltContext *SCC; SeqNum OldSeq; IP_STATUS SendStatus; uint AmtOutstanding, AmtUnsent; int ForceWin; // Window we're force to use. BOOLEAN FullSegment; BOOLEAN MoreToSend = FALSE; uint SegmentsSent = 0; BOOLEAN LargeSendOffload = FALSE; BOOLEAN LargeSendFailed = FALSE; uint MSS; uint LargeSend, SentBytes; void *Irp; CTEStructAssert(SendTCB, tcb); ASSERT(SendTCB->tcb_refcnt != 0); ASSERT(*(int *)&SendTCB->tcb_sendwin >= 0); ASSERT(*(int *)&SendTCB->tcb_cwin >= SendTCB->tcb_mss); ASSERT(!(SendTCB->tcb_flags & FIN_OUTSTANDING) || (SendTCB->tcb_sendnext == SendTCB->tcb_sendmax)); if (!(SendTCB->tcb_flags & IN_TCP_SEND) && !(SendTCB->tcb_fastchk & TCP_FLAG_IN_RCV)) { SendTCB->tcb_flags |= IN_TCP_SEND; // We'll continue this loop until we send a FIN, or we break out // internally for some other reason. while (!(SendTCB->tcb_flags & FIN_OUTSTANDING)) { CheckTCBSends(SendTCB); SegmentsSent++; if (SegmentsSent > MaxSendSegments) { // We are throttled by max segments that can be sent in // this loop. Comeback later MoreToSend = TRUE; break; } AmtOutstanding = (uint) (SendTCB->tcb_sendnext - SendTCB->tcb_senduna); AmtUnsent = SendTCB->tcb_unacked - AmtOutstanding; ASSERT(*(int *)&AmtUnsent >= 0); SendWin = (int)(MIN(SendTCB->tcb_sendwin, SendTCB->tcb_cwin) - AmtOutstanding); // if this send is after the fast recovery // and sendwin is zero because of amt outstanding // then, at least force 1 segment to prevent delayed // ack timeouts from the remote if (SendTCB->tcb_force) { SendTCB->tcb_force = 0; if (SendWin < SendTCB->tcb_mss) { SendWin = SendTCB->tcb_mss; } } // Since the window could have shrank, need to get it to zero at // least. ForceWin = (int)((SendTCB->tcb_flags & FORCE_OUTPUT) >> FORCE_OUT_SHIFT); SendWin = MAX(SendWin, ForceWin); LargeSend = MIN((uint) SendWin, AmtUnsent); LargeSend = MIN(LargeSend, SendTCB->tcb_mss * MaxSendSegments); AmountToSend = MIN(MIN((uint) SendWin, AmtUnsent), SendTCB->tcb_mss); ASSERT(SendTCB->tcb_mss > 0); // Time stamp option addition might force us to cut the data // to be sent by 12 bytes. FullSegment = FALSE; if ((SendTCB->tcb_tcpopts & TCP_FLAG_TS) && (AmountToSend + ALIGNED_TS_OPT_SIZE >= SendTCB->tcb_mss)) { AmountToSend = SendTCB->tcb_mss - ALIGNED_TS_OPT_SIZE; FullSegment = TRUE; } else { if (AmountToSend == SendTCB->tcb_mss) FullSegment = TRUE; } // We will send a segment if // // 1. The segment size == mss // 2. This is the only segment to be sent // 3. FIN is set and this is the last segment // 4. FORCE_OUTPUT is set // 5. Amount to be sent is >= MSS/2 if (FullSegment || (AmountToSend != 0 && AmountToSend == AmtUnsent) || (SendWin != 0 && (((SendTCB->tcb_flags & FIN_NEEDED) && (AmtUnsent <= SendTCB->tcb_mss)) || (SendTCB->tcb_flags & FORCE_OUTPUT) || AmountToSend >= (SendTCB->tcb_maxwin / 2)))) { // // Set MSS first. // if (SendTCB->tcb_tcpopts & TCP_FLAG_TS) { MSS = SendTCB->tcb_mss - ALIGNED_TS_OPT_SIZE; } else { MSS = SendTCB->tcb_mss; } // It's OK to send something. Try to get a header buffer now. FirstBuffer = GetTCPHeaderAtDpcLevel(&Header); if (FirstBuffer != NULL) { // Got a header buffer. Loop through the sends on the TCB, // building a frame. CurrentBuffer = FirstBuffer; CurSend = SendTCB->tcb_cursend; Header = (TCPHeader *)((PUCHAR)Header + LocalNetInfo.ipi_hsize); // allow room for filling time stamp options (12 bytes) if (SendTCB->tcb_tcpopts & TCP_FLAG_TS) { NdisAdjustBufferLength(FirstBuffer, sizeof(TCPHeader) + ALIGNED_TS_OPT_SIZE); SCC = (SendCmpltContext *) (Header + 1); SCC = (SendCmpltContext *) ((uchar *) SCC + ALIGNED_TS_OPT_SIZE); } else { SCC = (SendCmpltContext *) (Header + 1); } SCC = ALIGN_UP_POINTER(SCC, PVOID); #if DBG SCC->scc_sig = scc_signature; #endif FillTCPHeader(SendTCB, Header); SCC->scc_ubufcount = 0; SCC->scc_tbufcount = 0; SCC->scc_count = 0; SCC->scc_LargeSend = 0; // Check if RCE has large send capability and, if so, // attempt to offload segmentation to the hardware. // * only offload if there is more than 1 segment's worth // of data. // * only offload if the number of segments is greater than // the minimum number of segments the adapter is willing // to offload. // * only offload if it is allowed by all the entities of // known classification families. // * ( i.e. if TCP or IP options need to be // offloaded, we only offload if the adapter supports it) // if (!DisableLargeSendOffload && SendTCB->tcb_rce && (SendTCB->tcb_rce->rce_OffloadFlags & TCP_LARGE_SEND_OFFLOAD) && (SendTCB->tcb_allowedoffloads & TCP_LARGE_SEND_OFFLOAD) && (!(SendTCB->tcb_tcpopts & TCP_FLAG_TS) || (SendTCB->tcb_rce->rce_OffloadFlags & TCP_LARGE_SEND_TCPOPT_OFFLOAD)) && (!SendTCB->tcb_opt.ioi_options || (SendTCB->tcb_rce->rce_OffloadFlags & TCP_LARGE_SEND_IPOPT_OFFLOAD)) && !LargeSendFailed && (MSS < LargeSend) && (CurSend && (CurSend->tsr_lastbuf == NULL)) && !(CurSend->tsr_flags & TSR_FLAG_URG)) { uint PartialSegment; LargeSendOffload = TRUE; LargeSend = MIN(SendTCB->tcb_rce->rce_TcpLargeSend.MaxOffLoadSize, LargeSend); // // Adjust LargeSend to make LSO path // conform sender side silly window avoidance: // 1) it is multiple of MSS // 2) We are sending out everything we have // 3) FORCE_OUTPUT is set // 4) Amount to be sent is >= maximum window size /2 // PartialSegment = LargeSend % MSS; if ((PartialSegment != 0) && (LargeSend != AmtUnsent) && (!(SendTCB->tcb_flags & FORCE_OUTPUT)) && (PartialSegment < (SendTCB->tcb_maxwin / 2))) { LargeSend -= PartialSegment; } // // Offload only if the segments we have is greater than // the minimum segment requirement of the NIC. // if (SendTCB->tcb_rce->rce_TcpLargeSend.MinSegmentCount > (LargeSend + MSS - 1) / MSS ) { LargeSendOffload = FALSE; } // // LargeSend can not be zero. // if (LargeSend == 0) { LargeSendOffload = FALSE; } } else { LargeSendOffload = FALSE; } if (LargeSendOffload) { IF_TCPDBG(TCP_DEBUG_OFFLOAD) { KdPrintEx((DPFLTR_TCPIP_ID, DPFLTR_INFO_LEVEL,"TCPSend: tcb %x offload %d bytes at " "seq %u ack %u win %u\n", SendTCB, LargeSend, SendTCB->tcb_sendnext, SendTCB->tcb_rcvnext, SendWin)); } OldSeq = SendTCB->tcb_sendnext; CTEStructAssert(CurSend, tsr); SCC->scc_firstsend = CurSend; if (!ProcessSend(SendTCB, SCC, &LargeSend, AmtUnsent, Header, SendWin, CurrentBuffer)) { goto error_oor1; } { uint PHXsum = SendTCB->tcb_phxsum; PHXsum = (((PHXsum << 16) | (PHXsum >> 16)) + PHXsum) >> 16; Header->tcp_xsum = (ushort) PHXsum; } SCC->scc_SendSize = LargeSend; SCC->scc_ByteSent = 0; SCC->scc_LargeSend = SendTCB; REFERENCE_TCB(SendTCB); #if DBG SendTCB->tcb_LargeSend++; #endif SendTCB->tcb_rcvdsegs = 0; if (SendTCB->tcb_tcpopts & TCP_FLAG_TS) { LargeSend += sizeof(TCPHeader) + ALIGNED_TS_OPT_SIZE; } else { LargeSend += sizeof(TCPHeader); } IF_TCPDBG(TCP_DEBUG_OFFLOAD) { KdPrintEx((DPFLTR_TCPIP_ID, DPFLTR_INFO_LEVEL,"TCPSend: tcb %x large-send %d seq %u\n", SendTCB, LargeSend, OldSeq)); } ClassifyPacket(SendTCB); CTEFreeLock(&SendTCB->tcb_lock, TCBHandle); SendStatus = (*LocalNetInfo.ipi_largexmit)(TCPProtInfo, SCC, FirstBuffer, LargeSend, SendTCB->tcb_daddr, SendTCB->tcb_saddr, &SendTCB->tcb_opt, SendTCB->tcb_rce, PROTOCOL_TCP, &SentBytes, MSS); SendTCB->tcb_error = SendStatus; if (SendStatus != IP_PENDING) { // Let TCPSendComplete hanlde partial sends SCC->scc_ByteSent = SentBytes; TCPSendComplete(SCC, FirstBuffer, IP_SUCCESS); } CTEGetLock(&SendTCB->tcb_lock, &TCBHandle); if (SendStatus == IP_GENERAL_FAILURE) { if (SEQ_GTE(OldSeq, SendTCB->tcb_senduna) && SEQ_LT(OldSeq, SendTCB->tcb_sendnext)) { ResetSendNext(SendTCB, OldSeq); } LargeSendFailed = TRUE; continue; } if (SendStatus == IP_PACKET_TOO_BIG) { SeqNum NewSeq = OldSeq + SentBytes; //Not everything got sent. //Adjust for what is sent if (SEQ_GTE(NewSeq, SendTCB->tcb_senduna) && SEQ_LT(NewSeq, SendTCB->tcb_sendnext)) { ResetSendNext(SendTCB, NewSeq); } } if (!TCB_TIMER_RUNNING_R(SendTCB, RXMIT_TIMER)) { START_TCB_TIMER_R(SendTCB, RXMIT_TIMER, SendTCB->tcb_rexmit); } SendTCB->tcb_flags &= ~(IN_TCP_SEND | NEED_OUTPUT | FORCE_OUTPUT | SEND_AFTER_RCV); DerefTCB(SendTCB, TCBHandle); return; } // Normal path AmountLeft = AmountToSend; if (AmountToSend != 0) { CTEStructAssert(CurSend, tsr); SCC->scc_firstsend = CurSend; } else { // We're in the loop, but AmountToSend is 0. This // should happen only when we're sending a FIN. Check // this, and return if it's not true. ASSERT(AmtUnsent == 0); if (!(SendTCB->tcb_flags & FIN_NEEDED)) { FreeTCPHeader(FirstBuffer); break; } SCC->scc_firstsend = NULL; NDIS_BUFFER_LINKAGE(FirstBuffer) = NULL; } OldSeq = SendTCB->tcb_sendnext; if (!ProcessSend(SendTCB, SCC, &AmountToSend, AmtUnsent, Header, SendWin, CurrentBuffer)) { goto error_oor1; } AmountToSend += sizeof(TCPHeader); SendTCB->tcb_flags &= ~(NEED_ACK | ACK_DELAYED | FORCE_OUTPUT); STOP_TCB_TIMER_R(SendTCB, DELACK_TIMER); STOP_TCB_TIMER_R(SendTCB, SWS_TIMER); SendTCB->tcb_rcvdsegs = 0; if ( (SendTCB->tcb_flags & KEEPALIVE) && ( SendTCB->tcb_conn != NULL) ) START_TCB_TIMER_R(SendTCB, KA_TIMER, SendTCB->tcb_conn->tc_tcbkatime); SendTCB->tcb_kacount = 0; // We're all set. Xsum it and send it. ClassifyPacket(SendTCB); // Account for time stamp options if (SendTCB->tcb_tcpopts & TCP_FLAG_TS) { if (SendTCB->tcb_rce && (SendTCB->tcb_rce->rce_OffloadFlags & TCP_XMT_CHECKSUM_OFFLOAD) && (SendTCB->tcb_rce->rce_OffloadFlags & TCP_CHECKSUM_OPT_OFFLOAD)) { uint PHXsum = SendTCB->tcb_phxsum + (uint)net_short(AmountToSend + ALIGNED_TS_OPT_SIZE); PHXsum = (((PHXsum << 16) | (PHXsum >> 16)) + PHXsum) >> 16; Header->tcp_xsum = (ushort) PHXsum; SendTCB->tcb_opt.ioi_TcpChksum = 1; #if DBG DbgTcpSendHwChksumCount++; #endif } else { Header->tcp_xsum = ~XsumSendChain( SendTCB->tcb_phxsum + (uint)net_short(AmountToSend + ALIGNED_TS_OPT_SIZE), FirstBuffer); SendTCB->tcb_opt.ioi_TcpChksum = 0; } CTEFreeLock(&SendTCB->tcb_lock, TCBHandle); Irp = NULL; if (SCC->scc_firstsend) { Irp = SCC->scc_firstsend->tsr_req.tr_context; } SendStatus = (*LocalNetInfo.ipi_xmit)(TCPProtInfo, SCC, FirstBuffer, AmountToSend + ALIGNED_TS_OPT_SIZE, SendTCB->tcb_daddr, SendTCB->tcb_saddr, &SendTCB->tcb_opt, SendTCB->tcb_rce, PROTOCOL_TCP, Irp ); } else { if (SendTCB->tcb_rce && (SendTCB->tcb_rce->rce_OffloadFlags & TCP_XMT_CHECKSUM_OFFLOAD)) { uint PHXsum = SendTCB->tcb_phxsum + (uint)net_short(AmountToSend); PHXsum = (((PHXsum << 16) | (PHXsum >> 16)) + PHXsum) >> 16; Header->tcp_xsum = (ushort) PHXsum; SendTCB->tcb_opt.ioi_TcpChksum = 1; #if DBG DbgTcpSendHwChksumCount++; #endif } else { Header->tcp_xsum = ~XsumSendChain(SendTCB->tcb_phxsum + (uint)net_short(AmountToSend), FirstBuffer); SendTCB->tcb_opt.ioi_TcpChksum = 0; } CTEFreeLock(&SendTCB->tcb_lock, TCBHandle); Irp = NULL; if(SCC->scc_firstsend) { Irp = SCC->scc_firstsend->tsr_req.tr_context; } SendStatus = (*LocalNetInfo.ipi_xmit)(TCPProtInfo, SCC, FirstBuffer, AmountToSend, SendTCB->tcb_daddr, SendTCB->tcb_saddr, &SendTCB->tcb_opt, SendTCB->tcb_rce, PROTOCOL_TCP, Irp ); } SendTCB->tcb_error = SendStatus; if (SendStatus != IP_PENDING) { TCPSendComplete(SCC, FirstBuffer, IP_SUCCESS); if (SendStatus != IP_SUCCESS) { CTEGetLock(&SendTCB->tcb_lock, &TCBHandle); // This packet didn't get sent. If nothing's // changed in the TCB, put sendnext back to // what we just tried to send. Depending on // the error, we may try again. if (SEQ_GTE(OldSeq, SendTCB->tcb_senduna) && SEQ_LT(OldSeq, SendTCB->tcb_sendnext)) ResetSendNext(SendTCB, OldSeq); // We know this packet didn't get sent. Start // the retransmit timer now, if it's not already // runnimg, in case someone came in while we // were in IP and stopped it. if (!TCB_TIMER_RUNNING_R(SendTCB, RXMIT_TIMER)) { START_TCB_TIMER_R(SendTCB, RXMIT_TIMER, SendTCB->tcb_rexmit); } // If it failed because of an MTU problem, get // the new MTU and try again. if (SendStatus == IP_PACKET_TOO_BIG) { uint NewMTU; // The MTU has changed. Update it, and try // again. // if ipsec is adjusting the mtu, rce_newmtu // will contain the newmtu. if (SendTCB->tcb_rce) { if (!SendTCB->tcb_rce->rce_newmtu) { SendStatus = (*LocalNetInfo.ipi_getpinfo)( SendTCB->tcb_daddr, SendTCB->tcb_saddr, &NewMTU, NULL, SendTCB->tcb_rce); } else { NewMTU = SendTCB->tcb_rce->rce_newmtu; SendStatus = IP_SUCCESS; } } else { SendStatus = (*LocalNetInfo.ipi_getpinfo)( SendTCB->tcb_daddr, SendTCB->tcb_saddr, &NewMTU, NULL, SendTCB->tcb_rce); } if (SendStatus != IP_SUCCESS) break; // We have a new MTU. Make sure it's big enough // to use. If not, correct this and turn off // MTU discovery on this TCB. Otherwise use the // new MTU. if (NewMTU <= (sizeof(TCPHeader) + SendTCB->tcb_opt.ioi_optlength)) { // The new MTU is too small to use. Turn off // PMTU discovery on this TCB, and drop to // our off net MTU size. SendTCB->tcb_opt.ioi_flags &= ~IP_FLAG_DF; SendTCB->tcb_mss = MIN((ushort)MAX_REMOTE_MSS, SendTCB->tcb_remmss); } else { // The new MTU is adequate. Adjust it for // the header size and options length, and // use it. NewMTU -= sizeof(TCPHeader) - SendTCB->tcb_opt.ioi_optlength; SendTCB->tcb_mss = MIN((ushort) NewMTU, SendTCB->tcb_remmss); } ASSERT(SendTCB->tcb_mss > 0); ValidateMSS(SendTCB); continue; } break; } } //Start it now, since we know that mac driver accepted it. CTEGetLock(&SendTCB->tcb_lock, &TCBHandle); if (!TCB_TIMER_RUNNING_R(SendTCB, RXMIT_TIMER)) { START_TCB_TIMER_R(SendTCB, RXMIT_TIMER, SendTCB->tcb_rexmit); } continue; } else // FirstBuffer != NULL. goto error_oor; } else { // We've decided we can't send anything now. Figure out why, and // see if we need to set a timer. if (SendTCB->tcb_sendwin == 0) { if (!(SendTCB->tcb_flags & FLOW_CNTLD)) { ushort tmp; SendTCB->tcb_flags |= FLOW_CNTLD; SendTCB->tcb_rexmitcnt = 0; tmp = MIN(MAX(REXMIT_TO(SendTCB), MIN_RETRAN_TICKS), MAX_REXMIT_TO); START_TCB_TIMER_R(SendTCB, RXMIT_TIMER, tmp); SendTCB->tcb_slowcount++; SendTCB->tcb_fastchk |= TCP_FLAG_SLOW; } else if (!TCB_TIMER_RUNNING_R(SendTCB, RXMIT_TIMER)) START_TCB_TIMER_R(SendTCB, RXMIT_TIMER, SendTCB->tcb_rexmit); } else if (AmountToSend != 0) // We have something to send, but we're not sending // it, presumably due to SWS avoidance. if (!TCB_TIMER_RUNNING_R(SendTCB, SWS_TIMER)) START_TCB_TIMER_R(SendTCB, SWS_TIMER, SWS_TO); break; } } // while (!FIN_OUTSTANDING) // We're done sending, so we don't need the output flags set. SendTCB->tcb_flags &= ~(IN_TCP_SEND | NEED_OUTPUT | FORCE_OUTPUT | SEND_AFTER_RCV); if (MoreToSend) { //just indicate that we need to send more DelayAction(SendTCB, NEED_OUTPUT); PartitionDelayQProcessing(FALSE); } // This is for TS algo SendTCB->tcb_lastack = SendTCB->tcb_rcvnext; } else SendTCB->tcb_flags |= SEND_AFTER_RCV; DerefTCB(SendTCB, TCBHandle); return; // Common case error handling code for out of resource conditions. Start the // retransmit timer if it's not already running (so that we try this again // later), clean up and return. error_oor: if (!TCB_TIMER_RUNNING_R(SendTCB, RXMIT_TIMER)) { ushort tmp; tmp = MIN(MAX(REXMIT_TO(SendTCB), MIN_RETRAN_TICKS), MAX_REXMIT_TO); START_TCB_TIMER_R(SendTCB, RXMIT_TIMER, tmp); } // We had an out of resource problem, so clear the OUTPUT flags. SendTCB->tcb_flags &= ~(IN_TCP_SEND | NEED_OUTPUT | FORCE_OUTPUT); DerefTCB(SendTCB, TCBHandle); return; error_oor1: if (!TCB_TIMER_RUNNING_R(SendTCB, RXMIT_TIMER)) { ushort tmp; tmp = MIN(MAX(REXMIT_TO(SendTCB), MIN_RETRAN_TICKS), MAX_REXMIT_TO); START_TCB_TIMER_R(SendTCB, RXMIT_TIMER, tmp); } // We had an out of resource problem, so clear the OUTPUT flags. SendTCB->tcb_flags &= ~(IN_TCP_SEND | NEED_OUTPUT | FORCE_OUTPUT); DerefTCB(SendTCB, TCBHandle); TCPSendComplete(SCC, FirstBuffer, IP_SUCCESS); return; } //* ResetSendNextAndFastSend - Set the sendnext value of a TCB. // // Called to handle fast retransmit of the segment which the reveiver // is asking for. // We assume the caller has put a reference on the TCB, and the TCB is locked // on entry. The reference is dropped and the lock released before returning. // // Input: SeqTCB - Pointer to TCB to be updated. // NewSeq - Sequence number to set. // NewCWin - new value for congestion window. // // Returns: Nothing. // void ResetAndFastSend(TCB * SeqTCB, SeqNum NewSeq, uint NewCWin) { TCPSendReq *SendReq; uint AmtForward; Queue *CurQ; PNDIS_BUFFER Buffer; uint Offset; uint SendSize; CTELockHandle TCBHandle; int ToBeSent; CTEStructAssert(SeqTCB, tcb); ASSERT(SEQ_GTE(NewSeq, SeqTCB->tcb_senduna)); // The new seq must be less than send max, or NewSeq, senduna, sendnext, // and sendmax must all be equal. (The latter case happens when we're // called exiting TIME_WAIT, or possibly when we're retransmitting // during a flow controlled situation). ASSERT(SEQ_LT(NewSeq, SeqTCB->tcb_sendmax) || (SEQ_EQ(SeqTCB->tcb_senduna, SeqTCB->tcb_sendnext) && SEQ_EQ(SeqTCB->tcb_senduna, SeqTCB->tcb_sendmax) && SEQ_EQ(SeqTCB->tcb_senduna, NewSeq))); if (SYNC_STATE(SeqTCB->tcb_state) && SeqTCB->tcb_state != TCB_TIME_WAIT) { // In these states we need to update the send queue. if (!EMPTYQ(&SeqTCB->tcb_sendq)) { // Stop the retransmit timer only if we are sure there are going // to be retransmissions. STOP_TCB_TIMER_R(SeqTCB, RXMIT_TIMER); SeqTCB->tcb_rtt = 0; CurQ = QHEAD(&SeqTCB->tcb_sendq); SendReq = (TCPSendReq *) STRUCT_OF(TCPReq, CurQ, tr_q); // SendReq points to the first send request on the send queue. // We're pointing at the proper send req now. We need to go down // SendReq points to the cursend // SendSize point to sendsize in the cursend SendSize = SendReq->tsr_unasize; Buffer = SendReq->tsr_buffer; Offset = SendReq->tsr_offset; // Call the fast retransmit send now if ((SeqTCB->tcb_tcpopts & TCP_FLAG_SACK)) { SackListEntry *Prev, *Current; SeqNum CurBegin = 0, CurEnd; BOOLEAN UseSackList = TRUE; Prev = STRUCT_OF(SackListEntry, &SeqTCB->tcb_SackRcvd, next); Current = Prev->next; // There is a hole from Newseq to Currentbeg // try to retransmit whole hole size!! if (Current && SEQ_LT(NewSeq, Current->begin)) { ToBeSent = Current->begin - NewSeq; CurBegin = Current->begin; CurEnd = Current->end; } else { UseSackList = FALSE; ToBeSent = SeqTCB->tcb_mss; } IF_TCPDBG(TCP_DEBUG_SACK) { KdPrintEx((DPFLTR_TCPIP_ID, DPFLTR_INFO_LEVEL, "In Sack Reset and send rexmiting %d %d\n", NewSeq, SendSize)); } TCPFastSend(SeqTCB, Buffer, Offset, SendReq, SendSize, NewSeq, ToBeSent); // If we have not been already acked for the missing segments // and if we know where to start retransmitting do so now. // Also, re-validate SackListentry Prev = STRUCT_OF(SackListEntry, &SeqTCB->tcb_SackRcvd, next); Current = Prev->next; if (!UseSackList || (Current && Current->begin != CurBegin)) { // The SACK list changed while we were in a transmission. // Just bail out, and wait for the next ACK to continue // if necessary. Current = NULL; } while (Current && Current->next && (SEQ_GTE(NewSeq, SeqTCB->tcb_senduna)) && (SEQ_LT(SeqTCB->tcb_senduna, Current->next->end))) { SeqNum NextSeq; ASSERT(SEQ_LTE(Current->begin, Current->end)); // There can be multiple dropped packets till // Current->begin. IF_TCPDBG(TCP_DEBUG_SACK) { KdPrintEx((DPFLTR_TCPIP_ID, DPFLTR_INFO_LEVEL, "Scanning after Current %d %d\n", Current->begin, Current->end)); } NextSeq = Current->end; CurBegin = Current->begin; ASSERT(SEQ_LT(NextSeq, SeqTCB->tcb_sendmax)); // If we have not yet sent the segment keep quiet now. if (SEQ_GTE(NextSeq, SeqTCB->tcb_sendnext) || (SEQ_LTE(NextSeq, SeqTCB->tcb_senduna))) { break; } // Position cursend by following number of bytes AmtForward = NextSeq - NewSeq; if (!EMPTYQ(&SeqTCB->tcb_sendq)) { CurQ = QHEAD(&SeqTCB->tcb_sendq); SendReq = (TCPSendReq *) STRUCT_OF(TCPReq, CurQ, tr_q); while (AmtForward) { if (AmtForward >= SendReq->tsr_unasize) { AmtForward -= SendReq->tsr_unasize; CurQ = QNEXT(CurQ); SendReq = (TCPSendReq *)STRUCT_OF(TCPReq, CurQ, tr_q); ASSERT(CurQ != QEND(&SeqTCB->tcb_sendq)); } else { break; } } SendSize = SendReq->tsr_unasize - AmtForward; Buffer = SendReq->tsr_buffer; Offset = SendReq->tsr_offset; while (AmtForward) { uint Length; ASSERT((Offset < NdisBufferLength(Buffer)) || ((Offset == 0) && (NdisBufferLength(Buffer) == 0))); Length = NdisBufferLength(Buffer) - Offset; if (AmtForward >= Length) { // We're moving past this one. Skip over him, // and 0 the Offset we're keeping. AmtForward -= Length; Offset = 0; Buffer = NDIS_BUFFER_LINKAGE(Buffer); ASSERT(Buffer != NULL); } else { break; } } Offset = Offset + AmtForward; // Okay. Now retransmit this seq too. if (Current->next) { ToBeSent = Current->next->begin - Current->end; } else { ToBeSent = SeqTCB->tcb_mss; } IF_TCPDBG(TCP_DEBUG_SACK) { KdPrintEx((DPFLTR_TCPIP_ID, DPFLTR_INFO_LEVEL, "SACK inner loop rexmiting %d %d %d\n", Current->end, SendSize, ToBeSent)); } TCPFastSend(SeqTCB, Buffer, Offset, SendReq, SendSize, NextSeq, ToBeSent); } else { break; } // Also, re-validate Current Sack list in SackListentry Prev = STRUCT_OF(SackListEntry, &SeqTCB->tcb_SackRcvd, next); Current = Prev->next; while (Current && Current->begin != CurBegin) { // The SACK list changed while in TCPFastSend. // Just bail out. Current = Current->next; } if (Current) { Current = Current->next; } else { break; } } } else { ToBeSent = SeqTCB->tcb_mss; TCPFastSend(SeqTCB, Buffer, Offset, SendReq, SendSize, NewSeq, ToBeSent); } } else { ASSERT(SeqTCB->tcb_cursend == NULL); } } SeqTCB->tcb_cwin = NewCWin; // Make sure there is nothing outstanding or the retransmit timer is // running or we are in the process of sending a segment (and yet to // start the timer). ASSERT((SeqTCB->tcb_sendnext == SeqTCB->tcb_senduna) || TCB_TIMER_RUNNING_R(SeqTCB, RXMIT_TIMER) || (SeqTCB->tcb_flags & IN_TCP_SEND)); TCBHandle = DISPATCH_LEVEL; DerefTCB(SeqTCB, TCBHandle); return; } //* TCPFastSend - To send a segment without changing TCB state // // Called to handle fast retransmit of the segment // tcb_lock will be held while entering (called by TCPRcv) // // Input: SendTCB - Pointer to TCB // in_sendBuf - Pointer to ndis_buffer // in_sendofs - Send Offset // in_sendreq - current send request // in_sendsize - size of this send // // Returns: Nothing. // void TCPFastSend(TCB * SendTCB, PNDIS_BUFFER in_SendBuf, uint in_SendOfs, TCPSendReq * in_SendReq, uint in_SendSize, SeqNum NextSeq, int in_ToBeSent) { uint AmountToSend; // Amount to send this time. uint AmountLeft; TCPHeader *Header; // TCP header for a send. PNDIS_BUFFER FirstBuffer, CurrentBuffer; TCPSendReq *CurSend; SendCmpltContext *SCC; SeqNum OldSeq; SeqNum SendNext; IP_STATUS SendStatus; uint AmtOutstanding, AmtUnsent; CTELockHandle TCBHandle; void *Irp; uint TSLen=0; uint SendOfs = in_SendOfs; uint SendSize = in_SendSize; PNDIS_BUFFER SendBuf = in_SendBuf; SendNext = NextSeq; CurSend = in_SendReq; TCBHandle = DISPATCH_LEVEL; CTEStructAssert(SendTCB, tcb); ASSERT(SendTCB->tcb_refcnt != 0); ASSERT(*(int *)&SendTCB->tcb_sendwin >= 0); ASSERT(*(int *)&SendTCB->tcb_cwin >= SendTCB->tcb_mss); ASSERT(!(SendTCB->tcb_flags & FIN_OUTSTANDING) || (SendTCB->tcb_sendnext == SendTCB->tcb_sendmax)); AmtOutstanding = (uint) (SendTCB->tcb_sendnext - SendTCB->tcb_senduna); AmtUnsent = MIN(MIN(in_ToBeSent, (int)SendSize), (int)SendTCB->tcb_sendwin); while (AmtUnsent > 0) { if (SEQ_GT(SendTCB->tcb_senduna, SendNext)) { // Since tcb_lock is releasd in this loop // it is possible that delayed ack acked // what we are trying to retransmit. goto error_oor; } //This was minimum of sendwin and amtunsent AmountToSend = MIN(AmtUnsent, SendTCB->tcb_mss); // Time stamp option addition might force us to cut the data // to be sent by 12 bytes. if ((SendTCB->tcb_tcpopts & TCP_FLAG_TS) && (AmountToSend + ALIGNED_TS_OPT_SIZE >= SendTCB->tcb_mss)) { AmountToSend -= ALIGNED_TS_OPT_SIZE; } // See if we have enough to send. We'll send if we have at least a // segment, or if we really have some data to send and we can send // all that we have, or the send window is > 0 and we need to force // output or send a FIN (note that if we need to force output // SendWin will be at least 1 from the check above), or if we can // send an amount == to at least half the maximum send window // we've seen. ASSERT((int)AmtUnsent >= 0); // It's OK to send something. Try to get a header buffer now. // Mark the TCB for debugging. // This should be removed for shipping version. FirstBuffer = GetTCPHeaderAtDpcLevel(&Header); if (FirstBuffer != NULL) { // Got a header buffer. Loop through the sends on the TCB, // building a frame. CurrentBuffer = FirstBuffer; Header = (TCPHeader *) ((PUCHAR)Header + LocalNetInfo.ipi_hsize); // allow room for filling time stamp options. if (SendTCB->tcb_tcpopts & TCP_FLAG_TS) { // Account for time stamp options TSLen = ALIGNED_TS_OPT_SIZE; NdisAdjustBufferLength(FirstBuffer, sizeof(TCPHeader) + ALIGNED_TS_OPT_SIZE); SCC = ALIGN_UP_POINTER((SendCmpltContext *) (Header + 1),PVOID); SCC = (SendCmpltContext *)((uchar *) SCC + ALIGNED_TS_OPT_SIZE); } else { SCC = (SendCmpltContext *) (Header + 1); } SCC = ALIGN_UP_POINTER(SCC, PVOID); #if DBG SCC->scc_sig = scc_signature; #endif FillTCPHeader(SendTCB, Header); { ulong L = SendNext; Header->tcp_seq = net_long(L); } SCC->scc_ubufcount = 0; SCC->scc_tbufcount = 0; SCC->scc_count = 0; SCC->scc_LargeSend = 0; AmountLeft = AmountToSend; if (AmountToSend != 0) { long Result; CTEStructAssert(CurSend, tsr); SCC->scc_firstsend = CurSend; do { BOOLEAN DirectSend = FALSE; ASSERT(CurSend->tsr_refcnt > 0); Result = CTEInterlockedIncrementLong(&(CurSend->tsr_refcnt)); ASSERT(Result > 0); SCC->scc_count++; // If the current send offset is 0 and the current // send is less than or equal to what we have left // to send, we haven't already put a transport // buffer on this send, and nobody else is using // the buffer chain directly, just use the input // buffers. We check for other people using them // by looking at tsr_lastbuf. If it's NULL, // nobody else is using the buffers. If it's not // NULL, somebody is. if (SendOfs == 0 && (SendSize <= AmountLeft) && (SCC->scc_tbufcount == 0) && CurSend->tsr_lastbuf == NULL) { ulong length = 0; PNDIS_BUFFER tmp = SendBuf; while (tmp) { length += NdisBufferLength(tmp); tmp = NDIS_BUFFER_LINKAGE(tmp); } // If sum of mdl lengths is > request length // use slow path. if (AmountLeft >= length) { DirectSend = TRUE; } } if (DirectSend) { NDIS_BUFFER_LINKAGE(CurrentBuffer) = SendBuf; do { SCC->scc_ubufcount++; CurrentBuffer = NDIS_BUFFER_LINKAGE(CurrentBuffer); } while (NDIS_BUFFER_LINKAGE(CurrentBuffer) != NULL); CurSend->tsr_lastbuf = CurrentBuffer; AmountLeft -= SendSize; SendSize = 0; } else { uint AmountToDup; PNDIS_BUFFER NewBuf, Buf; uint Offset; NDIS_STATUS NStatus; uchar *VirtualAddress; uint Length; // Either the current send has more data than // we want to send, or the starting offset is // not 0. In either case we'll need to loop // through the current send, allocating buffers. Buf = SendBuf; Offset = SendOfs; do { ASSERT(Buf != NULL); TcpipQueryBuffer(Buf, &VirtualAddress, &Length, NormalPagePriority); if (VirtualAddress == NULL) { if (SCC->scc_tbufcount == 0 && SCC->scc_ubufcount == 0) { //TCPSendComplete(SCC, FirstBuffer,IP_SUCCESS); goto error_oor1; } AmountToSend -= AmountLeft; AmountLeft = 0; break; } ASSERT((Offset < Length) || (Offset == 0 && Length == 0)); // Adjust the length for the offset into // this buffer. Length -= Offset; AmountToDup = MIN(AmountLeft, Length); NdisAllocateBuffer(&NStatus, &NewBuf, TCPSendBufferPool, VirtualAddress + Offset, AmountToDup); if (NStatus == NDIS_STATUS_SUCCESS) { SCC->scc_tbufcount++; NDIS_BUFFER_LINKAGE(CurrentBuffer) = NewBuf; CurrentBuffer = NewBuf; if (AmountToDup >= Length) { // Exhausted this buffer. Buf = NDIS_BUFFER_LINKAGE(Buf); Offset = 0; } else { Offset += AmountToDup; ASSERT(Offset < NdisBufferLength(Buf)); } SendSize -= AmountToDup; AmountLeft -= AmountToDup; } else { // Couldn't allocate a buffer. If // the packet is already partly built, // send what we've got, otherwise // bail out. if (SCC->scc_tbufcount == 0 && SCC->scc_ubufcount == 0) { goto error_oor1; } AmountToSend -= AmountLeft; AmountLeft = 0; } } while (AmountLeft && SendSize); SendBuf = Buf; SendOfs = Offset; } if (CurSend->tsr_flags & TSR_FLAG_URG) { ushort UP; // This send is urgent data. We need to figure // out what the urgent data pointer should be. // We know sendnext is the starting sequence // number of the frame, and that at the top of // this do loop sendnext identified a byte in // the CurSend at that time. We advanced CurSend // at the same rate we've decremented // AmountLeft (AmountToSend - AmountLeft == // AmountBuilt), so sendnext + // (AmountToSend - AmountLeft) identifies a byte // in the current value of CurSend, and that // quantity plus tcb_sendsize is the sequence // number one beyond the current send. UP = (ushort) (AmountToSend - AmountLeft) + (ushort)SendSize - ((SendTCB->tcb_flags & BSD_URGENT) ? 0 : 1); Header->tcp_urgent = net_short(UP); Header->tcp_flags |= TCP_FLAG_URG; } // See if we've exhausted this send. If we have, // set the PUSH bit in this frame and move on to // the next send. We also need to check the // urgent data bit. if (SendSize == 0) { Queue *Next; ulong PrevFlags; // We've exhausted this send. Set the PUSH bit. Header->tcp_flags |= TCP_FLAG_PUSH; PrevFlags = CurSend->tsr_flags; Next = QNEXT(&CurSend->tsr_req.tr_q); if (Next != QEND(&SendTCB->tcb_sendq)) { CurSend = STRUCT_OF(TCPSendReq, QSTRUCT(TCPReq, Next, tr_q), tsr_req); CTEStructAssert(CurSend, tsr); SendSize = CurSend->tsr_unasize; SendOfs = CurSend->tsr_offset; SendBuf = CurSend->tsr_buffer; // Check the urgent flags. We can't combine // new urgent data on to the end of old // non-urgent data. if ((PrevFlags & TSR_FLAG_URG) && ! (CurSend->tsr_flags & TSR_FLAG_URG)) break; } else { ASSERT(AmountLeft == 0); CurSend = NULL; SendBuf = NULL; } } } while (AmountLeft != 0); } else { // Amt to send is 0. // Just bail out and strat timer. if (!TCB_TIMER_RUNNING_R(SendTCB, RXMIT_TIMER)) { START_TCB_TIMER_R(SendTCB, RXMIT_TIMER, SendTCB->tcb_rexmit); } FreeTCPHeader(FirstBuffer); return; } // Adjust for what we're really going to send. AmountToSend -= AmountLeft; OldSeq = SendNext; SendNext += AmountToSend; AmtUnsent -= AmountToSend; TStats.ts_retranssegs++; // We've built the frame entirely. If we've send everything // we have and their's a FIN pending, OR it in. AmountToSend += sizeof(TCPHeader); SendTCB->tcb_flags &= ~(NEED_ACK | ACK_DELAYED | FORCE_OUTPUT); STOP_TCB_TIMER_R(SendTCB, DELACK_TIMER); STOP_TCB_TIMER_R(SendTCB, SWS_TIMER); SendTCB->tcb_rcvdsegs = 0; if ( (SendTCB->tcb_flags & KEEPALIVE) && (SendTCB->tcb_conn != NULL) ) START_TCB_TIMER_R(SendTCB, KA_TIMER, SendTCB->tcb_conn->tc_tcbkatime); SendTCB->tcb_kacount = 0; CTEFreeLock(&SendTCB->tcb_lock, TCBHandle); Irp = NULL; if (SCC->scc_firstsend) { Irp = SCC->scc_firstsend->tsr_req.tr_context; } // We're all set. Xsum it and send it. if (SendTCB->tcb_rce && (SendTCB->tcb_rce->rce_OffloadFlags & TCP_XMT_CHECKSUM_OFFLOAD) && (SendTCB->tcb_rce->rce_OffloadFlags & TCP_CHECKSUM_OPT_OFFLOAD) ){ uint PHXsum = SendTCB->tcb_phxsum + (uint)net_short(AmountToSend + TSLen); PHXsum = (((PHXsum << 16) | (PHXsum >> 16)) + PHXsum) >> 16; Header->tcp_xsum = (ushort) PHXsum; SendTCB->tcb_opt.ioi_TcpChksum = 1; } else { Header->tcp_xsum = ~XsumSendChain( SendTCB->tcb_phxsum + (uint)net_short(AmountToSend + TSLen), FirstBuffer); SendTCB->tcb_opt.ioi_TcpChksum = 0; } SendStatus = (*LocalNetInfo.ipi_xmit)(TCPProtInfo, SCC, FirstBuffer, AmountToSend + TSLen, SendTCB->tcb_daddr, SendTCB->tcb_saddr, &SendTCB->tcb_opt, SendTCB->tcb_rce, PROTOCOL_TCP, Irp); //Reacquire Lock to keep DerefTCB happy //Bug #63904 if (SendStatus != IP_PENDING) { TCPSendComplete(SCC, FirstBuffer, IP_SUCCESS); } CTEGetLock(&SendTCB->tcb_lock, &TCBHandle); SendTCB->tcb_error = SendStatus; if (!TCB_TIMER_RUNNING_R(SendTCB, RXMIT_TIMER)) { START_TCB_TIMER_R(SendTCB, RXMIT_TIMER, SendTCB->tcb_rexmit); } } else { // FirstBuffer != NULL. goto error_oor; } } //while AmtUnsent > 0 return; // Common case error handling code for out of resource conditions. Start the // retransmit timer if it's not already running (so that we try this again // later), clean up and return. error_oor: if (!TCB_TIMER_RUNNING_R(SendTCB, RXMIT_TIMER)) { ushort tmp; tmp = MIN(MAX(REXMIT_TO(SendTCB), MIN_RETRAN_TICKS), MAX_REXMIT_TO); START_TCB_TIMER_R(SendTCB, RXMIT_TIMER, tmp); } return; error_oor1: if (!TCB_TIMER_RUNNING_R(SendTCB, RXMIT_TIMER)) { ushort tmp; tmp = MIN(MAX(REXMIT_TO(SendTCB), MIN_RETRAN_TICKS), MAX_REXMIT_TO); START_TCB_TIMER_R(SendTCB, RXMIT_TIMER, tmp); } TCPSendComplete(SCC, FirstBuffer, IP_SUCCESS); return; } //* TDISend - Send data on a connection. // // The main TDI send entry point. We take the input parameters, validate them, // allocate a send request, etc. We then put the send request on the queue. // If we have no other sends on the queue or Nagling is disabled we'll // call TCPSend to send the data. // // Input: Request - The TDI request for the call. // Flags - Flags for this send. // SendLength - Length in bytes of send. // SendBuffer - Pointer to buffer chain to be sent. // // Returns: Status of attempt to send. // TDI_STATUS TdiSend(PTDI_REQUEST Request, ushort Flags, uint SendLength, PNDIS_BUFFER SendBuffer) { TCPConn *Conn; TCB *SendTCB; TCPSendReq *SendReq; CTELockHandle ConnTableHandle; TDI_STATUS Error; uint EmptyQ; #if DBG_VALIDITY_CHECK // Check for Mdl sanity in send requests // Should be removed for RTM uint RealSendSize; PNDIS_BUFFER Temp; // Loop through the buffer chain, and make sure that the length matches // up with SendLength. Temp = SendBuffer; RealSendSize = 0; if (Temp != NULL) { do { RealSendSize += NdisBufferLength(Temp); Temp = NDIS_BUFFER_LINKAGE(Temp); } while (Temp != NULL); if (RealSendSize < SendLength) { PIRP Irp = (PIRP)Request->RequestContext; PIO_STACK_LOCATION IrpSp = IoGetCurrentIrpStackLocation(Irp); DbgPrint("Invalid TDI_SEND request issued to \\\\Device\\\\Tcp.\n"); DbgPrint("Irp: %p Mdl: %p CompletionRoutine: %p\n", Irp, Irp->MdlAddress, IrpSp->CompletionRoutine); DbgPrint("This is not a bug in tcpip.sys.\n"); DbgPrint("Please notify the originator of this IRP.\n"); DbgBreakPoint(); } } #endif //CTEGetLock(&ConnTableLock, &ConnTableHandle); Conn = GetConnFromConnID(PtrToUlong(Request->Handle.ConnectionContext), &ConnTableHandle); if (Conn != NULL) { CTEStructAssert(Conn, tc); SendTCB = Conn->tc_tcb; if (SendTCB != NULL) { CTEStructAssert(SendTCB, tcb); CTEGetLockAtDPC(&SendTCB->tcb_lock); CTEFreeLock(&(Conn->tc_ConnBlock->cb_lock), DISPATCH_LEVEL); if (DATA_SEND_STATE(SendTCB->tcb_state) && !CLOSING(SendTCB)) { // We have a TCB, and it's valid. Get a send request now. CheckTCBSends(SendTCB); if (SendLength == 0) { Error = TDI_SUCCESS; } else if (((ULONG64)SendTCB->tcb_unacked + SendLength) >= MAXULONG) { Error = TDI_INVALID_PARAMETER; } else { SendReq = GetSendReq(); if (SendReq != NULL) { SendReq->tsr_req.tr_rtn = Request->RequestNotifyObject; SendReq->tsr_req.tr_context = Request->RequestContext; SendReq->tsr_buffer = SendBuffer; SendReq->tsr_size = SendLength; SendReq->tsr_unasize = SendLength; SendReq->tsr_refcnt = 1; // ACK will decrement this ref SendReq->tsr_offset = 0; SendReq->tsr_lastbuf = NULL; SendReq->tsr_time = TCPTime; SendReq->tsr_flags = (Flags & TDI_SEND_EXPEDITED) ? TSR_FLAG_URG : 0; SendTCB->tcb_unacked += SendLength; if (Flags & TDI_SEND_AND_DISCONNECT) { //move the state to fin_wait and //mark the tcb for send and disconnect if (SendTCB->tcb_state == TCB_ESTAB) { SendTCB->tcb_state = TCB_FIN_WAIT1; } else { ASSERT(SendTCB->tcb_state == TCB_CLOSE_WAIT); SendTCB->tcb_state = TCB_LAST_ACK; } SendTCB->tcb_slowcount++; SendTCB->tcb_fastchk |= TCP_FLAG_SLOW; SendTCB->tcb_fastchk |= TCP_FLAG_SEND_AND_DISC; SendTCB->tcb_flags |= FIN_NEEDED; SendReq->tsr_flags |= TSR_FLAG_SEND_AND_DISC; //extrac reference to make sure that //this request will not be completed until the //connection is closed SendReq->tsr_refcnt++; InterlockedDecrement((PLONG)&TStats.ts_currestab); } EmptyQ = EMPTYQ(&SendTCB->tcb_sendq); ENQUEUE(&SendTCB->tcb_sendq, &SendReq->tsr_req.tr_q); if (SendTCB->tcb_cursend == NULL) { SendTCB->tcb_cursend = SendReq; SendTCB->tcb_sendbuf = SendBuffer; SendTCB->tcb_sendofs = 0; SendTCB->tcb_sendsize = SendLength; } if (EmptyQ) { REFERENCE_TCB(SendTCB); TCPSend(SendTCB, ConnTableHandle); } else if (!(SendTCB->tcb_flags & NAGLING) || (SendTCB->tcb_unacked - (SendTCB->tcb_sendmax - SendTCB->tcb_senduna)) >= SendTCB->tcb_mss) { REFERENCE_TCB(SendTCB); TCPSend(SendTCB, ConnTableHandle); } else CTEFreeLock(&SendTCB->tcb_lock, ConnTableHandle); return TDI_PENDING; } else Error = TDI_NO_RESOURCES; } } else Error = TDI_INVALID_STATE; CTEFreeLock(&SendTCB->tcb_lock, ConnTableHandle); return Error; } else { CTEFreeLock(&(Conn->tc_ConnBlock->cb_lock), ConnTableHandle); Error = TDI_INVALID_STATE; } } else Error = TDI_INVALID_CONNECTION; //CTEFreeLock(&ConnTableLock, ConnTableHandle); return Error; } #pragma BEGIN_INIT extern void *TLRegisterProtocol(uchar Protocol, void *RcvHandler, void *XmitHandler, void *StatusHandler, void *RcvCmpltHandler, void *PnPHandler, void *ElistHandler); extern IP_STATUS TCPRcv(void *IPContext, IPAddr Dest, IPAddr Src, IPAddr LocalAddr, IPAddr SrcAddr, IPHeader UNALIGNED * IPH, uint IPHLength, IPRcvBuf * RcvBuf, uint Size, uchar IsBCast, uchar Protocol, IPOptInfo * OptInfo); extern void TCPRcvComplete(void); uchar SendInited = FALSE; //* InitTCPSend - Initialize our send side. // // Called during init time to initialize our TCP send state. // // Input: Nothing. // // Returns: TRUE if we inited, false if we didn't. // int InitTCPSend(void) { NDIS_STATUS Status; TcpHeaderBufferSize = (USHORT)(ALIGN_UP(LocalNetInfo.ipi_hsize,PVOID) + ALIGN_UP((sizeof(TCPHeader) + ALIGNED_TS_OPT_SIZE + ALIGNED_SACK_OPT_SIZE),PVOID) + ALIGN_UP(MAX(MSS_OPT_SIZE, sizeof(SendCmpltContext)),PVOID)); #if BACK_FILL TcpHeaderBufferSize += MAX_BACKFILL_HDR_SIZE; #endif TcpHeaderPool = MdpCreatePool (TcpHeaderBufferSize, 'thCT'); if (!TcpHeaderPool) { return FALSE; } NdisAllocateBufferPool(&Status, &TCPSendBufferPool, NUM_TCP_BUFFERS); if (Status != NDIS_STATUS_SUCCESS) { MdpDestroyPool(TcpHeaderPool); return FALSE; } TCPProtInfo = TLRegisterProtocol(PROTOCOL_TCP, TCPRcv, TCPSendComplete, TCPStatus, TCPRcvComplete, TCPPnPPowerRequest, TCPElistChangeHandler); if (TCPProtInfo == NULL) { MdpDestroyPool(TcpHeaderPool); NdisFreeBufferPool(TCPSendBufferPool); return FALSE; } SendInited = TRUE; return TRUE; } //* UnInitTCPSend - UnInitialize our send side. // // Called during init time if we're going to fail to initialize. // // Input: Nothing. // // Returns: TRUE if we inited, false if we didn't. // void UnInitTCPSend(void) { if (!SendInited) return; TLRegisterProtocol(PROTOCOL_TCP, NULL, NULL, NULL, NULL, NULL, NULL); MdpDestroyPool(TcpHeaderPool); NdisFreeBufferPool(TCPSendBufferPool); } #pragma END_INIT