/* Updated for OS X 10.3.x by Felix Kronlage (fkr@opendarwin.org) * Port to OS X 10.2.x by Rob Braun (bbraun@synack.net) * * Copyright (c) 1996, 2001 Portland State University * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer as * the first lines of this file unmodified. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR/S ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL Portland State University or the authors BE * LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * SACK and FACK implementation in FreeBSD Release-4.3. * * Dr. Suresh Singh, Shiv Saxena and Harkirat Singh * Portland State University * Computer Science Dept. - Aug 16, 2001 * * * email: {singh, saxenas, harkirat}@cs.pdx.edu * project page: http://www.cs.pdx.edu/~singh/pacman.html * * */ diff -urN xnu-517.7.21/bsd/kern/sysctl_init.c xnu-517.7.21-SACK/bsd/kern/sysctl_init.c --- xnu-517.7.21/bsd/kern/sysctl_init.c Tue Aug 3 00:22:22 2004 +++ xnu-517.7.21-SACK/bsd/kern/sysctl_init.c Sat Sep 25 06:28:28 2004 @@ -174,6 +174,7 @@ extern struct sysctl_oid sysctl__net_inet_tcp_slowstart_flightsize; extern struct sysctl_oid sysctl__net_inet_tcp_local_slowstart_flightsize; extern struct sysctl_oid sysctl__net_inet_tcp_newreno; +extern struct sysctl_oid sysctl__net_inet_tcp_sack; extern struct sysctl_oid sysctl__net_inet_tcp_tcbhashsize; extern struct sysctl_oid sysctl__net_inet_tcp_do_tcpdrain; extern struct sysctl_oid sysctl__net_inet_tcp_icmp_may_rst; @@ -580,6 +581,7 @@ ,&sysctl__net_inet_tcp_delacktime ,&sysctl__net_inet_tcp_isn_reseed_interval ,&sysctl__net_inet_tcp_msl + ,&sysctl__net_inet_tcp_sack #if TCP_DROP_SYNFIN ,&sysctl__net_inet_tcp_drop_synfin #endif Binary files xnu-517.7.21/bsd/netinet/.tcp.h.rej.swp and xnu-517.7.21-SACK/bsd/netinet/.tcp.h.rej.swp differ diff -urN xnu-517.7.21/bsd/netinet/tcp.h xnu-517.7.21-SACK/bsd/netinet/tcp.h --- xnu-517.7.21/bsd/netinet/tcp.h Tue Aug 3 00:22:22 2004 +++ xnu-517.7.21-SACK/bsd/netinet/tcp.h Sat Sep 25 07:44:52 2004 @@ -107,12 +107,19 @@ #define TCPOPT_SACK_PERMITTED 4 /* Experimental */ #define TCPOLEN_SACK_PERMITTED 2 #define TCPOPT_SACK 5 /* Experimental */ +#define TCPOLEN_SACK 8 /* len of sack block */ #define TCPOPT_TIMESTAMP 8 #define TCPOLEN_TIMESTAMP 10 #define TCPOLEN_TSTAMP_APPA (TCPOLEN_TIMESTAMP+2) /* appendix A */ #define TCPOPT_TSTAMP_HDR \ (TCPOPT_NOP<<24|TCPOPT_NOP<<16|TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP) +#define TCPOPT_SACK_PERMIT_HDR \ + (TCPOPT_NOP<<24|TCPOPT_NOP<<16|TCPOPT_SACK_PERMITTED<<8|TCPOLEN_SACK_PERMITTED) +#define TCPOPT_SACK_HDR (TCPOPT_NOP<<24|TCPOPT_NOP<<16|TCPOPT_SACK<<8) +#define MAX_SACK_BLKS 6 /* Max # SACK blocks stored at sender side */ +#define TCP_MAX_SACK 3 /* MAX # SACKs sent in any segment */ + #define TCPOPT_CC 11 /* CC options: RFC-1644 */ #define TCPOPT_CCNEW 12 #define TCPOPT_CCECHO 13 @@ -166,6 +173,7 @@ #define TCP_MAXSEG 0x02 /* set maximum segment size */ #define TCP_NOPUSH 0x04 /* don't push last block of write */ #define TCP_NOOPT 0x08 /* don't use TCP options */ +#define TCP_SACK_DISABLE 0x300 /* disable SACKs (if enabled by def.) */ #define TCP_KEEPALIVE 0x10 /* idle time used when SO_KEEPALIVE is enabled */ #endif diff -urN xnu-517.7.21/bsd/netinet/tcp_input.c xnu-517.7.21-SACK/bsd/netinet/tcp_input.c --- xnu-517.7.21/bsd/netinet/tcp_input.c Tue Aug 3 00:22:22 2004 +++ xnu-517.7.21-SACK/bsd/netinet/tcp_input.c Sat Sep 25 07:42:49 2004 @@ -123,7 +123,7 @@ #define DBG_FNC_TCP_INPUT NETDBG_CODE(DBG_NETTCP, (3 << 8)) #define DBG_FNC_TCP_NEWCONN NETDBG_CODE(DBG_NETTCP, (7 << 8)) -static int tcprexmtthresh = 3; +int tcprexmtthresh = 3; tcp_cc tcp_ccgen; extern int apple_hwcksum_rx; @@ -1036,6 +1036,9 @@ } } + if (!tp->sack_disable) + tcp_del_sackholes(tp, th); /* Delete stale SACK holes */ + /* * Segment received on connection. * Reset idle time and keep-alive timer. @@ -1051,6 +1054,11 @@ if (tp->t_state != TCPS_LISTEN && optp) tcp_dooptions(tp, optp, optlen, th, &to); + if (!tp->sack_disable) { + tp->rcv_laststart = th->th_seq; + tp->rcv_lastend = th->th_seq + tlen; + } + /* * Header prediction: check for the two common cases * of a uni-directional data xfer. If the packet has @@ -1127,6 +1135,9 @@ tcpstat.tcps_rcvackbyte += acked; sbdrop(&so->so_snd, acked); tp->snd_una = th->th_ack; + tp->snd_last = tp->snd_una; + tp->snd_fack = tp->snd_una; + tp->retran_data = 0; m_freem(m); ND6_HINT(tp); /* some progress has been done */ @@ -1158,6 +1169,8 @@ * with nothing on the reassembly queue and * we have enough buffer space to take it. */ + if (!tp->sack_disable && tp->rcv_numsacks ) + tcp_clean_sackreport(tp); ++tcpstat.tcps_preddat; tp->rcv_nxt += tlen; tcpstat.tcps_rcvpack++; @@ -1323,6 +1336,9 @@ bzero(taop, sizeof(*taop)); } tcp_dooptions(tp, optp, optlen, th, &to); + if (!tp->sack_disable) + if ((tp->t_flags & TF_SACK_PERMIT) == 0) + tp->sack_disable = 1; if (iss) tp->iss = iss; else { @@ -1330,6 +1346,10 @@ } tp->irs = th->th_seq; tcp_sendseqinit(tp); + tp->snd_last = tp->snd_una; + tp->snd_fack = tp->snd_una; + tp->retran_data = 0; + tp->snd_awnd = 0; tcp_rcvseqinit(tp); tp->snd_recover = tp->snd_una; /* @@ -1505,6 +1525,9 @@ } } else tp->t_flags &= ~TF_RCVD_CC; + if (!tp->sack_disable) + if ((tp->t_flags & TF_SACK_PERMIT) == 0) + tp->sack_disable = 1; tcpstat.tcps_connects++; soisconnected(so); /* Do window scaling on this connection? */ @@ -2028,25 +2051,30 @@ if (tp->t_timer[TCPT_REXMT] == 0 || th->th_ack != tp->snd_una) tp->t_dupacks = 0; - else if (++tp->t_dupacks == tcprexmtthresh) { + else if (++tp->t_dupacks == tcprexmtthresh || + ((SEQ_GT(tp->snd_fack, tcprexmtthresh * + tp->t_maxseg + tp->snd_una)) && + SEQ_GT(tp->snd_una, tp->snd_last))) { tcp_seq onxt = tp->snd_nxt; u_int win = min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_maxseg; - if (tcp_do_newreno && SEQ_LT(th->th_ack, - tp->snd_recover)) { - /* False retransmit, should not - * cut window - */ - tp->snd_cwnd += tp->t_maxseg; + if (SEQ_LT(th->th_ack, tp->snd_last)) { tp->t_dupacks = 0; - (void) tcp_output(tp); goto drop; } if (win < 2) win = 2; tp->snd_ssthresh = win * tp->t_maxseg; - tp->snd_recover = tp->snd_max; + tp->snd_last = tp->snd_max; + if (!tp->sack_disable) { + tp->t_timer[TCPT_REXMT] = 0; + tp->t_rtttime = 0; + tp->t_dupacks = tcprexmtthresh; + (void) tcp_output(tp); + tp->snd_cwnd = tp->snd_ssthresh; + goto drop; + } tp->t_timer[TCPT_REXMT] = 0; tp->t_rtttime = 0; tp->snd_nxt = th->th_ack; @@ -2058,6 +2086,10 @@ tp->snd_nxt = onxt; goto drop; } else if (tp->t_dupacks > tcprexmtthresh) { + if (!tp->sack_disable) { + if (tp->snd_awnd < tp->snd_cwnd) + tcp_output(tp); + } tp->snd_cwnd += tp->t_maxseg; (void) tcp_output(tp); goto drop; @@ -2070,27 +2102,33 @@ * If the congestion window was inflated to account * for the other side's cached packets, retract it. */ - if (tcp_do_newreno == 0) { - if (tp->t_dupacks >= tcprexmtthresh && - tp->snd_cwnd > tp->snd_ssthresh) - tp->snd_cwnd = tp->snd_ssthresh; - tp->t_dupacks = 0; - } else if (tp->t_dupacks >= tcprexmtthresh && - !tcp_newreno(tp, th)) { - /* - * Window inflation should have left us with approx. - * snd_ssthresh outstanding data. But in case we - * would be inclined to send a burst, better to do - * it via the slow start mechanism. - */ - if (SEQ_GT(th->th_ack + tp->snd_ssthresh, tp->snd_max)) - tp->snd_cwnd = - tp->snd_max - th->th_ack + tp->t_maxseg; - else - tp->snd_cwnd = tp->snd_ssthresh; - tp->t_dupacks = 0; - } - + if (!tp->sack_disable) { + if (tp->t_dupacks >= tcprexmtthresh) { + if (tcp_sack_partialack(tp, th)) { + if (tp->snd_awnd < tp->snd_cwnd) + needoutput = 1; + } else { + tp->snd_cwnd = tp->snd_ssthresh; + if (tcp_seq_subtract(tp->snd_max, + th->th_ack) < tp->snd_ssthresh) + tp->snd_cwnd = + tcp_seq_subtract(tp->snd_max, th->th_ack); + tp->t_dupacks = 0; + if (SEQ_GT(th->th_ack, tp->snd_fack)) + tp->snd_fack = th->th_ack; + } + } + } else { + if (tp->t_dupacks >= tcprexmtthresh && + !tcp_newreno(tp, th)) { + tp->snd_cwnd = tp->snd_ssthresh; + if (tcp_seq_subtract(tp->snd_max, th->th_ack) < + tp->snd_ssthresh) + tp->snd_cwnd = tcp_seq_subtract(tp->snd_max, + th->th_ack); + tp->t_dupacks = 0; + } + } if (tp->t_dupacks < tcprexmtthresh) tp->t_dupacks = 0; @@ -2191,7 +2229,7 @@ * in NewReno fast recovery mode, so we leave the congestion * window alone. */ - if (tcp_do_newreno == 0 || tp->t_dupacks == 0) + if (tp->t_dupacks < tcprexmtthresh) tp->snd_cwnd = min(cw + incr,TCP_MAXWIN<snd_scale); } if (acked > so->so_snd.sb_cc) { @@ -2206,6 +2244,11 @@ tp->snd_una = th->th_ack; if (SEQ_LT(tp->snd_nxt, tp->snd_una)) tp->snd_nxt = tp->snd_una; + if (SEQ_GT(tp->snd_una, tp->snd_fack)) { + tp->snd_fack = tp->snd_una; + tp->snd_awnd = tcp_seq_subtract(tp->snd_nxt, + tp->snd_fack) + tp->retran_data; + } sowwakeup(so); switch (tp->t_state) { @@ -2430,6 +2473,10 @@ } } + + if (!tp->sack_disable) + tcp_update_sack_list(tp); + /* * Note the amount of data that peer has sent into * our window, in order to estimate the sender's @@ -2739,12 +2786,417 @@ (char *)&to->to_ccecho, sizeof(to->to_ccecho)); NTOHL(to->to_ccecho); break; + case TCPOPT_SACK_PERMITTED: + if (tp->sack_disable || optlen!=TCPOLEN_SACK_PERMITTED) + continue; + if (th->th_flags & TH_SYN) + tp->t_flags |= TF_SACK_PERMIT; + break; + case TCPOPT_SACK: + if (tcp_sack_option(tp, th, cp, optlen)) + continue; + break; } } if (th->th_flags & TH_SYN) tcp_mss(tp, mss); /* sets t_maxseg */ } +u_long +tcp_seq_subtract(a, b) + u_long a, b; +{ + return ((long)(a - b)); +} + +/* + * This function is called upon receipt of new valid data (while not in header + * prediction mode), and it updates the ordered list of sacks. + */ +void +tcp_update_sack_list(tp) + struct tcpcb *tp; +{ + /* + * First reported block MUST be the most recent one. Subsequent + * blocks SHOULD be in the order in which they arrived at the + * receiver. These two conditions make the implementation fully + * compliant with RFC 2018. + */ + int i, j = 0, count = 0, lastpos = -1; + struct sackblk sack, firstsack, temp[MAX_SACK_BLKS]; + + /* First clean up current list of sacks */ + for (i = 0; i < tp->rcv_numsacks; i++) { + sack = tp->sackblks[i]; + if (sack.start == 0 && sack.end == 0) { + count++; /* count = number of blocks to be discarded */ + continue; + } + if (SEQ_LEQ(sack.end, tp->rcv_nxt)) { + tp->sackblks[i].start = tp->sackblks[i].end = 0; + count++; + } else { + temp[j].start = tp->sackblks[i].start; + temp[j++].end = tp->sackblks[i].end; + } + } + tp->rcv_numsacks -= count; + if (tp->rcv_numsacks == 0) { /* no sack blocks currently (fast path) */ + tcp_clean_sackreport(tp); + if (SEQ_LT(tp->rcv_nxt, tp->rcv_laststart)) { + /* ==> need first sack block */ + tp->sackblks[0].start = tp->rcv_laststart; + tp->sackblks[0].end = tp->rcv_lastend; + tp->rcv_numsacks = 1; + } + return; + } + /* Otherwise, sack blocks are already present. */ + for (i = 0; i < tp->rcv_numsacks; i++) + tp->sackblks[i] = temp[i]; /* first copy back sack list */ + if (SEQ_GEQ(tp->rcv_nxt, tp->rcv_lastend)) + return; /* sack list remains unchanged */ + /* + * From here, segment just received should be (part of) the 1st sack. + * Go through list, possibly coalescing sack block entries. + */ + firstsack.start = tp->rcv_laststart; + firstsack.end = tp->rcv_lastend; + for (i = 0; i < tp->rcv_numsacks; i++) { + sack = tp->sackblks[i]; + if (SEQ_LT(sack.end, firstsack.start) || + SEQ_GT(sack.start, firstsack.end)) + continue; /* no overlap */ + if (sack.start == firstsack.start && sack.end == firstsack.end){ + /* + * identical block; delete it here since we will + * move it to the front of the list. + */ + tp->sackblks[i].start = tp->sackblks[i].end = 0; + lastpos = i; /* last posn with a zero entry */ + continue; + } + if (SEQ_LEQ(sack.start, firstsack.start)) + firstsack.start = sack.start; /* merge blocks */ + if (SEQ_GEQ(sack.end, firstsack.end)) + firstsack.end = sack.end; /* merge blocks */ + tp->sackblks[i].start = tp->sackblks[i].end = 0; + lastpos = i; /* last posn with a zero entry */ + } + if (lastpos != -1) { /* at least one merge */ + for (i = 0, j = 1; i < tp->rcv_numsacks; i++) { + sack = tp->sackblks[i]; + if (sack.start == 0 && sack.end == 0) + continue; + temp[j++] = sack; + } + tp->rcv_numsacks = j; /* including first blk (added later) */ + for (i = 1; i < tp->rcv_numsacks; i++) /* now copy back */ + tp->sackblks[i] = temp[i]; + } else { /* no merges -- shift sacks by 1 */ + if (tp->rcv_numsacks < MAX_SACK_BLKS) + tp->rcv_numsacks++; + for (i = tp->rcv_numsacks-1; i > 0; i--) + tp->sackblks[i] = tp->sackblks[i-1]; + } + tp->sackblks[0] = firstsack; + return; +} + +/* + * Process the TCP SACK option. Returns 1 if tcp_dooptions() should continue, + * and 0 otherwise, if the option was fine. tp->snd_holes is an ordered list + * of holes (oldest to newest, in terms of the sequence space). + */ +int +tcp_sack_option(tp, th, cp, optlen) + struct tcpcb *tp; + struct tcphdr *th; + u_char *cp; + int optlen; +{ + int tmp_olen; + u_char *tmp_cp; + struct sackhole *cur, *p, *temp; + + if (tp->sack_disable) + return 1; + + /* Note: TCPOLEN_SACK must be 2*sizeof(tcp_seq) */ + if (optlen <= 2 || (optlen - 2) % TCPOLEN_SACK != 0) + return 1; + tmp_cp = cp + 2; + tmp_olen = optlen - 2; + if (tp->snd_numholes < 0) + tp->snd_numholes = 0; + if (tp->t_maxseg == 0) + panic("tcp_sack_option"); /* Should never happen */ + while (tmp_olen > 0) { + struct sackblk sack; + + bcopy((char *) tmp_cp, (char *) &(sack.start), sizeof(tcp_seq)); + NTOHL(sack.start); + bcopy((char *) tmp_cp + sizeof(tcp_seq), + (char *) &(sack.end), sizeof(tcp_seq)); + NTOHL(sack.end); + tmp_olen -= TCPOLEN_SACK; + tmp_cp += TCPOLEN_SACK; + if (SEQ_LEQ(sack.end, sack.start)) + continue; /* bad SACK fields */ + if (SEQ_LEQ(sack.end, tp->snd_una)) + continue; /* old block */ + /* Updates snd_fack. */ + if (SEQ_GEQ(sack.end, tp->snd_fack)) + tp->snd_fack = sack.end; + if (SEQ_GT(th->th_ack, tp->snd_una)) { + if (SEQ_LT(sack.start, th->th_ack)) + continue; + } else { + if (SEQ_LT(sack.start, tp->snd_una)) + continue; + } + if (SEQ_GT(sack.end, tp->snd_max)) + continue; + if (tp->snd_holes == 0) { /* first hole */ + tp->snd_holes = (struct sackhole *) + _MALLOC(sizeof(struct sackhole), M_PCB, M_NOWAIT); + if (tp->snd_holes == NULL) { + /* ENOBUFS, so ignore SACKed block for now*/ + continue; + } + cur = tp->snd_holes; + cur->start = th->th_ack; + cur->end = sack.start; + cur->rxmit = cur->start; + cur->next = 0; + tp->snd_numholes = 1; + tp->rcv_lastsack = sack.end; + /* + * dups is at least one. If more data has been + * SACKed, it can be greater than one. + */ + cur->dups = min(tcprexmtthresh, + ((sack.end - cur->end)/tp->t_maxseg)); + if (cur->dups < 1) + cur->dups = 1; + continue; /* with next sack block */ + } + /* Go thru list of holes: p = previous, cur = current */ + p = cur = tp->snd_holes; + while (cur) { + if (SEQ_LEQ(sack.end, cur->start)) + /* SACKs data before the current hole */ + break; /* no use going through more holes */ + if (SEQ_GEQ(sack.start, cur->end)) { + /* SACKs data beyond the current hole */ + cur->dups++; + if ( ((sack.end - cur->end)/tp->t_maxseg) >= + tcprexmtthresh) + cur->dups = tcprexmtthresh; + p = cur; + cur = cur->next; + continue; + } + if (SEQ_LEQ(sack.start, cur->start)) { + /* Data acks at least the beginning of hole */ + if (SEQ_GT(sack.end, cur->rxmit)) + tp->retran_data -= + tcp_seq_subtract(cur->rxmit, + cur->start); + else + tp->retran_data -= + tcp_seq_subtract(sack.end, + cur->start); + if (SEQ_GEQ(sack.end,cur->end)){ + /* Acks entire hole, so delete hole */ + if (p != cur) { + p->next = cur->next; + FREE(cur, M_PCB); + cur = p->next; + } else { + cur=cur->next; + FREE(p, M_PCB); + p = cur; + tp->snd_holes = p; + } + tp->snd_numholes--; + continue; + } + /* otherwise, move start of hole forward */ + cur->start = sack.end; + cur->rxmit = max (cur->rxmit, cur->start); + p = cur; + cur = cur->next; + continue; + } + /* move end of hole backward */ + if (SEQ_GEQ(sack.end, cur->end)) { + if (SEQ_GT(cur->rxmit, sack.start)) + tp->retran_data -= + tcp_seq_subtract(cur->rxmit, + sack.start); + cur->end = sack.start; + cur->rxmit = min (cur->rxmit, cur->end); + cur->dups++; + if ( ((sack.end - cur->end)/tp->t_maxseg) >= + tcprexmtthresh) + cur->dups = tcprexmtthresh; + p = cur; + cur = cur->next; + continue; + } + if (SEQ_LT(cur->start, sack.start) && + SEQ_GT(cur->end, sack.end)) { + /* + * ACKs some data in middle of a hole; need to + * split current hole + */ + temp = (struct sackhole *)_MALLOC(sizeof(*temp), + M_PCB,M_NOWAIT); + if (temp == NULL) + continue; /* ENOBUFS */ + if (SEQ_GT(cur->rxmit, sack.end)) + tp->retran_data -= + tcp_seq_subtract(sack.end, + sack.start); + else if (SEQ_GT(cur->rxmit, sack.start)) + tp->retran_data -= + tcp_seq_subtract(cur->rxmit, + sack.start); + temp->next = cur->next; + temp->start = sack.end; + temp->end = cur->end; + temp->dups = cur->dups; + temp->rxmit = max (cur->rxmit, temp->start); + cur->end = sack.start; + cur->rxmit = min (cur->rxmit, cur->end); + cur->dups++; + if ( ((sack.end - cur->end)/tp->t_maxseg) >= + tcprexmtthresh) + cur->dups = tcprexmtthresh; + cur->next = temp; + p = temp; + cur = p->next; + tp->snd_numholes++; + } + } + /* At this point, p points to the last hole on the list */ + if (SEQ_LT(tp->rcv_lastsack, sack.start)) { + /* + * Need to append new hole at end. + * Last hole is p (and it's not NULL). + */ + temp = (struct sackhole *) _MALLOC(sizeof(*temp), + M_PCB, M_NOWAIT); + if (temp == NULL) + continue; /* ENOBUFS */ + temp->start = tp->rcv_lastsack; + temp->end = sack.start; + temp->dups = min(tcprexmtthresh, + ((sack.end - sack.start)/tp->t_maxseg)); + if (temp->dups < 1) + temp->dups = 1; + temp->rxmit = temp->start; + temp->next = 0; + p->next = temp; + tp->rcv_lastsack = sack.end; + tp->snd_numholes++; + } + } + /* + * Update retran_data and snd_awnd. Go through the list of + * holes. Increment retran_data by (hole->rxmit - hole->start). + */ + tp->retran_data = 0; + cur = tp->snd_holes; + while (cur) { + tp->retran_data += cur->rxmit - cur->start; + cur = cur->next; + } + tp->snd_awnd = tcp_seq_subtract(tp->snd_nxt, tp->snd_fack) + + tp->retran_data; + + return 0; +} + +/* + * Delete stale (i.e, cumulatively ack'd) holes. Hole is deleted only if + * it is completely acked; otherwise, tcp_sack_option(), called from + * tcp_dooptions(), will fix up the hole. + */ +void +tcp_del_sackholes(tp, th) + struct tcpcb *tp; + struct tcphdr *th; +{ + if (!tp->sack_disable && tp->t_state != TCPS_LISTEN) { + /* max because this could be an older ack just arrived */ + tcp_seq lastack = SEQ_GT(th->th_ack, tp->snd_una) ? + th->th_ack : tp->snd_una; + struct sackhole *cur = tp->snd_holes; + struct sackhole *prev = cur; + while (cur) + if (SEQ_LEQ(cur->end, lastack)) { + cur = cur->next; + FREE(prev, M_PCB); + prev = cur; + tp->snd_numholes--; + } else if (SEQ_LT(cur->start, lastack)) { + cur->start = lastack; + if (SEQ_LT(cur->rxmit, cur->start)) + cur->rxmit = cur->start; + break; + } else + break; + tp->snd_holes = cur; + } +} + +/* + * Delete all receiver-side SACK information. + */ +void +tcp_clean_sackreport(tp) + struct tcpcb *tp; +{ + int i; + + tp->rcv_numsacks = 0; + for (i = 0; i < MAX_SACK_BLKS; i++) + tp->sackblks[i].start = tp->sackblks[i].end=0; + +} + +/* + * Checks for partial ack. If partial ack arrives, turn off retransmission + * timer, deflate the window, do not clear tp->t_dupacks, and return 1. + * If the ack advances at least to tp->snd_last, return 0. + */ +int +tcp_sack_partialack(tp, th) + struct tcpcb *tp; + struct tcphdr *th; +{ + if (SEQ_LT(th->th_ack, tp->snd_last)) { + /* Turn off retx. timer (will start again next segment) */ + tp->t_timer[TCPT_REXMT] = 0; + tp->t_rtttime = 0; + /* + * Partial window deflation. This statement relies on the + * fact that tp->snd_una has not been updated yet. In FACK + * hold snd_cwnd constant during fast recovery. + */ + if (tp->snd_cwnd > (th->th_ack - tp->snd_una)) { + tp->snd_cwnd -= th->th_ack - tp->snd_una; + tp->snd_cwnd += tp->t_maxseg; + } else + tp->snd_cwnd = tp->t_maxseg; + return 1; + } + return 0; +} + /* * Pull out of band byte out of a segment so * it doesn't appear in the user's data queue. @@ -3172,7 +3624,7 @@ struct tcpcb *tp; struct tcphdr *th; { - if (SEQ_LT(th->th_ack, tp->snd_recover)) { + if (SEQ_LT(th->th_ack, tp->snd_last)) { tcp_seq onxt = tp->snd_nxt; u_long ocwnd = tp->snd_cwnd; #ifdef __APPLE__ diff -urN xnu-517.7.21/bsd/netinet/tcp_output.c xnu-517.7.21-SACK/bsd/netinet/tcp_output.c --- xnu-517.7.21/bsd/netinet/tcp_output.c Tue Aug 3 00:22:22 2004 +++ xnu-517.7.21-SACK/bsd/netinet/tcp_output.c Sat Sep 25 07:41:16 2004 @@ -133,6 +133,95 @@ extern u_long route_generation; +extern int tcprexmtthresh; + +#ifdef TCP_SACK_DEBUG +void +tcp_print_holes(tp) +struct tcpcb *tp; +{ + struct sackhole *p = tp->snd_holes; + if (p == 0) + return; + printf("Hole report: start--end dups rxmit\n"); + while (p) { + printf("%d--%d d %d r %d\n", p->start, p->end, p->dups, + p->rxmit); + p = p->next; + } + printf("\n"); +} +#endif /* TCP_SACK_DEBUG */ + + /* + * Returns pointer to a sackhole if there are any pending retransmissions; + * NULL otherwise. + */ +struct sackhole * +tcp_sack_output(tp) +register struct tcpcb *tp; +{ + struct sackhole *p; + if (tp->sack_disable) + return 0; + p = tp->snd_holes; + while (p) { + /* In FACK, if p->dups is less than tcprexmtthresh, but + * snd_fack advances more than tcprextmtthresh * tp->t_maxseg, + * tcp_input() will try fast retransmit. This forces output. + */ + if ((p->dups >= tcprexmtthresh || + tp->t_dupacks == tcprexmtthresh) && + SEQ_LT(p->rxmit, p->end)) { + if (SEQ_LT(p->rxmit, tp->snd_una)) {/* old SACK hole */ + p = p->next; + continue; + } +#ifdef TCP_SACK_DEBUG + if (p) + tcp_print_holes(tp); +#endif + return p; + } + p = p->next; + } + return 0; +} + +/* + * After a timeout, the SACK list may be rebuilt. This SACK information + * should be used to avoid retransmitting SACKed data. This function + * traverses the SACK list to see if snd_nxt should be moved forward. + */ +void +tcp_sack_adjust(tp) + struct tcpcb *tp; +{ + struct sackhole *cur = tp->snd_holes; + if (cur == 0) + return; /* No holes */ + if (SEQ_GEQ(tp->snd_nxt, tp->rcv_lastsack)) + return; /* We're already beyond any SACKed blocks */ + /* + * Two cases for which we want to advance snd_nxt: + * i) snd_nxt lies between end of one hole and beginning of another + * ii) snd_nxt lies between end of last hole and rcv_lastsack + */ + while (cur->next) { + if (SEQ_LT(tp->snd_nxt, cur->end)) + return; + if (SEQ_GEQ(tp->snd_nxt, cur->next->start)) + cur = cur->next; + else { + tp->snd_nxt = cur->next->start; + return; + } + } + if (SEQ_LT(tp->snd_nxt, cur->end)) + return; + tp->snd_nxt = tp->rcv_lastsack; + return; +} /* * Tcp output routine: figure out what should be sent and send it. */ @@ -160,6 +249,8 @@ int m_off; struct mbuf *m_last = 0; struct mbuf *m_head = 0; + int i, sack_rxmit = 0; + struct sackhole *p; #if INET6 int isipv6 = tp->t_inpcb->inp_vflag & INP_IPV6 ; #endif @@ -258,8 +349,13 @@ } } sendalot = 0; + if (!tp->sack_disable && SEQ_LT(tp->snd_nxt, tp->snd_max)) + tcp_sack_adjust(tp); off = tp->snd_nxt - tp->snd_una; - win = min(tp->snd_wnd, tp->snd_cwnd); + if (!tp->sack_disable && (tp->t_dupacks > tcprexmtthresh)) + win = tp->snd_wnd; + else + win = min(tp->snd_wnd, tp->snd_cwnd); if (tp->t_flags & TF_SLOWLINK && slowlink_wsize > 0) win = min(win, slowlink_wsize); @@ -273,6 +369,16 @@ if (tp->t_flags & TF_NEEDSYN) flags |= TH_SYN; + if (!tp->sack_disable && !sendalot) { + if (tp->t_dupacks >= tcprexmtthresh && + (p = tcp_sack_output(tp))) { + off = p->rxmit - tp->snd_una; + sack_rxmit = 1; + len = min(tp->t_maxseg, p->end - p->rxmit); + } + } + sendalot = 0; + /* * If in persist timeout with window of 0, send 1 byte. * Otherwise, if window is small but nonzero @@ -306,7 +412,12 @@ } } - len = (long)ulmin(so->so_snd.sb_cc, win) - off; + if (!sack_rxmit) { + + len = (long)ulmin(so->so_snd.sb_cc, win) - off; + if (!tp->sack_disable && len && SEQ_GT(tp->snd_last, tp->snd_una) && (tp->snd_awnd >= tp->snd_cwnd)) + len = 0; + } if ((taop = tcp_gettaocache(tp->t_inpcb)) == NULL) { taop = &tao_noncached; @@ -397,6 +508,8 @@ goto send; if (SEQ_LT(tp->snd_nxt, tp->snd_max)) goto send; + if (sack_rxmit) + goto send; } /* @@ -440,6 +553,11 @@ ((tp->t_flags & TF_SENTFIN) == 0 || tp->snd_nxt == tp->snd_una)) goto send; + if (SEQ_GT(tp->snd_max, tp->snd_una) && !tp->t_timer[TCPT_REXMT] && + !tp->t_timer[TCPT_PERSIST]) { + tp->t_timer[TCPT_REXMT] = tp->t_rxtcur; + } + /* * TCP window updates are not reliable, rather a polling protocol * using ``persist'' packets is used to insure receipt of window @@ -501,6 +619,13 @@ (void)memcpy(opt + 2, &mss, sizeof(mss)); optlen = TCPOLEN_MAXSEG; + if (!tp->sack_disable && ((flags & TH_ACK) == 0 || + (tp->t_flags & TF_SACK_PERMIT))) { + *((u_int32_t *) (opt + optlen)) = + htonl(TCPOPT_SACK_PERMIT_HDR); + optlen += 4; + } + if ((tp->t_flags & TF_REQ_SCALE) && ((flags & TH_ACK) == 0 || (tp->t_flags & TF_RCVD_SCALE))) { @@ -532,6 +657,28 @@ optlen += TCPOLEN_TSTAMP_APPA; } + if (!tp->sack_disable && tp->t_state == TCPS_ESTABLISHED && + (tp->t_flags & (TF_SACK_PERMIT|TF_NOOPT)) == TF_SACK_PERMIT && + tp->rcv_numsacks) { + + u_int32_t *lp = (u_int32_t *)(opt + optlen); + u_int32_t *olp = lp++; + int count = 0; /* actual number of SACKs inserted */ + int maxsack = (TCP_MAXOLEN - (optlen + 4))/TCPOLEN_SACK; + + maxsack = min(maxsack, TCP_MAX_SACK); + for (i = 0; (i < tp->rcv_numsacks && count < maxsack); i++) { + struct sackblk sack = tp->sackblks[i]; + if (sack.start == 0 && sack.end == 0) + continue; + *lp++ = htonl(sack.start); + *lp++ = htonl(sack.end); + count++; + } + *olp = htonl(TCPOPT_SACK_HDR|(TCPOLEN_SACK*count+2)); + optlen += TCPOLEN_SACK*count + 4; /* including leading NOPs */ + } + /* * Send `CC-family' options if our side wants to use them (TF_REQ_CC), * options are allowed (!TF_NOOPT) and it's not a RST. @@ -839,6 +986,13 @@ th->th_seq = htonl(tp->snd_nxt); else th->th_seq = htonl(tp->snd_max); + if (sack_rxmit) { + if (sendalot) + sendalot = 0; + th->th_seq = htonl(p->rxmit); + p->rxmit += len; + tp->retran_data += len; + } th->th_ack = htonl(tp->rcv_nxt); if (optlen) { bcopy(opt, th + 1, optlen); @@ -921,6 +1075,11 @@ tp->t_flags |= TF_SENTFIN; } } + if (!tp->sack_disable) { + if (sack_rxmit && (p->rxmit != tp->snd_nxt)) { + goto timer; + } + } tp->snd_nxt += len; if (SEQ_GT(tp->snd_nxt, tp->snd_max)) { tp->snd_max = tp->snd_nxt; @@ -943,6 +1102,17 @@ * Initialize shift counter which is used for backoff * of retransmit time. */ +timer: + if (!tp->sack_disable && sack_rxmit && + tp->t_timer[TCPT_REXMT] == 0 && + tp->snd_nxt != tp->snd_una) { + tp->t_timer[TCPT_REXMT] = tp->t_rxtcur; + if (tp->t_timer[TCPT_PERSIST]) { + tp->t_timer[TCPT_PERSIST] = 0; + tp->t_rxtshift = 0; + } + } + /* This seems to duplicate the above... --bbraun */ if (tp->t_timer[TCPT_REXMT] == 0 && tp->snd_nxt != tp->snd_una) { tp->t_timer[TCPT_REXMT] = tp->t_rxtcur; @@ -1053,6 +1223,8 @@ error = ip_output(m, tp->t_inpcb->inp_options, &tp->t_inpcb->inp_route, (so->so_options & SO_DONTROUTE), 0); } + tp->snd_awnd = tcp_seq_subtract(tp->snd_max, tp->snd_fack) + + tp->retran_data; if (error) { /* diff -urN xnu-517.7.21/bsd/netinet/tcp_subr.c xnu-517.7.21-SACK/bsd/netinet/tcp_subr.c --- xnu-517.7.21/bsd/netinet/tcp_subr.c Tue Aug 3 00:22:22 2004 +++ xnu-517.7.21-SACK/bsd/netinet/tcp_subr.c Sat Sep 25 06:28:28 2004 @@ -180,6 +180,10 @@ SYSCTL_INT(_net_inet_tcp, OID_AUTO, isn_reseed_interval, CTLFLAG_RW, &tcp_isn_reseed_interval, 0, "Seconds between reseeding of ISN secret"); +static int tcp_do_sack = 1; +SYSCTL_INT(_net_inet_tcp, TCPCTL_SACK, sack, CTLFLAG_RW, &tcp_do_sack, 0, + "Experimental Sack"); + static void tcp_cleartaocache __P((void)); static void tcp_notify __P((struct inpcb *, int)); @@ -628,6 +632,8 @@ callout_init(tp->tt_delack = &it->inp_tp_delack); #endif + tp->sack_disable = tcp_do_sack ? 0 : 1; + if (tcp_do_rfc1323) tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP); if (tcp_do_rfc1644) @@ -703,6 +709,7 @@ register struct tseg_qent *q; struct inpcb *inp = tp->t_inpcb; struct socket *so = inp->inp_socket; + struct sackhole *p, *q_sack; #if INET6 int isipv6 = (inp->inp_vflag & INP_IPV6) != 0; #endif /* INET6 */ @@ -868,6 +875,13 @@ no_valid_rt: /* free the reassembly queue, if any */ (void) tcp_freeq(tp); + + q_sack = p = tp->snd_holes; + while (p != 0) { + q_sack = p->next; + FREE(p, M_PCB); + p = q_sack; + } #ifdef __APPLE__ if (so->cached_in_sock_layer) diff -urN xnu-517.7.21/bsd/netinet/tcp_timer.c xnu-517.7.21-SACK/bsd/netinet/tcp_timer.c --- xnu-517.7.21/bsd/netinet/tcp_timer.c Tue Aug 3 00:22:22 2004 +++ xnu-517.7.21-SACK/bsd/netinet/tcp_timer.c Sat Sep 25 06:28:28 2004 @@ -59,6 +59,7 @@ #include #include #include +#include #include #include #include @@ -256,6 +257,7 @@ splx(s); return; } + /* * Search through tcb's and update active timers. */ @@ -367,6 +369,7 @@ register int rexmt; struct socket *so_tmp; struct tcptemp *t_template; + struct sackhole *p, *q; #if TCPDEBUG int ostate; @@ -386,6 +389,16 @@ * control block. Otherwise, check again in a bit. */ case TCPT_2MSL: + q = p = tp->snd_holes; + while (p != 0) { + q = p->next; + FREE(p, M_PCB); + p = q; + } + tp->snd_holes = 0; + tp->snd_fack = tp->snd_una; + tp->retran_data = 0; + tp->snd_awnd = 0; if (tp->t_state != TCPS_TIME_WAIT && tp->t_rcvtime <= tcp_maxidle) { tp->t_timer[TCPT_2MSL] = tcp_keepintvl; @@ -401,6 +414,16 @@ * to a longer retransmit interval and retransmit one segment. */ case TCPT_REXMT: + q = p = tp->snd_holes; + while (p != 0) { + q = p->next; + FREE(p, M_PCB); + p = q; + } + tp->snd_holes = 0; + tp->snd_fack = tp->snd_una; + tp->retran_data = 0; + tp->snd_awnd = 0; if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) { tp->t_rxtshift = TCP_MAXRXTSHIFT; tcpstat.tcps_timeoutdrop++; @@ -466,7 +489,7 @@ * Note: We overload snd_recover to function also as the * snd_last variable described in RFC 2582 */ - tp->snd_recover = tp->snd_max; + tp->snd_last = tp->snd_max; /* * Force a segment to be sent. */ diff -urN xnu-517.7.21/bsd/netinet/tcp_usrreq.c xnu-517.7.21-SACK/bsd/netinet/tcp_usrreq.c --- xnu-517.7.21/bsd/netinet/tcp_usrreq.c Tue Aug 3 00:22:22 2004 +++ xnu-517.7.21-SACK/bsd/netinet/tcp_usrreq.c Sat Sep 25 06:28:28 2004 @@ -797,6 +797,11 @@ tp->iss = tcp_new_isn(tp); tcp_sendseqinit(tp); + tp->snd_last = tp->snd_una; + tp->snd_fack = tp->snd_una; + tp->retran_data = 0; + tp->snd_awnd = 0; + /* * Generate a CC value for this connection and * check whether CC or CCnew should be used. @@ -1022,6 +1027,9 @@ break; case TCP_NOPUSH: optval = tp->t_flags & TF_NOPUSH; + break; + case TCP_SACK_DISABLE: + optval = tp->sack_disable; break; default: error = ENOPROTOOPT; diff -urN xnu-517.7.21/bsd/netinet/tcp_var.h xnu-517.7.21-SACK/bsd/netinet/tcp_var.h --- xnu-517.7.21/bsd/netinet/tcp_var.h Tue Aug 3 00:22:22 2004 +++ xnu-517.7.21-SACK/bsd/netinet/tcp_var.h Sat Sep 25 07:42:43 2004 @@ -63,6 +63,19 @@ #define N_TIME_WAIT_SLOTS 128 /* must be power of 2 */ +struct sackblk { + tcp_seq start; /* start seq no. of sack block */ + tcp_seq end; /* end seq no. */ +}; + +struct sackhole { + tcp_seq start; /* start seq no. of hole */ + tcp_seq end; /* end seq no. */ + int dups; /* number of dup(s)acks for this hole */ + tcp_seq rxmit; /* next seq. no in hole to be retransmitted */ + struct sackhole *next; /* next in list */ +}; + /* * Kernel variables for tcp. */ @@ -151,6 +164,20 @@ u_long rcv_wnd; /* receive window */ tcp_seq rcv_up; /* receive urgent pointer */ + int sack_disable; /* disable SACK for this connection */ + int snd_numholes; /* number of holes seen by sender */ + struct sackhole *snd_holes; /* linked list of holes (sorted) */ + tcp_seq snd_fack; /* for FACK congestion control */ + u_long snd_awnd; /* snd_nxt - snd_fack + */ + /* retransmitted data */ + int retran_data; /* amount of outstanding retx. data */ + tcp_seq snd_last; /* for use in fast recovery */ + tcp_seq rcv_laststart; /* start of last segment recd. */ + tcp_seq rcv_lastend; /* end of ... */ + tcp_seq rcv_lastsack; /* last seq number(+1) sack'd by rcv'r*/ + int rcv_numsacks; /* # distinct sack blks present */ + struct sackblk sackblks[MAX_SACK_BLKS]; /* seq nos. of sack blocks */ + u_long snd_wnd; /* send window */ u_long snd_cwnd; /* congestion-controlled window */ u_long snd_ssthresh; /* snd_cwnd size threshold for @@ -493,7 +520,8 @@ #define TCPCTL_PCBLIST 11 /* list of all outstanding PCBs */ #define TCPCTL_DELACKTIME 12 /* time before sending delayed ACK */ #define TCPCTL_V6MSSDFLT 13 /* MSS default for IPv6 */ -#define TCPCTL_MAXID 14 +#define TCPCTL_SACK 14 +#define TCPCTL_MAXID 15 #define TCPCTL_NAMES { \ { 0, 0 }, \ @@ -510,6 +538,7 @@ { "pcblist", CTLTYPE_STRUCT }, \ { "delacktime", CTLTYPE_INT }, \ { "v6mssdflt", CTLTYPE_INT }, \ + { "sack", CTLTYPE_INT }, \ } #ifdef __APPLE_API_PRIVATE @@ -522,6 +551,7 @@ extern struct inpcbinfo tcbinfo; extern struct tcpstat tcpstat; /* tcp statistics */ extern int tcp_mssdflt; /* XXX */ +extern int tcp_do_sack; extern int tcp_minmss; extern int tcp_delack_enabled; extern int tcp_do_newreno; @@ -567,6 +597,17 @@ tcp_timers __P((struct tcpcb *, int)); void tcp_trace __P((int, int, struct tcpcb *, void *, struct tcphdr *, int)); +int tcp_sack_option __P((struct tcpcb *,struct tcphdr *,u_char *,int)); +void tcp_update_sack_list __P((struct tcpcb *tp)); +void tcp_del_sackholes __P((struct tcpcb *, struct tcphdr *)); +void tcp_clean_sackreport __P((struct tcpcb *tp)); +void tcp_sack_adjust __P((struct tcpcb *tp)); +struct sackhole * + tcp_sack_output __P((struct tcpcb *tp)); +int tcp_sack_partialack __P((struct tcpcb *, struct tcphdr *)); + +u_long tcp_seq_subtract __P((u_long, u_long )); + extern struct pr_usrreqs tcp_usrreqs; extern u_long tcp_sendspace;