diff -u -r -x *.[ao] -x *CVS* -x Makefile -x *.flags -x *.depend linux-2.3.10-cvs/include/linux/sysctl.h linux/include/linux/sysctl.h --- linux-2.3.10-cvs/include/linux/sysctl.h Mon Jul 12 12:29:05 1999 +++ linux/include/linux/sysctl.h Mon Jul 12 13:18:59 1999 @@ -221,7 +221,11 @@ NET_IPV4_ICMP_PARAMPROB_RATE=62, NET_IPV4_ICMP_ECHOREPLY_RATE=63, NET_IPV4_ICMP_IGNORE_BOGUS_ERROR_RESPONSES=64, - NET_IPV4_IGMP_MAX_MEMBERSHIPS=65 + NET_IPV4_IGMP_MAX_MEMBERSHIPS=65, + NET_IPV4_TCP_VEGAS = 66, + NET_IPV4_TCP_VEGAS_ALPHA = 67, + NET_IPV4_TCP_VEGAS_BETA = 68, + NET_IPV4_TCP_VEGAS_GAMMA = 69 }; enum { diff -u -r -x *.[ao] -x *CVS* -x Makefile -x *.flags -x *.depend linux-2.3.10-cvs/include/net/sock.h linux/include/net/sock.h --- linux-2.3.10-cvs/include/net/sock.h Mon Jul 12 12:29:10 1999 +++ linux/include/net/sock.h Tue Jul 13 13:53:35 1999 @@ -306,6 +306,23 @@ struct open_request **syn_wait_last; int syn_backlog; /* Backlog of received SYNs */ + + /* Vegas variables */ + __u32 v_beg_snd_nxt; /* saves right edge of bytes sent during last RTT */ + __u32 v_beg_snd_una; /* saves the left edge of bytes sent during last RTT */ + __u32 v_beg_snd_cwnd; /* saves the size of the cwnd */ + + __u8 v_do_vegas; /* if true, do vegas for this connection */ + __u8 v_doing_vegas_now;/* if true, do vegas for this RTT */ + + __u16 v_cntRTT; /* # of RTTs measured within last RTT */ + __u32 v_minRTT; /* min of RTTs measured within last RTT (in usec) */ + + __u32 v_baseRTT; /* the min of all Vegas RTT measurements seen (in usec) */ + + /* for Vegas DEBUGGING only -- remove when done */ + __u32 v_isn; /* initial seq. number for this connection */ + /* end for Vegas DEBUGGING only */ }; @@ -350,8 +367,22 @@ #define SOCK_DEBUGGING #ifdef SOCK_DEBUGGING #define SOCK_DEBUG(sk, msg...) do { if((sk) && ((sk)->debug)) printk(KERN_DEBUG ## msg); } while (0) +#define SOCK_INSTRUMENT(sk, msg...) do { \ + if((sk) && ((sk)->debug)) { \ + struct timeval tv; \ + get_fast_time(&tv); \ + printk(" %lu.%06lu: ", \ + tv.tv_sec, tv.tv_usec); \ + printk(msg); \ + } \ + } while (0) +#define TCP_INSTRUMENT(tp, msg...) do { \ + if ((tp)->send_head)\ + SOCK_INSTRUMENT(((tp)->send_head)->sk, msg); } while (0) #else +#define SOCK_INSTRUMENT(sk, msg...) do { } while (0) #define SOCK_DEBUG(sk, msg...) do { } while (0) +#define TCP_INSTRUMENT(tp, msg...) do { } while (0) #endif /* This is the per-socket lock. The spinlock provides a synchronization diff -u -r -x *.[ao] -x *CVS* -x Makefile -x *.flags -x *.depend linux-2.3.10-cvs/include/net/tcp.h linux/include/net/tcp.h --- linux-2.3.10-cvs/include/net/tcp.h Mon Jul 12 12:29:10 1999 +++ linux/include/net/tcp.h Mon Jul 19 23:51:37 1999 @@ -1084,4 +1084,84 @@ } +extern __inline__ u8 vegas_is_enabled(struct tcp_opt *tp) +{ + /* Should we be taking Vegas samples right now? */ + return tp->v_doing_vegas_now; +} + +extern __inline__ void disable_vegas(struct tcp_opt *tp) +{ + /* Stop taking Vegas samples for now. */ + TCP_INSTRUMENT(tp, "disable_vegas: Disabling vegas\n"); + + tp->v_doing_vegas_now = 0; +} + +extern __inline__ void enable_vegas(struct tcp_opt *tp) +{ + /* There are several situations when we must "re-start" Vegas: + * + * o when a connection is established + * o after an RTO + * o after fast recovery + * o when we send a packet and there is no outstanding unacknowledged data + * (restarting an idle connection) + * + * In these circumstances we cannot do a Vegas calculation at the end of + * the first RTT, because any calculation we do is using stale info -- + * both the saved cwnd and congestion feedback are stale. + * Instead we must wait until the completion of an RTT during + * which we actually receive ACKs. + */ + + TCP_INSTRUMENT(tp, "enable_vegas: Enabling vegas\n"); + + /* Begin taking Vegas samples next time we send something */ + tp->v_doing_vegas_now = 1; + + tp->v_beg_snd_nxt = tp->snd_nxt; /* set the beginning of the next send window */ + + tp->v_cntRTT = 0; + tp->v_minRTT = 0x7fffffff; +} + +extern __inline__ int is_tcp_vegas(struct tcp_opt *tp) +{ + /* Is this TCP connection using Vegas (regardless of whether it is taking + * Vegas measurements at the current time)? + */ + return (int)tp->v_do_vegas; +} + +extern __inline__ void do_tcp_vegas(struct tcp_opt *tp, __u8 should_do_vegas) +{ +#ifndef CONFIG_CPU_IS_SLOW + /* Set up a new TCP connection, depending on whether it should be + * using Vegas or not. + */ + if (should_do_vegas) { + tp->v_do_vegas = 1; + tp->v_baseRTT = 0x7fffffff; + + /* For Vegas debugging printk()'s only: */ + tp->v_isn = tp->snd_nxt; + + enable_vegas(tp); + } + else { + tp->v_do_vegas = 0; + disable_vegas(tp); + } +#else + /* CPUs that don't have time to take fine-grained timestamps for + * each packet probably shouldn't be doing Vegas. + */ + tp->v_do_vegas = 0; + disable_vegas(tp); +#endif + TCP_INSTRUMENT(tp, "do_tcp_vegas: do_tcp_vegas: v_do_vegas=%d, v_doing_vegas_now=%d\n", + tp->v_do_vegas, tp->v_doing_vegas_now); +} + #endif /* _TCP_H */ diff -u -r -x *.[ao] -x *CVS* -x Makefile -x *.flags -x *.depend linux-2.3.10-cvs/net/ipv4/sysctl_net_ipv4.c linux/net/ipv4/sysctl_net_ipv4.c --- linux-2.3.10-cvs/net/ipv4/sysctl_net_ipv4.c Mon Jul 12 12:29:22 1999 +++ linux/net/ipv4/sysctl_net_ipv4.c Mon Jul 12 02:17:27 1999 @@ -61,6 +61,12 @@ extern int sysctl_tcp_syn_taildrop; extern int sysctl_max_syn_backlog; +/* Vegas variables from tcp_input.c */ +extern int sysctl_tcp_vegas_cong_avoid; +extern int sysctl_tcp_vegas_alpha; +extern int sysctl_tcp_vegas_beta; +extern int sysctl_tcp_vegas_gamma; + /* From icmp.c */ extern int sysctl_icmp_destunreach_time; extern int sysctl_icmp_timeexceed_time; @@ -184,6 +190,14 @@ {NET_IPV4_IGMP_MAX_MEMBERSHIPS, "igmp_max_memberships", &sysctl_igmp_max_memberships, sizeof(int), 0644, NULL, &proc_dointvec}, #endif + {NET_IPV4_TCP_VEGAS, "tcp_vegas_cong_avoid", + &sysctl_tcp_vegas_cong_avoid, sizeof(int), 0644, NULL, &proc_dointvec}, + {NET_IPV4_TCP_VEGAS_ALPHA, "tcp_vegas_alpha", + &sysctl_tcp_vegas_alpha, sizeof(int), 0644, NULL, &proc_dointvec}, + {NET_IPV4_TCP_VEGAS_BETA, "tcp_vegas_beta", + &sysctl_tcp_vegas_beta, sizeof(int), 0644, NULL, &proc_dointvec}, + {NET_IPV4_TCP_VEGAS_GAMMA, "tcp_vegas_gamma", + &sysctl_tcp_vegas_gamma, sizeof(int), 0644, NULL, &proc_dointvec}, {0} }; diff -u -r -x *.[ao] -x *CVS* -x Makefile -x *.flags -x *.depend linux-2.3.10-cvs/net/ipv4/tcp_input.c linux/net/ipv4/tcp_input.c --- linux-2.3.10-cvs/net/ipv4/tcp_input.c Mon Jul 12 12:29:23 1999 +++ linux/net/ipv4/tcp_input.c Tue Jul 20 00:05:31 1999 @@ -78,6 +78,15 @@ int sysctl_tcp_window_scaling = 1; int sysctl_tcp_sack = 1; +/* Vegas is off by default. */ +int sysctl_tcp_vegas_cong_avoid = 0; +/* Default values of the Vegas variables, in fixed-point representation + * with V_PARAM_SHIFT bits to the right of the binary point. */ +#define V_PARAM_SHIFT 1 +int sysctl_tcp_vegas_alpha = 1<v_baseRTT) { + tp->v_baseRTT = vrtt; + TCP_INSTRUMENT(tp, "tcp_vegas_rtt_calc: new baseRTT=%u\n", tp->v_baseRTT); + } + + /* Find the min RTT during the last RTT to find + * the current prop. delay + queuing delay: + */ + tp->v_minRTT = min(tp->v_minRTT, vrtt); + tp->v_cntRTT++; + + TCP_INSTRUMENT(tp, "tcp_vegas_rtt_calc: now have %u rtt samples; latest is %u\n", tp->v_cntRTT, vrtt); +} + /* Called to compute a smoothed rtt estimate. The data fed to this * routine either comes from timestamps, or from segments that were * known _not_ to have been retransmitted [see Karn/Partridge @@ -151,10 +188,15 @@ * it up into three procedures. -- erics */ -static __inline__ void tcp_rtt_estimator(struct tcp_opt *tp, __u32 mrtt) +static __inline__ void tcp_rtt_estimator(struct tcp_opt *tp, __u32 mrtt, + __u32 usrtt) { long m = mrtt; /* RTT */ + if (vegas_is_enabled(tp)) { + tcp_vegas_rtt_calc(tp, usrtt); + } + /* The following amusing code comes from Jacobson's * article in SIGCOMM '88. Note that rtt and mdev * are scaled versions of rtt and mean deviation. @@ -468,6 +510,12 @@ tp->snd_cwnd = (tp->snd_ssthresh); tp->dup_acks = 0; + + /* We are exiting fast recovery so we can use Vegas again. */ + if (is_tcp_vegas(tp)) { + TCP_INSTRUMENT(tp, "clear_fast_retransmit: done doing fast recovery -- enable vegas\n"); + enable_vegas(tp); + } } /* NOTE: This code assumes that tp->dup_acks gets cleared when a @@ -497,6 +545,13 @@ if (tp->high_seq == 0 || after(ack, tp->high_seq)) { tp->dup_acks++; if ((tp->fackets_out > 3) || (tp->dup_acks == 3)) { + + /* We received 3 dup_acks and are about to + * go into fast recovery, so disable vegas. */ + if (is_tcp_vegas(tp)){ + TCP_INSTRUMENT(tp, "doing fast recovery -- disable vegas\n"); + disable_vegas(tp); + } tp->snd_ssthresh = tcp_recalc_ssthresh(tp); tp->snd_cwnd = (tp->snd_ssthresh + 3); tp->high_seq = tp->snd_nxt; @@ -585,7 +640,7 @@ /* This is Jacobson's slow start and congestion avoidance. * SIGCOMM '88, p. 328. */ -static __inline__ void tcp_cong_avoid(struct tcp_opt *tp) +static __inline__ void tcp_reno_cong_avoid(struct tcp_opt *tp) { if (tp->snd_cwnd <= tp->snd_ssthresh) { /* In "safe" area, increase. */ @@ -602,15 +657,286 @@ } } +/* This is based on the congestion detection/avoidance scheme described in + * Lawrence S. Brakmo and Larry L. Peterson. + * "TCP Vegas: End to end congestion avoidance on a global internet." + * IEEE Journal on Selected Areas in Communication, 13(8):1465--1480, + * October 1995. Available from ftp://ftp.cs.arizona.edu/xkernel/Papers/jsac.ps + * See http://www.cs.arizona.edu/xkernel/ for their implementation. + * The main aspects that distinguish this implementation from the + * Arizona Vegas implementation are: + * o We do not change the loss detection or recovery mechanisms of + * Linux in any way. Linux already recovers from losses quite well, + * using fine-grained timers, NewReno, and FACK. + * o To avoid the performance penalty imposed by increasing cwnd + * only every-other RTT during slow start, we increase during + * every RTT during slow start, just like Reno. + * o Largely to allow continuous cwnd growth during slow start, + * we use the rate at which ACKs come back as the "actual" + * rate, rather than the rate at which data is sent. + * o To speed convergence to the right rate, we set the cwnd + * to achieve the right ("actual") rate when we exit slow start. + * o To filter out the noise caused by delayed ACKs, we use the + * minimum RTT sample observed during the last RTT to calculate + * the actual rate. + * o When the sender re-starts from idle, it waits until it has + * received ACKs for an entire flight of new data before making + * a cwnd adjustment decision. The original Vegas implementation + * assumed senders never went idle. + */ +static void tcp_vegas_cong_avoid(struct tcp_opt *tp, u32 ack, + u32 usrtt_sample) +{ + /* The key players are v_beg_snd_una and v_beg_snd_nxt. These are so named + * because they represent the approximate values of snd_una and snd_nxt + * at the beginning of the current RTT. More precisely, they represent + * the amount of data sent during the RTT. At the end of the RTT, + * when we receive an ACK for v_beg_snd_nxt, we will calculate that + * (v_beg_snd_nxt - v_beg_snd_una) outstanding bytes of data have been + * ACKed during the course of the RTT, giving an "actual" rate of + * (v_beg_snd_nxt - v_beg_snd_una) / (rtt duration) + * + * Unfortunately, v_beg_snd_una is not exactly equal to snd_una, + * because delayed ACKs can cover more than one segment, so they + * don't line up nicely with the boundaries of RTTs. + * + * Another unfortunate fact of life is that delayed ACKs delay the advance + * of the left edge of our send window, so that the number of bytes we + * send in an RTT is often less than our cwnd will allow. So we keep + * track of our cwnd separately, in v_beg_snd_cwnd. + */ + + if (after(ack, tp->v_beg_snd_nxt)) { + /* Do the Vegas once-per-RTT cwnd adjustment. */ + u32 old_wnd, old_snd_cwnd; + + TCP_INSTRUMENT(tp, "-------------------------------------------------\n"); + TCP_INSTRUMENT(tp, "got ack=%u, for 'distinguished'=%u; cwnd=%u\n", + ack - tp->v_isn, tp->v_beg_snd_nxt - tp->v_isn, tp->snd_cwnd); + + /* + * Here old_wnd is essentially the window of data that was + * sent during the previous RTT, and has all + * been acknowledged in the course of the RTT that ended + * with the ACK we just received. Likewise, old_snd_cwnd + * is the cwnd during the previous RTT. + */ + old_wnd = (tp->v_beg_snd_nxt - tp->v_beg_snd_una) / tp->mss_cache; + old_snd_cwnd = tp->v_beg_snd_cwnd; + TCP_INSTRUMENT(tp, + "old_wnd = (v_beg_snd_nxt=%u) - (v_beg_snd_una=%u) = %u B = %u MSS; old_snd_cwnd = %u\n", + tp->v_beg_snd_nxt - tp->v_isn, + tp->v_beg_snd_una - tp->v_isn, + tp->v_beg_snd_nxt - tp->v_beg_snd_una, + old_wnd, + old_snd_cwnd); + + /* Save the extent of the current window so we can use this + * at the end of the next RTT. + */ + tp->v_beg_snd_una = tp->v_beg_snd_nxt; + tp->v_beg_snd_nxt = tp->snd_nxt; + tp->v_beg_snd_cwnd = tp->snd_cwnd; + + TCP_INSTRUMENT(tp, "updated: (v_beg_snd_nxt=%u) - (v_beg_snd_una=%u) = %u B; v_beg_snd_cwnd=%u\n", + tp->v_beg_snd_nxt - tp->v_isn, + tp->v_beg_snd_una - tp->v_isn, + tp->v_beg_snd_nxt - tp->v_beg_snd_una, + tp->v_beg_snd_cwnd); + + /* Take into account the current RTT sample too, to + * decrease the impact of delayed acks. This double counts + * this sample since we count it for the next window as well, + * but that's not too awful, since we're taking the min, + * rather than averaging. + */ + tcp_vegas_rtt_calc(tp, usrtt_sample); + + /* We do the Vegas calculations only if we got enough RTT + * samples that we can be reasonably sure that we got + * at least one RTT sample that wasn't from a delayed ACK. + * If we only had 2 samples total, + * then that means we're getting only 1 ACK per RTT, which + * means they're almost certainly delayed ACKs. + * If we have 3 samples, we should be OK. + */ + + if (tp->v_cntRTT <= 2) { + /* We don't have enough RTT samples to do the Vegas calculation, + * so we'll behave like Reno. + */ + if (tp->snd_cwnd > tp->snd_ssthresh) { + tp->snd_cwnd++; + } + } + else { + u32 rtt, target_cwnd, diff; + + /* We have enough RTT samples, so, using the Vegas algorithm, + * we determine if we should increase or decrease cwnd, and by how much. + */ + + /* Pluck out the RTT we are using for the Vegas calculations. This + * is the min RTT seen during the last RTT. Taking the min + * filters out the effects of delayed ACKs, at the cost of noticing + * congestion a bit later. + */ + rtt = tp->v_minRTT; + TCP_INSTRUMENT(tp, "vegas rtt=%u, from %u samples\n", rtt, tp->v_cntRTT); + + /* Calculate the cwnd we should have, if we weren't going too fast. + * This is: + * (actual rate in segments) * baseRTT + * We keep it as a fixed point number with V_PARAM_SHIFT bits to the + * right of the binary point. + */ + target_cwnd = ((old_wnd * tp->v_baseRTT) << V_PARAM_SHIFT) / rtt; + TCP_INSTRUMENT(tp, "target_cwnd = %u x 2^-%u segments\n", target_cwnd, V_PARAM_SHIFT); + + /* Calculate the difference between the window we had, and + * the window we would like to have. This quantity is the "Diff" from the + * Arizona Vegas papers. + * + * Again, this is a fixed point number with V_PARAM_SHIFT bits to the + * right of the binary point. + */ + diff = (old_wnd << V_PARAM_SHIFT) - target_cwnd; + TCP_INSTRUMENT(tp, "diff = %u x 2^-%u segments\n", diff, V_PARAM_SHIFT); + + if (tp->snd_cwnd < tp->snd_ssthresh) { /* slow start */ + + if (diff > sysctl_tcp_vegas_gamma) { + /* Going too fast. Time to slow down and switch to congestion avoidance. */ + + TCP_INSTRUMENT(tp, "switching to congestion avoidance\n"); + + tp->snd_ssthresh = 2; + + /* Set cwnd to match the actual rate exactly: + * cwnd = (actual rate) * baseRTT + * Then we add 1 because the integer truncation robs us of full link + * utilization. + */ + tp->snd_cwnd = min( tp->snd_cwnd, (target_cwnd >> V_PARAM_SHIFT) + 1 ); + + TCP_INSTRUMENT(tp, "leaving slow start and setting cwnd=%d packets\n", + tp->snd_cwnd); + } + else { + TCP_INSTRUMENT(tp, "keep doing slow start\n"); + } + } + else { /* congestion avoidance */ + u32 next_snd_cwnd; + + /* Figure out where we would like cwnd to be. */ + if (diff > sysctl_tcp_vegas_beta) { + /* The old window was too fast, so we slow down. */ + TCP_INSTRUMENT(tp, "slowing down during congestion avoidance\n"); + next_snd_cwnd = old_snd_cwnd - 1; + } + else if (diff < sysctl_tcp_vegas_alpha) { + /* We don't have enough extra packets in the network, so speed up. */ + TCP_INSTRUMENT(tp, "speeding up during congestion avoidance\n"); + next_snd_cwnd = old_snd_cwnd + 1; + } + else { + /* Sending just as fast as we should be. */ + TCP_INSTRUMENT(tp, "staying at the same rate during congestion avoidance\n"); + next_snd_cwnd = old_snd_cwnd; + } + + /* Adjust cwnd upward or downward, toward the desired value. */ + if (next_snd_cwnd > tp->snd_cwnd) { + tp->snd_cwnd++; + } + else if (next_snd_cwnd < tp->snd_cwnd) { + tp->snd_cwnd--; + } + } + + TCP_INSTRUMENT(tp, "adjusting cwnd to %u\n", tp->snd_cwnd); + TCP_INSTRUMENT(tp, "cwnd bw=%d bits/sec, actual=%d bits/sec\n", + tp->snd_cwnd * tp->mss_cache * ((8*1000000) / rtt), + old_wnd * ((8*1000000) / rtt) ); + + } /* end of got-acks-this-RTT code */ + + /* Wipe the slate clean for the next RTT. */ + tp->v_cntRTT = 0; + tp->v_minRTT = 0x7fffffff; + + } /* End of stuff done once per RTT. */ + + /* The following code is executed for every ack we receive, + * except for conditions checked in should_advance_cwnd() + * before the call to tcp_cong_avoid(). Mainly this means that + * we only execute this code if the ack actually acked some + * data. + */ + + /* If we are in slow start, increase our cwnd in response to this ACK. + * (If we are not in slow start then we are in congestion avoidance, + * and adjust our congestion window only once per RTT. See the code above.) + */ + if (tp->snd_cwnd <= tp->snd_ssthresh) { + tp->snd_cwnd++; + TCP_INSTRUMENT(tp, "increasing cwnd during s/s; new cwnd=%u\n", tp->snd_cwnd); + } + + /* If we haven't been able to keep up with cwnd, clamp our cwnd + * down to our actual effective sending window. This will usually + * happen if the receiver's window is limiting us, or the sender + * is not providing enough data to fully exploit our cwnd. The + * thing we are trying to avoid is cranking up cwnd absurdly high + * while we are limited by the receiver window or sender sending + * rate, and then sending a huge burst out into the network when + * the receiver window or sender's sending rate opens back up. + */ + if ((int)(tp->snd_cwnd * tp->mss_cache) > (int)(tp->snd_nxt - tp->snd_una) + + (int)(2 * tp->mss_cache)) { + tp->snd_cwnd = (tp->snd_nxt - tp->snd_una)/tp->mss_cache; + + TCP_INSTRUMENT(tp, "clamping cwnd to %u B; advwnd=%u B, in_flight=%u B\n", + tp->snd_cwnd * tp->mss_cache, + tp->snd_wnd, + (tp->snd_nxt - tp->snd_una)); + } + + /* Make sure that we are never so timid as to reduce our cwnd below 2 MSS. + * Going below 2 MSS would risk huge delayed ACKs from our receiver. + */ + if (tp->snd_cwnd < 2) + tp->snd_cwnd = 2; + + TCP_INSTRUMENT(tp, "vegas cwnd=%d, ssthresh=%d\n", + tp->snd_cwnd, tp->snd_ssthresh); +} + + +static __inline__ void tcp_cong_avoid(struct tcp_opt *tp, u32 ack, + u32 usrtt_sample) +{ + if (vegas_is_enabled(tp)) + tcp_vegas_cong_avoid(tp, ack, usrtt_sample); + else + tcp_reno_cong_avoid(tp); +} + /* Remove acknowledged frames from the retransmission queue. */ static int tcp_clean_rtx_queue(struct sock *sk, __u32 ack, - __u32 *seq, __u32 *seq_rtt) + __u32 *seq, __u32 *seq_rtt, __u32 *seq_usrtt) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); struct sk_buff *skb; __u32 now = tcp_time_stamp; + struct timeval usnow; int acked = 0; + /* get microsecond-resolution time */ + if(vegas_is_enabled(tp)) + get_fast_time(&usnow); + /* If we are retransmitting, and this ACK clears up to * the retransmit head, or further, then clear our state. */ @@ -651,6 +977,9 @@ tp->packets_out--; *seq = scb->seq; *seq_rtt = now - scb->when; + if(vegas_is_enabled(tp)) + *seq_usrtt = (usnow.tv_sec - skb->stamp.tv_sec) * 1000000 + + (usnow.tv_usec - skb->stamp.tv_usec); __skb_unlink(skb, skb->list); kfree_skb(skb); } @@ -709,7 +1038,7 @@ * with this code. (Superceeds RFC1323) */ static void tcp_ack_saw_tstamp(struct sock *sk, struct tcp_opt *tp, - u32 seq, u32 ack, int flag) + u32 seq, u32 ack, u32 seq_usrtt, int flag) { __u32 seq_rtt; @@ -725,9 +1054,16 @@ return; seq_rtt = tcp_time_stamp - tp->rcv_tsecr; - tcp_rtt_estimator(tp, seq_rtt); + tcp_rtt_estimator(tp, seq_rtt, seq_usrtt); if (tp->retransmits) { if (tp->packets_out == 0) { + /* We got all the retransmitted pkts acked, so + * can enable vegas now. */ + if (is_tcp_vegas(tp)) { + + TCP_INSTRUMENT(tp, "tcp_ack_saw_tstamp: enable vgs when done retr. timed out pkts\n"); + enable_vegas(tp); + } tp->retransmits = 0; tp->fackets_out = 0; tp->retrans_out = 0; @@ -772,6 +1108,7 @@ int flag = 0; u32 seq = 0; u32 seq_rtt = 0; + u32 seq_usrtt = 0; /* microsecond resolution rtt */ if(sk->zapped) return(1); /* Dead, can't ack any more so why bother */ @@ -828,21 +1165,28 @@ tcp_ack_probe(sk, ack); /* See if we can take anything off of the retransmit queue. */ - flag |= tcp_clean_rtx_queue(sk, ack, &seq, &seq_rtt); + flag |= tcp_clean_rtx_queue(sk, ack, &seq, &seq_rtt, &seq_usrtt); /* We must do this here, before code below clears out important * state contained in tp->fackets_out and tp->retransmits. -DaveM */ if (should_advance_cwnd(tp, flag)) - tcp_cong_avoid(tp); + tcp_cong_avoid(tp, ack, seq_usrtt); /* If we have a timestamp, we always do rtt estimates. */ if (tp->saw_tstamp) { - tcp_ack_saw_tstamp(sk, tp, seq, ack, flag); + tcp_ack_saw_tstamp(sk, tp, seq, ack, seq_usrtt, flag); } else { /* If we were retransmiting don't count rtt estimate. */ if (tp->retransmits) { if (tp->packets_out == 0) { + /* We got all retransmitted pkts acked, so + * can enable vegas now. */ + if (is_tcp_vegas(tp)){ + + TCP_INSTRUMENT(tp, "tcp_ack: enable vgs when done retr. timed out pkts\n"); + enable_vegas(tp); + } tp->retransmits = 0; tp->fackets_out = 0; tp->retrans_out = 0; @@ -859,7 +1203,7 @@ if (flag & FLAG_DATA_ACKED) { if(!(flag & FLAG_RETRANS_DATA_ACKED)) { tp->backoff = 0; - tcp_rtt_estimator(tp, seq_rtt); + tcp_rtt_estimator(tp, seq_rtt, seq_usrtt); tcp_set_rto(tp); tcp_bound_rto(tp); } @@ -2268,7 +2612,7 @@ /* tp->tcp_header_len and tp->mss_clamp probably changed, synchronize mss. - */ + */ tcp_sync_mss(sk, tp->pmtu_cookie); tp->rcv_mss = tp->mss_cache; diff -u -r -x *.[ao] -x *CVS* -x Makefile -x *.flags -x *.depend linux-2.3.10-cvs/net/ipv4/tcp_ipv4.c linux/net/ipv4/tcp_ipv4.c --- linux-2.3.10-cvs/net/ipv4/tcp_ipv4.c Mon Jul 12 12:29:23 1999 +++ linux/net/ipv4/tcp_ipv4.c Mon Jul 12 01:31:56 1999 @@ -66,6 +66,7 @@ extern int sysctl_tcp_timestamps; extern int sysctl_tcp_window_scaling; extern int sysctl_tcp_sack; +extern int sysctl_tcp_vegas_cong_avoid; extern int sysctl_tcp_syncookies; extern int sysctl_ip_dynaddr; extern __u32 sysctl_wmem_max; @@ -1400,6 +1401,9 @@ newtp->snd_cwnd_cnt = 0; newtp->dup_acks = 0; newtp->delayed_acks = 0; + + do_tcp_vegas(newtp, sysctl_tcp_vegas_cong_avoid); + init_timer(&newtp->retransmit_timer); newtp->retransmit_timer.function = &tcp_retransmit_timer; newtp->retransmit_timer.data = (unsigned long) newsk; diff -u -r -x *.[ao] -x *CVS* -x Makefile -x *.flags -x *.depend linux-2.3.10-cvs/net/ipv4/tcp_output.c linux/net/ipv4/tcp_output.c --- linux-2.3.10-cvs/net/ipv4/tcp_output.c Mon Jul 12 12:29:23 1999 +++ linux/net/ipv4/tcp_output.c Mon Jul 12 02:14:08 1999 @@ -41,6 +41,7 @@ extern int sysctl_tcp_timestamps; extern int sysctl_tcp_window_scaling; extern int sysctl_tcp_sack; +extern int sysctl_tcp_vegas_cong_avoid; /* People can turn this off for buggy TCP's found in printers etc. */ int sysctl_tcp_retrans_collapse = 1; @@ -111,7 +112,30 @@ */ tcp_header_size += (TCPOLEN_SACK_BASE_ALIGNED + (tp->num_sacks * TCPOLEN_SACK_PERBLOCK)); + } + if (vegas_is_enabled(tp)) { + /* If the connection is idle and we are restarting, then we don't + * want to do any Vegas calculations until we get fresh RTT samples. + * So when we restart, we reset our Vegas state to a clean + * slate. After we get acks for this flight of packets, _then_ + * we can make Vegas calculations again. + */ + if (tcp_packets_in_flight(tp) == 0) { + SOCK_INSTRUMENT(sk, "zero packets in flight; reset vegas\n"); + enable_vegas(tp); + } + } + + if (vegas_is_enabled(tp)) { + /* Purely for debugging purposes */ + if (tp->v_beg_snd_nxt == TCP_SKB_CB(skb)->seq) { + if (TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq) { + SOCK_INSTRUMENT(sk, "sending 'special'=%u\n", + tp->v_beg_snd_nxt - tp->v_isn); + } + } } + th = (struct tcphdr *) skb_push(skb, tcp_header_size); skb->h.th = th; skb_set_owner_w(skb, sk); @@ -170,6 +194,9 @@ if (!force_queue && tp->send_head == NULL && tcp_snd_test(sk, skb)) { /* Send it out now. */ TCP_SKB_CB(skb)->when = tcp_time_stamp; + if (is_tcp_vegas(tp)) + get_fast_time(&skb->stamp); + tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; tp->packets_out++; tcp_transmit_skb(sk, skb_clone(skb, GFP_KERNEL)); @@ -348,12 +375,22 @@ if (tcp_fragment(sk, skb, mss_now)) break; } - /* Advance the send_head. This one is going out. */ update_send_head(sk); TCP_SKB_CB(skb)->when = tcp_time_stamp; + if (is_tcp_vegas(tp)) + get_fast_time(&skb->stamp); + tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; tp->packets_out++; + if (vegas_is_enabled(tp)) { + /* Purely a debug message */ + TCP_INSTRUMENT(tp, "write_xmit: send 1 more; in_flight=%u, snd_wnd=%u, cwnd=%u, snd_una=%u, skb->seq=%u, skb->end_seq=%u\n", + tcp_packets_in_flight(tp), tp->snd_wnd, + tp->snd_cwnd, tp->snd_una - tp->v_isn, + TCP_SKB_CB(skb)->seq - tp->v_isn, + TCP_SKB_CB(skb)->end_seq - tp->v_isn); + } tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)); sent_pkts = 1; } @@ -623,6 +660,9 @@ * is still in somebody's hands, else make a clone. */ TCP_SKB_CB(skb)->when = tcp_time_stamp; + if (is_tcp_vegas(tp)) + get_fast_time(&skb->stamp); + if(skb_cloned(skb)) skb = skb_copy(skb, GFP_ATOMIC); else @@ -746,6 +786,9 @@ !(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_URG)) { update_send_head(sk); TCP_SKB_CB(skb)->when = tcp_time_stamp; + if (is_tcp_vegas(tp)) + get_fast_time(&skb->stamp); + tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; tp->packets_out++; tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)); @@ -801,6 +844,9 @@ TCP_SKB_CB(skb)->seq = tp->write_seq; TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq; TCP_SKB_CB(skb)->when = tcp_time_stamp; + if (is_tcp_vegas(tp)) + get_fast_time(&skb->stamp); + tcp_transmit_skb(sk, skb); } @@ -959,6 +1005,8 @@ TCP_SKB_CB(buff)->end_seq = tp->write_seq; tp->snd_nxt = TCP_SKB_CB(buff)->end_seq; + do_tcp_vegas(tp, sysctl_tcp_vegas_cong_avoid); + tp->window_clamp = dst->window; tcp_select_initial_window(sock_rspace(sk)/2,tp->mss_clamp, &tp->rcv_wnd, @@ -987,6 +1035,9 @@ /* Send it off. */ __skb_queue_tail(&sk->write_queue, buff); TCP_SKB_CB(buff)->when = tcp_time_stamp; + if (is_tcp_vegas(tp)) + get_fast_time(&buff->stamp); + tp->packets_out++; tcp_transmit_skb(sk, skb_clone(buff, GFP_KERNEL)); tcp_statistics.TcpActiveOpens++; @@ -1100,6 +1151,8 @@ } update_send_head(sk); TCP_SKB_CB(skb)->when = tcp_time_stamp; + if (is_tcp_vegas(tp)) + get_fast_time(&skb->stamp); tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; tp->packets_out++; tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)); @@ -1126,6 +1179,8 @@ TCP_SKB_CB(skb)->seq = tp->snd_nxt - 1; TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq; TCP_SKB_CB(skb)->when = tcp_time_stamp; + if (is_tcp_vegas(tp)) + get_fast_time(&skb->stamp); tcp_transmit_skb(sk, skb); } } diff -u -r -x *.[ao] -x *CVS* -x Makefile -x *.flags -x *.depend linux-2.3.10-cvs/net/ipv4/tcp_timer.c linux/net/ipv4/tcp_timer.c --- linux-2.3.10-cvs/net/ipv4/tcp_timer.c Mon Jul 12 12:29:23 1999 +++ linux/net/ipv4/tcp_timer.c Mon Jul 12 02:14:19 1999 @@ -472,6 +472,12 @@ tp->snd_ssthresh = tcp_recalc_ssthresh(tp); tp->snd_cwnd_cnt = 0; tp->snd_cwnd = 1; + + /* Turn off Vegas until we have recovered from this loss. */ + if (is_tcp_vegas(tp)) { + SOCK_INSTRUMENT(sk, "suffered a timeout -- disabling vegas\n"); + disable_vegas(tp); + } } tp->retransmits++;