summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/networking/ip-sysctl.txt17
-rw-r--r--include/linux/skbuff.h9
-rw-r--r--include/linux/tcp.h11
-rw-r--r--include/net/tcp.h21
-rw-r--r--net/ipv4/Makefile1
-rw-r--r--net/ipv4/sysctl_net_ipv4.c14
-rw-r--r--net/ipv4/tcp.c1
-rw-r--r--net/ipv4/tcp_input.c180
-rw-r--r--net/ipv4/tcp_minisocks.c3
-rw-r--r--net/ipv4/tcp_output.c6
-rw-r--r--net/ipv4/tcp_recovery.c109
11 files changed, 286 insertions, 86 deletions
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index ebe94f2cab98..85752c81c5ec 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -384,6 +384,14 @@ tcp_mem - vector of 3 INTEGERs: min, pressure, max
Defaults are calculated at boot time from amount of available
memory.
+tcp_min_rtt_wlen - INTEGER
+ The window length of the windowed min filter to track the minimum RTT.
+ A shorter window lets a flow more quickly pick up new (higher)
+ minimum RTT when it is moved to a longer path (e.g., due to traffic
+ engineering). A longer window makes the filter more resistant to RTT
+ inflations such as transient congestion. The unit is seconds.
+ Default: 300
+
tcp_moderate_rcvbuf - BOOLEAN
If set, TCP performs receive buffer auto-tuning, attempting to
automatically size the buffer (no greater than tcp_rmem[2]) to
@@ -425,6 +433,15 @@ tcp_orphan_retries - INTEGER
you should think about lowering this value, such sockets
may consume significant resources. Cf. tcp_max_orphans.
+tcp_recovery - INTEGER
+ This value is a bitmap to enable various experimental loss recovery
+ features.
+
+ RACK: 0x1 enables the RACK loss detection for fast detection of lost
+ retransmissions and tail drops.
+
+ Default: 0x1
+
tcp_reordering - INTEGER
Initial reordering level of packets in a TCP stream.
TCP stack can then dynamically adjust flow reordering level
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 4398411236f1..24f4dfd94c51 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -463,6 +463,15 @@ static inline u32 skb_mstamp_us_delta(const struct skb_mstamp *t1,
return delta_us;
}
+static inline bool skb_mstamp_after(const struct skb_mstamp *t1,
+ const struct skb_mstamp *t0)
+{
+ s32 diff = t1->stamp_jiffies - t0->stamp_jiffies;
+
+ if (!diff)
+ diff = t1->stamp_us - t0->stamp_us;
+ return diff > 0;
+}
/**
* struct sk_buff - socket buffer
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 86a7edaa6797..5dce9705fe84 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -194,6 +194,12 @@ struct tcp_sock {
u32 window_clamp; /* Maximal window to advertise */
u32 rcv_ssthresh; /* Current window clamp */
+ /* Information of the most recently (s)acked skb */
+ struct tcp_rack {
+ struct skb_mstamp mstamp; /* (Re)sent time of the skb */
+ u8 advanced; /* mstamp advanced since last lost marking */
+ u8 reord; /* reordering detected */
+ } rack;
u16 advmss; /* Advertised MSS */
u8 unused;
u8 nonagle : 4,/* Disable Nagle algorithm? */
@@ -217,6 +223,9 @@ struct tcp_sock {
u32 mdev_max_us; /* maximal mdev for the last rtt period */
u32 rttvar_us; /* smoothed mdev_max */
u32 rtt_seq; /* sequence number to update rttvar */
+ struct rtt_meas {
+ u32 rtt, ts; /* RTT in usec and sampling time in jiffies. */
+ } rtt_min[3];
u32 packets_out; /* Packets which are "in flight" */
u32 retrans_out; /* Retransmitted packets out */
@@ -280,8 +289,6 @@ struct tcp_sock {
int lost_cnt_hint;
u32 retransmit_high; /* L-bits may be on up to this seqno */
- u32 lost_retrans_low; /* Sent seq after any rxmit (lowest) */
-
u32 prior_ssthresh; /* ssthresh saved at recovery start */
u32 high_seq; /* snd_nxt at onset of congestion */
diff --git a/include/net/tcp.h b/include/net/tcp.h
index eed94fc355c1..11e320412216 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -279,6 +279,7 @@ extern int sysctl_tcp_limit_output_bytes;
extern int sysctl_tcp_challenge_ack_limit;
extern unsigned int sysctl_tcp_notsent_lowat;
extern int sysctl_tcp_min_tso_segs;
+extern int sysctl_tcp_min_rtt_wlen;
extern int sysctl_tcp_autocorking;
extern int sysctl_tcp_invalid_ratelimit;
extern int sysctl_tcp_pacing_ss_ratio;
@@ -566,6 +567,7 @@ void tcp_resume_early_retransmit(struct sock *sk);
void tcp_rearm_rto(struct sock *sk);
void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req);
void tcp_reset(struct sock *sk);
+void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb);
/* tcp_timer.c */
void tcp_init_xmit_timers(struct sock *);
@@ -671,6 +673,12 @@ static inline bool tcp_ca_dst_locked(const struct dst_entry *dst)
return dst_metric_locked(dst, RTAX_CC_ALGO);
}
+/* Minimum RTT in usec. ~0 means not available. */
+static inline u32 tcp_min_rtt(const struct tcp_sock *tp)
+{
+ return tp->rtt_min[0].rtt;
+}
+
/* Compute the actual receive window we are currently advertising.
* Rcv_nxt can be after the window if our peer push more data
* than the offered window.
@@ -1743,6 +1751,19 @@ int tcpv4_offload_init(void);
void tcp_v4_init(void);
void tcp_init(void);
+/* tcp_recovery.c */
+
+/* Flags to enable various loss recovery features. See below */
+extern int sysctl_tcp_recovery;
+
+/* Use TCP RACK to detect (some) tail and retransmit losses */
+#define TCP_RACK_LOST_RETRANS 0x1
+
+extern int tcp_rack_mark_lost(struct sock *sk);
+
+extern void tcp_rack_advance(struct tcp_sock *tp,
+ const struct skb_mstamp *xmit_time, u8 sacked);
+
/*
* Save and compile IPv4 options, return a pointer to it
*/
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 89aacb630a53..c29809f765dc 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -8,6 +8,7 @@ obj-y := route.o inetpeer.o protocol.o \
inet_timewait_sock.o inet_connection_sock.o \
tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \
tcp_minisocks.o tcp_cong.o tcp_metrics.o tcp_fastopen.o \
+ tcp_recovery.o \
tcp_offload.o datagram.o raw.o udp.o udplite.o \
udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \
fib_frontend.o fib_semantics.o fib_trie.o \
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 894da3a70aff..25300c5e283b 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -496,6 +496,13 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec
},
{
+ .procname = "tcp_recovery",
+ .data = &sysctl_tcp_recovery,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
.procname = "tcp_reordering",
.data = &sysctl_tcp_reordering,
.maxlen = sizeof(int),
@@ -577,6 +584,13 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec
},
{
+ .procname = "tcp_min_rtt_wlen",
+ .data = &sysctl_tcp_min_rtt_wlen,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
.procname = "tcp_low_latency",
.data = &sysctl_tcp_low_latency,
.maxlen = sizeof(int),
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index ac1bdbb50352..0cfa7c0c1e80 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -388,6 +388,7 @@ void tcp_init_sock(struct sock *sk)
icsk->icsk_rto = TCP_TIMEOUT_INIT;
tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
+ tp->rtt_min[0].rtt = ~0U;
/* So many TCP implementations out there (incorrectly) count the
* initial SYN frame in their delayed-ACK and congestion control
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 944eaca69115..fdd88c3803a6 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -95,6 +95,7 @@ int sysctl_tcp_stdurg __read_mostly;
int sysctl_tcp_rfc1337 __read_mostly;
int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
int sysctl_tcp_frto __read_mostly = 2;
+int sysctl_tcp_min_rtt_wlen __read_mostly = 300;
int sysctl_tcp_thin_dupack __read_mostly;
@@ -880,6 +881,7 @@ static void tcp_update_reordering(struct sock *sk, const int metric,
if (metric > 0)
tcp_disable_early_retrans(tp);
+ tp->rack.reord = 1;
}
/* This must be called before lost_out is incremented */
@@ -905,8 +907,7 @@ static void tcp_skb_mark_lost(struct tcp_sock *tp, struct sk_buff *skb)
}
}
-static void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp,
- struct sk_buff *skb)
+void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb)
{
tcp_verify_retransmit_hint(tp, skb);
@@ -1047,70 +1048,6 @@ static bool tcp_is_sackblock_valid(struct tcp_sock *tp, bool is_dsack,
return !before(start_seq, end_seq - tp->max_window);
}
-/* Check for lost retransmit. This superb idea is borrowed from "ratehalving".
- * Event "B". Later note: FACK people cheated me again 8), we have to account
- * for reordering! Ugly, but should help.
- *
- * Search retransmitted skbs from write_queue that were sent when snd_nxt was
- * less than what is now known to be received by the other end (derived from
- * highest SACK block). Also calculate the lowest snd_nxt among the remaining
- * retransmitted skbs to avoid some costly processing per ACKs.
- */
-static void tcp_mark_lost_retrans(struct sock *sk, int *flag)
-{
- const struct inet_connection_sock *icsk = inet_csk(sk);
- struct tcp_sock *tp = tcp_sk(sk);
- struct sk_buff *skb;
- int cnt = 0;
- u32 new_low_seq = tp->snd_nxt;
- u32 received_upto = tcp_highest_sack_seq(tp);
-
- if (!tcp_is_fack(tp) || !tp->retrans_out ||
- !after(received_upto, tp->lost_retrans_low) ||
- icsk->icsk_ca_state != TCP_CA_Recovery)
- return;
-
- tcp_for_write_queue(skb, sk) {
- u32 ack_seq = TCP_SKB_CB(skb)->ack_seq;
-
- if (skb == tcp_send_head(sk))
- break;
- if (cnt == tp->retrans_out)
- break;
- if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
- continue;
-
- if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS))
- continue;
-
- /* TODO: We would like to get rid of tcp_is_fack(tp) only
- * constraint here (see above) but figuring out that at
- * least tp->reordering SACK blocks reside between ack_seq
- * and received_upto is not easy task to do cheaply with
- * the available datastructures.
- *
- * Whether FACK should check here for tp->reordering segs
- * in-between one could argue for either way (it would be
- * rather simple to implement as we could count fack_count
- * during the walk and do tp->fackets_out - fack_count).
- */
- if (after(received_upto, ack_seq)) {
- TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
- tp->retrans_out -= tcp_skb_pcount(skb);
- *flag |= FLAG_LOST_RETRANS;
- tcp_skb_mark_lost_uncond_verify(tp, skb);
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSTRETRANSMIT);
- } else {
- if (before(ack_seq, new_low_seq))
- new_low_seq = ack_seq;
- cnt += tcp_skb_pcount(skb);
- }
- }
-
- if (tp->retrans_out)
- tp->lost_retrans_low = new_low_seq;
-}
-
static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb,
struct tcp_sack_block_wire *sp, int num_sacks,
u32 prior_snd_una)
@@ -1236,6 +1173,8 @@ static u8 tcp_sacktag_one(struct sock *sk,
return sacked;
if (!(sacked & TCPCB_SACKED_ACKED)) {
+ tcp_rack_advance(tp, xmit_time, sacked);
+
if (sacked & TCPCB_SACKED_RETRANS) {
/* If the segment is not tagged as lost,
* we do not clear RETRANS, believing
@@ -1837,7 +1776,6 @@ advance_sp:
((inet_csk(sk)->icsk_ca_state != TCP_CA_Loss) || tp->undo_marker))
tcp_update_reordering(sk, tp->fackets_out - state->reord, 0);
- tcp_mark_lost_retrans(sk, &state->flag);
tcp_verify_left_out(tp);
out:
@@ -2314,14 +2252,29 @@ static inline void tcp_moderate_cwnd(struct tcp_sock *tp)
tp->snd_cwnd_stamp = tcp_time_stamp;
}
+static bool tcp_tsopt_ecr_before(const struct tcp_sock *tp, u32 when)
+{
+ return tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
+ before(tp->rx_opt.rcv_tsecr, when);
+}
+
+/* skb is spurious retransmitted if the returned timestamp echo
+ * reply is prior to the skb transmission time
+ */
+static bool tcp_skb_spurious_retrans(const struct tcp_sock *tp,
+ const struct sk_buff *skb)
+{
+ return (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS) &&
+ tcp_tsopt_ecr_before(tp, tcp_skb_timestamp(skb));
+}
+
/* Nothing was retransmitted or returned timestamp is less
* than timestamp of the first retransmission.
*/
static inline bool tcp_packet_delayed(const struct tcp_sock *tp)
{
return !tp->retrans_stamp ||
- (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
- before(tp->rx_opt.rcv_tsecr, tp->retrans_stamp));
+ tcp_tsopt_ecr_before(tp, tp->retrans_stamp);
}
/* Undo procedures. */
@@ -2853,6 +2806,11 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
}
}
+ /* Use RACK to detect loss */
+ if (sysctl_tcp_recovery & TCP_RACK_LOST_RETRANS &&
+ tcp_rack_mark_lost(sk))
+ flag |= FLAG_LOST_RETRANS;
+
/* E. Process state. */
switch (icsk->icsk_ca_state) {
case TCP_CA_Recovery:
@@ -2915,8 +2873,69 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
tcp_xmit_retransmit_queue(sk);
}
+/* Kathleen Nichols' algorithm for tracking the minimum value of
+ * a data stream over some fixed time interval. (E.g., the minimum
+ * RTT over the past five minutes.) It uses constant space and constant
+ * time per update yet almost always delivers the same minimum as an
+ * implementation that has to keep all the data in the window.
+ *
+ * The algorithm keeps track of the best, 2nd best & 3rd best min
+ * values, maintaining an invariant that the measurement time of the
+ * n'th best >= n-1'th best. It also makes sure that the three values
+ * are widely separated in the time window since that bounds the worse
+ * case error when that data is monotonically increasing over the window.
+ *
+ * Upon getting a new min, we can forget everything earlier because it
+ * has no value - the new min is <= everything else in the window by
+ * definition and it's the most recent. So we restart fresh on every new min
+ * and overwrites 2nd & 3rd choices. The same property holds for 2nd & 3rd
+ * best.
+ */
+static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us)
+{
+ const u32 now = tcp_time_stamp, wlen = sysctl_tcp_min_rtt_wlen * HZ;
+ struct rtt_meas *m = tcp_sk(sk)->rtt_min;
+ struct rtt_meas rttm = { .rtt = (rtt_us ? : 1), .ts = now };
+ u32 elapsed;
+
+ /* Check if the new measurement updates the 1st, 2nd, or 3rd choices */
+ if (unlikely(rttm.rtt <= m[0].rtt))
+ m[0] = m[1] = m[2] = rttm;
+ else if (rttm.rtt <= m[1].rtt)
+ m[1] = m[2] = rttm;
+ else if (rttm.rtt <= m[2].rtt)
+ m[2] = rttm;
+
+ elapsed = now - m[0].ts;
+ if (unlikely(elapsed > wlen)) {
+ /* Passed entire window without a new min so make 2nd choice
+ * the new min & 3rd choice the new 2nd. So forth and so on.
+ */
+ m[0] = m[1];
+ m[1] = m[2];
+ m[2] = rttm;
+ if (now - m[0].ts > wlen) {
+ m[0] = m[1];
+ m[1] = rttm;
+ if (now - m[0].ts > wlen)
+ m[0] = rttm;
+ }
+ } else if (m[1].ts == m[0].ts && elapsed > wlen / 4) {
+ /* Passed a quarter of the window without a new min so
+ * take 2nd choice from the 2nd quarter of the window.
+ */
+ m[2] = m[1] = rttm;
+ } else if (m[2].ts == m[1].ts && elapsed > wlen / 2) {
+ /* Passed half the window without a new min so take the 3rd
+ * choice from the last half of the window.
+ */
+ m[2] = rttm;
+ }
+}
+
static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,
- long seq_rtt_us, long sack_rtt_us)
+ long seq_rtt_us, long sack_rtt_us,
+ long ca_rtt_us)
{
const struct tcp_sock *tp = tcp_sk(sk);
@@ -2925,9 +2944,6 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,
* Karn's algorithm forbids taking RTT if some retransmitted data
* is acked (RFC6298).
*/
- if (flag & FLAG_RETRANS_DATA_ACKED)
- seq_rtt_us = -1L;
-
if (seq_rtt_us < 0)
seq_rtt_us = sack_rtt_us;
@@ -2939,11 +2955,16 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,
*/
if (seq_rtt_us < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
flag & FLAG_ACKED)
- seq_rtt_us = jiffies_to_usecs(tcp_time_stamp - tp->rx_opt.rcv_tsecr);
-
+ seq_rtt_us = ca_rtt_us = jiffies_to_usecs(tcp_time_stamp -
+ tp->rx_opt.rcv_tsecr);
if (seq_rtt_us < 0)
return false;
+ /* ca_rtt_us >= 0 is counting on the invariant that ca_rtt_us is
+ * always taken together with ACK, SACK, or TS-opts. Any negative
+ * values will be skipped with the seq_rtt_us < 0 check above.
+ */
+ tcp_update_rtt_min(sk, ca_rtt_us);
tcp_rtt_estimator(sk, seq_rtt_us);
tcp_set_rto(sk);
@@ -2964,7 +2985,7 @@ void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req)
rtt_us = skb_mstamp_us_delta(&now, &tcp_rsk(req)->snt_synack);
}
- tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, rtt_us, -1L);
+ tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, rtt_us, -1L, rtt_us);
}
@@ -3131,6 +3152,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
if (sacked & TCPCB_SACKED_ACKED)
tp->sacked_out -= acked_pcount;
+ else if (tcp_is_sack(tp) && !tcp_skb_spurious_retrans(tp, skb))
+ tcp_rack_advance(tp, &skb->skb_mstamp, sacked);
if (sacked & TCPCB_LOST)
tp->lost_out -= acked_pcount;
@@ -3169,7 +3192,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
flag |= FLAG_SACK_RENEGING;
skb_mstamp_get(&now);
- if (likely(first_ackt.v64)) {
+ if (likely(first_ackt.v64) && !(flag & FLAG_RETRANS_DATA_ACKED)) {
seq_rtt_us = skb_mstamp_us_delta(&now, &first_ackt);
ca_rtt_us = skb_mstamp_us_delta(&now, &last_ackt);
}
@@ -3178,7 +3201,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
ca_rtt_us = skb_mstamp_us_delta(&now, &sack->last_sackt);
}
- rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us);
+ rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us,
+ ca_rtt_us);
if (flag & FLAG_ACKED) {
tcp_rearm_rto(sk);
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 41828bdc5d32..1fd5d413a664 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -470,6 +470,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
newtp->srtt_us = 0;
newtp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
+ newtp->rtt_min[0].rtt = ~0U;
newicsk->icsk_rto = TCP_TIMEOUT_INIT;
newtp->packets_out = 0;
@@ -547,6 +548,8 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
tcp_ecn_openreq_child(newtp, req);
newtp->fastopen_rsk = NULL;
newtp->syn_data_acked = 0;
+ newtp->rack.mstamp.v64 = 0;
+ newtp->rack.advanced = 0;
newtp->saved_syn = req->saved_syn;
req->saved_syn = NULL;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 19adedb8c5cc..f6f7f9b4901b 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2655,8 +2655,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
net_dbg_ratelimited("retrans_out leaked\n");
}
#endif
- if (!tp->retrans_out)
- tp->lost_retrans_low = tp->snd_nxt;
TCP_SKB_CB(skb)->sacked |= TCPCB_RETRANS;
tp->retrans_out += tcp_skb_pcount(skb);
@@ -2664,10 +2662,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
if (!tp->retrans_stamp)
tp->retrans_stamp = tcp_skb_timestamp(skb);
- /* snd_nxt is stored to detect loss of retransmitted segment,
- * see tcp_input.c tcp_sacktag_write_queue().
- */
- TCP_SKB_CB(skb)->ack_seq = tp->snd_nxt;
} else if (err != -EBUSY) {
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL);
}
diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c
new file mode 100644
index 000000000000..5353085fd0b2
--- /dev/null
+++ b/net/ipv4/tcp_recovery.c
@@ -0,0 +1,109 @@
+#include <linux/tcp.h>
+#include <net/tcp.h>
+
+int sysctl_tcp_recovery __read_mostly = TCP_RACK_LOST_RETRANS;
+
+/* Marks a packet lost, if some packet sent later has been (s)acked.
+ * The underlying idea is similar to the traditional dupthresh and FACK
+ * but they look at different metrics:
+ *
+ * dupthresh: 3 OOO packets delivered (packet count)
+ * FACK: sequence delta to highest sacked sequence (sequence space)
+ * RACK: sent time delta to the latest delivered packet (time domain)
+ *
+ * The advantage of RACK is it applies to both original and retransmitted
+ * packet and therefore is robust against tail losses. Another advantage
+ * is being more resilient to reordering by simply allowing some
+ * "settling delay", instead of tweaking the dupthresh.
+ *
+ * The current version is only used after recovery starts but can be
+ * easily extended to detect the first loss.
+ */
+int tcp_rack_mark_lost(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *skb;
+ u32 reo_wnd, prior_retrans = tp->retrans_out;
+
+ if (inet_csk(sk)->icsk_ca_state < TCP_CA_Recovery || !tp->rack.advanced)
+ return 0;
+
+ /* Reset the advanced flag to avoid unnecessary queue scanning */
+ tp->rack.advanced = 0;
+
+ /* To be more reordering resilient, allow min_rtt/4 settling delay
+ * (lower-bounded to 1000uS). We use min_rtt instead of the smoothed
+ * RTT because reordering is often a path property and less related
+ * to queuing or delayed ACKs.
+ *
+ * TODO: measure and adapt to the observed reordering delay, and
+ * use a timer to retransmit like the delayed early retransmit.
+ */
+ reo_wnd = 1000;
+ if (tp->rack.reord && tcp_min_rtt(tp) != ~0U)
+ reo_wnd = max(tcp_min_rtt(tp) >> 2, reo_wnd);
+
+ tcp_for_write_queue(skb, sk) {
+ struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
+
+ if (skb == tcp_send_head(sk))
+ break;
+
+ /* Skip ones already (s)acked */
+ if (!after(scb->end_seq, tp->snd_una) ||
+ scb->sacked & TCPCB_SACKED_ACKED)
+ continue;
+
+ if (skb_mstamp_after(&tp->rack.mstamp, &skb->skb_mstamp)) {
+
+ if (skb_mstamp_us_delta(&tp->rack.mstamp,
+ &skb->skb_mstamp) <= reo_wnd)
+ continue;
+
+ /* skb is lost if packet sent later is sacked */
+ tcp_skb_mark_lost_uncond_verify(tp, skb);
+ if (scb->sacked & TCPCB_SACKED_RETRANS) {
+ scb->sacked &= ~TCPCB_SACKED_RETRANS;
+ tp->retrans_out -= tcp_skb_pcount(skb);
+ NET_INC_STATS_BH(sock_net(sk),
+ LINUX_MIB_TCPLOSTRETRANSMIT);
+ }
+ } else if (!(scb->sacked & TCPCB_RETRANS)) {
+ /* Original data are sent sequentially so stop early
+ * b/c the rest are all sent after rack_sent
+ */
+ break;
+ }
+ }
+ return prior_retrans - tp->retrans_out;
+}
+
+/* Record the most recently (re)sent time among the (s)acked packets */
+void tcp_rack_advance(struct tcp_sock *tp,
+ const struct skb_mstamp *xmit_time, u8 sacked)
+{
+ if (tp->rack.mstamp.v64 &&
+ !skb_mstamp_after(xmit_time, &tp->rack.mstamp))
+ return;
+
+ if (sacked & TCPCB_RETRANS) {
+ struct skb_mstamp now;
+
+ /* If the sacked packet was retransmitted, it's ambiguous
+ * whether the retransmission or the original (or the prior
+ * retransmission) was sacked.
+ *
+ * If the original is lost, there is no ambiguity. Otherwise
+ * we assume the original can be delayed up to aRTT + min_rtt.
+ * the aRTT term is bounded by the fast recovery or timeout,
+ * so it's at least one RTT (i.e., retransmission is at least
+ * an RTT later).
+ */
+ skb_mstamp_get(&now);
+ if (skb_mstamp_us_delta(&now, xmit_time) < tcp_min_rtt(tp))
+ return;
+ }
+
+ tp->rack.mstamp = *xmit_time;
+ tp->rack.advanced = 1;
+}