summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2020-09-10 13:15:40 -0700
committerDavid S. Miller <davem@davemloft.net>2020-09-10 13:15:40 -0700
commitd095c46206be74a96dac2daf21506ab06c3ddfb1 (patch)
treef9db7718210545aa9708155babb2b1df39fb0fce
parent3a8c4ad161d48d9674a340f2dea916ae560c3e9d (diff)
parentac8f1710c12bb4c3626280ce03f05459ba8feef6 (diff)
Merge branch 'tcp-add-tos-reflection-feature'
Wei Wang says: ==================== tcp: add tos reflection feature This patch series adds a new tcp feature to reflect TOS value received in SYN, and send it out in SYN-ACK, and eventually set the TOS value of the established socket with this reflected TOS value. This provides a way to set the traffic class/QoS level for all traffic in the same connection to be the same as the incoming SYN. It could be useful for datacenters to provide equivalent QoS according to the incoming request. This feature is guarded by /proc/sys/net/ipv4/tcp_reflect_tos, and is by default turned off. ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--include/linux/tcp.h1
-rw-r--r--include/net/ip.h2
-rw-r--r--include/net/netns/ipv4.h1
-rw-r--r--net/dccp/ipv4.c6
-rw-r--r--net/ipv4/ip_output.c5
-rw-r--r--net/ipv4/syncookies.c6
-rw-r--r--net/ipv4/sysctl_net_ipv4.c9
-rw-r--r--net/ipv4/tcp_input.c1
-rw-r--r--net/ipv4/tcp_ipv4.c11
-rw-r--r--net/ipv6/tcp_ipv6.c10
10 files changed, 42 insertions, 10 deletions
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 56ff2952edaf..2f87377e9af7 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -134,6 +134,7 @@ struct tcp_request_sock {
* FastOpen it's the seq#
* after data-in-SYN.
*/
+ u8 syn_tos;
};
static inline struct tcp_request_sock *tcp_rsk(const struct request_sock *req)
diff --git a/include/net/ip.h b/include/net/ip.h
index b09c48d862cc..0f72bf8c0cbf 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -151,7 +151,7 @@ int igmp_mc_init(void);
int ip_build_and_send_pkt(struct sk_buff *skb, const struct sock *sk,
__be32 saddr, __be32 daddr,
- struct ip_options_rcu *opt);
+ struct ip_options_rcu *opt, u8 tos);
int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
struct net_device *orig_dev);
void ip_list_rcv(struct list_head *head, struct packet_type *pt,
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 9e36738c1fe1..8e4fcac4df72 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -183,6 +183,7 @@ struct netns_ipv4 {
unsigned int sysctl_tcp_fastopen_blackhole_timeout;
atomic_t tfo_active_disable_times;
unsigned long tfo_active_disable_stamp;
+ int sysctl_tcp_reflect_tos;
int sysctl_udp_wmem_min;
int sysctl_udp_rmem_min;
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index d8f3751a512b..bb3d70664dde 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -495,7 +495,8 @@ static int dccp_v4_send_response(const struct sock *sk, struct request_sock *req
rcu_read_lock();
err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
ireq->ir_rmt_addr,
- rcu_dereference(ireq->ireq_opt));
+ rcu_dereference(ireq->ireq_opt),
+ inet_sk(sk)->tos);
rcu_read_unlock();
err = net_xmit_eval(err);
}
@@ -537,7 +538,8 @@ static void dccp_v4_ctl_send_reset(const struct sock *sk, struct sk_buff *rxskb)
local_bh_disable();
bh_lock_sock(ctl_sk);
err = ip_build_and_send_pkt(skb, ctl_sk,
- rxiph->daddr, rxiph->saddr, NULL);
+ rxiph->daddr, rxiph->saddr, NULL,
+ inet_sk(ctl_sk)->tos);
bh_unlock_sock(ctl_sk);
if (net_xmit_eval(err) == 0) {
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index b931d0b02e49..5fb536ff51f0 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -142,7 +142,8 @@ static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
*
*/
int ip_build_and_send_pkt(struct sk_buff *skb, const struct sock *sk,
- __be32 saddr, __be32 daddr, struct ip_options_rcu *opt)
+ __be32 saddr, __be32 daddr, struct ip_options_rcu *opt,
+ u8 tos)
{
struct inet_sock *inet = inet_sk(sk);
struct rtable *rt = skb_rtable(skb);
@@ -155,7 +156,7 @@ int ip_build_and_send_pkt(struct sk_buff *skb, const struct sock *sk,
iph = ip_hdr(skb);
iph->version = 4;
iph->ihl = 5;
- iph->tos = inet->tos;
+ iph->tos = tos;
iph->ttl = ip_select_ttl(inet, &rt->dst);
iph->daddr = (opt && opt->opt.srr ? opt->opt.faddr : daddr);
iph->saddr = saddr;
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index f0794f0232ba..c375c126f436 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -286,11 +286,10 @@ struct request_sock *cookie_tcp_reqsk_alloc(const struct request_sock_ops *ops,
struct sock *sk,
struct sk_buff *skb)
{
+ struct tcp_request_sock *treq;
struct request_sock *req;
#ifdef CONFIG_MPTCP
- struct tcp_request_sock *treq;
-
if (sk_is_mptcp(sk))
ops = &mptcp_subflow_request_sock_ops;
#endif
@@ -299,8 +298,9 @@ struct request_sock *cookie_tcp_reqsk_alloc(const struct request_sock_ops *ops,
if (!req)
return NULL;
-#if IS_ENABLED(CONFIG_MPTCP)
treq = tcp_rsk(req);
+ treq->syn_tos = TCP_SKB_CB(skb)->ip_dsfield;
+#if IS_ENABLED(CONFIG_MPTCP)
treq->is_mptcp = sk_is_mptcp(sk);
if (treq->is_mptcp) {
int err = mptcp_subflow_init_cookie_req(req, sk, skb);
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 54023a46db04..3e5f4f2e705e 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -1330,6 +1330,15 @@ static struct ctl_table ipv4_net_table[] = {
.extra2 = &comp_sack_nr_max,
},
{
+ .procname = "tcp_reflect_tos",
+ .data = &init_net.ipv4.sysctl_tcp_reflect_tos,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+ {
.procname = "udp_rmem_min",
.data = &init_net.ipv4.sysctl_udp_rmem_min,
.maxlen = sizeof(init_net.ipv4.sysctl_udp_rmem_min),
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 4337841faeff..3658ad84f0c6 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -6834,6 +6834,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
tcp_rsk(req)->snt_isn = isn;
tcp_rsk(req)->txhash = net_tx_rndhash();
+ tcp_rsk(req)->syn_tos = TCP_SKB_CB(skb)->ip_dsfield;
tcp_openreq_init_rwin(req, sk, dst);
sk_rx_queue_set(req_to_sk(req), skb);
if (!want_cookie) {
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index af27cfa9d8d3..ace48b2790ff 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -972,6 +972,7 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
struct flowi4 fl4;
int err = -1;
struct sk_buff *skb;
+ u8 tos;
/* First, grab a route. */
if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
@@ -979,13 +980,17 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
+ tos = sock_net(sk)->ipv4.sysctl_tcp_reflect_tos ?
+ tcp_rsk(req)->syn_tos : inet_sk(sk)->tos;
+
if (skb) {
__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
rcu_read_lock();
err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
ireq->ir_rmt_addr,
- rcu_dereference(ireq->ireq_opt));
+ rcu_dereference(ireq->ireq_opt),
+ tos & ~INET_ECN_MASK);
rcu_read_unlock();
err = net_xmit_eval(err);
}
@@ -1530,6 +1535,10 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
newinet->inet_id = prandom_u32();
+ /* Set ToS of the new socket based upon the value of incoming SYN. */
+ if (sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)
+ newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
+
if (!dst) {
dst = inet_csk_route_child_sock(sk, newsk, req);
if (!dst)
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 04efa3ee80ef..862058dce6d0 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -510,6 +510,7 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst,
struct flowi6 *fl6 = &fl->u.ip6;
struct sk_buff *skb;
int err = -ENOMEM;
+ u8 tclass;
/* First, grab a route. */
if (!dst && (dst = inet6_csk_route_req(sk, fl6, req,
@@ -528,9 +529,12 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst,
rcu_read_lock();
opt = ireq->ipv6_opt;
+ tclass = sock_net(sk)->ipv4.sysctl_tcp_reflect_tos ?
+ tcp_rsk(req)->syn_tos : np->tclass;
if (!opt)
opt = rcu_dereference(np->opt);
- err = ip6_xmit(sk, skb, fl6, sk->sk_mark, opt, np->tclass,
+ err = ip6_xmit(sk, skb, fl6, sk->sk_mark, opt,
+ tclass & ~INET_ECN_MASK,
sk->sk_priority);
rcu_read_unlock();
err = net_xmit_eval(err);
@@ -1310,6 +1314,10 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
if (np->repflow)
newnp->flow_label = ip6_flowlabel(ipv6_hdr(skb));
+ /* Set ToS of the new socket based upon the value of incoming SYN. */
+ if (sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)
+ newnp->tclass = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
+
/* Clone native IPv6 options from listening socket (if any)
Yes, keeping reference count would be much more clever,