diff options
author | David S. Miller <davem@davemloft.net> | 2016-05-03 15:51:17 -0400 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2016-05-03 15:51:17 -0400 |
commit | e34b1638d02bef8c3278af30ee73077c5babc082 (patch) | |
tree | f8ce3ae55f12a36bc4361d9ee515b3f301051c65 | |
parent | 2b84af94a3932b1dcb716d1898edb18b7325dbea (diff) | |
parent | 10724cc7bb7832b482df049c20fd824d928c5eaa (diff) |
Merge branch 'tipc-next'
Jon Maloy says:
====================
tipc: redesign socket-level flow control
The socket-level flow control in TIPC has long been due for a major
overhaul. This series fixes this.
====================
Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r-- | net/tipc/core.c | 8 | ||||
-rw-r--r-- | net/tipc/msg.h | 14 | ||||
-rw-r--r-- | net/tipc/node.c | 21 | ||||
-rw-r--r-- | net/tipc/node.h | 6 | ||||
-rw-r--r-- | net/tipc/socket.c | 144 | ||||
-rw-r--r-- | net/tipc/socket.h | 17 |
6 files changed, 145 insertions, 65 deletions
diff --git a/net/tipc/core.c b/net/tipc/core.c index e2bdb07a49a2..fe1b062c4f18 100644 --- a/net/tipc/core.c +++ b/net/tipc/core.c @@ -112,11 +112,9 @@ static int __init tipc_init(void) pr_info("Activated (version " TIPC_MOD_VER ")\n"); - sysctl_tipc_rmem[0] = TIPC_CONN_OVERLOAD_LIMIT >> 4 << - TIPC_LOW_IMPORTANCE; - sysctl_tipc_rmem[1] = TIPC_CONN_OVERLOAD_LIMIT >> 4 << - TIPC_CRITICAL_IMPORTANCE; - sysctl_tipc_rmem[2] = TIPC_CONN_OVERLOAD_LIMIT; + sysctl_tipc_rmem[0] = RCVBUF_MIN; + sysctl_tipc_rmem[1] = RCVBUF_DEF; + sysctl_tipc_rmem[2] = RCVBUF_MAX; err = tipc_netlink_start(); if (err) diff --git a/net/tipc/msg.h b/net/tipc/msg.h index 58bf51541813..024da8af91f0 100644 --- a/net/tipc/msg.h +++ b/net/tipc/msg.h @@ -743,16 +743,26 @@ static inline void msg_set_msgcnt(struct tipc_msg *m, u16 n) msg_set_bits(m, 9, 16, 0xffff, n); } -static inline u32 msg_bcast_tag(struct tipc_msg *m) +static inline u32 msg_conn_ack(struct tipc_msg *m) { return msg_bits(m, 9, 16, 0xffff); } -static inline void msg_set_bcast_tag(struct tipc_msg *m, u32 n) +static inline void msg_set_conn_ack(struct tipc_msg *m, u32 n) { msg_set_bits(m, 9, 16, 0xffff, n); } +static inline u32 msg_adv_win(struct tipc_msg *m) +{ + return msg_bits(m, 9, 0, 0xffff); +} + +static inline void msg_set_adv_win(struct tipc_msg *m, u32 n) +{ + msg_set_bits(m, 9, 0, 0xffff, n); +} + static inline u32 msg_max_pkt(struct tipc_msg *m) { return msg_bits(m, 9, 16, 0xffff) * 4; diff --git a/net/tipc/node.c b/net/tipc/node.c index c29915688230..29cc85319327 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -1,7 +1,7 @@ /* * net/tipc/node.c: TIPC node management routines * - * Copyright (c) 2000-2006, 2012-2015, Ericsson AB + * Copyright (c) 2000-2006, 2012-2016, Ericsson AB * Copyright (c) 2005-2006, 2010-2014, Wind River Systems * All rights reserved. * @@ -191,6 +191,20 @@ int tipc_node_get_mtu(struct net *net, u32 addr, u32 sel) tipc_node_put(n); return mtu; } + +u16 tipc_node_get_capabilities(struct net *net, u32 addr) +{ + struct tipc_node *n; + u16 caps; + + n = tipc_node_find(net, addr); + if (unlikely(!n)) + return TIPC_NODE_CAPABILITIES; + caps = n->capabilities; + tipc_node_put(n); + return caps; +} + /* * A trivial power-of-two bitmask technique is used for speed, since this * operation is done for every incoming TIPC packet. The number of hash table @@ -304,8 +318,11 @@ struct tipc_node *tipc_node_create(struct net *net, u32 addr, u16 capabilities) spin_lock_bh(&tn->node_list_lock); n = tipc_node_find(net, addr); - if (n) + if (n) { + /* Same node may come back with new capabilities */ + n->capabilities = capabilities; goto exit; + } n = kzalloc(sizeof(*n), GFP_ATOMIC); if (!n) { pr_warn("Node creation failed, no memory\n"); diff --git a/net/tipc/node.h b/net/tipc/node.h index f39d9d06e8bb..8264b3d97dc4 100644 --- a/net/tipc/node.h +++ b/net/tipc/node.h @@ -45,10 +45,11 @@ /* Optional capabilities supported by this code version */ enum { - TIPC_BCAST_SYNCH = (1 << 1) + TIPC_BCAST_SYNCH = (1 << 1), + TIPC_BLOCK_FLOWCTL = (2 << 1) }; -#define TIPC_NODE_CAPABILITIES TIPC_BCAST_SYNCH +#define TIPC_NODE_CAPABILITIES (TIPC_BCAST_SYNCH | TIPC_BLOCK_FLOWCTL) #define INVALID_BEARER_ID -1 void tipc_node_stop(struct net *net); @@ -70,6 +71,7 @@ void tipc_node_broadcast(struct net *net, struct sk_buff *skb); int tipc_node_add_conn(struct net *net, u32 dnode, u32 port, u32 peer_port); void tipc_node_remove_conn(struct net *net, u32 dnode, u32 port); int tipc_node_get_mtu(struct net *net, u32 addr, u32 sel); +u16 tipc_node_get_capabilities(struct net *net, u32 addr); int tipc_nl_node_dump(struct sk_buff *skb, struct netlink_callback *cb); int tipc_nl_node_dump_link(struct sk_buff *skb, struct netlink_callback *cb); int tipc_nl_node_reset_link_stats(struct sk_buff *skb, struct genl_info *info); diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 3eeb50a27b89..12628890c219 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -96,8 +96,11 @@ struct tipc_sock { uint conn_timeout; atomic_t dupl_rcvcnt; bool link_cong; - uint sent_unacked; - uint rcv_unacked; + u16 snt_unacked; + u16 snd_win; + u16 peer_caps; + u16 rcv_unacked; + u16 rcv_win; struct sockaddr_tipc remote; struct rhash_head node; struct rcu_head rcu; @@ -227,9 +230,29 @@ static struct tipc_sock *tipc_sk(const struct sock *sk) return container_of(sk, struct tipc_sock, sk); } -static int tsk_conn_cong(struct tipc_sock *tsk) +static bool tsk_conn_cong(struct tipc_sock *tsk) { - return tsk->sent_unacked >= TIPC_FLOWCTRL_WIN; + return tsk->snt_unacked >= tsk->snd_win; +} + +/* tsk_blocks(): translate a buffer size in bytes to number of + * advertisable blocks, taking into account the ratio truesize(len)/len + * We can trust that this ratio is always < 4 for len >= FLOWCTL_BLK_SZ + */ +static u16 tsk_adv_blocks(int len) +{ + return len / FLOWCTL_BLK_SZ / 4; +} + +/* tsk_inc(): increment counter for sent or received data + * - If block based flow control is not supported by peer we + * fall back to message based ditto, incrementing the counter + */ +static u16 tsk_inc(struct tipc_sock *tsk, int msglen) +{ + if (likely(tsk->peer_caps & TIPC_BLOCK_FLOWCTL)) + return ((msglen / FLOWCTL_BLK_SZ) + 1); + return 1; } /** @@ -377,9 +400,12 @@ static int tipc_sk_create(struct net *net, struct socket *sock, sk->sk_write_space = tipc_write_space; sk->sk_destruct = tipc_sock_destruct; tsk->conn_timeout = CONN_TIMEOUT_DEFAULT; - tsk->sent_unacked = 0; atomic_set(&tsk->dupl_rcvcnt, 0); + /* Start out with safe limits until we receive an advertised window */ + tsk->snd_win = tsk_adv_blocks(RCVBUF_MIN); + tsk->rcv_win = tsk->snd_win; + if (sock->state == SS_READY) { tsk_set_unreturnable(tsk, true); if (sock->type == SOCK_DGRAM) @@ -775,7 +801,7 @@ static void tipc_sk_proto_rcv(struct tipc_sock *tsk, struct sk_buff *skb) struct sock *sk = &tsk->sk; struct tipc_msg *hdr = buf_msg(skb); int mtyp = msg_type(hdr); - int conn_cong; + bool conn_cong; /* Ignore if connection cannot be validated: */ if (!tsk_peer_msg(tsk, hdr)) @@ -789,7 +815,9 @@ static void tipc_sk_proto_rcv(struct tipc_sock *tsk, struct sk_buff *skb) return; } else if (mtyp == CONN_ACK) { conn_cong = tsk_conn_cong(tsk); - tsk->sent_unacked -= msg_msgcnt(hdr); + tsk->snt_unacked -= msg_conn_ack(hdr); + if (tsk->peer_caps & TIPC_BLOCK_FLOWCTL) + tsk->snd_win = msg_adv_win(hdr); if (conn_cong) sk->sk_write_space(sk); } else if (mtyp != CONN_PROBE_REPLY) { @@ -1020,12 +1048,14 @@ static int __tipc_send_stream(struct socket *sock, struct msghdr *m, size_t dsz) u32 dnode; uint mtu, send, sent = 0; struct iov_iter save; + int hlen = MIN_H_SIZE; /* Handle implied connection establishment */ if (unlikely(dest)) { rc = __tipc_sendmsg(sock, m, dsz); + hlen = msg_hdr_sz(mhdr); if (dsz && (dsz == rc)) - tsk->sent_unacked = 1; + tsk->snt_unacked = tsk_inc(tsk, dsz + hlen); return rc; } if (dsz > (uint)INT_MAX) @@ -1054,7 +1084,7 @@ next: if (likely(!tsk_conn_cong(tsk))) { rc = tipc_node_xmit(net, &pktchain, dnode, portid); if (likely(!rc)) { - tsk->sent_unacked++; + tsk->snt_unacked += tsk_inc(tsk, send + hlen); sent += send; if (sent == dsz) return dsz; @@ -1118,6 +1148,13 @@ static void tipc_sk_finish_conn(struct tipc_sock *tsk, u32 peer_port, sk_reset_timer(sk, &sk->sk_timer, jiffies + tsk->probing_intv); tipc_node_add_conn(net, peer_node, tsk->portid, peer_port); tsk->max_pkt = tipc_node_get_mtu(net, peer_node, tsk->portid); + tsk->peer_caps = tipc_node_get_capabilities(net, peer_node); + if (tsk->peer_caps & TIPC_BLOCK_FLOWCTL) + return; + + /* Fall back to message based flow control */ + tsk->rcv_win = FLOWCTL_MSG_WIN; + tsk->snd_win = FLOWCTL_MSG_WIN; } /** @@ -1214,7 +1251,7 @@ static int tipc_sk_anc_data_recv(struct msghdr *m, struct tipc_msg *msg, return 0; } -static void tipc_sk_send_ack(struct tipc_sock *tsk, uint ack) +static void tipc_sk_send_ack(struct tipc_sock *tsk) { struct net *net = sock_net(&tsk->sk); struct sk_buff *skb = NULL; @@ -1230,7 +1267,14 @@ static void tipc_sk_send_ack(struct tipc_sock *tsk, uint ack) if (!skb) return; msg = buf_msg(skb); - msg_set_msgcnt(msg, ack); + msg_set_conn_ack(msg, tsk->rcv_unacked); + tsk->rcv_unacked = 0; + + /* Adjust to and advertize the correct window limit */ + if (tsk->peer_caps & TIPC_BLOCK_FLOWCTL) { + tsk->rcv_win = tsk_adv_blocks(tsk->sk.sk_rcvbuf); + msg_set_adv_win(msg, tsk->rcv_win); + } tipc_node_xmit_skb(net, skb, dnode, msg_link_selector(msg)); } @@ -1288,7 +1332,7 @@ static int tipc_recvmsg(struct socket *sock, struct msghdr *m, size_t buf_len, long timeo; unsigned int sz; u32 err; - int res; + int res, hlen; /* Catch invalid receive requests */ if (unlikely(!buf_len)) @@ -1313,6 +1357,7 @@ restart: buf = skb_peek(&sk->sk_receive_queue); msg = buf_msg(buf); sz = msg_data_sz(msg); + hlen = msg_hdr_sz(msg); err = msg_errcode(msg); /* Discard an empty non-errored message & try again */ @@ -1335,7 +1380,7 @@ restart: sz = buf_len; m->msg_flags |= MSG_TRUNC; } - res = skb_copy_datagram_msg(buf, msg_hdr_sz(msg), m, sz); + res = skb_copy_datagram_msg(buf, hlen, m, sz); if (res) goto exit; res = sz; @@ -1347,15 +1392,15 @@ restart: res = -ECONNRESET; } - /* Consume received message (optional) */ - if (likely(!(flags & MSG_PEEK))) { - if ((sock->state != SS_READY) && - (++tsk->rcv_unacked >= TIPC_CONNACK_INTV)) { - tipc_sk_send_ack(tsk, tsk->rcv_unacked); - tsk->rcv_unacked = 0; - } - tsk_advance_rx_queue(sk); + if (unlikely(flags & MSG_PEEK)) + goto exit; + + if (likely(sock->state != SS_READY)) { + tsk->rcv_unacked += tsk_inc(tsk, hlen + sz); + if (unlikely(tsk->rcv_unacked >= (tsk->rcv_win / 4))) + tipc_sk_send_ack(tsk); } + tsk_advance_rx_queue(sk); exit: release_sock(sk); return res; @@ -1384,7 +1429,7 @@ static int tipc_recv_stream(struct socket *sock, struct msghdr *m, int sz_to_copy, target, needed; int sz_copied = 0; u32 err; - int res = 0; + int res = 0, hlen; /* Catch invalid receive attempts */ if (unlikely(!buf_len)) @@ -1410,6 +1455,7 @@ restart: buf = skb_peek(&sk->sk_receive_queue); msg = buf_msg(buf); sz = msg_data_sz(msg); + hlen = msg_hdr_sz(msg); err = msg_errcode(msg); /* Discard an empty non-errored message & try again */ @@ -1434,8 +1480,7 @@ restart: needed = (buf_len - sz_copied); sz_to_copy = (sz <= needed) ? sz : needed; - res = skb_copy_datagram_msg(buf, msg_hdr_sz(msg) + offset, - m, sz_to_copy); + res = skb_copy_datagram_msg(buf, hlen + offset, m, sz_to_copy); if (res) goto exit; @@ -1457,20 +1502,18 @@ restart: res = -ECONNRESET; } - /* Consume received message (optional) */ - if (likely(!(flags & MSG_PEEK))) { - if (unlikely(++tsk->rcv_unacked >= TIPC_CONNACK_INTV)) { - tipc_sk_send_ack(tsk, tsk->rcv_unacked); - tsk->rcv_unacked = 0; - } - tsk_advance_rx_queue(sk); - } + if (unlikely(flags & MSG_PEEK)) + goto exit; + + tsk->rcv_unacked += tsk_inc(tsk, hlen + sz); + if (unlikely(tsk->rcv_unacked >= (tsk->rcv_win / 4))) + tipc_sk_send_ack(tsk); + tsk_advance_rx_queue(sk); /* Loop around if more data is required */ if ((sz_copied < buf_len) && /* didn't get all requested data */ (!skb_queue_empty(&sk->sk_receive_queue) || (sz_copied < target)) && /* and more is ready or required */ - (!(flags & MSG_PEEK)) && /* and aren't just peeking at data */ (!err)) /* and haven't reached a FIN */ goto restart; @@ -1602,30 +1645,33 @@ static bool filter_connect(struct tipc_sock *tsk, struct sk_buff *skb) /** * rcvbuf_limit - get proper overload limit of socket receive queue * @sk: socket - * @buf: message + * @skb: message * - * For all connection oriented messages, irrespective of importance, - * the default overload value (i.e. 67MB) is set as limit. + * For connection oriented messages, irrespective of importance, + * default queue limit is 2 MB. * - * For all connectionless messages, by default new queue limits are - * as belows: + * For connectionless messages, queue limits are based on message + * importance as follows: * - * TIPC_LOW_IMPORTANCE (4 MB) - * TIPC_MEDIUM_IMPORTANCE (8 MB) - * TIPC_HIGH_IMPORTANCE (16 MB) - * TIPC_CRITICAL_IMPORTANCE (32 MB) + * TIPC_LOW_IMPORTANCE (2 MB) + * TIPC_MEDIUM_IMPORTANCE (4 MB) + * TIPC_HIGH_IMPORTANCE (8 MB) + * TIPC_CRITICAL_IMPORTANCE (16 MB) * * Returns overload limit according to corresponding message importance */ -static unsigned int rcvbuf_limit(struct sock *sk, struct sk_buff *buf) +static unsigned int rcvbuf_limit(struct sock *sk, struct sk_buff *skb) { - struct tipc_msg *msg = buf_msg(buf); + struct tipc_sock *tsk = tipc_sk(sk); + struct tipc_msg *hdr = buf_msg(skb); + + if (unlikely(!msg_connected(hdr))) + return sk->sk_rcvbuf << msg_importance(hdr); - if (msg_connected(msg)) - return sysctl_tipc_rmem[2]; + if (likely(tsk->peer_caps & TIPC_BLOCK_FLOWCTL)) + return sk->sk_rcvbuf; - return sk->sk_rcvbuf >> TIPC_CRITICAL_IMPORTANCE << - msg_importance(msg); + return FLOWCTL_MSG_LIM; } /** @@ -1748,7 +1794,7 @@ static void tipc_sk_enqueue(struct sk_buff_head *inputq, struct sock *sk, /* Try backlog, compensating for double-counted bytes */ dcnt = &tipc_sk(sk)->dupl_rcvcnt; - if (sk->sk_backlog.len) + if (!sk->sk_backlog.len) atomic_set(dcnt, 0); lim = rcvbuf_limit(sk, skb) + atomic_read(dcnt); if (likely(!sk_add_backlog(sk, skb, lim))) diff --git a/net/tipc/socket.h b/net/tipc/socket.h index 4241f22069dc..06fb5944cf76 100644 --- a/net/tipc/socket.h +++ b/net/tipc/socket.h @@ -1,6 +1,6 @@ /* net/tipc/socket.h: Include file for TIPC socket code * - * Copyright (c) 2014-2015, Ericsson AB + * Copyright (c) 2014-2016, Ericsson AB * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -38,10 +38,17 @@ #include <net/sock.h> #include <net/genetlink.h> -#define TIPC_CONNACK_INTV 256 -#define TIPC_FLOWCTRL_WIN (TIPC_CONNACK_INTV * 2) -#define TIPC_CONN_OVERLOAD_LIMIT ((TIPC_FLOWCTRL_WIN * 2 + 1) * \ - SKB_TRUESIZE(TIPC_MAX_USER_MSG_SIZE)) +/* Compatibility values for deprecated message based flow control */ +#define FLOWCTL_MSG_WIN 512 +#define FLOWCTL_MSG_LIM ((FLOWCTL_MSG_WIN * 2 + 1) * SKB_TRUESIZE(MAX_MSG_SIZE)) + +#define FLOWCTL_BLK_SZ 1024 + +/* Socket receive buffer sizes */ +#define RCVBUF_MIN (FLOWCTL_BLK_SZ * 512) +#define RCVBUF_DEF (FLOWCTL_BLK_SZ * 1024 * 2) +#define RCVBUF_MAX (FLOWCTL_BLK_SZ * 1024 * 16) + int tipc_socket_init(void); void tipc_socket_stop(void); void tipc_sk_rcv(struct net *net, struct sk_buff_head *inputq); |