diff options
author | Florian Westphal <fw@strlen.de> | 2020-11-16 10:48:12 +0100 |
---|---|---|
committer | Jakub Kicinski <kuba@kernel.org> | 2020-11-16 10:46:07 -0800 |
commit | 8edf08649eede6e5a3e39a3d38c63f30039a0c1e (patch) | |
tree | 278ab97b84769bc057a3f5449aaec7d19019c665 | |
parent | 813e0a683d4cacb668622bc9a1693cb82b5f8ff8 (diff) |
mptcp: rework poll+nospace handling
MPTCP maintains a status bit, MPTCP_SEND_SPACE, that is set when at
least one subflow and the mptcp socket itself are writeable.
mptcp_poll returns EPOLLOUT if the bit is set.
mptcp_sendmsg makes sure MPTCP_SEND_SPACE gets cleared when last write
has used up all subflows or the mptcp socket wmem.
This reworks nospace handling as follows:
MPTCP_SEND_SPACE is replaced with MPTCP_NOSPACE, i.e. inverted meaning.
This bit is set when the mptcp socket is not writeable.
The mptcp-level ack path schedule will then schedule the mptcp worker
to allow it to free already-acked data (and reduce wmem usage).
This will then wake userspace processes that wait for a POLLOUT event.
sendmsg will set MPTCP_NOSPACE only when it has to wait for more
wmem (blocking I/O case).
poll path will set MPTCP_NOSPACE in case the mptcp socket is
not writeable.
Normal tcp-level notification (SOCK_NOSPACE) is only enabled
in case the subflow socket has no available wmem.
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
-rw-r--r-- | net/mptcp/protocol.c | 92 | ||||
-rw-r--r-- | net/mptcp/protocol.h | 2 | ||||
-rw-r--r-- | net/mptcp/subflow.c | 11 |
3 files changed, 54 insertions, 51 deletions
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 821daa922ca3..7fcd26011a3d 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -724,7 +724,7 @@ void mptcp_data_acked(struct sock *sk) { mptcp_reset_timer(sk); - if ((!test_bit(MPTCP_SEND_SPACE, &mptcp_sk(sk)->flags) || + if ((test_bit(MPTCP_NOSPACE, &mptcp_sk(sk)->flags) || mptcp_send_head(sk) || (inet_sk_state_load(sk) != TCP_ESTABLISHED))) mptcp_schedule_work(sk); @@ -835,20 +835,6 @@ static void dfrag_clear(struct sock *sk, struct mptcp_data_frag *dfrag) put_page(dfrag->page); } -static bool mptcp_is_writeable(struct mptcp_sock *msk) -{ - struct mptcp_subflow_context *subflow; - - if (!sk_stream_is_writeable((struct sock *)msk)) - return false; - - mptcp_for_each_subflow(msk, subflow) { - if (sk_stream_is_writeable(subflow->tcp_sock)) - return true; - } - return false; -} - static void mptcp_clean_una(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); @@ -901,13 +887,8 @@ static void mptcp_clean_una_wakeup(struct sock *sk) mptcp_clean_una(sk); /* Only wake up writers if a subflow is ready */ - if (mptcp_is_writeable(msk)) { - set_bit(MPTCP_SEND_SPACE, &msk->flags); - smp_mb__after_atomic(); - - /* set SEND_SPACE before sk_stream_write_space clears - * NOSPACE - */ + if (sk_stream_is_writeable(sk)) { + clear_bit(MPTCP_NOSPACE, &msk->flags); sk_stream_write_space(sk); } } @@ -1041,17 +1022,25 @@ static void mptcp_nospace(struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow; - clear_bit(MPTCP_SEND_SPACE, &msk->flags); + set_bit(MPTCP_NOSPACE, &msk->flags); smp_mb__after_atomic(); /* msk->flags is changed by write_space cb */ mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + bool ssk_writeable = sk_stream_is_writeable(ssk); struct socket *sock = READ_ONCE(ssk->sk_socket); + if (ssk_writeable || !sock) + continue; + /* enables ssk->write_space() callbacks */ - if (sock) - set_bit(SOCK_NOSPACE, &sock->flags); + set_bit(SOCK_NOSPACE, &sock->flags); } + + /* mptcp_data_acked() could run just before we set the NOSPACE bit, + * so explicitly check for snd_una value + */ + mptcp_clean_una((struct sock *)msk); } static bool mptcp_subflow_active(struct mptcp_subflow_context *subflow) @@ -1155,12 +1144,6 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk, return NULL; } -static void ssk_check_wmem(struct mptcp_sock *msk) -{ - if (unlikely(!mptcp_is_writeable(msk))) - mptcp_nospace(msk); -} - static void mptcp_push_release(struct sock *sk, struct sock *ssk, struct mptcp_sendmsg_info *info) { @@ -1332,7 +1315,6 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) wait_for_memory: mptcp_nospace(msk); - mptcp_clean_una(sk); if (mptcp_timer_pending(sk)) mptcp_reset_timer(sk); ret = sk_stream_wait_memory(sk, &timeo); @@ -1344,7 +1326,6 @@ wait_for_memory: mptcp_push_pending(sk, msg->msg_flags); out: - ssk_check_wmem(msk); release_sock(sk); return copied ? : ret; } @@ -1921,7 +1902,6 @@ static int __mptcp_init_sock(struct sock *sk) INIT_LIST_HEAD(&msk->conn_list); INIT_LIST_HEAD(&msk->join_list); INIT_LIST_HEAD(&msk->rtx_queue); - __set_bit(MPTCP_SEND_SPACE, &msk->flags); INIT_WORK(&msk->work, mptcp_worker); msk->out_of_order_queue = RB_ROOT; msk->first_pending = NULL; @@ -2619,13 +2599,6 @@ bool mptcp_finish_join(struct sock *ssk) return true; } -static bool mptcp_memory_free(const struct sock *sk, int wake) -{ - struct mptcp_sock *msk = mptcp_sk(sk); - - return wake ? test_bit(MPTCP_SEND_SPACE, &msk->flags) : true; -} - static struct proto mptcp_prot = { .name = "MPTCP", .owner = THIS_MODULE, @@ -2646,7 +2619,6 @@ static struct proto mptcp_prot = { .sockets_allocated = &mptcp_sockets_allocated, .memory_allocated = &tcp_memory_allocated, .memory_pressure = &tcp_memory_pressure, - .stream_memory_free = mptcp_memory_free, .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), .sysctl_mem = sysctl_tcp_mem, @@ -2820,6 +2792,39 @@ static __poll_t mptcp_check_readable(struct mptcp_sock *msk) 0; } +static bool __mptcp_check_writeable(struct mptcp_sock *msk) +{ + struct sock *sk = (struct sock *)msk; + bool mptcp_writable; + + mptcp_clean_una(sk); + mptcp_writable = sk_stream_is_writeable(sk); + if (!mptcp_writable) + mptcp_nospace(msk); + + return mptcp_writable; +} + +static __poll_t mptcp_check_writeable(struct mptcp_sock *msk) +{ + struct sock *sk = (struct sock *)msk; + __poll_t ret = 0; + bool slow; + + if (unlikely(sk->sk_shutdown & SEND_SHUTDOWN)) + return 0; + + if (sk_stream_is_writeable(sk)) + return EPOLLOUT | EPOLLWRNORM; + + slow = lock_sock_fast(sk); + if (__mptcp_check_writeable(msk)) + ret = EPOLLOUT | EPOLLWRNORM; + + unlock_sock_fast(sk, slow); + return ret; +} + static __poll_t mptcp_poll(struct file *file, struct socket *sock, struct poll_table_struct *wait) { @@ -2838,8 +2843,7 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock, if (state != TCP_SYN_SENT && state != TCP_SYN_RECV) { mask |= mptcp_check_readable(msk); - if (test_bit(MPTCP_SEND_SPACE, &msk->flags)) - mask |= EPOLLOUT | EPOLLWRNORM; + mask |= mptcp_check_writeable(msk); } if (sk->sk_shutdown & RCV_SHUTDOWN) mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index fd9c666aed7f..8345011fc0ba 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -86,7 +86,7 @@ /* MPTCP socket flags */ #define MPTCP_DATA_READY 0 -#define MPTCP_SEND_SPACE 1 +#define MPTCP_NOSPACE 1 #define MPTCP_WORK_RTX 2 #define MPTCP_WORK_EOF 3 #define MPTCP_FALLBACK_DONE 4 diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 42581ffb0c7e..794259789194 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -997,17 +997,16 @@ static void subflow_data_ready(struct sock *sk) static void subflow_write_space(struct sock *sk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + struct socket *sock = READ_ONCE(sk->sk_socket); struct sock *parent = subflow->conn; if (!sk_stream_is_writeable(sk)) return; - if (sk_stream_is_writeable(parent)) { - set_bit(MPTCP_SEND_SPACE, &mptcp_sk(parent)->flags); - smp_mb__after_atomic(); - /* set SEND_SPACE before sk_stream_write_space clears NOSPACE */ - sk_stream_write_space(parent); - } + if (sock && sk_stream_is_writeable(parent)) + clear_bit(SOCK_NOSPACE, &sock->flags); + + sk_stream_write_space(parent); } static struct inet_connection_sock_af_ops * |