diff options
Diffstat (limited to 'net')
-rw-r--r-- | net/core/sock.c | 78 | ||||
-rw-r--r-- | net/ipv4/tcp.c | 3 | ||||
-rw-r--r-- | net/ipv4/tcp_ipv4.c | 9 | ||||
-rw-r--r-- | net/ipv4/tcp_memcontrol.c | 91 | ||||
-rw-r--r-- | net/ipv4/tcp_output.c | 7 | ||||
-rw-r--r-- | net/ipv6/tcp_ipv6.c | 3 | ||||
-rw-r--r-- | net/socket.c | 2 | ||||
-rw-r--r-- | net/sunrpc/auth_gss/gss_rpc_upcall.c | 3 | ||||
-rw-r--r-- | net/sunrpc/clnt.c | 1 | ||||
-rw-r--r-- | net/sunrpc/rpc_pipe.c | 2 | ||||
-rw-r--r-- | net/sunrpc/svc_xprt.c | 45 | ||||
-rw-r--r-- | net/sunrpc/svcauth.c | 2 | ||||
-rw-r--r-- | net/sunrpc/svcauth_unix.c | 8 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/backchannel.c | 26 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/fmr_ops.c | 64 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/frwr_ops.c | 174 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/physical_ops.c | 13 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/rpc_rdma.c | 16 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/transport.c | 3 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/verbs.c | 16 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/xprt_rdma.h | 14 | ||||
-rw-r--r-- | net/sunrpc/xprtsock.c | 63 |
22 files changed, 407 insertions, 236 deletions
diff --git a/net/core/sock.c b/net/core/sock.c index 51270238e269..6c1c8bc93412 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -195,44 +195,6 @@ bool sk_net_capable(const struct sock *sk, int cap) } EXPORT_SYMBOL(sk_net_capable); - -#ifdef CONFIG_MEMCG_KMEM -int mem_cgroup_sockets_init(struct mem_cgroup *memcg, struct cgroup_subsys *ss) -{ - struct proto *proto; - int ret = 0; - - mutex_lock(&proto_list_mutex); - list_for_each_entry(proto, &proto_list, node) { - if (proto->init_cgroup) { - ret = proto->init_cgroup(memcg, ss); - if (ret) - goto out; - } - } - - mutex_unlock(&proto_list_mutex); - return ret; -out: - list_for_each_entry_continue_reverse(proto, &proto_list, node) - if (proto->destroy_cgroup) - proto->destroy_cgroup(memcg); - mutex_unlock(&proto_list_mutex); - return ret; -} - -void mem_cgroup_sockets_destroy(struct mem_cgroup *memcg) -{ - struct proto *proto; - - mutex_lock(&proto_list_mutex); - list_for_each_entry_reverse(proto, &proto_list, node) - if (proto->destroy_cgroup) - proto->destroy_cgroup(memcg); - mutex_unlock(&proto_list_mutex); -} -#endif - /* * Each address family might have different locking rules, so we have * one slock key per address family: @@ -240,11 +202,6 @@ void mem_cgroup_sockets_destroy(struct mem_cgroup *memcg) static struct lock_class_key af_family_keys[AF_MAX]; static struct lock_class_key af_family_slock_keys[AF_MAX]; -#if defined(CONFIG_MEMCG_KMEM) -struct static_key memcg_socket_limit_enabled; -EXPORT_SYMBOL(memcg_socket_limit_enabled); -#endif - /* * Make lock validator output more readable. (we pre-construct these * strings build-time, so that runtime initialization of socket @@ -1507,12 +1464,6 @@ void sk_free(struct sock *sk) } EXPORT_SYMBOL(sk_free); -static void sk_update_clone(const struct sock *sk, struct sock *newsk) -{ - if (mem_cgroup_sockets_enabled && sk->sk_cgrp) - sock_update_memcg(newsk); -} - /** * sk_clone_lock - clone a socket, and lock its clone * @sk: the socket to clone @@ -1607,7 +1558,8 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) sk_set_socket(newsk, NULL); newsk->sk_wq = NULL; - sk_update_clone(sk, newsk); + if (mem_cgroup_sockets_enabled && sk->sk_memcg) + sock_update_memcg(newsk); if (newsk->sk_prot->sockets_allocated) sk_sockets_allocated_inc(newsk); @@ -2089,27 +2041,27 @@ int __sk_mem_schedule(struct sock *sk, int size, int kind) struct proto *prot = sk->sk_prot; int amt = sk_mem_pages(size); long allocated; - int parent_status = UNDER_LIMIT; sk->sk_forward_alloc += amt * SK_MEM_QUANTUM; - allocated = sk_memory_allocated_add(sk, amt, &parent_status); + allocated = sk_memory_allocated_add(sk, amt); + + if (mem_cgroup_sockets_enabled && sk->sk_memcg && + !mem_cgroup_charge_skmem(sk->sk_memcg, amt)) + goto suppress_allocation; /* Under limit. */ - if (parent_status == UNDER_LIMIT && - allocated <= sk_prot_mem_limits(sk, 0)) { + if (allocated <= sk_prot_mem_limits(sk, 0)) { sk_leave_memory_pressure(sk); return 1; } - /* Under pressure. (we or our parents) */ - if ((parent_status > SOFT_LIMIT) || - allocated > sk_prot_mem_limits(sk, 1)) + /* Under pressure. */ + if (allocated > sk_prot_mem_limits(sk, 1)) sk_enter_memory_pressure(sk); - /* Over hard limit (we or our parents) */ - if ((parent_status == OVER_LIMIT) || - (allocated > sk_prot_mem_limits(sk, 2))) + /* Over hard limit. */ + if (allocated > sk_prot_mem_limits(sk, 2)) goto suppress_allocation; /* guarantee minimum buffer size under pressure */ @@ -2158,6 +2110,9 @@ suppress_allocation: sk_memory_allocated_sub(sk, amt); + if (mem_cgroup_sockets_enabled && sk->sk_memcg) + mem_cgroup_uncharge_skmem(sk->sk_memcg, amt); + return 0; } EXPORT_SYMBOL(__sk_mem_schedule); @@ -2173,6 +2128,9 @@ void __sk_mem_reclaim(struct sock *sk, int amount) sk_memory_allocated_sub(sk, amount); sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT; + if (mem_cgroup_sockets_enabled && sk->sk_memcg) + mem_cgroup_uncharge_skmem(sk->sk_memcg, amount); + if (sk_under_memory_pressure(sk) && (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0))) sk_leave_memory_pressure(sk); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 7bb1b091efd1..fd17eec93525 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -422,7 +422,8 @@ void tcp_init_sock(struct sock *sk) sk->sk_rcvbuf = sysctl_tcp_rmem[1]; local_bh_disable(); - sock_update_memcg(sk); + if (mem_cgroup_sockets_enabled) + sock_update_memcg(sk); sk_sockets_allocated_inc(sk); local_bh_enable(); } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 65947c1f4733..c7d1fb50f381 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1818,7 +1818,9 @@ void tcp_v4_destroy_sock(struct sock *sk) tcp_saved_syn_free(tp); sk_sockets_allocated_dec(sk); - sock_release_memcg(sk); + + if (mem_cgroup_sockets_enabled && sk->sk_memcg) + sock_release_memcg(sk); } EXPORT_SYMBOL(tcp_v4_destroy_sock); @@ -2342,11 +2344,6 @@ struct proto tcp_prot = { .compat_setsockopt = compat_tcp_setsockopt, .compat_getsockopt = compat_tcp_getsockopt, #endif -#ifdef CONFIG_MEMCG_KMEM - .init_cgroup = tcp_init_cgroup, - .destroy_cgroup = tcp_destroy_cgroup, - .proto_cgroup = tcp_proto_cgroup, -#endif .diag_destroy = tcp_abort, }; EXPORT_SYMBOL(tcp_prot); diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c index 2379c1b4efb2..18bc7f745e9c 100644 --- a/net/ipv4/tcp_memcontrol.c +++ b/net/ipv4/tcp_memcontrol.c @@ -8,75 +8,49 @@ int tcp_init_cgroup(struct mem_cgroup *memcg, struct cgroup_subsys *ss) { + struct mem_cgroup *parent = parent_mem_cgroup(memcg); + struct page_counter *counter_parent = NULL; /* * The root cgroup does not use page_counters, but rather, * rely on the data already collected by the network * subsystem */ - struct mem_cgroup *parent = parent_mem_cgroup(memcg); - struct page_counter *counter_parent = NULL; - struct cg_proto *cg_proto, *parent_cg; - - cg_proto = tcp_prot.proto_cgroup(memcg); - if (!cg_proto) + if (memcg == root_mem_cgroup) return 0; - cg_proto->sysctl_mem[0] = sysctl_tcp_mem[0]; - cg_proto->sysctl_mem[1] = sysctl_tcp_mem[1]; - cg_proto->sysctl_mem[2] = sysctl_tcp_mem[2]; - cg_proto->memory_pressure = 0; - cg_proto->memcg = memcg; + memcg->tcp_mem.memory_pressure = 0; - parent_cg = tcp_prot.proto_cgroup(parent); - if (parent_cg) - counter_parent = &parent_cg->memory_allocated; + if (parent) + counter_parent = &parent->tcp_mem.memory_allocated; - page_counter_init(&cg_proto->memory_allocated, counter_parent); - percpu_counter_init(&cg_proto->sockets_allocated, 0, GFP_KERNEL); + page_counter_init(&memcg->tcp_mem.memory_allocated, counter_parent); return 0; } -EXPORT_SYMBOL(tcp_init_cgroup); void tcp_destroy_cgroup(struct mem_cgroup *memcg) { - struct cg_proto *cg_proto; - - cg_proto = tcp_prot.proto_cgroup(memcg); - if (!cg_proto) + if (memcg == root_mem_cgroup) return; - percpu_counter_destroy(&cg_proto->sockets_allocated); - - if (test_bit(MEMCG_SOCK_ACTIVATED, &cg_proto->flags)) - static_key_slow_dec(&memcg_socket_limit_enabled); - + if (memcg->tcp_mem.active) + static_branch_dec(&memcg_sockets_enabled_key); } -EXPORT_SYMBOL(tcp_destroy_cgroup); static int tcp_update_limit(struct mem_cgroup *memcg, unsigned long nr_pages) { - struct cg_proto *cg_proto; - int i; int ret; - cg_proto = tcp_prot.proto_cgroup(memcg); - if (!cg_proto) + if (memcg == root_mem_cgroup) return -EINVAL; - ret = page_counter_limit(&cg_proto->memory_allocated, nr_pages); + ret = page_counter_limit(&memcg->tcp_mem.memory_allocated, nr_pages); if (ret) return ret; - for (i = 0; i < 3; i++) - cg_proto->sysctl_mem[i] = min_t(long, nr_pages, - sysctl_tcp_mem[i]); - - if (nr_pages == PAGE_COUNTER_MAX) - clear_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags); - else { + if (!memcg->tcp_mem.active) { /* - * The active bit needs to be written after the static_key + * The active flag needs to be written after the static_key * update. This is what guarantees that the socket activation * function is the last one to run. See sock_update_memcg() for * details, and note that we don't mark any socket as belonging @@ -90,14 +64,9 @@ static int tcp_update_limit(struct mem_cgroup *memcg, unsigned long nr_pages) * We never race with the readers in sock_update_memcg(), * because when this value change, the code to process it is not * patched in yet. - * - * The activated bit is used to guarantee that no two writers - * will do the update in the same memcg. Without that, we can't - * properly shutdown the static key. */ - if (!test_and_set_bit(MEMCG_SOCK_ACTIVATED, &cg_proto->flags)) - static_key_slow_inc(&memcg_socket_limit_enabled); - set_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags); + static_branch_inc(&memcg_sockets_enabled_key); + memcg->tcp_mem.active = true; } return 0; @@ -141,32 +110,32 @@ static ssize_t tcp_cgroup_write(struct kernfs_open_file *of, static u64 tcp_cgroup_read(struct cgroup_subsys_state *css, struct cftype *cft) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); - struct cg_proto *cg_proto = tcp_prot.proto_cgroup(memcg); u64 val; switch (cft->private) { case RES_LIMIT: - if (!cg_proto) - return PAGE_COUNTER_MAX; - val = cg_proto->memory_allocated.limit; + if (memcg == root_mem_cgroup) + val = PAGE_COUNTER_MAX; + else + val = memcg->tcp_mem.memory_allocated.limit; val *= PAGE_SIZE; break; case RES_USAGE: - if (!cg_proto) + if (memcg == root_mem_cgroup) val = atomic_long_read(&tcp_memory_allocated); else - val = page_counter_read(&cg_proto->memory_allocated); + val = page_counter_read(&memcg->tcp_mem.memory_allocated); val *= PAGE_SIZE; break; case RES_FAILCNT: - if (!cg_proto) + if (memcg == root_mem_cgroup) return 0; - val = cg_proto->memory_allocated.failcnt; + val = memcg->tcp_mem.memory_allocated.failcnt; break; case RES_MAX_USAGE: - if (!cg_proto) + if (memcg == root_mem_cgroup) return 0; - val = cg_proto->memory_allocated.watermark; + val = memcg->tcp_mem.memory_allocated.watermark; val *= PAGE_SIZE; break; default: @@ -179,19 +148,17 @@ static ssize_t tcp_cgroup_reset(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct mem_cgroup *memcg; - struct cg_proto *cg_proto; memcg = mem_cgroup_from_css(of_css(of)); - cg_proto = tcp_prot.proto_cgroup(memcg); - if (!cg_proto) + if (memcg == root_mem_cgroup) return nbytes; switch (of_cft(of)->private) { case RES_MAX_USAGE: - page_counter_reset_watermark(&cg_proto->memory_allocated); + page_counter_reset_watermark(&memcg->tcp_mem.memory_allocated); break; case RES_FAILCNT: - cg_proto->memory_allocated.failcnt = 0; + memcg->tcp_mem.memory_allocated.failcnt = 0; break; } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 412a920fe0ec..fda379cd600d 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2813,13 +2813,16 @@ begin_fwd: */ void sk_forced_mem_schedule(struct sock *sk, int size) { - int amt, status; + int amt; if (size <= sk->sk_forward_alloc) return; amt = sk_mem_pages(size); sk->sk_forward_alloc += amt * SK_MEM_QUANTUM; - sk_memory_allocated_add(sk, amt, &status); + sk_memory_allocated_add(sk, amt); + + if (mem_cgroup_sockets_enabled && sk->sk_memcg) + mem_cgroup_charge_skmem(sk->sk_memcg, amt); } /* Send a FIN. The caller locks the socket for us. diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index db9f1c318afc..4ad8edb46f7c 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1889,9 +1889,6 @@ struct proto tcpv6_prot = { .compat_setsockopt = compat_tcp_setsockopt, .compat_getsockopt = compat_tcp_getsockopt, #endif -#ifdef CONFIG_MEMCG_KMEM - .proto_cgroup = tcp_proto_cgroup, -#endif .clear_sk = tcp_v6_clear_sk, .diag_destroy = tcp_abort, }; diff --git a/net/socket.c b/net/socket.c index 91c2de6f5020..c044d1e8508c 100644 --- a/net/socket.c +++ b/net/socket.c @@ -294,7 +294,7 @@ static int init_inodecache(void) 0, (SLAB_HWCACHE_ALIGN | SLAB_RECLAIM_ACCOUNT | - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD | SLAB_ACCOUNT), init_once); if (sock_inode_cachep == NULL) return -ENOMEM; diff --git a/net/sunrpc/auth_gss/gss_rpc_upcall.c b/net/sunrpc/auth_gss/gss_rpc_upcall.c index 59eeed43eda2..f0c6a8c78a56 100644 --- a/net/sunrpc/auth_gss/gss_rpc_upcall.c +++ b/net/sunrpc/auth_gss/gss_rpc_upcall.c @@ -326,6 +326,9 @@ int gssp_accept_sec_context_upcall(struct net *net, if (data->found_creds && client_name.data != NULL) { char *c; + data->creds.cr_raw_principal = kstrndup(client_name.data, + client_name.len, GFP_KERNEL); + data->creds.cr_principal = kstrndup(client_name.data, client_name.len, GFP_KERNEL); if (data->creds.cr_principal) { diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 23608eb0ded2..b7f21044f4d8 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1217,6 +1217,7 @@ static int rpc_anyaddr(int family, struct sockaddr *buf, size_t buflen) return -EINVAL; memcpy(buf, &rpc_in6addr_loopback, sizeof(rpc_in6addr_loopback)); + break; default: dprintk("RPC: %s: address family not supported\n", __func__); diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index d81186d34558..14f45bf0410c 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -1500,7 +1500,7 @@ int register_rpc_pipefs(void) rpc_inode_cachep = kmem_cache_create("rpc_inode_cache", sizeof(struct rpc_inode), 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), init_once); if (!rpc_inode_cachep) return -ENOMEM; diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index a6cbb2104667..7422f28818b2 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -10,11 +10,13 @@ #include <linux/kthread.h> #include <linux/slab.h> #include <net/sock.h> +#include <linux/sunrpc/addr.h> #include <linux/sunrpc/stats.h> #include <linux/sunrpc/svc_xprt.h> #include <linux/sunrpc/svcsock.h> #include <linux/sunrpc/xprt.h> #include <linux/module.h> +#include <linux/netdevice.h> #include <trace/events/sunrpc.h> #define RPCDBG_FACILITY RPCDBG_SVCXPRT @@ -938,6 +940,49 @@ static void svc_age_temp_xprts(unsigned long closure) mod_timer(&serv->sv_temptimer, jiffies + svc_conn_age_period * HZ); } +/* Close temporary transports whose xpt_local matches server_addr immediately + * instead of waiting for them to be picked up by the timer. + * + * This is meant to be called from a notifier_block that runs when an ip + * address is deleted. + */ +void svc_age_temp_xprts_now(struct svc_serv *serv, struct sockaddr *server_addr) +{ + struct svc_xprt *xprt; + struct svc_sock *svsk; + struct socket *sock; + struct list_head *le, *next; + LIST_HEAD(to_be_closed); + struct linger no_linger = { + .l_onoff = 1, + .l_linger = 0, + }; + + spin_lock_bh(&serv->sv_lock); + list_for_each_safe(le, next, &serv->sv_tempsocks) { + xprt = list_entry(le, struct svc_xprt, xpt_list); + if (rpc_cmp_addr(server_addr, (struct sockaddr *) + &xprt->xpt_local)) { + dprintk("svc_age_temp_xprts_now: found %p\n", xprt); + list_move(le, &to_be_closed); + } + } + spin_unlock_bh(&serv->sv_lock); + + while (!list_empty(&to_be_closed)) { + le = to_be_closed.next; + list_del_init(le); + xprt = list_entry(le, struct svc_xprt, xpt_list); + dprintk("svc_age_temp_xprts_now: closing %p\n", xprt); + svsk = container_of(xprt, struct svc_sock, sk_xprt); + sock = svsk->sk_sock; + kernel_setsockopt(sock, SOL_SOCKET, SO_LINGER, + (char *)&no_linger, sizeof(no_linger)); + svc_close_xprt(xprt); + } +} +EXPORT_SYMBOL_GPL(svc_age_temp_xprts_now); + static void call_xpt_users(struct svc_xprt *xprt) { struct svc_xpt_user *u; diff --git a/net/sunrpc/svcauth.c b/net/sunrpc/svcauth.c index 79c0f3459b5c..69841db1f533 100644 --- a/net/sunrpc/svcauth.c +++ b/net/sunrpc/svcauth.c @@ -55,6 +55,7 @@ svc_authenticate(struct svc_rqst *rqstp, __be32 *authp) spin_unlock(&authtab_lock); rqstp->rq_auth_slack = 0; + init_svc_cred(&rqstp->rq_cred); rqstp->rq_authop = aops; return aops->accept(rqstp, authp); @@ -63,6 +64,7 @@ EXPORT_SYMBOL_GPL(svc_authenticate); int svc_set_client(struct svc_rqst *rqstp) { + rqstp->rq_client = NULL; return rqstp->rq_authop->set_client(rqstp); } EXPORT_SYMBOL_GPL(svc_set_client); diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c index 621ca7b4a155..dfacdc95b3f5 100644 --- a/net/sunrpc/svcauth_unix.c +++ b/net/sunrpc/svcauth_unix.c @@ -728,10 +728,6 @@ svcauth_null_accept(struct svc_rqst *rqstp, __be32 *authp) struct kvec *resv = &rqstp->rq_res.head[0]; struct svc_cred *cred = &rqstp->rq_cred; - cred->cr_group_info = NULL; - cred->cr_principal = NULL; - rqstp->rq_client = NULL; - if (argv->iov_len < 3*4) return SVC_GARBAGE; @@ -794,10 +790,6 @@ svcauth_unix_accept(struct svc_rqst *rqstp, __be32 *authp) u32 slen, i; int len = argv->iov_len; - cred->cr_group_info = NULL; - cred->cr_principal = NULL; - rqstp->rq_client = NULL; - if ((len -= 3*4) < 0) return SVC_GARBAGE; diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c index 2dcb44f69e53..cc1251d07297 100644 --- a/net/sunrpc/xprtrdma/backchannel.c +++ b/net/sunrpc/xprtrdma/backchannel.c @@ -15,7 +15,7 @@ # define RPCDBG_FACILITY RPCDBG_TRANS #endif -#define RPCRDMA_BACKCHANNEL_DEBUG +#undef RPCRDMA_BACKCHANNEL_DEBUG static void rpcrdma_bc_free_rqst(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst) @@ -42,8 +42,8 @@ static int rpcrdma_bc_setup_rqst(struct rpcrdma_xprt *r_xprt, size_t size; req = rpcrdma_create_req(r_xprt); - if (!req) - return -ENOMEM; + if (IS_ERR(req)) + return PTR_ERR(req); req->rl_backchannel = true; size = RPCRDMA_INLINE_WRITE_THRESHOLD(rqst); @@ -84,9 +84,7 @@ out_fail: static int rpcrdma_bc_setup_reps(struct rpcrdma_xprt *r_xprt, unsigned int count) { - struct rpcrdma_buffer *buffers = &r_xprt->rx_buf; struct rpcrdma_rep *rep; - unsigned long flags; int rc = 0; while (count--) { @@ -98,9 +96,7 @@ static int rpcrdma_bc_setup_reps(struct rpcrdma_xprt *r_xprt, break; } - spin_lock_irqsave(&buffers->rb_lock, flags); - list_add(&rep->rr_list, &buffers->rb_recv_bufs); - spin_unlock_irqrestore(&buffers->rb_lock, flags); + rpcrdma_recv_buffer_put(rep); } return rc; @@ -140,6 +136,7 @@ int xprt_rdma_bc_setup(struct rpc_xprt *xprt, unsigned int reqs) __func__); goto out_free; } + dprintk("RPC: %s: new rqst %p\n", __func__, rqst); rqst->rq_xprt = &r_xprt->rx_xprt; INIT_LIST_HEAD(&rqst->rq_list); @@ -220,12 +217,14 @@ int rpcrdma_bc_marshal_reply(struct rpc_rqst *rqst) rpclen = rqst->rq_svec[0].iov_len; +#ifdef RPCRDMA_BACKCHANNEL_DEBUG pr_info("RPC: %s: rpclen %zd headerp 0x%p lkey 0x%x\n", __func__, rpclen, headerp, rdmab_lkey(req->rl_rdmabuf)); pr_info("RPC: %s: RPC/RDMA: %*ph\n", __func__, (int)RPCRDMA_HDRLEN_MIN, headerp); pr_info("RPC: %s: RPC: %*ph\n", __func__, (int)rpclen, rqst->rq_svec[0].iov_base); +#endif req->rl_send_iov[0].addr = rdmab_addr(req->rl_rdmabuf); req->rl_send_iov[0].length = RPCRDMA_HDRLEN_MIN; @@ -269,6 +268,9 @@ void xprt_rdma_bc_free_rqst(struct rpc_rqst *rqst) { struct rpc_xprt *xprt = rqst->rq_xprt; + dprintk("RPC: %s: freeing rqst %p (req %p)\n", + __func__, rqst, rpcr_to_rdmar(rqst)); + smp_mb__before_atomic(); WARN_ON_ONCE(!test_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state)); clear_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state); @@ -333,9 +335,7 @@ void rpcrdma_bc_receive_call(struct rpcrdma_xprt *r_xprt, struct rpc_rqst, rq_bc_pa_list); list_del(&rqst->rq_bc_pa_list); spin_unlock(&xprt->bc_pa_lock); -#ifdef RPCRDMA_BACKCHANNEL_DEBUG - pr_info("RPC: %s: using rqst %p\n", __func__, rqst); -#endif + dprintk("RPC: %s: using rqst %p\n", __func__, rqst); /* Prepare rqst */ rqst->rq_reply_bytes_recvd = 0; @@ -355,10 +355,8 @@ void rpcrdma_bc_receive_call(struct rpcrdma_xprt *r_xprt, * direction reply. */ req = rpcr_to_rdmar(rqst); -#ifdef RPCRDMA_BACKCHANNEL_DEBUG - pr_info("RPC: %s: attaching rep %p to req %p\n", + dprintk("RPC: %s: attaching rep %p to req %p\n", __func__, rep, req); -#endif req->rl_reply = rep; /* Defeat the retransmit detection logic in send_request */ diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c index f1e8dafbd507..c14f3a4bff68 100644 --- a/net/sunrpc/xprtrdma/fmr_ops.c +++ b/net/sunrpc/xprtrdma/fmr_ops.c @@ -179,6 +179,69 @@ out_maperr: return rc; } +static void +__fmr_dma_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg) +{ + struct ib_device *device = r_xprt->rx_ia.ri_device; + struct rpcrdma_mw *mw = seg->rl_mw; + int nsegs = seg->mr_nsegs; + + seg->rl_mw = NULL; + + while (nsegs--) + rpcrdma_unmap_one(device, seg++); + + rpcrdma_put_mw(r_xprt, mw); +} + +/* Invalidate all memory regions that were registered for "req". + * + * Sleeps until it is safe for the host CPU to access the + * previously mapped memory regions. + */ +static void +fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) +{ + struct rpcrdma_mr_seg *seg; + unsigned int i, nchunks; + struct rpcrdma_mw *mw; + LIST_HEAD(unmap_list); + int rc; + + dprintk("RPC: %s: req %p\n", __func__, req); + + /* ORDER: Invalidate all of the req's MRs first + * + * ib_unmap_fmr() is slow, so use a single call instead + * of one call per mapped MR. + */ + for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) { + seg = &req->rl_segments[i]; + mw = seg->rl_mw; + + list_add(&mw->r.fmr.fmr->list, &unmap_list); + + i += seg->mr_nsegs; + } + rc = ib_unmap_fmr(&unmap_list); + if (rc) + pr_warn("%s: ib_unmap_fmr failed (%i)\n", __func__, rc); + + /* ORDER: Now DMA unmap all of the req's MRs, and return + * them to the free MW list. + */ + for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) { + seg = &req->rl_segments[i]; + + __fmr_dma_unmap(r_xprt, seg); + + i += seg->mr_nsegs; + seg->mr_nsegs = 0; + } + + req->rl_nchunks = 0; +} + /* Use the ib_unmap_fmr() verb to prevent further remote * access via RDMA READ or RDMA WRITE. */ @@ -231,6 +294,7 @@ fmr_op_destroy(struct rpcrdma_buffer *buf) const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = { .ro_map = fmr_op_map, + .ro_unmap_sync = fmr_op_unmap_sync, .ro_unmap = fmr_op_unmap, .ro_open = fmr_op_open, .ro_maxpages = fmr_op_maxpages, diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 88cf9e7269c2..c6836844bd0e 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -245,12 +245,14 @@ frwr_op_maxpages(struct rpcrdma_xprt *r_xprt) rpcrdma_max_segments(r_xprt) * ia->ri_max_frmr_depth); } -/* If FAST_REG or LOCAL_INV failed, indicate the frmr needs to be reset. */ +/* If FAST_REG or LOCAL_INV failed, indicate the frmr needs + * to be reset. + * + * WARNING: Only wr_id and status are reliable at this point + */ static void -frwr_sendcompletion(struct ib_wc *wc) +__frwr_sendcompletion_flush(struct ib_wc *wc, struct rpcrdma_mw *r) { - struct rpcrdma_mw *r; - if (likely(wc->status == IB_WC_SUCCESS)) return; @@ -261,9 +263,23 @@ frwr_sendcompletion(struct ib_wc *wc) else pr_warn("RPC: %s: frmr %p error, status %s (%d)\n", __func__, r, ib_wc_status_msg(wc->status), wc->status); + r->r.frmr.fr_state = FRMR_IS_STALE; } +static void +frwr_sendcompletion(struct ib_wc *wc) +{ + struct rpcrdma_mw *r = (struct rpcrdma_mw *)(unsigned long)wc->wr_id; + struct rpcrdma_frmr *f = &r->r.frmr; + + if (unlikely(wc->status != IB_WC_SUCCESS)) + __frwr_sendcompletion_flush(wc, r); + + if (f->fr_waiter) + complete(&f->fr_linv_done); +} + static int frwr_op_init(struct rpcrdma_xprt *r_xprt) { @@ -319,7 +335,7 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, struct rpcrdma_mw *mw; struct rpcrdma_frmr *frmr; struct ib_mr *mr; - struct ib_reg_wr reg_wr; + struct ib_reg_wr *reg_wr; struct ib_send_wr *bad_wr; int rc, i, n, dma_nents; u8 key; @@ -335,7 +351,9 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, } while (mw->r.frmr.fr_state != FRMR_IS_INVALID); frmr = &mw->r.frmr; frmr->fr_state = FRMR_IS_VALID; + frmr->fr_waiter = false; mr = frmr->fr_mr; + reg_wr = &frmr->fr_regwr; if (nsegs > ia->ri_max_frmr_depth) nsegs = ia->ri_max_frmr_depth; @@ -381,19 +399,19 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, key = (u8)(mr->rkey & 0x000000FF); ib_update_fast_reg_key(mr, ++key); - reg_wr.wr.next = NULL; - reg_wr.wr.opcode = IB_WR_REG_MR; - reg_wr.wr.wr_id = (uintptr_t)mw; - reg_wr.wr.num_sge = 0; - reg_wr.wr.send_flags = 0; - reg_wr.mr = mr; - reg_wr.key = mr->rkey; - reg_wr.access = writing ? - IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE : - IB_ACCESS_REMOTE_READ; + reg_wr->wr.next = NULL; + reg_wr->wr.opcode = IB_WR_REG_MR; + reg_wr->wr.wr_id = (uintptr_t)mw; + reg_wr->wr.num_sge = 0; + reg_wr->wr.send_flags = 0; + reg_wr->mr = mr; + reg_wr->key = mr->rkey; + reg_wr->access = writing ? + IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE : + IB_ACCESS_REMOTE_READ; DECR_CQCOUNT(&r_xprt->rx_ep); - rc = ib_post_send(ia->ri_id->qp, ®_wr.wr, &bad_wr); + rc = ib_post_send(ia->ri_id->qp, ®_wr->wr, &bad_wr); if (rc) goto out_senderr; @@ -413,6 +431,116 @@ out_senderr: return rc; } +static struct ib_send_wr * +__frwr_prepare_linv_wr(struct rpcrdma_mr_seg *seg) +{ + struct rpcrdma_mw *mw = seg->rl_mw; + struct rpcrdma_frmr *f = &mw->r.frmr; + struct ib_send_wr *invalidate_wr; + + f->fr_waiter = false; + f->fr_state = FRMR_IS_INVALID; + invalidate_wr = &f->fr_invwr; + + memset(invalidate_wr, 0, sizeof(*invalidate_wr)); + invalidate_wr->wr_id = (unsigned long)(void *)mw; + invalidate_wr->opcode = IB_WR_LOCAL_INV; + invalidate_wr->ex.invalidate_rkey = f->fr_mr->rkey; + + return invalidate_wr; +} + +static void +__frwr_dma_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, + int rc) +{ + struct ib_device *device = r_xprt->rx_ia.ri_device; + struct rpcrdma_mw *mw = seg->rl_mw; + struct rpcrdma_frmr *f = &mw->r.frmr; + + seg->rl_mw = NULL; + + ib_dma_unmap_sg(device, f->sg, f->sg_nents, seg->mr_dir); + + if (!rc) + rpcrdma_put_mw(r_xprt, mw); + else + __frwr_queue_recovery(mw); +} + +/* Invalidate all memory regions that were registered for "req". + * + * Sleeps until it is safe for the host CPU to access the + * previously mapped memory regions. + */ +static void +frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) +{ + struct ib_send_wr *invalidate_wrs, *pos, *prev, *bad_wr; + struct rpcrdma_ia *ia = &r_xprt->rx_ia; + struct rpcrdma_mr_seg *seg; + unsigned int i, nchunks; + struct rpcrdma_frmr *f; + int rc; + + dprintk("RPC: %s: req %p\n", __func__, req); + + /* ORDER: Invalidate all of the req's MRs first + * + * Chain the LOCAL_INV Work Requests and post them with + * a single ib_post_send() call. + */ + invalidate_wrs = pos = prev = NULL; + seg = NULL; + for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) { + seg = &req->rl_segments[i]; + + pos = __frwr_prepare_linv_wr(seg); + + if (!invalidate_wrs) + invalidate_wrs = pos; + else + prev->next = pos; + prev = pos; + + i += seg->mr_nsegs; + } + f = &seg->rl_mw->r.frmr; + + /* Strong send queue ordering guarantees that when the + * last WR in the chain completes, all WRs in the chain + * are complete. + */ + f->fr_invwr.send_flags = IB_SEND_SIGNALED; + f->fr_waiter = true; + init_completion(&f->fr_linv_done); + INIT_CQCOUNT(&r_xprt->rx_ep); + + /* Transport disconnect drains the receive CQ before it + * replaces the QP. The RPC reply handler won't call us + * unless ri_id->qp is a valid pointer. + */ + rc = ib_post_send(ia->ri_id->qp, invalidate_wrs, &bad_wr); + if (rc) + pr_warn("%s: ib_post_send failed %i\n", __func__, rc); + + wait_for_completion(&f->fr_linv_done); + + /* ORDER: Now DMA unmap all of the req's MRs, and return + * them to the free MW list. + */ + for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) { + seg = &req->rl_segments[i]; + + __frwr_dma_unmap(r_xprt, seg, rc); + + i += seg->mr_nsegs; + seg->mr_nsegs = 0; + } + + req->rl_nchunks = 0; +} + /* Post a LOCAL_INV Work Request to prevent further remote access * via RDMA READ or RDMA WRITE. */ @@ -423,23 +551,24 @@ frwr_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg) struct rpcrdma_ia *ia = &r_xprt->rx_ia; struct rpcrdma_mw *mw = seg1->rl_mw; struct rpcrdma_frmr *frmr = &mw->r.frmr; - struct ib_send_wr invalidate_wr, *bad_wr; + struct ib_send_wr *invalidate_wr, *bad_wr; int rc, nsegs = seg->mr_nsegs; dprintk("RPC: %s: FRMR %p\n", __func__, mw); seg1->rl_mw = NULL; frmr->fr_state = FRMR_IS_INVALID; + invalidate_wr = &mw->r.frmr.fr_invwr; - memset(&invalidate_wr, 0, sizeof(invalidate_wr)); - invalidate_wr.wr_id = (unsigned long)(void *)mw; - invalidate_wr.opcode = IB_WR_LOCAL_INV; - invalidate_wr.ex.invalidate_rkey = frmr->fr_mr->rkey; + memset(invalidate_wr, 0, sizeof(*invalidate_wr)); + invalidate_wr->wr_id = (uintptr_t)mw; + invalidate_wr->opcode = IB_WR_LOCAL_INV; + invalidate_wr->ex.invalidate_rkey = frmr->fr_mr->rkey; DECR_CQCOUNT(&r_xprt->rx_ep); ib_dma_unmap_sg(ia->ri_device, frmr->sg, frmr->sg_nents, seg1->mr_dir); read_lock(&ia->ri_qplock); - rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr); + rc = ib_post_send(ia->ri_id->qp, invalidate_wr, &bad_wr); read_unlock(&ia->ri_qplock); if (rc) goto out_err; @@ -471,6 +600,7 @@ frwr_op_destroy(struct rpcrdma_buffer *buf) const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = { .ro_map = frwr_op_map, + .ro_unmap_sync = frwr_op_unmap_sync, .ro_unmap = frwr_op_unmap, .ro_open = frwr_op_open, .ro_maxpages = frwr_op_maxpages, diff --git a/net/sunrpc/xprtrdma/physical_ops.c b/net/sunrpc/xprtrdma/physical_ops.c index 617b76f22154..dbb302ecf590 100644 --- a/net/sunrpc/xprtrdma/physical_ops.c +++ b/net/sunrpc/xprtrdma/physical_ops.c @@ -83,6 +83,18 @@ physical_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg) return 1; } +/* DMA unmap all memory regions that were mapped for "req". + */ +static void +physical_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) +{ + struct ib_device *device = r_xprt->rx_ia.ri_device; + unsigned int i; + + for (i = 0; req->rl_nchunks; --req->rl_nchunks) + rpcrdma_unmap_one(device, &req->rl_segments[i++]); +} + static void physical_op_destroy(struct rpcrdma_buffer *buf) { @@ -90,6 +102,7 @@ physical_op_destroy(struct rpcrdma_buffer *buf) const struct rpcrdma_memreg_ops rpcrdma_physical_memreg_ops = { .ro_map = physical_op_map, + .ro_unmap_sync = physical_op_unmap_sync, .ro_unmap = physical_op_unmap, .ro_open = physical_op_open, .ro_maxpages = physical_op_maxpages, diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index c10d9699441c..0f28f2d743ed 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -804,6 +804,11 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep) if (req->rl_reply) goto out_duplicate; + /* Sanity checking has passed. We are now committed + * to complete this transaction. + */ + list_del_init(&rqst->rq_list); + spin_unlock_bh(&xprt->transport_lock); dprintk("RPC: %s: reply 0x%p completes request 0x%p\n" " RPC request 0x%p xid 0x%08x\n", __func__, rep, req, rqst, @@ -888,12 +893,23 @@ badheader: break; } + /* Invalidate and flush the data payloads before waking the + * waiting application. This guarantees the memory region is + * properly fenced from the server before the application + * accesses the data. It also ensures proper send flow + * control: waking the next RPC waits until this RPC has + * relinquished all its Send Queue entries. + */ + if (req->rl_nchunks) + r_xprt->rx_ia.ri_ops->ro_unmap_sync(r_xprt, req); + credits = be32_to_cpu(headerp->rm_credit); if (credits == 0) credits = 1; /* don't deadlock */ else if (credits > r_xprt->rx_buf.rb_max_requests) credits = r_xprt->rx_buf.rb_max_requests; + spin_lock_bh(&xprt->transport_lock); cwnd = xprt->cwnd; xprt->cwnd = credits << RPC_CWNDSHIFT; if (xprt->cwnd > cwnd) diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 8c545f7d7525..740bddcf3488 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -576,6 +576,9 @@ xprt_rdma_free(void *buffer) rb = container_of(buffer, struct rpcrdma_regbuf, rg_base[0]); req = rb->rg_owner; + if (req->rl_backchannel) + return; + r_xprt = container_of(req->rl_buffer, struct rpcrdma_xprt, rx_buf); dprintk("RPC: %s: called on 0x%p\n", __func__, req->rl_reply); diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index eadd1655145a..732c71ce5dca 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -616,10 +616,8 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, /* set trigger for requesting send completion */ ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 - 1; - if (ep->rep_cqinit > RPCRDMA_MAX_UNSIGNALED_SENDS) - ep->rep_cqinit = RPCRDMA_MAX_UNSIGNALED_SENDS; - else if (ep->rep_cqinit <= 2) - ep->rep_cqinit = 0; + if (ep->rep_cqinit <= 2) + ep->rep_cqinit = 0; /* always signal? */ INIT_CQCOUNT(ep); init_waitqueue_head(&ep->rep_connect_wait); INIT_DELAYED_WORK(&ep->rep_connect_worker, rpcrdma_connect_worker); @@ -852,10 +850,11 @@ retry: if (extras) { rc = rpcrdma_ep_post_extra_recv(r_xprt, extras); - if (rc) + if (rc) { pr_warn("%s: rpcrdma_ep_post_extra_recv: %i\n", __func__, rc); rc = 0; + } } } @@ -1337,15 +1336,14 @@ rpcrdma_ep_post_extra_recv(struct rpcrdma_xprt *r_xprt, unsigned int count) struct rpcrdma_ia *ia = &r_xprt->rx_ia; struct rpcrdma_ep *ep = &r_xprt->rx_ep; struct rpcrdma_rep *rep; - unsigned long flags; int rc; while (count--) { - spin_lock_irqsave(&buffers->rb_lock, flags); + spin_lock(&buffers->rb_lock); if (list_empty(&buffers->rb_recv_bufs)) goto out_reqbuf; rep = rpcrdma_buffer_get_rep_locked(buffers); - spin_unlock_irqrestore(&buffers->rb_lock, flags); + spin_unlock(&buffers->rb_lock); rc = rpcrdma_ep_post_recv(ia, ep, rep); if (rc) @@ -1355,7 +1353,7 @@ rpcrdma_ep_post_extra_recv(struct rpcrdma_xprt *r_xprt, unsigned int count) return 0; out_reqbuf: - spin_unlock_irqrestore(&buffers->rb_lock, flags); + spin_unlock(&buffers->rb_lock); pr_warn("%s: no extra receive buffers\n", __func__); return -ENOMEM; diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index ac7f8d4f632a..728101ddc44b 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -88,12 +88,6 @@ struct rpcrdma_ep { struct delayed_work rep_connect_worker; }; -/* - * Force a signaled SEND Work Request every so often, - * in case the provider needs to do some housekeeping. - */ -#define RPCRDMA_MAX_UNSIGNALED_SENDS (32) - #define INIT_CQCOUNT(ep) atomic_set(&(ep)->rep_cqcount, (ep)->rep_cqinit) #define DECR_CQCOUNT(ep) atomic_sub_return(1, &(ep)->rep_cqcount) @@ -207,6 +201,12 @@ struct rpcrdma_frmr { enum rpcrdma_frmr_state fr_state; struct work_struct fr_work; struct rpcrdma_xprt *fr_xprt; + bool fr_waiter; + struct completion fr_linv_done;; + union { + struct ib_reg_wr fr_regwr; + struct ib_send_wr fr_invwr; + }; }; struct rpcrdma_fmr { @@ -364,6 +364,8 @@ struct rpcrdma_xprt; struct rpcrdma_memreg_ops { int (*ro_map)(struct rpcrdma_xprt *, struct rpcrdma_mr_seg *, int, bool); + void (*ro_unmap_sync)(struct rpcrdma_xprt *, + struct rpcrdma_req *); int (*ro_unmap)(struct rpcrdma_xprt *, struct rpcrdma_mr_seg *); int (*ro_open)(struct rpcrdma_ia *, diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 2ffaf6a79499..fde2138b81e7 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -398,7 +398,6 @@ static int xs_sendpages(struct socket *sock, struct sockaddr *addr, int addrlen, if (unlikely(!sock)) return -ENOTSOCK; - clear_bit(SOCKWQ_ASYNC_NOSPACE, &sock->flags); if (base != 0) { addr = NULL; addrlen = 0; @@ -442,7 +441,6 @@ static void xs_nospace_callback(struct rpc_task *task) struct sock_xprt *transport = container_of(task->tk_rqstp->rq_xprt, struct sock_xprt, xprt); transport->inet->sk_write_pending--; - clear_bit(SOCKWQ_ASYNC_NOSPACE, &transport->sock->flags); } /** @@ -467,20 +465,11 @@ static int xs_nospace(struct rpc_task *task) /* Don't race with disconnect */ if (xprt_connected(xprt)) { - if (test_bit(SOCKWQ_ASYNC_NOSPACE, &transport->sock->flags)) { - /* - * Notify TCP that we're limited by the application - * window size - */ - set_bit(SOCK_NOSPACE, &transport->sock->flags); - sk->sk_write_pending++; - /* ...and wait for more buffer space */ - xprt_wait_for_buffer_space(task, xs_nospace_callback); - } - } else { - clear_bit(SOCKWQ_ASYNC_NOSPACE, &transport->sock->flags); + /* wait for more buffer space */ + sk->sk_write_pending++; + xprt_wait_for_buffer_space(task, xs_nospace_callback); + } else ret = -ENOTCONN; - } spin_unlock_bh(&xprt->transport_lock); @@ -616,9 +605,6 @@ process_status: case -EAGAIN: status = xs_nospace(task); break; - default: - dprintk("RPC: sendmsg returned unrecognized error %d\n", - -status); case -ENETUNREACH: case -ENOBUFS: case -EPIPE: @@ -626,7 +612,10 @@ process_status: case -EPERM: /* When the server has died, an ICMP port unreachable message * prompts ECONNREFUSED. */ - clear_bit(SOCKWQ_ASYNC_NOSPACE, &transport->sock->flags); + break; + default: + dprintk("RPC: sendmsg returned unrecognized error %d\n", + -status); } return status; @@ -706,16 +695,16 @@ static int xs_tcp_send_request(struct rpc_task *task) case -EAGAIN: status = xs_nospace(task); break; - default: - dprintk("RPC: sendmsg returned unrecognized error %d\n", - -status); case -ECONNRESET: case -ECONNREFUSED: case -ENOTCONN: case -EADDRINUSE: case -ENOBUFS: case -EPIPE: - clear_bit(SOCKWQ_ASYNC_NOSPACE, &transport->sock->flags); + break; + default: + dprintk("RPC: sendmsg returned unrecognized error %d\n", + -status); } return status; @@ -1609,19 +1598,23 @@ static void xs_tcp_state_change(struct sock *sk) static void xs_write_space(struct sock *sk) { - struct socket *sock; + struct socket_wq *wq; struct rpc_xprt *xprt; - if (unlikely(!(sock = sk->sk_socket))) + if (!sk->sk_socket) return; - clear_bit(SOCK_NOSPACE, &sock->flags); + clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags); if (unlikely(!(xprt = xprt_from_sock(sk)))) return; - if (test_and_clear_bit(SOCKWQ_ASYNC_NOSPACE, &sock->flags) == 0) - return; + rcu_read_lock(); + wq = rcu_dereference(sk->sk_wq); + if (!wq || test_and_clear_bit(SOCKWQ_ASYNC_NOSPACE, &wq->flags) == 0) + goto out; xprt_write_space(xprt); +out: + rcu_read_unlock(); } /** @@ -1907,18 +1900,6 @@ static inline void xs_reclassify_socket(int family, struct socket *sock) } } #else -static inline void xs_reclassify_socketu(struct socket *sock) -{ -} - -static inline void xs_reclassify_socket4(struct socket *sock) -{ -} - -static inline void xs_reclassify_socket6(struct socket *sock) -{ -} - static inline void xs_reclassify_socket(int family, struct socket *sock) { } @@ -2008,7 +1989,7 @@ static int xs_local_setup_socket(struct sock_xprt *transport) "transport socket (%d).\n", -status); goto out; } - xs_reclassify_socketu(sock); + xs_reclassify_socket(AF_LOCAL, sock); dprintk("RPC: worker connecting xprt %p via AF_LOCAL to %s\n", xprt, xprt->address_strings[RPC_DISPLAY_ADDR]); |