summaryrefslogtreecommitdiff
path: root/net/sunrpc/xprtrdma/verbs.c
diff options
context:
space:
mode:
authorChuck Lever <chuck.lever@oracle.com>2018-05-04 15:35:20 -0400
committerAnna Schumaker <Anna.Schumaker@Netapp.com>2018-05-07 09:20:04 -0400
commit7c8d9e7c8863905951d4eaa7a8d277150f3a37f7 (patch)
tree16017a18ac2150272b7b5dd4be6fe1cd5d230aeb /net/sunrpc/xprtrdma/verbs.c
parent0e0b854cfb3302b1907e9d3a927469b95710238f (diff)
xprtrdma: Move Receive posting to Receive handler
Receive completion and Reply handling are done by a BOUND workqueue, meaning they run on only one CPU. Posting receives is currently done in the send_request path, which on large systems is typically done on a different CPU than the one handling Receive completions. This results in movement of Receive-related cachelines between the sending and receiving CPUs. More importantly, it means that currently Receives are posted while the transport's write lock is held, which is unnecessary and costly. Finally, allocation of Receive buffers is performed on-demand in the Receive completion handler. This helps guarantee that they are allocated on the same NUMA node as the CPU that handles Receive completions. Signed-off-by: Chuck Lever <chuck.lever@oracle.com> Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
Diffstat (limited to 'net/sunrpc/xprtrdma/verbs.c')
-rw-r--r--net/sunrpc/xprtrdma/verbs.c176
1 files changed, 99 insertions, 77 deletions
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 4b18926de912..2ed2ee4f2c6c 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -74,6 +74,7 @@
*/
static void rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt);
static void rpcrdma_mrs_destroy(struct rpcrdma_buffer *buf);
+static int rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt, bool temp);
static void rpcrdma_dma_unmap_regbuf(struct rpcrdma_regbuf *rb);
struct workqueue_struct *rpcrdma_receive_wq __read_mostly;
@@ -726,7 +727,6 @@ rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
{
struct rpcrdma_xprt *r_xprt = container_of(ia, struct rpcrdma_xprt,
rx_ia);
- unsigned int extras;
int rc;
retry:
@@ -770,9 +770,8 @@ retry:
}
dprintk("RPC: %s: connected\n", __func__);
- extras = r_xprt->rx_buf.rb_bc_srv_max_requests;
- if (extras)
- rpcrdma_ep_post_extra_recv(r_xprt, extras);
+
+ rpcrdma_post_recvs(r_xprt, true);
out:
if (rc)
@@ -1082,14 +1081,8 @@ rpcrdma_create_req(struct rpcrdma_xprt *r_xprt)
return req;
}
-/**
- * rpcrdma_create_rep - Allocate an rpcrdma_rep object
- * @r_xprt: controlling transport
- *
- * Returns 0 on success or a negative errno on failure.
- */
-int
-rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt)
+static int
+rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt, bool temp)
{
struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
@@ -1117,6 +1110,7 @@ rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt)
rep->rr_recv_wr.wr_cqe = &rep->rr_cqe;
rep->rr_recv_wr.sg_list = &rep->rr_rdmabuf->rg_iov;
rep->rr_recv_wr.num_sge = 1;
+ rep->rr_temp = temp;
spin_lock(&buf->rb_lock);
list_add(&rep->rr_list, &buf->rb_recv_bufs);
@@ -1168,12 +1162,8 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt)
list_add(&req->rl_list, &buf->rb_send_bufs);
}
+ buf->rb_posted_receives = 0;
INIT_LIST_HEAD(&buf->rb_recv_bufs);
- for (i = 0; i <= buf->rb_max_requests; i++) {
- rc = rpcrdma_create_rep(r_xprt);
- if (rc)
- goto out;
- }
rc = rpcrdma_sendctxs_create(r_xprt);
if (rc)
@@ -1263,7 +1253,6 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
rep = rpcrdma_buffer_get_rep_locked(buf);
rpcrdma_destroy_rep(rep);
}
- buf->rb_send_count = 0;
spin_lock(&buf->rb_reqslock);
while (!list_empty(&buf->rb_allreqs)) {
@@ -1278,7 +1267,6 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
spin_lock(&buf->rb_reqslock);
}
spin_unlock(&buf->rb_reqslock);
- buf->rb_recv_count = 0;
rpcrdma_mrs_destroy(buf);
}
@@ -1351,27 +1339,11 @@ rpcrdma_mr_unmap_and_put(struct rpcrdma_mr *mr)
__rpcrdma_mr_put(&r_xprt->rx_buf, mr);
}
-static struct rpcrdma_rep *
-rpcrdma_buffer_get_rep(struct rpcrdma_buffer *buffers)
-{
- /* If an RPC previously completed without a reply (say, a
- * credential problem or a soft timeout occurs) then hold off
- * on supplying more Receive buffers until the number of new
- * pending RPCs catches up to the number of posted Receives.
- */
- if (unlikely(buffers->rb_send_count < buffers->rb_recv_count))
- return NULL;
-
- if (unlikely(list_empty(&buffers->rb_recv_bufs)))
- return NULL;
- buffers->rb_recv_count++;
- return rpcrdma_buffer_get_rep_locked(buffers);
-}
-
-/*
- * Get a set of request/reply buffers.
+/**
+ * rpcrdma_buffer_get - Get a request buffer
+ * @buffers: Buffer pool from which to obtain a buffer
*
- * Reply buffer (if available) is attached to send buffer upon return.
+ * Returns a fresh rpcrdma_req, or NULL if none are available.
*/
struct rpcrdma_req *
rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
@@ -1379,23 +1351,21 @@ rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
struct rpcrdma_req *req;
spin_lock(&buffers->rb_lock);
- if (list_empty(&buffers->rb_send_bufs))
- goto out_reqbuf;
- buffers->rb_send_count++;
+ if (unlikely(list_empty(&buffers->rb_send_bufs)))
+ goto out_noreqs;
req = rpcrdma_buffer_get_req_locked(buffers);
- req->rl_reply = rpcrdma_buffer_get_rep(buffers);
spin_unlock(&buffers->rb_lock);
-
return req;
-out_reqbuf:
+out_noreqs:
spin_unlock(&buffers->rb_lock);
return NULL;
}
-/*
- * Put request/reply buffers back into pool.
- * Pre-decrement counter/array index.
+/**
+ * rpcrdma_buffer_put - Put request/reply buffers back into pool
+ * @req: object to return
+ *
*/
void
rpcrdma_buffer_put(struct rpcrdma_req *req)
@@ -1406,27 +1376,16 @@ rpcrdma_buffer_put(struct rpcrdma_req *req)
req->rl_reply = NULL;
spin_lock(&buffers->rb_lock);
- buffers->rb_send_count--;
- list_add_tail(&req->rl_list, &buffers->rb_send_bufs);
+ list_add(&req->rl_list, &buffers->rb_send_bufs);
if (rep) {
- buffers->rb_recv_count--;
- list_add_tail(&rep->rr_list, &buffers->rb_recv_bufs);
+ if (!rep->rr_temp) {
+ list_add(&rep->rr_list, &buffers->rb_recv_bufs);
+ rep = NULL;
+ }
}
spin_unlock(&buffers->rb_lock);
-}
-
-/*
- * Recover reply buffers from pool.
- * This happens when recovering from disconnect.
- */
-void
-rpcrdma_recv_buffer_get(struct rpcrdma_req *req)
-{
- struct rpcrdma_buffer *buffers = req->rl_buffer;
-
- spin_lock(&buffers->rb_lock);
- req->rl_reply = rpcrdma_buffer_get_rep(buffers);
- spin_unlock(&buffers->rb_lock);
+ if (rep)
+ rpcrdma_destroy_rep(rep);
}
/*
@@ -1438,10 +1397,13 @@ rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
{
struct rpcrdma_buffer *buffers = &rep->rr_rxprt->rx_buf;
- spin_lock(&buffers->rb_lock);
- buffers->rb_recv_count--;
- list_add_tail(&rep->rr_list, &buffers->rb_recv_bufs);
- spin_unlock(&buffers->rb_lock);
+ if (!rep->rr_temp) {
+ spin_lock(&buffers->rb_lock);
+ list_add(&rep->rr_list, &buffers->rb_recv_bufs);
+ spin_unlock(&buffers->rb_lock);
+ } else {
+ rpcrdma_destroy_rep(rep);
+ }
}
/**
@@ -1537,13 +1499,6 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia,
struct ib_send_wr *send_wr = &req->rl_sendctx->sc_wr;
int rc;
- if (req->rl_reply) {
- rc = rpcrdma_ep_post_recv(ia, req->rl_reply);
- if (rc)
- return rc;
- req->rl_reply = NULL;
- }
-
if (!ep->rep_send_count ||
test_bit(RPCRDMA_REQ_F_TX_RESOURCES, &req->rl_flags)) {
send_wr->send_flags |= IB_SEND_SIGNALED;
@@ -1618,3 +1573,70 @@ out_rc:
rpcrdma_recv_buffer_put(rep);
return rc;
}
+
+/**
+ * rpcrdma_post_recvs - Maybe post some Receive buffers
+ * @r_xprt: controlling transport
+ * @temp: when true, allocate temp rpcrdma_rep objects
+ *
+ */
+void
+rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp)
+{
+ struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
+ struct ib_recv_wr *wr, *bad_wr;
+ int needed, count, rc;
+
+ needed = buf->rb_credits + (buf->rb_bc_srv_max_requests << 1);
+ if (buf->rb_posted_receives > needed)
+ return;
+ needed -= buf->rb_posted_receives;
+
+ count = 0;
+ wr = NULL;
+ while (needed) {
+ struct rpcrdma_regbuf *rb;
+ struct rpcrdma_rep *rep;
+
+ spin_lock(&buf->rb_lock);
+ rep = list_first_entry_or_null(&buf->rb_recv_bufs,
+ struct rpcrdma_rep, rr_list);
+ if (likely(rep))
+ list_del(&rep->rr_list);
+ spin_unlock(&buf->rb_lock);
+ if (!rep) {
+ if (rpcrdma_create_rep(r_xprt, temp))
+ break;
+ continue;
+ }
+
+ rb = rep->rr_rdmabuf;
+ if (!rpcrdma_regbuf_is_mapped(rb)) {
+ if (!__rpcrdma_dma_map_regbuf(&r_xprt->rx_ia, rb)) {
+ rpcrdma_recv_buffer_put(rep);
+ break;
+ }
+ }
+
+ trace_xprtrdma_post_recv(rep->rr_recv_wr.wr_cqe);
+ rep->rr_recv_wr.next = wr;
+ wr = &rep->rr_recv_wr;
+ ++count;
+ --needed;
+ }
+ if (!count)
+ return;
+
+ rc = ib_post_recv(r_xprt->rx_ia.ri_id->qp, wr, &bad_wr);
+ if (rc) {
+ for (wr = bad_wr; wr; wr = wr->next) {
+ struct rpcrdma_rep *rep;
+
+ rep = container_of(wr, struct rpcrdma_rep, rr_recv_wr);
+ rpcrdma_recv_buffer_put(rep);
+ --count;
+ }
+ }
+ buf->rb_posted_receives += count;
+ trace_xprtrdma_post_recvs(r_xprt, count, rc);
+}