From 616d37a070bb33ea387d0e93343acd8336a30886 Mon Sep 17 00:00:00 2001 From: Santosh Shilimkar Date: Tue, 18 Jun 2019 22:12:05 -0700 Subject: rds: fix reordering with composite message notification RDS composite message(rdma + control) user notification needs to be triggered once the full message is delivered and such a fix was added as part of commit 941f8d55f6d61 ("RDS: RDMA: Fix the composite message user notification"). But rds_send_remove_from_sock is missing data part notify check and hence at times the user don't get notification which isn't desirable. One way is to fix the rds_send_remove_from_sock to check of that case but considering the ordering complexity with completion handler and rdma + control messages are always dispatched back to back in same send context, just delaying the signaled completion on rmda work request also gets the desired behaviour. i.e Notifying application only after RDMA + control message send completes. So patch updates the earlier fix with this approach. The delay signaling completions of rdma op till the control message send completes fix was done by Venkat Venkatsubra in downstream kernel. Reviewed-and-tested-by: Zhu Yanjun Reviewed-by: Gerd Rausch Signed-off-by: Santosh Shilimkar --- net/rds/ib_send.c | 29 +++++++++++++---------------- net/rds/rdma.c | 10 ---------- net/rds/rds.h | 1 - net/rds/send.c | 4 +--- 4 files changed, 14 insertions(+), 30 deletions(-) (limited to 'net') diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c index 18f2341202f8..dfe6237dafe2 100644 --- a/net/rds/ib_send.c +++ b/net/rds/ib_send.c @@ -69,6 +69,16 @@ static void rds_ib_send_complete(struct rds_message *rm, complete(rm, notify_status); } +static void rds_ib_send_unmap_data(struct rds_ib_connection *ic, + struct rm_data_op *op, + int wc_status) +{ + if (op->op_nents) + ib_dma_unmap_sg(ic->i_cm_id->device, + op->op_sg, op->op_nents, + DMA_TO_DEVICE); +} + static void rds_ib_send_unmap_rdma(struct rds_ib_connection *ic, struct rm_rdma_op *op, int wc_status) @@ -129,21 +139,6 @@ static void rds_ib_send_unmap_atomic(struct rds_ib_connection *ic, rds_ib_stats_inc(s_ib_atomic_fadd); } -static void rds_ib_send_unmap_data(struct rds_ib_connection *ic, - struct rm_data_op *op, - int wc_status) -{ - struct rds_message *rm = container_of(op, struct rds_message, data); - - if (op->op_nents) - ib_dma_unmap_sg(ic->i_cm_id->device, - op->op_sg, op->op_nents, - DMA_TO_DEVICE); - - if (rm->rdma.op_active && rm->data.op_notify) - rds_ib_send_unmap_rdma(ic, &rm->rdma, wc_status); -} - /* * Unmap the resources associated with a struct send_work. * @@ -902,7 +897,9 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op) send->s_queued = jiffies; send->s_op = NULL; - nr_sig += rds_ib_set_wr_signal_state(ic, send, op->op_notify); + if (!op->op_notify) + nr_sig += rds_ib_set_wr_signal_state(ic, send, + op->op_notify); send->s_wr.opcode = op->op_write ? IB_WR_RDMA_WRITE : IB_WR_RDMA_READ; send->s_rdma_wr.remote_addr = remote_addr; diff --git a/net/rds/rdma.c b/net/rds/rdma.c index b340ed4fc43a..916f5ec373d8 100644 --- a/net/rds/rdma.c +++ b/net/rds/rdma.c @@ -641,16 +641,6 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, } op->op_notifier->n_user_token = args->user_token; op->op_notifier->n_status = RDS_RDMA_SUCCESS; - - /* Enable rmda notification on data operation for composite - * rds messages and make sure notification is enabled only - * for the data operation which follows it so that application - * gets notified only after full message gets delivered. - */ - if (rm->data.op_sg) { - rm->rdma.op_notify = 0; - rm->data.op_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME); - } } /* The cookie contains the R_Key of the remote memory region, and diff --git a/net/rds/rds.h b/net/rds/rds.h index 0d8f67cadd74..f0066d168499 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -476,7 +476,6 @@ struct rds_message { } rdma; struct rm_data_op { unsigned int op_active:1; - unsigned int op_notify:1; unsigned int op_nents; unsigned int op_count; unsigned int op_dmasg; diff --git a/net/rds/send.c b/net/rds/send.c index 166dd578c1cc..031b1e97a466 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -491,14 +491,12 @@ void rds_rdma_send_complete(struct rds_message *rm, int status) struct rm_rdma_op *ro; struct rds_notifier *notifier; unsigned long flags; - unsigned int notify = 0; spin_lock_irqsave(&rm->m_rs_lock, flags); - notify = rm->rdma.op_notify | rm->data.op_notify; ro = &rm->rdma; if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags) && - ro->op_active && notify && ro->op_notifier) { + ro->op_active && ro->op_notify && ro->op_notifier) { notifier = ro->op_notifier; rs = rm->m_rs; sock_hold(rds_rs_to_sk(rs)); -- cgit v1.2.3 From a55207884708bf1d5b8c87a3c504502de77a5416 Mon Sep 17 00:00:00 2001 From: Gerd Rausch Date: Fri, 28 Jun 2019 17:31:19 -0700 Subject: Revert "RDS: IB: split the mr registration and invalidation path" This reverts commit 56012459310a1dbcc55c2dbf5500a9f7571402cb. RDS kept spinning inside function "rds_ib_post_reg_frmr", waiting for "i_fastreg_wrs" to become incremented: while (atomic_dec_return(&ibmr->ic->i_fastreg_wrs) <= 0) { atomic_inc(&ibmr->ic->i_fastreg_wrs); cpu_relax(); } Looking at the original commit: commit 56012459310a ("RDS: IB: split the mr registration and invalidation path") In there, the "rds_ib_mr_cqe_handler" was changed in the following way: void rds_ib_mr_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc) if (frmr->fr_inv) { frmr->fr_state = FRMR_IS_FREE; frmr->fr_inv = false; atomic_inc(&ic->i_fastreg_wrs); } else { atomic_inc(&ic->i_fastunreg_wrs); } It looks like it's got it exactly backwards: Function "rds_ib_post_reg_frmr" keeps track of the outstanding requests via "i_fastreg_wrs". Function "rds_ib_post_inv" keeps track of the outstanding requests via "i_fastunreg_wrs" (post original commit). It also sets: frmr->fr_inv = true; However the completion handler "rds_ib_mr_cqe_handler" adjusts "i_fastreg_wrs" when "fr_inv" had been true, and adjusts "i_fastunreg_wrs" otherwise. The original commit was done in the name of performance: to remove the performance bottleneck No performance benefit could be observed with a fixed-up version of the original commit measured between two Oracle X7 servers, both equipped with Mellanox Connect-X5 HCAs. The prudent course of action is to revert this commit. Signed-off-by: Gerd Rausch Signed-off-by: Santosh Shilimkar --- net/rds/ib.h | 4 +--- net/rds/ib_cm.c | 9 ++------- net/rds/ib_frmr.c | 11 +++++------ 3 files changed, 8 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/net/rds/ib.h b/net/rds/ib.h index 67a715b076ca..66c03c7665b2 100644 --- a/net/rds/ib.h +++ b/net/rds/ib.h @@ -15,8 +15,7 @@ #define RDS_IB_DEFAULT_RECV_WR 1024 #define RDS_IB_DEFAULT_SEND_WR 256 -#define RDS_IB_DEFAULT_FR_WR 256 -#define RDS_IB_DEFAULT_FR_INV_WR 256 +#define RDS_IB_DEFAULT_FR_WR 512 #define RDS_IB_DEFAULT_RETRY_COUNT 1 @@ -157,7 +156,6 @@ struct rds_ib_connection { /* To control the number of wrs from fastreg */ atomic_t i_fastreg_wrs; - atomic_t i_fastunreg_wrs; /* interrupt handling */ struct tasklet_struct i_send_tasklet; diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index 66c6eb56072b..8891822eba4f 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -460,10 +460,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn) * completion queue and send queue. This extra space is used for FRMR * registration and invalidation work requests */ - fr_queue_space = rds_ibdev->use_fastreg ? - (RDS_IB_DEFAULT_FR_WR + 1) + - (RDS_IB_DEFAULT_FR_INV_WR + 1) - : 0; + fr_queue_space = (rds_ibdev->use_fastreg ? RDS_IB_DEFAULT_FR_WR : 0); /* add the conn now so that connection establishment has the dev */ rds_ib_add_conn(rds_ibdev, conn); @@ -530,7 +527,6 @@ static int rds_ib_setup_qp(struct rds_connection *conn) attr.send_cq = ic->i_send_cq; attr.recv_cq = ic->i_recv_cq; atomic_set(&ic->i_fastreg_wrs, RDS_IB_DEFAULT_FR_WR); - atomic_set(&ic->i_fastunreg_wrs, RDS_IB_DEFAULT_FR_INV_WR); /* * XXX this can fail if max_*_wr is too large? Are we supposed @@ -1009,8 +1005,7 @@ void rds_ib_conn_path_shutdown(struct rds_conn_path *cp) wait_event(rds_ib_ring_empty_wait, rds_ib_ring_empty(&ic->i_recv_ring) && (atomic_read(&ic->i_signaled_sends) == 0) && - (atomic_read(&ic->i_fastreg_wrs) == RDS_IB_DEFAULT_FR_WR) && - (atomic_read(&ic->i_fastunreg_wrs) == RDS_IB_DEFAULT_FR_INV_WR)); + (atomic_read(&ic->i_fastreg_wrs) == RDS_IB_DEFAULT_FR_WR)); tasklet_kill(&ic->i_send_tasklet); tasklet_kill(&ic->i_recv_tasklet); diff --git a/net/rds/ib_frmr.c b/net/rds/ib_frmr.c index 688dcd68d4ea..32ae26ed58a0 100644 --- a/net/rds/ib_frmr.c +++ b/net/rds/ib_frmr.c @@ -239,8 +239,8 @@ static int rds_ib_post_inv(struct rds_ib_mr *ibmr) if (frmr->fr_state != FRMR_IS_INUSE) goto out; - while (atomic_dec_return(&ibmr->ic->i_fastunreg_wrs) <= 0) { - atomic_inc(&ibmr->ic->i_fastunreg_wrs); + while (atomic_dec_return(&ibmr->ic->i_fastreg_wrs) <= 0) { + atomic_inc(&ibmr->ic->i_fastreg_wrs); cpu_relax(); } @@ -257,7 +257,7 @@ static int rds_ib_post_inv(struct rds_ib_mr *ibmr) if (unlikely(ret)) { frmr->fr_state = FRMR_IS_STALE; frmr->fr_inv = false; - atomic_inc(&ibmr->ic->i_fastunreg_wrs); + atomic_inc(&ibmr->ic->i_fastreg_wrs); pr_err("RDS/IB: %s returned error(%d)\n", __func__, ret); goto out; } @@ -285,10 +285,9 @@ void rds_ib_mr_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc) if (frmr->fr_inv) { frmr->fr_state = FRMR_IS_FREE; frmr->fr_inv = false; - atomic_inc(&ic->i_fastreg_wrs); - } else { - atomic_inc(&ic->i_fastunreg_wrs); } + + atomic_inc(&ic->i_fastreg_wrs); } void rds_ib_unreg_frmr(struct list_head *list, unsigned int *nfreed, -- cgit v1.2.3 From 8c6166cfc9cd48e93d9176561e50b63cef4330d5 Mon Sep 17 00:00:00 2001 From: Gerd Rausch Date: Thu, 27 Jun 2019 09:21:44 -0700 Subject: rds: Accept peer connection reject messages due to incompatible version Prior to commit d021fabf525ff ("rds: rdma: add consumer reject") function "rds_rdma_cm_event_handler_cmn" would always honor a rejected connection attempt by issuing a "rds_conn_drop". The commit mentioned above added a "break", eliminating the "fallthrough" case and made the "rds_conn_drop" rather conditional: Now it only happens if a "consumer defined" reject (i.e. "rdma_reject") carries an integer-value of "1" inside "private_data": if (!conn) break; err = (int *)rdma_consumer_reject_data(cm_id, event, &len); if (!err || (err && ((*err) == RDS_RDMA_REJ_INCOMPAT))) { pr_warn("RDS/RDMA: conn <%pI6c, %pI6c> rejected, dropping connection\n", &conn->c_laddr, &conn->c_faddr); conn->c_proposed_version = RDS_PROTOCOL_COMPAT_VERSION; rds_conn_drop(conn); } rdsdebug("Connection rejected: %s\n", rdma_reject_msg(cm_id, event->status)); break; /* FALLTHROUGH */ A number of issues are worth mentioning here: #1) Previous versions of the RDS code simply rejected a connection by calling "rdma_reject(cm_id, NULL, 0);" So the value of the payload in "private_data" will not be "1", but "0". #2) Now the code has become dependent on host byte order and sizing. If one peer is big-endian, the other is little-endian, or there's a difference in sizeof(int) (e.g. ILP64 vs LP64), the *err check does not work as intended. #3) There is no check for "len" to see if the data behind *err is even valid. Luckily, it appears that the "rdma_reject(cm_id, NULL, 0)" will always carry 148 bytes of zeroized payload. But that should probably not be relied upon here. #4) With the added "break;", we might as well drop the misleading "/* FALLTHROUGH */" comment. This commit does _not_ address issue #2, as the sender would have to agree on a byte order as well. Here is the sequence of messages in this observed error-scenario: Host-A is pre-QoS changes (excluding the commit mentioned above) Host-B is post-QoS changes (including the commit mentioned above) #1 Host-B issues a connection request via function "rds_conn_path_transition" connection state transitions to "RDS_CONN_CONNECTING" #2 Host-A rejects the incompatible connection request (from #1) It does so by calling "rdma_reject(cm_id, NULL, 0);" #3 Host-B receives an "RDMA_CM_EVENT_REJECTED" event (from #2) But since the code is changed in the way described above, it won't drop the connection here, simply because "*err == 0". #4 Host-A issues a connection request #5 Host-B receives an "RDMA_CM_EVENT_CONNECT_REQUEST" event and ends up calling "rds_ib_cm_handle_connect". But since the state is already in "RDS_CONN_CONNECTING" (as of #1) it will end up issuing a "rdma_reject" without dropping the connection: if (rds_conn_state(conn) == RDS_CONN_CONNECTING) { /* Wait and see - our connect may still be succeeding */ rds_ib_stats_inc(s_ib_connect_raced); } goto out; #6 Host-A receives an "RDMA_CM_EVENT_REJECTED" event (from #5), drops the connection and tries again (goto #4) until it gives up. Tested-by: Zhu Yanjun Signed-off-by: Gerd Rausch Signed-off-by: Santosh Shilimkar --- net/rds/rdma_transport.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c index 46bce8389066..9db455d02255 100644 --- a/net/rds/rdma_transport.c +++ b/net/rds/rdma_transport.c @@ -112,7 +112,9 @@ static int rds_rdma_cm_event_handler_cmn(struct rdma_cm_id *cm_id, if (!conn) break; err = (int *)rdma_consumer_reject_data(cm_id, event, &len); - if (!err || (err && ((*err) == RDS_RDMA_REJ_INCOMPAT))) { + if (!err || + (err && len >= sizeof(*err) && + ((*err) <= RDS_RDMA_REJ_INCOMPAT))) { pr_warn("RDS/RDMA: conn <%pI6c, %pI6c> rejected, dropping connection\n", &conn->c_laddr, &conn->c_faddr); conn->c_proposed_version = RDS_PROTOCOL_COMPAT_VERSION; @@ -122,7 +124,6 @@ static int rds_rdma_cm_event_handler_cmn(struct rdma_cm_id *cm_id, rdsdebug("Connection rejected: %s\n", rdma_reject_msg(cm_id, event->status)); break; - /* FALLTHROUGH */ case RDMA_CM_EVENT_ADDR_ERROR: case RDMA_CM_EVENT_ROUTE_ERROR: case RDMA_CM_EVENT_CONNECT_ERROR: -- cgit v1.2.3 From fc640d4cbe268479ba448542e2b17045c908b2b3 Mon Sep 17 00:00:00 2001 From: Gerd Rausch Date: Fri, 28 Jun 2019 17:41:16 -0700 Subject: rds: Return proper "tos" value to user-space The proper "tos" value needs to be returned to user-space (sockopt RDS_INFO_CONNECTIONS). Fixes: 3eb450367d08 ("rds: add type of service(tos) infrastructure") Signed-off-by: Gerd Rausch Reviewed-by: Zhu Yanjun Signed-off-by: Santosh Shilimkar --- net/rds/connection.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/rds/connection.c b/net/rds/connection.c index 7ea134f9a825..ed7f2133acc2 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c @@ -736,6 +736,7 @@ static int rds_conn_info_visitor(struct rds_conn_path *cp, void *buffer) cinfo->next_rx_seq = cp->cp_next_rx_seq; cinfo->laddr = conn->c_laddr.s6_addr32[3]; cinfo->faddr = conn->c_faddr.s6_addr32[3]; + cinfo->tos = conn->c_tos; strncpy(cinfo->transport, conn->c_trans->t_name, sizeof(cinfo->transport)); cinfo->flags = 0; -- cgit v1.2.3 From dc205a8d34228809dedab94a85a866cbb255248f Mon Sep 17 00:00:00 2001 From: Santosh Shilimkar Date: Fri, 28 Jun 2019 15:38:58 -0700 Subject: rds: avoid version downgrade to legitimate newer peer connections Connections with legitimate tos values can get into usual connection race. It can result in consumer reject. We don't want tos value or protocol version to be demoted for such connections otherwise piers would end up different tos values which can results in no connection. Example a peer initiated connection with say tos 8 while usual connection racing can get downgraded to tos 0 which is not desirable. Patch fixes above issue introduced by commit commit d021fabf525f ("rds: rdma: add consumer reject") Reported-by: Yanjun Zhu Tested-by: Yanjun Zhu Signed-off-by: Santosh Shilimkar --- net/rds/rdma_transport.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c index 9db455d02255..ff74c4bbb9fc 100644 --- a/net/rds/rdma_transport.c +++ b/net/rds/rdma_transport.c @@ -117,8 +117,10 @@ static int rds_rdma_cm_event_handler_cmn(struct rdma_cm_id *cm_id, ((*err) <= RDS_RDMA_REJ_INCOMPAT))) { pr_warn("RDS/RDMA: conn <%pI6c, %pI6c> rejected, dropping connection\n", &conn->c_laddr, &conn->c_faddr); - conn->c_proposed_version = RDS_PROTOCOL_COMPAT_VERSION; - conn->c_tos = 0; + + if (!conn->c_tos) + conn->c_proposed_version = RDS_PROTOCOL_COMPAT_VERSION; + rds_conn_drop(conn); } rdsdebug("Connection rejected: %s\n", -- cgit v1.2.3