From c95b708d5fa65b4e51f088ee077d127fd5a57b70 Mon Sep 17 00:00:00 2001 From: Nick Bowler Date: Sat, 28 Mar 2020 01:09:09 -0400 Subject: nvme: fix compat address handling in several ioctls On a 32-bit kernel, the upper bits of userspace addresses passed via various ioctls are silently ignored by the nvme driver. However on a 64-bit kernel running a compat task, these upper bits are not ignored and are in fact required to be zero for the ioctls to work. Unfortunately, this difference matters. 32-bit smartctl submits the NVME_IOCTL_ADMIN_CMD ioctl with garbage in these upper bits because it seems the pointer value it puts into the nvme_passthru_cmd structure is sign extended. This works fine on 32-bit kernels but fails on a 64-bit one because (at least on my setup) the addresses smartctl uses are consistently above 2G. For example: # smartctl -x /dev/nvme0n1 smartctl 7.1 2019-12-30 r5022 [x86_64-linux-5.5.11] (local build) Copyright (C) 2002-19, Bruce Allen, Christian Franke, www.smartmontools.org Read NVMe Identify Controller failed: NVME_IOCTL_ADMIN_CMD: Bad address Since changing 32-bit kernels to actually check all of the submitted address bits now would break existing userspace, this patch fixes the compat problem by explicitly zeroing the upper bits in the compat case. This enables 32-bit smartctl to work on a 64-bit kernel. Signed-off-by: Nick Bowler Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 4f907e3beda1..2db8563aeb2d 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -6,6 +6,7 @@ #include #include +#include #include #include #include @@ -1252,6 +1253,18 @@ static void nvme_enable_aen(struct nvme_ctrl *ctrl) queue_work(nvme_wq, &ctrl->async_event_work); } +/* + * Convert integer values from ioctl structures to user pointers, silently + * ignoring the upper bits in the compat case to match behaviour of 32-bit + * kernels. + */ +static void __user *nvme_to_user_ptr(uintptr_t ptrval) +{ + if (in_compat_syscall()) + ptrval = (compat_uptr_t)ptrval; + return (void __user *)ptrval; +} + static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) { struct nvme_user_io io; @@ -1275,7 +1288,7 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) length = (io.nblocks + 1) << ns->lba_shift; meta_len = (io.nblocks + 1) * ns->ms; - metadata = (void __user *)(uintptr_t)io.metadata; + metadata = nvme_to_user_ptr(io.metadata); if (ns->ext) { length += meta_len; @@ -1298,7 +1311,7 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) c.rw.appmask = cpu_to_le16(io.appmask); return nvme_submit_user_cmd(ns->queue, &c, - (void __user *)(uintptr_t)io.addr, length, + nvme_to_user_ptr(io.addr), length, metadata, meta_len, lower_32_bits(io.slba), NULL, 0); } @@ -1418,9 +1431,9 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, effects = nvme_passthru_start(ctrl, ns, cmd.opcode); status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c, - (void __user *)(uintptr_t)cmd.addr, cmd.data_len, - (void __user *)(uintptr_t)cmd.metadata, - cmd.metadata_len, 0, &result, timeout); + nvme_to_user_ptr(cmd.addr), cmd.data_len, + nvme_to_user_ptr(cmd.metadata), cmd.metadata_len, + 0, &result, timeout); nvme_passthru_end(ctrl, effects); if (status >= 0) { @@ -1465,8 +1478,8 @@ static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns, effects = nvme_passthru_start(ctrl, ns, cmd.opcode); status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c, - (void __user *)(uintptr_t)cmd.addr, cmd.data_len, - (void __user *)(uintptr_t)cmd.metadata, cmd.metadata_len, + nvme_to_user_ptr(cmd.addr), cmd.data_len, + nvme_to_user_ptr(cmd.metadata), cmd.metadata_len, 0, &cmd.result, timeout); nvme_passthru_end(ctrl, effects); -- cgit v1.2.3 From 38803fcffb5baf40cd403c1bd980f22308aefee8 Mon Sep 17 00:00:00 2001 From: James Smart Date: Wed, 18 Mar 2020 14:41:12 -0700 Subject: nvme-fcloop: fix deallocation of working context There's been a longstanding bug of LS completions which freed ls ops, particularly the disconnect LS, while executing on a work context that is in the memory being free. Not a good thing to do. Rework LS handling to make callbacks in the rport context rather than the ls_request context. Signed-off-by: James Smart Reviewed-by: Himanshu Madhani Reviewed-by: Hannes Reinecke Signed-off-by: Christoph Hellwig --- drivers/nvme/target/fcloop.c | 76 ++++++++++++++++++++++++++++++-------------- 1 file changed, 52 insertions(+), 24 deletions(-) diff --git a/drivers/nvme/target/fcloop.c b/drivers/nvme/target/fcloop.c index 1c50af6219f3..9861fcea39f6 100644 --- a/drivers/nvme/target/fcloop.c +++ b/drivers/nvme/target/fcloop.c @@ -198,10 +198,13 @@ struct fcloop_lport_priv { }; struct fcloop_rport { - struct nvme_fc_remote_port *remoteport; - struct nvmet_fc_target_port *targetport; - struct fcloop_nport *nport; - struct fcloop_lport *lport; + struct nvme_fc_remote_port *remoteport; + struct nvmet_fc_target_port *targetport; + struct fcloop_nport *nport; + struct fcloop_lport *lport; + spinlock_t lock; + struct list_head ls_list; + struct work_struct ls_work; }; struct fcloop_tport { @@ -224,11 +227,10 @@ struct fcloop_nport { }; struct fcloop_lsreq { - struct fcloop_tport *tport; struct nvmefc_ls_req *lsreq; - struct work_struct work; struct nvmefc_tgt_ls_req tgt_ls_req; int status; + struct list_head ls_list; /* fcloop_rport->ls_list */ }; struct fcloop_rscn { @@ -292,21 +294,32 @@ fcloop_delete_queue(struct nvme_fc_local_port *localport, { } - -/* - * Transmit of LS RSP done (e.g. buffers all set). call back up - * initiator "done" flows. - */ static void -fcloop_tgt_lsrqst_done_work(struct work_struct *work) +fcloop_rport_lsrqst_work(struct work_struct *work) { - struct fcloop_lsreq *tls_req = - container_of(work, struct fcloop_lsreq, work); - struct fcloop_tport *tport = tls_req->tport; - struct nvmefc_ls_req *lsreq = tls_req->lsreq; + struct fcloop_rport *rport = + container_of(work, struct fcloop_rport, ls_work); + struct fcloop_lsreq *tls_req; - if (!tport || tport->remoteport) - lsreq->done(lsreq, tls_req->status); + spin_lock(&rport->lock); + for (;;) { + tls_req = list_first_entry_or_null(&rport->ls_list, + struct fcloop_lsreq, ls_list); + if (!tls_req) + break; + + list_del(&tls_req->ls_list); + spin_unlock(&rport->lock); + + tls_req->lsreq->done(tls_req->lsreq, tls_req->status); + /* + * callee may free memory containing tls_req. + * do not reference lsreq after this. + */ + + spin_lock(&rport->lock); + } + spin_unlock(&rport->lock); } static int @@ -319,17 +332,18 @@ fcloop_ls_req(struct nvme_fc_local_port *localport, int ret = 0; tls_req->lsreq = lsreq; - INIT_WORK(&tls_req->work, fcloop_tgt_lsrqst_done_work); + INIT_LIST_HEAD(&tls_req->ls_list); if (!rport->targetport) { tls_req->status = -ECONNREFUSED; - tls_req->tport = NULL; - schedule_work(&tls_req->work); + spin_lock(&rport->lock); + list_add_tail(&rport->ls_list, &tls_req->ls_list); + spin_unlock(&rport->lock); + schedule_work(&rport->ls_work); return ret; } tls_req->status = 0; - tls_req->tport = rport->targetport->private; ret = nvmet_fc_rcv_ls_req(rport->targetport, &tls_req->tgt_ls_req, lsreq->rqstaddr, lsreq->rqstlen); @@ -337,18 +351,28 @@ fcloop_ls_req(struct nvme_fc_local_port *localport, } static int -fcloop_xmt_ls_rsp(struct nvmet_fc_target_port *tport, +fcloop_xmt_ls_rsp(struct nvmet_fc_target_port *targetport, struct nvmefc_tgt_ls_req *tgt_lsreq) { struct fcloop_lsreq *tls_req = tgt_ls_req_to_lsreq(tgt_lsreq); struct nvmefc_ls_req *lsreq = tls_req->lsreq; + struct fcloop_tport *tport = targetport->private; + struct nvme_fc_remote_port *remoteport = tport->remoteport; + struct fcloop_rport *rport; memcpy(lsreq->rspaddr, tgt_lsreq->rspbuf, ((lsreq->rsplen < tgt_lsreq->rsplen) ? lsreq->rsplen : tgt_lsreq->rsplen)); + tgt_lsreq->done(tgt_lsreq); - schedule_work(&tls_req->work); + if (remoteport) { + rport = remoteport->private; + spin_lock(&rport->lock); + list_add_tail(&rport->ls_list, &tls_req->ls_list); + spin_unlock(&rport->lock); + schedule_work(&rport->ls_work); + } return 0; } @@ -834,6 +858,7 @@ fcloop_remoteport_delete(struct nvme_fc_remote_port *remoteport) { struct fcloop_rport *rport = remoteport->private; + flush_work(&rport->ls_work); fcloop_nport_put(rport->nport); } @@ -1136,6 +1161,9 @@ fcloop_create_remote_port(struct device *dev, struct device_attribute *attr, rport->nport = nport; rport->lport = nport->lport; nport->rport = rport; + spin_lock_init(&rport->lock); + INIT_WORK(&rport->ls_work, fcloop_rport_lsrqst_work); + INIT_LIST_HEAD(&rport->ls_list); return count; } -- cgit v1.2.3 From a62315b83664dc9a7a6c6170ba2d174bb9fbed3c Mon Sep 17 00:00:00 2001 From: Israel Rukshin Date: Tue, 31 Mar 2020 15:46:33 +0300 Subject: nvme-rdma: Replace comma with a semicolon Use a semicolon at the end of an assignment expression. Signed-off-by: Israel Rukshin Reviewed-by: Max Gurtovoy Signed-off-by: Christoph Hellwig --- drivers/nvme/host/rdma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 86603d9b0cef..f704e7227d05 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1350,7 +1350,7 @@ static int nvme_rdma_post_send(struct nvme_rdma_queue *queue, int ret; sge->addr = qe->dma; - sge->length = sizeof(struct nvme_command), + sge->length = sizeof(struct nvme_command); sge->lkey = queue->device->pd->local_dma_lkey; wr.next = NULL; -- cgit v1.2.3 From d038dd815fc56cd77ae8a51bb6d1d11e3aab9609 Mon Sep 17 00:00:00 2001 From: James Smart Date: Wed, 18 Mar 2020 14:40:43 -0700 Subject: nvmet-fc: fix typo in comment Fix typo in comment: about should be abort Signed-off-by: James Smart Reviewed-by: Sagi Grimberg Reviewed-by: Chiatanya Kulkarni Reviewed-by: Himanshu Madhani Reviewed-by: Hannes Reinecke --- drivers/nvme/target/fc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c index a0db6371b43e..a8ceb7721640 100644 --- a/drivers/nvme/target/fc.c +++ b/drivers/nvme/target/fc.c @@ -684,7 +684,7 @@ nvmet_fc_delete_target_queue(struct nvmet_fc_tgt_queue *queue) disconnect = atomic_xchg(&queue->connected, 0); spin_lock_irqsave(&queue->qlock, flags); - /* about outstanding io's */ + /* abort outstanding io's */ for (i = 0; i < queue->sqsize; fod++, i++) { if (fod->active) { spin_lock(&fod->flock); -- cgit v1.2.3 From 25e5cb780e62bde432b401f312bb847edc78b432 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Mon, 23 Mar 2020 15:06:30 -0700 Subject: nvme-tcp: fix possible crash in write_zeroes processing We cannot look at blk_rq_payload_bytes without first checking that the request has a mappable physical segments first (e.g. blk_rq_nr_phys_segments(rq) != 0) and only then to take the request payload bytes. This caused us to send a wrong sgl to the target or even dereference a non-existing buffer in case we actually got to the data send sequence (if it was in-capsule). Reported-by: Tony Asleson Suggested-by: Chaitanya Kulkarni Signed-off-by: Sagi Grimberg Signed-off-by: Keith Busch Signed-off-by: Christoph Hellwig --- drivers/nvme/host/tcp.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 0ef14f0fad86..aa754ae3ca08 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -174,16 +174,14 @@ static inline bool nvme_tcp_async_req(struct nvme_tcp_request *req) static inline bool nvme_tcp_has_inline_data(struct nvme_tcp_request *req) { struct request *rq; - unsigned int bytes; if (unlikely(nvme_tcp_async_req(req))) return false; /* async events don't have a request */ rq = blk_mq_rq_from_pdu(req); - bytes = blk_rq_payload_bytes(rq); - return rq_data_dir(rq) == WRITE && bytes && - bytes <= nvme_tcp_inline_data_size(req->queue); + return rq_data_dir(rq) == WRITE && req->data_len && + req->data_len <= nvme_tcp_inline_data_size(req->queue); } static inline struct page *nvme_tcp_req_cur_page(struct nvme_tcp_request *req) @@ -2164,7 +2162,9 @@ static blk_status_t nvme_tcp_map_data(struct nvme_tcp_queue *queue, c->common.flags |= NVME_CMD_SGL_METABUF; - if (rq_data_dir(rq) == WRITE && req->data_len && + if (!blk_rq_nr_phys_segments(rq)) + nvme_tcp_set_sg_null(c); + else if (rq_data_dir(rq) == WRITE && req->data_len <= nvme_tcp_inline_data_size(queue)) nvme_tcp_set_sg_inline(queue, c, req->data_len); else @@ -2191,7 +2191,8 @@ static blk_status_t nvme_tcp_setup_cmd_pdu(struct nvme_ns *ns, req->data_sent = 0; req->pdu_len = 0; req->pdu_sent = 0; - req->data_len = blk_rq_payload_bytes(rq); + req->data_len = blk_rq_nr_phys_segments(rq) ? + blk_rq_payload_bytes(rq) : 0; req->curr_bio = rq->bio; if (rq_data_dir(rq) == WRITE && -- cgit v1.2.3 From f86e5bf817a57c7e6538dafee2fc65a525bb9935 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Mon, 23 Mar 2020 16:43:52 -0700 Subject: nvme-tcp: don't poll a non-live queue In error recovery we might be removing the queue so check we can actually poll before we do. Reported-by: Mark Wunderlich Tested-by: Mark Wunderlich Signed-off-by: Sagi Grimberg Signed-off-by: Keith Busch Signed-off-by: Christoph Hellwig --- drivers/nvme/host/tcp.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index aa754ae3ca08..eb31c689d2cf 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -2299,6 +2299,9 @@ static int nvme_tcp_poll(struct blk_mq_hw_ctx *hctx) struct nvme_tcp_queue *queue = hctx->driver_data; struct sock *sk = queue->sock->sk; + if (!test_bit(NVME_TCP_Q_LIVE, &queue->flags)) + return 0; + if (sk_can_busy_loop(sk) && skb_queue_empty_lockless(&sk->sk_receive_queue)) sk_busy_loop(sk, true); nvme_tcp_try_recv(queue); -- cgit v1.2.3 From 39d06079a50fe2a651091b38e311e605de0788cb Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Tue, 31 Mar 2020 22:44:23 -0700 Subject: nvme-tcp: fix possible crash in recv error flow If the target misbehaves and sends us unexpected payload we need to make sure to fail the controller and stop processing the input stream. We clear the rd_enabled flag and stop the io_work, but we may still requeue it if we still have pending sends and then in the next invocation we will process the input stream as the check is only in the .data_ready upcall. To fix this we need to make sure not to self-requeue io_work upon a recv flow error. This fixes the crash: nvme nvme2: receive failed: -22 BUG: unable to handle page fault for address: ffffbeb5816c3b48 nvme_ns_head_make_request: 29 callbacks suppressed block nvme0n5: no usable path - requeuing I/O block nvme0n5: no usable path - requeuing I/O block nvme0n7: no usable path - requeuing I/O block nvme0n7: no usable path - requeuing I/O block nvme0n3: no usable path - requeuing I/O block nvme0n3: no usable path - requeuing I/O block nvme0n3: no usable path - requeuing I/O block nvme0n7: no usable path - requeuing I/O block nvme0n3: no usable path - requeuing I/O block nvme0n3: no usable path - requeuing I/O #PF: supervisor read access inkernel mode #PF: error_code(0x0000) - not-present page PGD 1039157067 P4D 1039157067 PUD 103915a067 PMD 102719f067 PTE 0 Oops: 0000 [#1] SMP PTI CPU: 8 PID: 411 Comm: kworker/8:1H Not tainted 5.3.0-40-generic #32~18.04.1-Ubuntu Hardware name: Supermicro Super Server/X10SRi-F, BIOS 2.0 12/17/2015 Workqueue: nvme_tcp_wq nvme_tcp_io_work [nvme_tcp] RIP: 0010:nvme_tcp_recv_skb+0x2ae/0xb50 [nvme_tcp] RSP: 0018:ffffbeb5806cfd10 EFLAGS: 00010246 RAX: ffffbeb5816c3b48 RBX: 00000000000003d0 RCX: 0000000000000008 RDX: 00000000000003d0 RSI: 0000000000000001 RDI: ffff9a3040684b40 RBP: ffffbeb5806cfd90 R08: 0000000000000000 R09: ffffffff946e6900 R10: ffffbeb5806cfce0 R11: 0000000000000001 R12: 0000000000000000 R13: ffff9a2ff86501c0 R14: 00000000000003d0 R15: ffff9a30b85f2798 FS: 0000000000000000(0000) GS:ffff9a30bf800000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffffbeb5816c3b48 CR3: 000000088400a006 CR4: 00000000003626e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: tcp_read_sock+0x8c/0x290 ? __release_sock+0x9d/0xe0 ? nvme_tcp_write_space+0xb0/0xb0 [nvme_tcp] nvme_tcp_io_work+0x4b4/0x830 [nvme_tcp] ? finish_task_switch+0x163/0x270 process_one_work+0x1fd/0x3f0 worker_thread+0x34/0x410 kthread+0x121/0x140 ? process_one_work+0x3f0/0x3f0 ? kthread_park+0xb0/0xb0 ret_from_fork+0x35/0x40 Reported-by: Roy Shterman Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/tcp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index eb31c689d2cf..c15a92163c1f 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -1073,7 +1073,7 @@ static void nvme_tcp_io_work(struct work_struct *w) if (result > 0) pending = true; else if (unlikely(result < 0)) - break; + return; if (!pending) return; -- cgit v1.2.3 From d866dbf6178713e37d2fec2870af00b345684e1a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 24 Jul 2019 10:37:22 -0700 Subject: blkcg: rename blkcg->cgwb_refcnt to ->online_pin and always use it blkcg->cgwb_refcnt is used to delay blkcg offlining so that blkgs don't get offlined while there are active cgwbs on them. However, it ends up making offlining unordered sometimes causing parents to be offlined before children. To fix it, we want child blkcgs to pin the parents' online states turning the refcnt into a more generic online pinning mechanism. In prepartion, * blkcg->cgwb_refcnt -> blkcg->online_pin * blkcg_cgwb_get/put() -> blkcg_pin/unpin_online() * Take them out of CONFIG_CGROUP_WRITEBACK Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 6 +++--- include/linux/blk-cgroup.h | 39 +++++++++++++-------------------------- mm/backing-dev.c | 6 +++--- 3 files changed, 19 insertions(+), 32 deletions(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index c15a26096038..2acef6a64954 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -883,8 +883,8 @@ static void blkcg_css_offline(struct cgroup_subsys_state *css) /* this prevents anyone from attaching or migrating to this blkcg */ wb_blkcg_offline(blkcg); - /* put the base cgwb reference allowing step 2 to be triggered */ - blkcg_cgwb_put(blkcg); + /* put the base online pin allowing step 2 to be triggered */ + blkcg_unpin_online(blkcg); } /** @@ -983,11 +983,11 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css) } spin_lock_init(&blkcg->lock); + refcount_set(&blkcg->online_pin, 1); INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_NOWAIT | __GFP_NOWARN); INIT_HLIST_HEAD(&blkcg->blkg_list); #ifdef CONFIG_CGROUP_WRITEBACK INIT_LIST_HEAD(&blkcg->cgwb_list); - refcount_set(&blkcg->cgwb_refcnt, 1); #endif list_add_tail(&blkcg->all_blkcgs_node, &all_blkcgs); diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index e4a6949fd171..7fb7caa55a3d 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -46,6 +46,7 @@ struct blkcg_gq; struct blkcg { struct cgroup_subsys_state css; spinlock_t lock; + refcount_t online_pin; struct radix_tree_root blkg_tree; struct blkcg_gq __rcu *blkg_hint; @@ -56,7 +57,6 @@ struct blkcg { struct list_head all_blkcgs_node; #ifdef CONFIG_CGROUP_WRITEBACK struct list_head cgwb_list; - refcount_t cgwb_refcnt; #endif }; @@ -412,47 +412,34 @@ static inline struct blkcg *cpd_to_blkcg(struct blkcg_policy_data *cpd) extern void blkcg_destroy_blkgs(struct blkcg *blkcg); -#ifdef CONFIG_CGROUP_WRITEBACK - /** - * blkcg_cgwb_get - get a reference for blkcg->cgwb_list + * blkcg_pin_online - pin online state * @blkcg: blkcg of interest * - * This is used to track the number of active wb's related to a blkcg. + * While pinned, a blkcg is kept online. This is primarily used to + * impedance-match blkg and cgwb lifetimes so that blkg doesn't go offline + * while an associated cgwb is still active. */ -static inline void blkcg_cgwb_get(struct blkcg *blkcg) +static inline void blkcg_pin_online(struct blkcg *blkcg) { - refcount_inc(&blkcg->cgwb_refcnt); + refcount_inc(&blkcg->online_pin); } /** - * blkcg_cgwb_put - put a reference for @blkcg->cgwb_list + * blkcg_unpin_online - unpin online state * @blkcg: blkcg of interest * - * This is used to track the number of active wb's related to a blkcg. - * When this count goes to zero, all active wb has finished so the + * This is primarily used to impedance-match blkg and cgwb lifetimes so + * that blkg doesn't go offline while an associated cgwb is still active. + * When this count goes to zero, all active cgwbs have finished so the * blkcg can continue destruction by calling blkcg_destroy_blkgs(). - * This work may occur in cgwb_release_workfn() on the cgwb_release - * workqueue. */ -static inline void blkcg_cgwb_put(struct blkcg *blkcg) +static inline void blkcg_unpin_online(struct blkcg *blkcg) { - if (refcount_dec_and_test(&blkcg->cgwb_refcnt)) + if (refcount_dec_and_test(&blkcg->online_pin)) blkcg_destroy_blkgs(blkcg); } -#else - -static inline void blkcg_cgwb_get(struct blkcg *blkcg) { } - -static inline void blkcg_cgwb_put(struct blkcg *blkcg) -{ - /* wb isn't being accounted, so trigger destruction right away */ - blkcg_destroy_blkgs(blkcg); -} - -#endif - /** * blkg_path - format cgroup path of blkg * @blkg: blkg of interest diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 62f05f605fb5..c81b4f3a7268 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -491,8 +491,8 @@ static void cgwb_release_workfn(struct work_struct *work) css_put(wb->blkcg_css); mutex_unlock(&wb->bdi->cgwb_release_mutex); - /* triggers blkg destruction if cgwb_refcnt becomes zero */ - blkcg_cgwb_put(blkcg); + /* triggers blkg destruction if no online users left */ + blkcg_unpin_online(blkcg); fprop_local_destroy_percpu(&wb->memcg_completions); percpu_ref_exit(&wb->refcnt); @@ -592,7 +592,7 @@ static int cgwb_create(struct backing_dev_info *bdi, list_add_tail_rcu(&wb->bdi_node, &bdi->wb_list); list_add(&wb->memcg_node, memcg_cgwb_list); list_add(&wb->blkcg_node, blkcg_cgwb_list); - blkcg_cgwb_get(blkcg); + blkcg_pin_online(blkcg); css_get(memcg_css); css_get(blkcg_css); } -- cgit v1.2.3 From 4308a434e5e08c78676aa66bc626ef78cbef0883 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 24 Jul 2019 10:37:55 -0700 Subject: blkcg: don't offline parent blkcg first blkcg->cgwb_refcnt is used to delay blkcg offlining so that blkgs don't get offlined while there are active cgwbs on them. However, it ends up making offlining unordered sometimes causing parents to be offlined before children. Let's fix this by making child blkcgs pin the parents' online states. Note that pin/unpin names are chosen over get/put intentionally because css uses get/put online for something different. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 16 ++++++++++++++++ include/linux/blk-cgroup.h | 6 +++++- 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 2acef6a64954..c5dc833212e1 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1006,6 +1006,21 @@ unlock: return ret; } +static int blkcg_css_online(struct cgroup_subsys_state *css) +{ + struct blkcg *blkcg = css_to_blkcg(css); + struct blkcg *parent = blkcg_parent(blkcg); + + /* + * blkcg_pin_online() is used to delay blkcg offline so that blkgs + * don't go offline while cgwbs are still active on them. Pin the + * parent so that offline always happens towards the root. + */ + if (parent) + blkcg_pin_online(parent); + return 0; +} + /** * blkcg_init_queue - initialize blkcg part of request queue * @q: request_queue to initialize @@ -1199,6 +1214,7 @@ static void blkcg_exit(struct task_struct *tsk) struct cgroup_subsys io_cgrp_subsys = { .css_alloc = blkcg_css_alloc, + .css_online = blkcg_css_online, .css_offline = blkcg_css_offline, .css_free = blkcg_css_free, .can_attach = blkcg_can_attach, diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 7fb7caa55a3d..35f8ffe92b70 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -436,8 +436,12 @@ static inline void blkcg_pin_online(struct blkcg *blkcg) */ static inline void blkcg_unpin_online(struct blkcg *blkcg) { - if (refcount_dec_and_test(&blkcg->online_pin)) + do { + if (!refcount_dec_and_test(&blkcg->online_pin)) + break; blkcg_destroy_blkgs(blkcg); + blkcg = blkcg_parent(blkcg); + } while (blkcg); } /** -- cgit v1.2.3 From 74e4d20e2f43cf09a35543d960ac8f7a1ffcbbb5 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Wed, 1 Apr 2020 22:44:43 -0700 Subject: nvme: inherit stable pages constraint in the mpath stack device If the backing device require stable pages, we need to set it on the stack mpath device as well. This applies to rdma/fc transports when doing data integrity and tcp transport calculating digests. Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 2db8563aeb2d..91c1bd659947 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1897,6 +1897,13 @@ static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id) if (ns->head->disk) { nvme_update_disk_info(ns->head->disk, ns, id); blk_queue_stack_limits(ns->head->disk->queue, ns->queue); + if (bdi_cap_stable_pages_required(ns->queue->backing_dev_info)) { + struct backing_dev_info *info = + ns->head->disk->queue->backing_dev_info; + + info->capabilities |= BDI_CAP_STABLE_WRITES; + } + revalidate_disk(ns->head->disk); } #endif -- cgit v1.2.3 From f0e656e4f253120eb871a53ffab7664530c1d9f4 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Wed, 1 Apr 2020 16:16:27 -0700 Subject: nvmet: fix NULL dereference when removing a referral When item release is called, the parent is already null. We need the parent to pass to nvmet_referral_disable so hook it up to ->disconnect_notify. Reported-by: Tony Asleson Signed-off-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/target/configfs.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c index 7aa10788b7c8..58cabd7b6fc5 100644 --- a/drivers/nvme/target/configfs.c +++ b/drivers/nvme/target/configfs.c @@ -1098,12 +1098,19 @@ static struct configfs_attribute *nvmet_referral_attrs[] = { NULL, }; -static void nvmet_referral_release(struct config_item *item) +static void nvmet_referral_notify(struct config_group *group, + struct config_item *item) { struct nvmet_port *parent = to_nvmet_port(item->ci_parent->ci_parent); struct nvmet_port *port = to_nvmet_port(item); nvmet_referral_disable(parent, port); +} + +static void nvmet_referral_release(struct config_item *item) +{ + struct nvmet_port *port = to_nvmet_port(item); + kfree(port); } @@ -1134,6 +1141,7 @@ static struct config_group *nvmet_referral_make( static struct configfs_group_operations nvmet_referral_group_ops = { .make_group = nvmet_referral_make, + .disconnect_notify = nvmet_referral_notify, }; static const struct config_item_type nvmet_referrals_type = { -- cgit v1.2.3 From 8cd55087dc45b2e1a73ed2a197cbf405f32deb08 Mon Sep 17 00:00:00 2001 From: Evan Green Date: Fri, 3 Apr 2020 16:43:03 +0200 Subject: loop: Report EOPNOTSUPP properly Properly plumb out EOPNOTSUPP from loop driver operations, which may get returned when for instance a discard operation is attempted but not supported by the underlying block device. Before this change, everything was reported in the log as an I/O error, which is scary and not helpful in debugging. Signed-off-by: Evan Green Reviewed-by: Gwendal Grignou Reviewed-by: Bart Van Assche Signed-off-by: Andrzej Pietrasiewicz Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/block/loop.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index a42c49e04954..04cbe951862d 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -463,7 +463,7 @@ static void lo_complete_rq(struct request *rq) if (!cmd->use_aio || cmd->ret < 0 || cmd->ret == blk_rq_bytes(rq) || req_op(rq) != REQ_OP_READ) { if (cmd->ret < 0) - ret = BLK_STS_IOERR; + ret = errno_to_blk_status(cmd->ret); goto end_io; } @@ -1955,7 +1955,10 @@ static void loop_handle_cmd(struct loop_cmd *cmd) failed: /* complete non-aio request */ if (!cmd->use_aio || ret) { - cmd->ret = ret ? -EIO : 0; + if (ret == -EOPNOTSUPP) + cmd->ret = ret; + else + cmd->ret = ret ? -EIO : 0; blk_mq_complete_request(rq); } } -- cgit v1.2.3 From c52abf563049e787c1341cdf15c7dbe1bfbc951b Mon Sep 17 00:00:00 2001 From: Evan Green Date: Fri, 3 Apr 2020 16:43:04 +0200 Subject: loop: Better discard support for block devices If the backing device for a loop device is itself a block device, then mirror the "write zeroes" capabilities of the underlying block device into the loop device. Copy this capability into both max_write_zeroes_sectors and max_discard_sectors of the loop device. The reason for this is that REQ_OP_DISCARD on a loop device translates into blkdev_issue_zeroout(), rather than blkdev_issue_discard(). This presents a consistent interface for loop devices (that discarded data is zeroed), regardless of the backing device type of the loop device. There should be no behavior change for loop devices backed by regular files. This change fixes blktest block/003, and removes an extraneous error print in block/013 when testing on a loop device backed by a block device that does not support discard. Signed-off-by: Evan Green Reviewed-by: Gwendal Grignou Reviewed-by: Chaitanya Kulkarni [used updated version of Evan's comment in loop_config_discard()] [moved backingq to local scope, removed redundant braces] Signed-off-by: Andrzej Pietrasiewicz Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/block/loop.c | 42 +++++++++++++++++++++++++++++++----------- 1 file changed, 31 insertions(+), 11 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 04cbe951862d..da693e6a834e 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -429,11 +429,12 @@ static int lo_fallocate(struct loop_device *lo, struct request *rq, loff_t pos, * information. */ struct file *file = lo->lo_backing_file; + struct request_queue *q = lo->lo_queue; int ret; mode |= FALLOC_FL_KEEP_SIZE; - if ((!file->f_op->fallocate) || lo->lo_encrypt_key_size) { + if (!blk_queue_discard(q)) { ret = -EOPNOTSUPP; goto out; } @@ -867,28 +868,47 @@ static void loop_config_discard(struct loop_device *lo) struct inode *inode = file->f_mapping->host; struct request_queue *q = lo->lo_queue; + /* + * If the backing device is a block device, mirror its zeroing + * capability. Set the discard sectors to the block device's zeroing + * capabilities because loop discards result in blkdev_issue_zeroout(), + * not blkdev_issue_discard(). This maintains consistent behavior with + * file-backed loop devices: discarded regions read back as zero. + */ + if (S_ISBLK(inode->i_mode) && !lo->lo_encrypt_key_size) { + struct request_queue *backingq; + + backingq = bdev_get_queue(inode->i_bdev); + blk_queue_max_discard_sectors(q, + backingq->limits.max_write_zeroes_sectors); + + blk_queue_max_write_zeroes_sectors(q, + backingq->limits.max_write_zeroes_sectors); + /* * We use punch hole to reclaim the free space used by the * image a.k.a. discard. However we do not support discard if * encryption is enabled, because it may give an attacker * useful information. */ - if ((!file->f_op->fallocate) || - lo->lo_encrypt_key_size) { + } else if (!file->f_op->fallocate || lo->lo_encrypt_key_size) { q->limits.discard_granularity = 0; q->limits.discard_alignment = 0; blk_queue_max_discard_sectors(q, 0); blk_queue_max_write_zeroes_sectors(q, 0); - blk_queue_flag_clear(QUEUE_FLAG_DISCARD, q); - return; - } - q->limits.discard_granularity = inode->i_sb->s_blocksize; - q->limits.discard_alignment = 0; + } else { + q->limits.discard_granularity = inode->i_sb->s_blocksize; + q->limits.discard_alignment = 0; - blk_queue_max_discard_sectors(q, UINT_MAX >> 9); - blk_queue_max_write_zeroes_sectors(q, UINT_MAX >> 9); - blk_queue_flag_set(QUEUE_FLAG_DISCARD, q); + blk_queue_max_discard_sectors(q, UINT_MAX >> 9); + blk_queue_max_write_zeroes_sectors(q, UINT_MAX >> 9); + } + + if (q->limits.max_write_zeroes_sectors) + blk_queue_flag_set(QUEUE_FLAG_DISCARD, q); + else + blk_queue_flag_clear(QUEUE_FLAG_DISCARD, q); } static void loop_unprepare_queue(struct loop_device *lo) -- cgit v1.2.3 From a032e4f6d60d0aca4f6570d2ad33105a2b9ba385 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Thu, 2 Apr 2020 08:48:53 -0700 Subject: nvmet-rdma: fix bonding failover possible NULL deref RDMA_CM_EVENT_ADDR_CHANGE event occur in the case of bonding failover on normal as well as on listening cm_ids. Hence this event will immediately trigger a NULL dereference trying to disconnect a queue for a cm_id that actually belongs to the port. To fix this we provide a different handler for the listener cm_ids that will defer a work to disable+(re)enable the port which essentially destroys and setups another listener cm_id Reported-by: Alex Lyakas Signed-off-by: Sagi Grimberg Reviewed-by: Max Gurtovoy Tested-by: Alex Lyakas Signed-off-by: Christoph Hellwig --- drivers/nvme/target/rdma.c | 175 ++++++++++++++++++++++++++++++--------------- 1 file changed, 119 insertions(+), 56 deletions(-) diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c index 9e1b8c61f54e..f78201421978 100644 --- a/drivers/nvme/target/rdma.c +++ b/drivers/nvme/target/rdma.c @@ -105,6 +105,13 @@ struct nvmet_rdma_queue { struct list_head queue_list; }; +struct nvmet_rdma_port { + struct nvmet_port *nport; + struct sockaddr_storage addr; + struct rdma_cm_id *cm_id; + struct delayed_work repair_work; +}; + struct nvmet_rdma_device { struct ib_device *device; struct ib_pd *pd; @@ -917,7 +924,8 @@ static void nvmet_rdma_free_dev(struct kref *ref) static struct nvmet_rdma_device * nvmet_rdma_find_get_device(struct rdma_cm_id *cm_id) { - struct nvmet_port *port = cm_id->context; + struct nvmet_rdma_port *port = cm_id->context; + struct nvmet_port *nport = port->nport; struct nvmet_rdma_device *ndev; int inline_page_count; int inline_sge_count; @@ -934,17 +942,17 @@ nvmet_rdma_find_get_device(struct rdma_cm_id *cm_id) if (!ndev) goto out_err; - inline_page_count = num_pages(port->inline_data_size); + inline_page_count = num_pages(nport->inline_data_size); inline_sge_count = max(cm_id->device->attrs.max_sge_rd, cm_id->device->attrs.max_recv_sge) - 1; if (inline_page_count > inline_sge_count) { pr_warn("inline_data_size %d cannot be supported by device %s. Reducing to %lu.\n", - port->inline_data_size, cm_id->device->name, + nport->inline_data_size, cm_id->device->name, inline_sge_count * PAGE_SIZE); - port->inline_data_size = inline_sge_count * PAGE_SIZE; + nport->inline_data_size = inline_sge_count * PAGE_SIZE; inline_page_count = inline_sge_count; } - ndev->inline_data_size = port->inline_data_size; + ndev->inline_data_size = nport->inline_data_size; ndev->inline_page_count = inline_page_count; ndev->device = cm_id->device; kref_init(&ndev->ref); @@ -1272,6 +1280,7 @@ static int nvmet_rdma_cm_accept(struct rdma_cm_id *cm_id, static int nvmet_rdma_queue_connect(struct rdma_cm_id *cm_id, struct rdma_cm_event *event) { + struct nvmet_rdma_port *port = cm_id->context; struct nvmet_rdma_device *ndev; struct nvmet_rdma_queue *queue; int ret = -EINVAL; @@ -1287,7 +1296,7 @@ static int nvmet_rdma_queue_connect(struct rdma_cm_id *cm_id, ret = -ENOMEM; goto put_device; } - queue->port = cm_id->context; + queue->port = port->nport; if (queue->host_qid == 0) { /* Let inflight controller teardown complete */ @@ -1412,7 +1421,7 @@ static void nvmet_rdma_queue_connect_fail(struct rdma_cm_id *cm_id, static int nvmet_rdma_device_removal(struct rdma_cm_id *cm_id, struct nvmet_rdma_queue *queue) { - struct nvmet_port *port; + struct nvmet_rdma_port *port; if (queue) { /* @@ -1431,7 +1440,7 @@ static int nvmet_rdma_device_removal(struct rdma_cm_id *cm_id, * cm_id destroy. use atomic xchg to make sure * we don't compete with remove_port. */ - if (xchg(&port->priv, NULL) != cm_id) + if (xchg(&port->cm_id, NULL) != cm_id) return 0; /* @@ -1462,6 +1471,13 @@ static int nvmet_rdma_cm_handler(struct rdma_cm_id *cm_id, nvmet_rdma_queue_established(queue); break; case RDMA_CM_EVENT_ADDR_CHANGE: + if (!queue) { + struct nvmet_rdma_port *port = cm_id->context; + + schedule_delayed_work(&port->repair_work, 0); + break; + } + /* FALLTHROUGH */ case RDMA_CM_EVENT_DISCONNECTED: case RDMA_CM_EVENT_TIMEWAIT_EXIT: nvmet_rdma_queue_disconnect(queue); @@ -1504,42 +1520,19 @@ restart: mutex_unlock(&nvmet_rdma_queue_mutex); } -static int nvmet_rdma_add_port(struct nvmet_port *port) +static void nvmet_rdma_disable_port(struct nvmet_rdma_port *port) { - struct rdma_cm_id *cm_id; - struct sockaddr_storage addr = { }; - __kernel_sa_family_t af; - int ret; - - switch (port->disc_addr.adrfam) { - case NVMF_ADDR_FAMILY_IP4: - af = AF_INET; - break; - case NVMF_ADDR_FAMILY_IP6: - af = AF_INET6; - break; - default: - pr_err("address family %d not supported\n", - port->disc_addr.adrfam); - return -EINVAL; - } + struct rdma_cm_id *cm_id = xchg(&port->cm_id, NULL); - if (port->inline_data_size < 0) { - port->inline_data_size = NVMET_RDMA_DEFAULT_INLINE_DATA_SIZE; - } else if (port->inline_data_size > NVMET_RDMA_MAX_INLINE_DATA_SIZE) { - pr_warn("inline_data_size %u is too large, reducing to %u\n", - port->inline_data_size, - NVMET_RDMA_MAX_INLINE_DATA_SIZE); - port->inline_data_size = NVMET_RDMA_MAX_INLINE_DATA_SIZE; - } + if (cm_id) + rdma_destroy_id(cm_id); +} - ret = inet_pton_with_scope(&init_net, af, port->disc_addr.traddr, - port->disc_addr.trsvcid, &addr); - if (ret) { - pr_err("malformed ip/port passed: %s:%s\n", - port->disc_addr.traddr, port->disc_addr.trsvcid); - return ret; - } +static int nvmet_rdma_enable_port(struct nvmet_rdma_port *port) +{ + struct sockaddr *addr = (struct sockaddr *)&port->addr; + struct rdma_cm_id *cm_id; + int ret; cm_id = rdma_create_id(&init_net, nvmet_rdma_cm_handler, port, RDMA_PS_TCP, IB_QPT_RC); @@ -1558,23 +1551,19 @@ static int nvmet_rdma_add_port(struct nvmet_port *port) goto out_destroy_id; } - ret = rdma_bind_addr(cm_id, (struct sockaddr *)&addr); + ret = rdma_bind_addr(cm_id, addr); if (ret) { - pr_err("binding CM ID to %pISpcs failed (%d)\n", - (struct sockaddr *)&addr, ret); + pr_err("binding CM ID to %pISpcs failed (%d)\n", addr, ret); goto out_destroy_id; } ret = rdma_listen(cm_id, 128); if (ret) { - pr_err("listening to %pISpcs failed (%d)\n", - (struct sockaddr *)&addr, ret); + pr_err("listening to %pISpcs failed (%d)\n", addr, ret); goto out_destroy_id; } - pr_info("enabling port %d (%pISpcs)\n", - le16_to_cpu(port->disc_addr.portid), (struct sockaddr *)&addr); - port->priv = cm_id; + port->cm_id = cm_id; return 0; out_destroy_id: @@ -1582,18 +1571,92 @@ out_destroy_id: return ret; } -static void nvmet_rdma_remove_port(struct nvmet_port *port) +static void nvmet_rdma_repair_port_work(struct work_struct *w) { - struct rdma_cm_id *cm_id = xchg(&port->priv, NULL); + struct nvmet_rdma_port *port = container_of(to_delayed_work(w), + struct nvmet_rdma_port, repair_work); + int ret; - if (cm_id) - rdma_destroy_id(cm_id); + nvmet_rdma_disable_port(port); + ret = nvmet_rdma_enable_port(port); + if (ret) + schedule_delayed_work(&port->repair_work, 5 * HZ); +} + +static int nvmet_rdma_add_port(struct nvmet_port *nport) +{ + struct nvmet_rdma_port *port; + __kernel_sa_family_t af; + int ret; + + port = kzalloc(sizeof(*port), GFP_KERNEL); + if (!port) + return -ENOMEM; + + nport->priv = port; + port->nport = nport; + INIT_DELAYED_WORK(&port->repair_work, nvmet_rdma_repair_port_work); + + switch (nport->disc_addr.adrfam) { + case NVMF_ADDR_FAMILY_IP4: + af = AF_INET; + break; + case NVMF_ADDR_FAMILY_IP6: + af = AF_INET6; + break; + default: + pr_err("address family %d not supported\n", + nport->disc_addr.adrfam); + ret = -EINVAL; + goto out_free_port; + } + + if (nport->inline_data_size < 0) { + nport->inline_data_size = NVMET_RDMA_DEFAULT_INLINE_DATA_SIZE; + } else if (nport->inline_data_size > NVMET_RDMA_MAX_INLINE_DATA_SIZE) { + pr_warn("inline_data_size %u is too large, reducing to %u\n", + nport->inline_data_size, + NVMET_RDMA_MAX_INLINE_DATA_SIZE); + nport->inline_data_size = NVMET_RDMA_MAX_INLINE_DATA_SIZE; + } + + ret = inet_pton_with_scope(&init_net, af, nport->disc_addr.traddr, + nport->disc_addr.trsvcid, &port->addr); + if (ret) { + pr_err("malformed ip/port passed: %s:%s\n", + nport->disc_addr.traddr, nport->disc_addr.trsvcid); + goto out_free_port; + } + + ret = nvmet_rdma_enable_port(port); + if (ret) + goto out_free_port; + + pr_info("enabling port %d (%pISpcs)\n", + le16_to_cpu(nport->disc_addr.portid), + (struct sockaddr *)&port->addr); + + return 0; + +out_free_port: + kfree(port); + return ret; +} + +static void nvmet_rdma_remove_port(struct nvmet_port *nport) +{ + struct nvmet_rdma_port *port = nport->priv; + + cancel_delayed_work_sync(&port->repair_work); + nvmet_rdma_disable_port(port); + kfree(port); } static void nvmet_rdma_disc_port_addr(struct nvmet_req *req, - struct nvmet_port *port, char *traddr) + struct nvmet_port *nport, char *traddr) { - struct rdma_cm_id *cm_id = port->priv; + struct nvmet_rdma_port *port = nport->priv; + struct rdma_cm_id *cm_id = port->cm_id; if (inet_addr_is_any((struct sockaddr *)&cm_id->route.addr.src_addr)) { struct nvmet_rdma_rsp *rsp = @@ -1603,7 +1666,7 @@ static void nvmet_rdma_disc_port_addr(struct nvmet_req *req, sprintf(traddr, "%pISc", addr); } else { - memcpy(traddr, port->disc_addr.traddr, NVMF_TRADDR_SIZE); + memcpy(traddr, nport->disc_addr.traddr, NVMF_TRADDR_SIZE); } } -- cgit v1.2.3 From 657f1975e9d9c880fa13030e88ba6cc84964f1db Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Thu, 2 Apr 2020 09:34:54 -0700 Subject: nvme: fix deadlock caused by ANA update wrong locking The deadlock combines 4 flows in parallel: - ns scanning (triggered from reconnect) - request timeout - ANA update (triggered from reconnect) - I/O coming into the mpath device (1) ns scanning triggers disk revalidation -> update disk info -> freeze queue -> but blocked, due to (2) (2) timeout handler reference the g_usage_counter - > but blocks in the transport .timeout() handler, due to (3) (3) the transport timeout handler (indirectly) calls nvme_stop_queue() -> which takes the (down_read) namespaces_rwsem - > but blocks, due to (4) (4) ANA update takes the (down_write) namespaces_rwsem -> calls nvme_mpath_set_live() -> which synchronize the ns_head srcu (see commit 504db087aacc) -> but blocks, due to (5) (5) I/O came into nvme_mpath_make_request -> took srcu_read_lock -> direct_make_request > blk_queue_enter -> but blocked, due to (1) ==> the request queue is under freeze -> deadlock. The fix is making ANA update take a read lock as the namespaces list is not manipulated, it is just the ns and ns->head that are being updated (which is protected with the ns->head lock). Fixes: 0d0b660f214dc ("nvme: add ANA support") Signed-off-by: Sagi Grimberg Reviewed-by: Keith Busch Reviewed-by: Hannes Reinecke Signed-off-by: Christoph Hellwig --- drivers/nvme/host/multipath.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 61bf87592570..54603bd3e02d 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -510,7 +510,7 @@ static int nvme_update_ana_state(struct nvme_ctrl *ctrl, if (!nr_nsids) return 0; - down_write(&ctrl->namespaces_rwsem); + down_read(&ctrl->namespaces_rwsem); list_for_each_entry(ns, &ctrl->namespaces, list) { unsigned nsid = le32_to_cpu(desc->nsids[n]); @@ -521,7 +521,7 @@ static int nvme_update_ana_state(struct nvme_ctrl *ctrl, if (++n == nr_nsids) break; } - up_write(&ctrl->namespaces_rwsem); + up_read(&ctrl->namespaces_rwsem); return 0; } -- cgit v1.2.3 From 8c5c660529209a0e324c1c1a35ce3f83d67a2aa5 Mon Sep 17 00:00:00 2001 From: James Smart Date: Fri, 3 Apr 2020 07:33:20 -0700 Subject: nvme-fc: Revert "add module to ops template to allow module references" The original patch was to resolve the lldd being able to be unloaded while being used to talk to the boot device of the system. However, the end result of the original patch is that any driver unload while a nvme controller is live via the lldd is now being prohibited. Given the module reference, the module teardown routine can't be called, thus there's no way, other than manual actions to terminate the controllers. Fixes: 863fbae929c7 ("nvme_fc: add module to ops template to allow module references") Cc: # v5.4+ Signed-off-by: James Smart Reviewed-by: Himanshu Madhani Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fc.c | 14 ++------------ drivers/nvme/target/fcloop.c | 1 - drivers/scsi/lpfc/lpfc_nvme.c | 2 -- drivers/scsi/qla2xxx/qla_nvme.c | 1 - include/linux/nvme-fc-driver.h | 4 ---- 5 files changed, 2 insertions(+), 20 deletions(-) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index a8bf2fb1287b..7dfc4a2ecf1e 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -342,8 +342,7 @@ nvme_fc_register_localport(struct nvme_fc_port_info *pinfo, !template->ls_req || !template->fcp_io || !template->ls_abort || !template->fcp_abort || !template->max_hw_queues || !template->max_sgl_segments || - !template->max_dif_sgl_segments || !template->dma_boundary || - !template->module) { + !template->max_dif_sgl_segments || !template->dma_boundary) { ret = -EINVAL; goto out_reghost_failed; } @@ -2016,7 +2015,6 @@ nvme_fc_ctrl_free(struct kref *ref) { struct nvme_fc_ctrl *ctrl = container_of(ref, struct nvme_fc_ctrl, ref); - struct nvme_fc_lport *lport = ctrl->lport; unsigned long flags; if (ctrl->ctrl.tagset) { @@ -2043,7 +2041,6 @@ nvme_fc_ctrl_free(struct kref *ref) if (ctrl->ctrl.opts) nvmf_free_options(ctrl->ctrl.opts); kfree(ctrl); - module_put(lport->ops->module); } static void @@ -3074,15 +3071,10 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, goto out_fail; } - if (!try_module_get(lport->ops->module)) { - ret = -EUNATCH; - goto out_free_ctrl; - } - idx = ida_simple_get(&nvme_fc_ctrl_cnt, 0, 0, GFP_KERNEL); if (idx < 0) { ret = -ENOSPC; - goto out_mod_put; + goto out_free_ctrl; } ctrl->ctrl.opts = opts; @@ -3232,8 +3224,6 @@ out_free_queues: out_free_ida: put_device(ctrl->dev); ida_simple_remove(&nvme_fc_ctrl_cnt, ctrl->cnum); -out_mod_put: - module_put(lport->ops->module); out_free_ctrl: kfree(ctrl); out_fail: diff --git a/drivers/nvme/target/fcloop.c b/drivers/nvme/target/fcloop.c index 9861fcea39f6..f69ce66e2d44 100644 --- a/drivers/nvme/target/fcloop.c +++ b/drivers/nvme/target/fcloop.c @@ -875,7 +875,6 @@ fcloop_targetport_delete(struct nvmet_fc_target_port *targetport) #define FCLOOP_DMABOUND_4G 0xFFFFFFFF static struct nvme_fc_port_template fctemplate = { - .module = THIS_MODULE, .localport_delete = fcloop_localport_delete, .remoteport_delete = fcloop_remoteport_delete, .create_queue = fcloop_create_queue, diff --git a/drivers/scsi/lpfc/lpfc_nvme.c b/drivers/scsi/lpfc/lpfc_nvme.c index f6c8963c915d..db4a04a207ec 100644 --- a/drivers/scsi/lpfc/lpfc_nvme.c +++ b/drivers/scsi/lpfc/lpfc_nvme.c @@ -1985,8 +1985,6 @@ out_unlock: /* Declare and initialization an instance of the FC NVME template. */ static struct nvme_fc_port_template lpfc_nvme_template = { - .module = THIS_MODULE, - /* initiator-based functions */ .localport_delete = lpfc_nvme_localport_delete, .remoteport_delete = lpfc_nvme_remoteport_delete, diff --git a/drivers/scsi/qla2xxx/qla_nvme.c b/drivers/scsi/qla2xxx/qla_nvme.c index bfcd02fdf2b8..941aa53363f5 100644 --- a/drivers/scsi/qla2xxx/qla_nvme.c +++ b/drivers/scsi/qla2xxx/qla_nvme.c @@ -610,7 +610,6 @@ static void qla_nvme_remoteport_delete(struct nvme_fc_remote_port *rport) } static struct nvme_fc_port_template qla_nvme_fc_transport = { - .module = THIS_MODULE, .localport_delete = qla_nvme_localport_delete, .remoteport_delete = qla_nvme_remoteport_delete, .create_queue = qla_nvme_alloc_queue, diff --git a/include/linux/nvme-fc-driver.h b/include/linux/nvme-fc-driver.h index 6d0d70f3219c..10f81629b9ce 100644 --- a/include/linux/nvme-fc-driver.h +++ b/include/linux/nvme-fc-driver.h @@ -270,8 +270,6 @@ struct nvme_fc_remote_port { * * Host/Initiator Transport Entrypoints/Parameters: * - * @module: The LLDD module using the interface - * * @localport_delete: The LLDD initiates deletion of a localport via * nvme_fc_deregister_localport(). However, the teardown is * asynchronous. This routine is called upon the completion of the @@ -385,8 +383,6 @@ struct nvme_fc_remote_port { * Value is Mandatory. Allowed to be zero. */ struct nvme_fc_port_template { - struct module *module; - /* initiator-based functions */ void (*localport_delete)(struct nvme_fc_local_port *); void (*remoteport_delete)(struct nvme_fc_remote_port *); -- cgit v1.2.3 From 536167d47ade11a5540b4b1a7125308b1641da7a Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 7 Apr 2020 03:13:48 +0900 Subject: blk-mq: don't commit_rqs() if none were queued Unburden the drivers from checking if a call to commit_rqs() is valid by not calling it when there are no requests to commit. Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- block/blk-mq.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index f6291ceedee4..8e56884fd2e9 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1289,7 +1289,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, * the driver there was more coming, but that turned out to * be a lie. */ - if (q->mq_ops->commit_rqs) + if (q->mq_ops->commit_rqs && queued) q->mq_ops->commit_rqs(hctx); spin_lock(&hctx->lock); @@ -1911,6 +1911,8 @@ blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last) void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, struct list_head *list) { + int queued = 0; + while (!list_empty(list)) { blk_status_t ret; struct request *rq = list_first_entry(list, struct request, @@ -1926,7 +1928,8 @@ void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, break; } blk_mq_end_request(rq, ret); - } + } else + queued++; } /* @@ -1934,7 +1937,7 @@ void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, * the driver there was more coming, but that turned out to * be a lie. */ - if (!list_empty(list) && hctx->queue->mq_ops->commit_rqs) + if (!list_empty(list) && hctx->queue->mq_ops->commit_rqs && queued) hctx->queue->mq_ops->commit_rqs(hctx); } -- cgit v1.2.3 From 21f9024355e58772ec5d7fc3534aa5e29d72a8b6 Mon Sep 17 00:00:00 2001 From: Israel Rukshin Date: Tue, 7 Apr 2020 11:02:28 +0000 Subject: nvmet-rdma: fix double free of rdma queue In case rdma accept fails at nvmet_rdma_queue_connect(), release work is scheduled. Later on, a new RDMA CM event may arrive since we didn't destroy the cm-id and call nvmet_rdma_queue_connect_fail(), which schedule another release work. This will cause calling nvmet_rdma_free_queue twice. To fix this we implicitly destroy the cm_id with non-zero ret code, which guarantees that new rdma_cm events will not arrive afterwards. Also add a qp pointer to nvmet_rdma_queue structure, so we can use it when the cm_id pointer is NULL or was destroyed. Signed-off-by: Israel Rukshin Suggested-by: Sagi Grimberg Reviewed-by: Max Gurtovoy Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/target/rdma.c | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c index f78201421978..ab867f32fb0d 100644 --- a/drivers/nvme/target/rdma.c +++ b/drivers/nvme/target/rdma.c @@ -78,6 +78,7 @@ enum nvmet_rdma_queue_state { struct nvmet_rdma_queue { struct rdma_cm_id *cm_id; + struct ib_qp *qp; struct nvmet_port *port; struct ib_cq *cq; atomic_t sq_wr_avail; @@ -474,7 +475,7 @@ static int nvmet_rdma_post_recv(struct nvmet_rdma_device *ndev, if (ndev->srq) ret = ib_post_srq_recv(ndev->srq, &cmd->wr, NULL); else - ret = ib_post_recv(cmd->queue->cm_id->qp, &cmd->wr, NULL); + ret = ib_post_recv(cmd->queue->qp, &cmd->wr, NULL); if (unlikely(ret)) pr_err("post_recv cmd failed\n"); @@ -513,7 +514,7 @@ static void nvmet_rdma_release_rsp(struct nvmet_rdma_rsp *rsp) atomic_add(1 + rsp->n_rdma, &queue->sq_wr_avail); if (rsp->n_rdma) { - rdma_rw_ctx_destroy(&rsp->rw, queue->cm_id->qp, + rdma_rw_ctx_destroy(&rsp->rw, queue->qp, queue->cm_id->port_num, rsp->req.sg, rsp->req.sg_cnt, nvmet_data_dir(&rsp->req)); } @@ -597,7 +598,7 @@ static void nvmet_rdma_read_data_done(struct ib_cq *cq, struct ib_wc *wc) WARN_ON(rsp->n_rdma <= 0); atomic_add(rsp->n_rdma, &queue->sq_wr_avail); - rdma_rw_ctx_destroy(&rsp->rw, queue->cm_id->qp, + rdma_rw_ctx_destroy(&rsp->rw, queue->qp, queue->cm_id->port_num, rsp->req.sg, rsp->req.sg_cnt, nvmet_data_dir(&rsp->req)); rsp->n_rdma = 0; @@ -752,7 +753,7 @@ static bool nvmet_rdma_execute_command(struct nvmet_rdma_rsp *rsp) } if (nvmet_rdma_need_data_in(rsp)) { - if (rdma_rw_ctx_post(&rsp->rw, queue->cm_id->qp, + if (rdma_rw_ctx_post(&rsp->rw, queue->qp, queue->cm_id->port_num, &rsp->read_cqe, NULL)) nvmet_req_complete(&rsp->req, NVME_SC_DATA_XFER_ERROR); } else { @@ -1038,6 +1039,7 @@ static int nvmet_rdma_create_queue_ib(struct nvmet_rdma_queue *queue) pr_err("failed to create_qp ret= %d\n", ret); goto err_destroy_cq; } + queue->qp = queue->cm_id->qp; atomic_set(&queue->sq_wr_avail, qp_attr.cap.max_send_wr); @@ -1066,11 +1068,10 @@ err_destroy_cq: static void nvmet_rdma_destroy_queue_ib(struct nvmet_rdma_queue *queue) { - struct ib_qp *qp = queue->cm_id->qp; - - ib_drain_qp(qp); - rdma_destroy_id(queue->cm_id); - ib_destroy_qp(qp); + ib_drain_qp(queue->qp); + if (queue->cm_id) + rdma_destroy_id(queue->cm_id); + ib_destroy_qp(queue->qp); ib_free_cq(queue->cq); } @@ -1305,9 +1306,12 @@ static int nvmet_rdma_queue_connect(struct rdma_cm_id *cm_id, ret = nvmet_rdma_cm_accept(cm_id, queue, &event->param.conn); if (ret) { - schedule_work(&queue->release_work); - /* Destroying rdma_cm id is not needed here */ - return 0; + /* + * Don't destroy the cm_id in free path, as we implicitly + * destroy the cm_id here with non-zero ret code. + */ + queue->cm_id = NULL; + goto free_queue; } mutex_lock(&nvmet_rdma_queue_mutex); @@ -1316,6 +1320,8 @@ static int nvmet_rdma_queue_connect(struct rdma_cm_id *cm_id, return 0; +free_queue: + nvmet_rdma_free_queue(queue); put_device: kref_put(&ndev->ref, nvmet_rdma_free_dev); -- cgit v1.2.3 From d3ef5536274faf89e626276b833be122a16bdb81 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 4 Apr 2020 08:51:20 +0200 Subject: block: fix busy device checking in blk_drop_partitions bd_super is only set by get_tree_bdev and mount_bdev, and thus not by other openers like btrfs or the XFS realtime and log devices, as well as block devices directly opened from user space. Check bd_openers instead. Fixes: 77032ca66f86 ("Return EBUSY from BLKRRPART for mounted whole-dev fs") Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/partitions/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/partitions/core.c b/block/partitions/core.c index b79c4513629b..1a0a829d8416 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -496,7 +496,7 @@ int blk_drop_partitions(struct gendisk *disk, struct block_device *bdev) if (!disk_part_scan_enabled(disk)) return 0; - if (bdev->bd_part_count || bdev->bd_super) + if (bdev->bd_part_count || bdev->bd_openers) return -EBUSY; res = invalidate_partition(disk, 0); if (res) -- cgit v1.2.3 From cb6b771b05c3026a85ed4817c1b87c5e6f41d136 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 10 Apr 2020 14:31:47 +0200 Subject: block: fix busy device checking in blk_drop_partitions again The previous fix had an off by one in the bd_openers checking, counting the callers blkdev_get. Fixes: d3ef5536274f ("block: fix busy device checking in blk_drop_partitions") Reported-by: Qian Cai Signed-off-by: Christoph Hellwig Tested-by: Qian Cai Signed-off-by: Jens Axboe --- block/partitions/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/partitions/core.c b/block/partitions/core.c index 1a0a829d8416..bc1ded1331b1 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -496,7 +496,7 @@ int blk_drop_partitions(struct gendisk *disk, struct block_device *bdev) if (!disk_part_scan_enabled(disk)) return 0; - if (bdev->bd_part_count || bdev->bd_openers) + if (bdev->bd_part_count || bdev->bd_openers > 1) return -EBUSY; res = invalidate_partition(disk, 0); if (res) -- cgit v1.2.3