diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2019-10-18 22:29:36 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2019-10-18 22:29:36 -0400 |
commit | d418d070057c45fd6f21567278f95452bfe690d1 (patch) | |
tree | 24a02ac68321cc119fbcba426e5914dc50c78dce | |
parent | dfdcff3215ae4ed7975b0991243d1dd8e1250bec (diff) | |
parent | b55f0097ae1da2520108bc426275c1ec5f857b78 (diff) |
Merge tag 'for-linus-2019-10-18' of git://git.kernel.dk/linux-block
Pull block fixes from Jens Axboe:
- NVMe pull request from Keith that address deadlocks, double resets,
memory leaks, and other regression.
- Fixup elv_support_iosched() for bio based devices (Damien)
- Fixup for the ahci PCS quirk (Dan)
- Socket O_NONBLOCK handling fix for io_uring (me)
- Timeout sequence io_uring fixes (yangerkun)
- MD warning fix for parameter default_layout (Song)
- blkcg activation fixes (Tejun)
- blk-rq-qos node deletion fix (Tejun)
* tag 'for-linus-2019-10-18' of git://git.kernel.dk/linux-block:
nvme-pci: Set the prp2 correctly when using more than 4k page
io_uring: fix logic error in io_timeout
io_uring: fix up O_NONBLOCK handling for sockets
md/raid0: fix warning message for parameter default_layout
libata/ahci: Fix PCS quirk application
blk-rq-qos: fix first node deletion of rq_qos_del()
blkcg: Fix multiple bugs in blkcg_activate_policy()
io_uring: consider the overflow of sequence for timeout req
nvme-tcp: fix possible leakage during error flow
nvmet-loop: fix possible leakage during error flow
block: Fix elv_support_iosched()
nvme-tcp: Initialize sk->sk_ll_usec only with NET_RX_BUSY_POLL
nvme: Wait for reset state when required
nvme: Prevent resets during paused controller state
nvme: Restart request timers in resetting state
nvme: Remove ADMIN_ONLY state
nvme-pci: Free tagset if no IO queues
nvme: retain split access workaround for capability reads
nvme: fix possible deadlock when nvme_update_formats fails
-rw-r--r-- | block/blk-cgroup.c | 69 | ||||
-rw-r--r-- | block/blk-rq-qos.h | 13 | ||||
-rw-r--r-- | block/elevator.c | 3 | ||||
-rw-r--r-- | drivers/ata/ahci.c | 4 | ||||
-rw-r--r-- | drivers/md/raid0.c | 2 | ||||
-rw-r--r-- | drivers/nvme/host/core.c | 94 | ||||
-rw-r--r-- | drivers/nvme/host/fabrics.h | 3 | ||||
-rw-r--r-- | drivers/nvme/host/nvme.h | 5 | ||||
-rw-r--r-- | drivers/nvme/host/pci.c | 83 | ||||
-rw-r--r-- | drivers/nvme/host/rdma.c | 8 | ||||
-rw-r--r-- | drivers/nvme/host/tcp.c | 11 | ||||
-rw-r--r-- | drivers/nvme/target/loop.c | 4 | ||||
-rw-r--r-- | fs/io_uring.c | 84 |
13 files changed, 266 insertions, 117 deletions
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index b6f20be0fc78..5d21027b1faf 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1362,7 +1362,7 @@ int blkcg_activate_policy(struct request_queue *q, const struct blkcg_policy *pol) { struct blkg_policy_data *pd_prealloc = NULL; - struct blkcg_gq *blkg; + struct blkcg_gq *blkg, *pinned_blkg = NULL; int ret; if (blkcg_policy_enabled(q, pol)) @@ -1370,49 +1370,82 @@ int blkcg_activate_policy(struct request_queue *q, if (queue_is_mq(q)) blk_mq_freeze_queue(q); -pd_prealloc: - if (!pd_prealloc) { - pd_prealloc = pol->pd_alloc_fn(GFP_KERNEL, q, &blkcg_root); - if (!pd_prealloc) { - ret = -ENOMEM; - goto out_bypass_end; - } - } - +retry: spin_lock_irq(&q->queue_lock); - /* blkg_list is pushed at the head, reverse walk to init parents first */ + /* blkg_list is pushed at the head, reverse walk to allocate parents first */ list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) { struct blkg_policy_data *pd; if (blkg->pd[pol->plid]) continue; - pd = pol->pd_alloc_fn(GFP_NOWAIT | __GFP_NOWARN, q, &blkcg_root); - if (!pd) - swap(pd, pd_prealloc); + /* If prealloc matches, use it; otherwise try GFP_NOWAIT */ + if (blkg == pinned_blkg) { + pd = pd_prealloc; + pd_prealloc = NULL; + } else { + pd = pol->pd_alloc_fn(GFP_NOWAIT | __GFP_NOWARN, q, + blkg->blkcg); + } + if (!pd) { + /* + * GFP_NOWAIT failed. Free the existing one and + * prealloc for @blkg w/ GFP_KERNEL. + */ + if (pinned_blkg) + blkg_put(pinned_blkg); + blkg_get(blkg); + pinned_blkg = blkg; + spin_unlock_irq(&q->queue_lock); - goto pd_prealloc; + + if (pd_prealloc) + pol->pd_free_fn(pd_prealloc); + pd_prealloc = pol->pd_alloc_fn(GFP_KERNEL, q, + blkg->blkcg); + if (pd_prealloc) + goto retry; + else + goto enomem; } blkg->pd[pol->plid] = pd; pd->blkg = blkg; pd->plid = pol->plid; - if (pol->pd_init_fn) - pol->pd_init_fn(pd); } + /* all allocated, init in the same order */ + if (pol->pd_init_fn) + list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) + pol->pd_init_fn(blkg->pd[pol->plid]); + __set_bit(pol->plid, q->blkcg_pols); ret = 0; spin_unlock_irq(&q->queue_lock); -out_bypass_end: +out: if (queue_is_mq(q)) blk_mq_unfreeze_queue(q); + if (pinned_blkg) + blkg_put(pinned_blkg); if (pd_prealloc) pol->pd_free_fn(pd_prealloc); return ret; + +enomem: + /* alloc failed, nothing's initialized yet, free everything */ + spin_lock_irq(&q->queue_lock); + list_for_each_entry(blkg, &q->blkg_list, q_node) { + if (blkg->pd[pol->plid]) { + pol->pd_free_fn(blkg->pd[pol->plid]); + blkg->pd[pol->plid] = NULL; + } + } + spin_unlock_irq(&q->queue_lock); + ret = -ENOMEM; + goto out; } EXPORT_SYMBOL_GPL(blkcg_activate_policy); diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h index e8cb68f6958a..2bc43e94f4c4 100644 --- a/block/blk-rq-qos.h +++ b/block/blk-rq-qos.h @@ -108,16 +108,13 @@ static inline void rq_qos_add(struct request_queue *q, struct rq_qos *rqos) static inline void rq_qos_del(struct request_queue *q, struct rq_qos *rqos) { - struct rq_qos *cur, *prev = NULL; - for (cur = q->rq_qos; cur; cur = cur->next) { - if (cur == rqos) { - if (prev) - prev->next = rqos->next; - else - q->rq_qos = cur; + struct rq_qos **cur; + + for (cur = &q->rq_qos; *cur; cur = &(*cur)->next) { + if (*cur == rqos) { + *cur = rqos->next; break; } - prev = cur; } blk_mq_debugfs_unregister_rqos(rqos); diff --git a/block/elevator.c b/block/elevator.c index 5437059c9261..076ba7308e65 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -616,7 +616,8 @@ out: static inline bool elv_support_iosched(struct request_queue *q) { - if (q->tag_set && (q->tag_set->flags & BLK_MQ_F_NO_SCHED)) + if (!q->mq_ops || + (q->tag_set && (q->tag_set->flags & BLK_MQ_F_NO_SCHED))) return false; return true; } diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c index dd92faf197d5..05c2b32dcc4d 100644 --- a/drivers/ata/ahci.c +++ b/drivers/ata/ahci.c @@ -1600,7 +1600,9 @@ static void ahci_intel_pcs_quirk(struct pci_dev *pdev, struct ahci_host_priv *hp */ if (!id || id->vendor != PCI_VENDOR_ID_INTEL) return; - if (((enum board_ids) id->driver_data) < board_ahci_pcs7) + + /* Skip applying the quirk on Denverton and beyond */ + if (((enum board_ids) id->driver_data) >= board_ahci_pcs7) return; /* diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index f61693e59684..1e772287b1c8 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -154,7 +154,7 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) } else { pr_err("md/raid0:%s: cannot assemble multi-zone RAID0 with default_layout setting\n", mdname(mddev)); - pr_err("md/raid0: please set raid.default_layout to 1 or 2\n"); + pr_err("md/raid0: please set raid0.default_layout to 1 or 2\n"); err = -ENOTSUPP; goto abort; } diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index fd7dea36c3b6..fa7ba09dca77 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -116,10 +116,26 @@ static void nvme_queue_scan(struct nvme_ctrl *ctrl) /* * Only new queue scan work when admin and IO queues are both alive */ - if (ctrl->state == NVME_CTRL_LIVE) + if (ctrl->state == NVME_CTRL_LIVE && ctrl->tagset) queue_work(nvme_wq, &ctrl->scan_work); } +/* + * Use this function to proceed with scheduling reset_work for a controller + * that had previously been set to the resetting state. This is intended for + * code paths that can't be interrupted by other reset attempts. A hot removal + * may prevent this from succeeding. + */ +int nvme_try_sched_reset(struct nvme_ctrl *ctrl) +{ + if (ctrl->state != NVME_CTRL_RESETTING) + return -EBUSY; + if (!queue_work(nvme_reset_wq, &ctrl->reset_work)) + return -EBUSY; + return 0; +} +EXPORT_SYMBOL_GPL(nvme_try_sched_reset); + int nvme_reset_ctrl(struct nvme_ctrl *ctrl) { if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING)) @@ -137,8 +153,7 @@ int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl) ret = nvme_reset_ctrl(ctrl); if (!ret) { flush_work(&ctrl->reset_work); - if (ctrl->state != NVME_CTRL_LIVE && - ctrl->state != NVME_CTRL_ADMIN_ONLY) + if (ctrl->state != NVME_CTRL_LIVE) ret = -ENETRESET; } @@ -315,15 +330,6 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, old_state = ctrl->state; switch (new_state) { - case NVME_CTRL_ADMIN_ONLY: - switch (old_state) { - case NVME_CTRL_CONNECTING: - changed = true; - /* FALLTHRU */ - default: - break; - } - break; case NVME_CTRL_LIVE: switch (old_state) { case NVME_CTRL_NEW: @@ -339,7 +345,6 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, switch (old_state) { case NVME_CTRL_NEW: case NVME_CTRL_LIVE: - case NVME_CTRL_ADMIN_ONLY: changed = true; /* FALLTHRU */ default: @@ -359,7 +364,6 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, case NVME_CTRL_DELETING: switch (old_state) { case NVME_CTRL_LIVE: - case NVME_CTRL_ADMIN_ONLY: case NVME_CTRL_RESETTING: case NVME_CTRL_CONNECTING: changed = true; @@ -381,8 +385,10 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, break; } - if (changed) + if (changed) { ctrl->state = new_state; + wake_up_all(&ctrl->state_wq); + } spin_unlock_irqrestore(&ctrl->lock, flags); if (changed && ctrl->state == NVME_CTRL_LIVE) @@ -391,6 +397,39 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, } EXPORT_SYMBOL_GPL(nvme_change_ctrl_state); +/* + * Returns true for sink states that can't ever transition back to live. + */ +static bool nvme_state_terminal(struct nvme_ctrl *ctrl) +{ + switch (ctrl->state) { + case NVME_CTRL_NEW: + case NVME_CTRL_LIVE: + case NVME_CTRL_RESETTING: + case NVME_CTRL_CONNECTING: + return false; + case NVME_CTRL_DELETING: + case NVME_CTRL_DEAD: + return true; + default: + WARN_ONCE(1, "Unhandled ctrl state:%d", ctrl->state); + return true; + } +} + +/* + * Waits for the controller state to be resetting, or returns false if it is + * not possible to ever transition to that state. + */ +bool nvme_wait_reset(struct nvme_ctrl *ctrl) +{ + wait_event(ctrl->state_wq, + nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING) || + nvme_state_terminal(ctrl)); + return ctrl->state == NVME_CTRL_RESETTING; +} +EXPORT_SYMBOL_GPL(nvme_wait_reset); + static void nvme_free_ns_head(struct kref *ref) { struct nvme_ns_head *head = @@ -1306,8 +1345,6 @@ static void nvme_update_formats(struct nvme_ctrl *ctrl) if (ns->disk && nvme_revalidate_disk(ns->disk)) nvme_set_queue_dying(ns); up_read(&ctrl->namespaces_rwsem); - - nvme_remove_invalid_namespaces(ctrl, NVME_NSID_ALL); } static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects) @@ -1323,6 +1360,7 @@ static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects) nvme_unfreeze(ctrl); nvme_mpath_unfreeze(ctrl->subsys); mutex_unlock(&ctrl->subsys->lock); + nvme_remove_invalid_namespaces(ctrl, NVME_NSID_ALL); mutex_unlock(&ctrl->scan_lock); } if (effects & NVME_CMD_EFFECTS_CCC) @@ -2874,7 +2912,6 @@ static int nvme_dev_open(struct inode *inode, struct file *file) switch (ctrl->state) { case NVME_CTRL_LIVE: - case NVME_CTRL_ADMIN_ONLY: break; default: return -EWOULDBLOCK; @@ -3168,7 +3205,6 @@ static ssize_t nvme_sysfs_show_state(struct device *dev, static const char *const state_name[] = { [NVME_CTRL_NEW] = "new", [NVME_CTRL_LIVE] = "live", - [NVME_CTRL_ADMIN_ONLY] = "only-admin", [NVME_CTRL_RESETTING] = "resetting", [NVME_CTRL_CONNECTING] = "connecting", [NVME_CTRL_DELETING] = "deleting", @@ -3679,11 +3715,10 @@ static void nvme_scan_work(struct work_struct *work) struct nvme_id_ctrl *id; unsigned nn; - if (ctrl->state != NVME_CTRL_LIVE) + /* No tagset on a live ctrl means IO queues could not created */ + if (ctrl->state != NVME_CTRL_LIVE || !ctrl->tagset) return; - WARN_ON_ONCE(!ctrl->tagset); - if (test_and_clear_bit(NVME_AER_NOTICE_NS_CHANGED, &ctrl->events)) { dev_info(ctrl->device, "rescanning namespaces.\n"); nvme_clear_changed_ns_log(ctrl); @@ -3844,13 +3879,13 @@ static void nvme_fw_act_work(struct work_struct *work) if (time_after(jiffies, fw_act_timeout)) { dev_warn(ctrl->device, "Fw activation timeout, reset controller\n"); - nvme_reset_ctrl(ctrl); - break; + nvme_try_sched_reset(ctrl); + return; } msleep(100); } - if (ctrl->state != NVME_CTRL_LIVE) + if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_LIVE)) return; nvme_start_queues(ctrl); @@ -3870,7 +3905,13 @@ static void nvme_handle_aen_notice(struct nvme_ctrl *ctrl, u32 result) nvme_queue_scan(ctrl); break; case NVME_AER_NOTICE_FW_ACT_STARTING: - queue_work(nvme_wq, &ctrl->fw_act_work); + /* + * We are (ab)using the RESETTING state to prevent subsequent + * recovery actions from interfering with the controller's + * firmware activation. + */ + if (nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING)) + queue_work(nvme_wq, &ctrl->fw_act_work); break; #ifdef CONFIG_NVME_MULTIPATH case NVME_AER_NOTICE_ANA: @@ -3993,6 +4034,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, INIT_WORK(&ctrl->async_event_work, nvme_async_event_work); INIT_WORK(&ctrl->fw_act_work, nvme_fw_act_work); INIT_WORK(&ctrl->delete_work, nvme_delete_ctrl_work); + init_waitqueue_head(&ctrl->state_wq); INIT_DELAYED_WORK(&ctrl->ka_work, nvme_keep_alive_work); memset(&ctrl->ka_cmd, 0, sizeof(ctrl->ka_cmd)); diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h index 93f08d77c896..a0ec40ab62ee 100644 --- a/drivers/nvme/host/fabrics.h +++ b/drivers/nvme/host/fabrics.h @@ -182,8 +182,7 @@ bool nvmf_ip_options_match(struct nvme_ctrl *ctrl, static inline bool nvmf_check_ready(struct nvme_ctrl *ctrl, struct request *rq, bool queue_live) { - if (likely(ctrl->state == NVME_CTRL_LIVE || - ctrl->state == NVME_CTRL_ADMIN_ONLY)) + if (likely(ctrl->state == NVME_CTRL_LIVE)) return true; return __nvmf_check_ready(ctrl, rq, queue_live); } diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 38a83ef5bcd3..22e8401352c2 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -15,6 +15,7 @@ #include <linux/sed-opal.h> #include <linux/fault-inject.h> #include <linux/rcupdate.h> +#include <linux/wait.h> #include <trace/events/block.h> @@ -161,7 +162,6 @@ static inline u16 nvme_req_qid(struct request *req) enum nvme_ctrl_state { NVME_CTRL_NEW, NVME_CTRL_LIVE, - NVME_CTRL_ADMIN_ONLY, /* Only admin queue live */ NVME_CTRL_RESETTING, NVME_CTRL_CONNECTING, NVME_CTRL_DELETING, @@ -199,6 +199,7 @@ struct nvme_ctrl { struct cdev cdev; struct work_struct reset_work; struct work_struct delete_work; + wait_queue_head_t state_wq; struct nvme_subsystem *subsys; struct list_head subsys_entry; @@ -449,6 +450,7 @@ void nvme_complete_rq(struct request *req); bool nvme_cancel_request(struct request *req, void *data, bool reserved); bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, enum nvme_ctrl_state new_state); +bool nvme_wait_reset(struct nvme_ctrl *ctrl); int nvme_disable_ctrl(struct nvme_ctrl *ctrl); int nvme_enable_ctrl(struct nvme_ctrl *ctrl); int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl); @@ -499,6 +501,7 @@ int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count); void nvme_stop_keep_alive(struct nvme_ctrl *ctrl); int nvme_reset_ctrl(struct nvme_ctrl *ctrl); int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl); +int nvme_try_sched_reset(struct nvme_ctrl *ctrl); int nvme_delete_ctrl(struct nvme_ctrl *ctrl); int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index bb88681f4dc3..869f462e6b6e 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -773,7 +773,8 @@ static blk_status_t nvme_setup_prp_simple(struct nvme_dev *dev, struct bio_vec *bv) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - unsigned int first_prp_len = dev->ctrl.page_size - bv->bv_offset; + unsigned int offset = bv->bv_offset & (dev->ctrl.page_size - 1); + unsigned int first_prp_len = dev->ctrl.page_size - offset; iod->first_dma = dma_map_bvec(dev->dev, bv, rq_dma_dir(req), 0); if (dma_mapping_error(dev->dev, iod->first_dma)) @@ -2263,10 +2264,7 @@ static bool __nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode) return true; } -/* - * return error value only when tagset allocation failed - */ -static int nvme_dev_add(struct nvme_dev *dev) +static void nvme_dev_add(struct nvme_dev *dev) { int ret; @@ -2296,7 +2294,7 @@ static int nvme_dev_add(struct nvme_dev *dev) if (ret) { dev_warn(dev->ctrl.device, "IO queues tagset allocation failed %d\n", ret); - return ret; + return; } dev->ctrl.tagset = &dev->tagset; } else { @@ -2307,7 +2305,6 @@ static int nvme_dev_add(struct nvme_dev *dev) } nvme_dbbuf_set(dev); - return 0; } static int nvme_pci_enable(struct nvme_dev *dev) @@ -2467,6 +2464,14 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown) mutex_unlock(&dev->shutdown_lock); } +static int nvme_disable_prepare_reset(struct nvme_dev *dev, bool shutdown) +{ + if (!nvme_wait_reset(&dev->ctrl)) + return -EBUSY; + nvme_dev_disable(dev, shutdown); + return 0; +} + static int nvme_setup_prp_pools(struct nvme_dev *dev) { dev->prp_page_pool = dma_pool_create("prp list page", dev->dev, @@ -2490,14 +2495,20 @@ static void nvme_release_prp_pools(struct nvme_dev *dev) dma_pool_destroy(dev->prp_small_pool); } +static void nvme_free_tagset(struct nvme_dev *dev) +{ + if (dev->tagset.tags) + blk_mq_free_tag_set(&dev->tagset); + dev->ctrl.tagset = NULL; +} + static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl) { struct nvme_dev *dev = to_nvme_dev(ctrl); nvme_dbbuf_dma_free(dev); put_device(dev->dev); - if (dev->tagset.tags) - blk_mq_free_tag_set(&dev->tagset); + nvme_free_tagset(dev); if (dev->ctrl.admin_q) blk_put_queue(dev->ctrl.admin_q); kfree(dev->queues); @@ -2508,6 +2519,11 @@ static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl) static void nvme_remove_dead_ctrl(struct nvme_dev *dev) { + /* + * Set state to deleting now to avoid blocking nvme_wait_reset(), which + * may be holding this pci_dev's device lock. + */ + nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING); nvme_get_ctrl(&dev->ctrl); nvme_dev_disable(dev, false); nvme_kill_queues(&dev->ctrl); @@ -2521,7 +2537,6 @@ static void nvme_reset_work(struct work_struct *work) container_of(work, struct nvme_dev, ctrl.reset_work); bool was_suspend = !!(dev->ctrl.ctrl_config & NVME_CC_SHN_NORMAL); int result; - enum nvme_ctrl_state new_state = NVME_CTRL_LIVE; if (WARN_ON(dev->ctrl.state != NVME_CTRL_RESETTING)) { result = -ENODEV; @@ -2615,13 +2630,11 @@ static void nvme_reset_work(struct work_struct *work) dev_warn(dev->ctrl.device, "IO queues not created\n"); nvme_kill_queues(&dev->ctrl); nvme_remove_namespaces(&dev->ctrl); - new_state = NVME_CTRL_ADMIN_ONLY; + nvme_free_tagset(dev); } else { nvme_start_queues(&dev->ctrl); nvme_wait_freeze(&dev->ctrl); - /* hit this only when allocate tagset fails */ - if (nvme_dev_add(dev)) - new_state = NVME_CTRL_ADMIN_ONLY; + nvme_dev_add(dev); nvme_unfreeze(&dev->ctrl); } @@ -2629,9 +2642,9 @@ static void nvme_reset_work(struct work_struct *work) * If only admin queue live, keep it to do further investigation or * recovery. */ - if (!nvme_change_ctrl_state(&dev->ctrl, new_state)) { + if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_LIVE)) { dev_warn(dev->ctrl.device, - "failed to mark controller state %d\n", new_state); + "failed to mark controller live state\n"); result = -ENODEV; goto out; } @@ -2672,7 +2685,7 @@ static int nvme_pci_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val) static int nvme_pci_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val) { - *val = readq(to_nvme_dev(ctrl)->bar + off); + *val = lo_hi_readq(to_nvme_dev(ctrl)->bar + off); return 0; } @@ -2836,19 +2849,28 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) static void nvme_reset_prepare(struct pci_dev *pdev) { struct nvme_dev *dev = pci_get_drvdata(pdev); - nvme_dev_disable(dev, false); + + /* + * We don't need to check the return value from waiting for the reset + * state as pci_dev device lock is held, making it impossible to race + * with ->remove(). + */ + nvme_disable_prepare_reset(dev, false); + nvme_sync_queues(&dev->ctrl); } static void nvme_reset_done(struct pci_dev *pdev) { struct nvme_dev *dev = pci_get_drvdata(pdev); - nvme_reset_ctrl_sync(&dev->ctrl); + + if (!nvme_try_sched_reset(&dev->ctrl)) + flush_work(&dev->ctrl.reset_work); } static void nvme_shutdown(struct pci_dev *pdev) { struct nvme_dev *dev = pci_get_drvdata(pdev); - nvme_dev_disable(dev, true); + nvme_disable_prepare_reset(dev, true); } /* @@ -2901,7 +2923,7 @@ static int nvme_resume(struct device *dev) if (ndev->last_ps == U32_MAX || nvme_set_power_state(ctrl, ndev->last_ps) != 0) - nvme_reset_ctrl(ctrl); + return nvme_try_sched_reset(&ndev->ctrl); return 0; } @@ -2929,17 +2951,14 @@ static int nvme_suspend(struct device *dev) */ if (pm_suspend_via_firmware() || !ctrl->npss || !pcie_aspm_enabled(pdev) || - (ndev->ctrl.quirks & NVME_QUIRK_SIMPLE_SUSPEND)) { - nvme_dev_disable(ndev, true); - return 0; - } + (ndev->ctrl.quirks & NVME_QUIRK_SIMPLE_SUSPEND)) + return nvme_disable_prepare_reset(ndev, true); nvme_start_freeze(ctrl); nvme_wait_freeze(ctrl); nvme_sync_queues(ctrl); - if (ctrl->state != NVME_CTRL_LIVE && - ctrl->state != NVME_CTRL_ADMIN_ONLY) + if (ctrl->state != NVME_CTRL_LIVE) goto unfreeze; ret = nvme_get_power_state(ctrl, &ndev->last_ps); @@ -2965,9 +2984,8 @@ static int nvme_suspend(struct device *dev) * Clearing npss forces a controller reset on resume. The * correct value will be resdicovered then. */ - nvme_dev_disable(ndev, true); + ret = nvme_disable_prepare_reset(ndev, true); ctrl->npss = 0; - ret = 0; } unfreeze: nvme_unfreeze(ctrl); @@ -2977,9 +2995,7 @@ unfreeze: static int nvme_simple_suspend(struct device *dev) { struct nvme_dev *ndev = pci_get_drvdata(to_pci_dev(dev)); - - nvme_dev_disable(ndev, true); - return 0; + return nvme_disable_prepare_reset(ndev, true); } static int nvme_simple_resume(struct device *dev) @@ -2987,8 +3003,7 @@ static int nvme_simple_resume(struct device *dev) struct pci_dev *pdev = to_pci_dev(dev); struct nvme_dev *ndev = pci_get_drvdata(pdev); - nvme_reset_ctrl(&ndev->ctrl); - return 0; + return nvme_try_sched_reset(&ndev->ctrl); } static const struct dev_pm_ops nvme_dev_pm_ops = { diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 4d280160dd3f..f19a28b4e997 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1701,6 +1701,14 @@ nvme_rdma_timeout(struct request *rq, bool reserved) dev_warn(ctrl->ctrl.device, "I/O %d QID %d timeout\n", rq->tag, nvme_rdma_queue_idx(queue)); + /* + * Restart the timer if a controller reset is already scheduled. Any + * timed out commands would be handled before entering the connecting + * state. + */ + if (ctrl->ctrl.state == NVME_CTRL_RESETTING) + return BLK_EH_RESET_TIMER; + if (ctrl->ctrl.state != NVME_CTRL_LIVE) { /* * Teardown immediately if controller times out while starting diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 385a5212c10f..770dbcbc999e 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -1386,7 +1386,9 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, queue->sock->sk->sk_data_ready = nvme_tcp_data_ready; queue->sock->sk->sk_state_change = nvme_tcp_state_change; queue->sock->sk->sk_write_space = nvme_tcp_write_space; +#ifdef CONFIG_NET_RX_BUSY_POLL queue->sock->sk->sk_ll_usec = 1; +#endif write_unlock_bh(&queue->sock->sk->sk_callback_lock); return 0; @@ -2044,6 +2046,14 @@ nvme_tcp_timeout(struct request *rq, bool reserved) struct nvme_tcp_ctrl *ctrl = req->queue->ctrl; struct nvme_tcp_cmd_pdu *pdu = req->pdu; + /* + * Restart the timer if a controller reset is already scheduled. Any + * timed out commands would be handled before entering the connecting + * state. + */ + if (ctrl->ctrl.state == NVME_CTRL_RESETTING) + return BLK_EH_RESET_TIMER; + dev_warn(ctrl->ctrl.device, "queue %d: timeout request %#x type %d\n", nvme_tcp_queue_id(req->queue), rq->tag, pdu->hdr.type); @@ -2126,6 +2136,7 @@ static blk_status_t nvme_tcp_setup_cmd_pdu(struct nvme_ns *ns, ret = nvme_tcp_map_data(queue, rq); if (unlikely(ret)) { + nvme_cleanup_cmd(rq); dev_err(queue->ctrl->ctrl.device, "Failed to map data (%d)\n", ret); return ret; diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index 748a39fca771..11f5aea97d1b 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -157,8 +157,10 @@ static blk_status_t nvme_loop_queue_rq(struct blk_mq_hw_ctx *hctx, iod->sg_table.sgl = iod->first_sgl; if (sg_alloc_table_chained(&iod->sg_table, blk_rq_nr_phys_segments(req), - iod->sg_table.sgl, SG_CHUNK_SIZE)) + iod->sg_table.sgl, SG_CHUNK_SIZE)) { + nvme_cleanup_cmd(req); return BLK_STS_RESOURCE; + } iod->req.sg = iod->sg_table.sgl; iod->req.sg_cnt = blk_rq_map_sg(req->q, req, iod->sg_table.sgl); diff --git a/fs/io_uring.c b/fs/io_uring.c index 76fdbe84aff5..67dbe0201e0d 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -322,6 +322,8 @@ struct io_kiocb { #define REQ_F_FAIL_LINK 256 /* fail rest of links */ #define REQ_F_SHADOW_DRAIN 512 /* link-drain shadow req */ #define REQ_F_TIMEOUT 1024 /* timeout request */ +#define REQ_F_ISREG 2048 /* regular file */ +#define REQ_F_MUST_PUNT 4096 /* must be punted even for NONBLOCK */ u64 user_data; u32 result; u32 sequence; @@ -914,26 +916,26 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events, return ret; } -static void kiocb_end_write(struct kiocb *kiocb) +static void kiocb_end_write(struct io_kiocb *req) { - if (kiocb->ki_flags & IOCB_WRITE) { - struct inode *inode = file_inode(kiocb->ki_filp); + /* + * Tell lockdep we inherited freeze protection from submission + * thread. + */ + if (req->flags & REQ_F_ISREG) { + struct inode *inode = file_inode(req->file); - /* - * Tell lockdep we inherited freeze protection from submission - * thread. - */ - if (S_ISREG(inode->i_mode)) - __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE); - file_end_write(kiocb->ki_filp); + __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE); } + file_end_write(req->file); } static void io_complete_rw(struct kiocb *kiocb, long res, long res2) { struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw); - kiocb_end_write(kiocb); + if (kiocb->ki_flags & IOCB_WRITE) + kiocb_end_write(req); if ((req->flags & REQ_F_LINK) && res != req->result) req->flags |= REQ_F_FAIL_LINK; @@ -945,7 +947,8 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2) { struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw); - kiocb_end_write(kiocb); + if (kiocb->ki_flags & IOCB_WRITE) + kiocb_end_write(req); if ((req->flags & REQ_F_LINK) && res != req->result) req->flags |= REQ_F_FAIL_LINK; @@ -1059,8 +1062,17 @@ static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s, if (!req->file) return -EBADF; - if (force_nonblock && !io_file_supports_async(req->file)) - force_nonblock = false; + if (S_ISREG(file_inode(req->file)->i_mode)) + req->flags |= REQ_F_ISREG; + + /* + * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so + * we know to async punt it even if it was opened O_NONBLOCK + */ + if (force_nonblock && !io_file_supports_async(req->file)) { + req->flags |= REQ_F_MUST_PUNT; + return -EAGAIN; + } kiocb->ki_pos = READ_ONCE(sqe->off); kiocb->ki_flags = iocb_flags(kiocb->ki_filp); @@ -1081,7 +1093,8 @@ static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s, return ret; /* don't allow async punt if RWF_NOWAIT was requested */ - if (kiocb->ki_flags & IOCB_NOWAIT) + if ((kiocb->ki_flags & IOCB_NOWAIT) || + (req->file->f_flags & O_NONBLOCK)) req->flags |= REQ_F_NOWAIT; if (force_nonblock) @@ -1382,7 +1395,9 @@ static int io_read(struct io_kiocb *req, const struct sqe_submit *s, * need async punt anyway, so it's more efficient to do it * here. */ - if (force_nonblock && ret2 > 0 && ret2 < read_size) + if (force_nonblock && !(req->flags & REQ_F_NOWAIT) && + (req->flags & REQ_F_ISREG) && + ret2 > 0 && ret2 < read_size) ret2 = -EAGAIN; /* Catch -EAGAIN return for forced non-blocking submission */ if (!force_nonblock || ret2 != -EAGAIN) { @@ -1447,7 +1462,7 @@ static int io_write(struct io_kiocb *req, const struct sqe_submit *s, * released so that it doesn't complain about the held lock when * we return to userspace. */ - if (S_ISREG(file_inode(file)->i_mode)) { + if (req->flags & REQ_F_ISREG) { __sb_start_write(file_inode(file)->i_sb, SB_FREEZE_WRITE, true); __sb_writers_release(file_inode(file)->i_sb, @@ -1884,7 +1899,7 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - unsigned count, req_dist, tail_index; + unsigned count; struct io_ring_ctx *ctx = req->ctx; struct list_head *entry; struct timespec64 ts; @@ -1907,21 +1922,36 @@ static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe) count = 1; req->sequence = ctx->cached_sq_head + count - 1; + /* reuse it to store the count */ + req->submit.sequence = count; req->flags |= REQ_F_TIMEOUT; /* * Insertion sort, ensuring the first entry in the list is always * the one we need first. */ - tail_index = ctx->cached_cq_tail - ctx->rings->sq_dropped; - req_dist = req->sequence - tail_index; spin_lock_irq(&ctx->completion_lock); list_for_each_prev(entry, &ctx->timeout_list) { struct io_kiocb *nxt = list_entry(entry, struct io_kiocb, list); - unsigned dist; + unsigned nxt_sq_head; + long long tmp, tmp_nxt; - dist = nxt->sequence - tail_index; - if (req_dist >= dist) + /* + * Since cached_sq_head + count - 1 can overflow, use type long + * long to store it. + */ + tmp = (long long)ctx->cached_sq_head + count - 1; + nxt_sq_head = nxt->sequence - nxt->submit.sequence + 1; + tmp_nxt = (long long)nxt_sq_head + nxt->submit.sequence - 1; + + /* + * cached_sq_head may overflow, and it will never overflow twice + * once there is some timeout req still be valid. + */ + if (ctx->cached_sq_head < nxt_sq_head) + tmp += UINT_MAX; + + if (tmp >= tmp_nxt) break; } list_add(&req->list, entry); @@ -2267,7 +2297,13 @@ static int __io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, int ret; ret = __io_submit_sqe(ctx, req, s, force_nonblock); - if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) { + + /* + * We async punt it if the file wasn't marked NOWAIT, or if the file + * doesn't support non-blocking read/write attempts + */ + if (ret == -EAGAIN && (!(req->flags & REQ_F_NOWAIT) || + (req->flags & REQ_F_MUST_PUNT))) { struct io_uring_sqe *sqe_copy; sqe_copy = kmemdup(s->sqe, sizeof(*sqe_copy), GFP_KERNEL); |