diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2016-05-17 16:03:32 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2016-05-17 16:03:32 -0700 |
commit | 24b9f0cf00c8e8df29a4ddfec8c139ad62753113 (patch) | |
tree | 95eb986ead9bd6734c1901b4971a940619141fe1 /drivers/nvme/host/pci.c | |
parent | a4d1dbed0e27030b3c3ca2d1d5c33a1b45bc53d2 (diff) | |
parent | 116f7d4a21fe450efc652c4850eb27cda36c9db0 (diff) |
Merge branch 'for-4.7/drivers' of git://git.kernel.dk/linux-block
Pull block driver updates from Jens Axboe:
"On top of the core pull request, this is the drivers pull request for
this merge window. This contains:
- Switch drivers to the new write back cache API, and kill off the
flush flags. From me.
- Kill the discard support for the STEC pci-e flash driver. It's
trivially broken, and apparently unmaintained, so it's safer to
just remove it. From Jeff Moyer.
- A set of lightnvm updates from the usual suspects (Matias/Javier,
and Simon), and fixes from Arnd, Jeff Mahoney, Sagi, and Wenwei
Tao.
- A set of updates for NVMe:
- Turn the controller state management into a proper state
machine. From Christoph.
- Shuffling of code in preparation for NVMe-over-fabrics, also
from Christoph.
- Cleanup of the command prep part from Ming Lin.
- Rewrite of the discard support from Ming Lin.
- Deadlock fix for namespace removal from Ming Lin.
- Use the now exported blk-mq tag helper for IO termination.
From Sagi.
- Various little fixes from Christoph, Guilherme, Keith, Ming
Lin, Wang Sheng-Hui.
- Convert mtip32xx to use the now exported blk-mq tag iter function,
from Keith"
* 'for-4.7/drivers' of git://git.kernel.dk/linux-block: (74 commits)
lightnvm: reserved space calculation incorrect
lightnvm: rename nr_pages to nr_ppas on nvm_rq
lightnvm: add is_cached entry to struct ppa_addr
lightnvm: expose gennvm_mark_blk to targets
lightnvm: remove mgt targets on mgt removal
lightnvm: pass dma address to hardware rather than pointer
lightnvm: do not assume sequential lun alloc.
nvme/lightnvm: Log using the ctrl named device
lightnvm: rename dma helper functions
lightnvm: enable metadata to be sent to device
lightnvm: do not free unused metadata on rrpc
lightnvm: fix out of bound ppa lun id on bb tbl
lightnvm: refactor set_bb_tbl for accepting ppa list
lightnvm: move responsibility for bad blk mgmt to target
lightnvm: make nvm_set_rqd_ppalist() aware of vblks
lightnvm: remove struct factory_blks
lightnvm: refactor device ops->get_bb_tbl()
lightnvm: introduce nvm_for_each_lun_ppa() macro
lightnvm: refactor dev->online_target to global nvm_targets
lightnvm: rename nvm_targets to nvm_tgt_type
...
Diffstat (limited to 'drivers/nvme/host/pci.c')
-rw-r--r-- | drivers/nvme/host/pci.c | 249 |
1 files changed, 87 insertions, 162 deletions
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 4fd733ff72b1..0f093f14d348 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -54,8 +54,7 @@ * We handle AEN commands ourselves and don't even let the * block layer know about them. */ -#define NVME_NR_AEN_COMMANDS 1 -#define NVME_AQ_BLKMQ_DEPTH (NVME_AQ_DEPTH - NVME_NR_AEN_COMMANDS) +#define NVME_AQ_BLKMQ_DEPTH (NVME_AQ_DEPTH - NVME_NR_AERS) static int use_threaded_interrupts; module_param(use_threaded_interrupts, int, 0); @@ -92,9 +91,7 @@ struct nvme_dev { struct msix_entry *entry; void __iomem *bar; struct work_struct reset_work; - struct work_struct scan_work; struct work_struct remove_work; - struct work_struct async_work; struct timer_list watchdog_timer; struct mutex shutdown_lock; bool subsystem; @@ -102,11 +99,6 @@ struct nvme_dev { dma_addr_t cmb_dma_addr; u64 cmb_size; u32 cmbsz; - unsigned long flags; - -#define NVME_CTRL_RESETTING 0 -#define NVME_CTRL_REMOVING 1 - struct nvme_ctrl ctrl; struct completion ioq_wait; }; @@ -271,40 +263,6 @@ static int nvme_init_request(void *data, struct request *req, return 0; } -static void nvme_queue_scan(struct nvme_dev *dev) -{ - /* - * Do not queue new scan work when a controller is reset during - * removal. - */ - if (test_bit(NVME_CTRL_REMOVING, &dev->flags)) - return; - queue_work(nvme_workq, &dev->scan_work); -} - -static void nvme_complete_async_event(struct nvme_dev *dev, - struct nvme_completion *cqe) -{ - u16 status = le16_to_cpu(cqe->status) >> 1; - u32 result = le32_to_cpu(cqe->result); - - if (status == NVME_SC_SUCCESS || status == NVME_SC_ABORT_REQ) { - ++dev->ctrl.event_limit; - queue_work(nvme_workq, &dev->async_work); - } - - if (status != NVME_SC_SUCCESS) - return; - - switch (result & 0xff07) { - case NVME_AER_NOTICE_NS_CHANGED: - dev_info(dev->ctrl.device, "rescanning\n"); - nvme_queue_scan(dev); - default: - dev_warn(dev->ctrl.device, "async event result %08x\n", result); - } -} - /** * __nvme_submit_cmd() - Copy a command into a queue and ring the doorbell * @nvmeq: The queue to use @@ -334,16 +292,11 @@ static __le64 **iod_list(struct request *req) return (__le64 **)(iod->sg + req->nr_phys_segments); } -static int nvme_init_iod(struct request *rq, struct nvme_dev *dev) +static int nvme_init_iod(struct request *rq, unsigned size, + struct nvme_dev *dev) { struct nvme_iod *iod = blk_mq_rq_to_pdu(rq); int nseg = rq->nr_phys_segments; - unsigned size; - - if (rq->cmd_flags & REQ_DISCARD) - size = sizeof(struct nvme_dsm_range); - else - size = blk_rq_bytes(rq); if (nseg > NVME_INT_PAGES || size > NVME_INT_BYTES(dev)) { iod->sg = kmalloc(nvme_iod_alloc_size(dev, size, nseg), GFP_ATOMIC); @@ -368,6 +321,8 @@ static void nvme_free_iod(struct nvme_dev *dev, struct request *req) __le64 **list = iod_list(req); dma_addr_t prp_dma = iod->first_dma; + nvme_cleanup_cmd(req); + if (iod->npages == 0) dma_pool_free(dev->prp_small_pool, list[0], prp_dma); for (i = 0; i < iod->npages; i++) { @@ -529,7 +484,7 @@ static bool nvme_setup_prps(struct nvme_dev *dev, struct request *req, } static int nvme_map_data(struct nvme_dev *dev, struct request *req, - struct nvme_command *cmnd) + unsigned size, struct nvme_command *cmnd) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); struct request_queue *q = req->q; @@ -546,7 +501,7 @@ static int nvme_map_data(struct nvme_dev *dev, struct request *req, if (!dma_map_sg(dev->dev, iod->sg, iod->nents, dma_dir)) goto out; - if (!nvme_setup_prps(dev, req, blk_rq_bytes(req))) + if (!nvme_setup_prps(dev, req, size)) goto out_unmap; ret = BLK_MQ_RQ_QUEUE_ERROR; @@ -596,37 +551,6 @@ static void nvme_unmap_data(struct nvme_dev *dev, struct request *req) } /* - * We reuse the small pool to allocate the 16-byte range here as it is not - * worth having a special pool for these or additional cases to handle freeing - * the iod. - */ -static int nvme_setup_discard(struct nvme_queue *nvmeq, struct nvme_ns *ns, - struct request *req, struct nvme_command *cmnd) -{ - struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - struct nvme_dsm_range *range; - - range = dma_pool_alloc(nvmeq->dev->prp_small_pool, GFP_ATOMIC, - &iod->first_dma); - if (!range) - return BLK_MQ_RQ_QUEUE_BUSY; - iod_list(req)[0] = (__le64 *)range; - iod->npages = 0; - - range->cattr = cpu_to_le32(0); - range->nlb = cpu_to_le32(blk_rq_bytes(req) >> ns->lba_shift); - range->slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req))); - - memset(cmnd, 0, sizeof(*cmnd)); - cmnd->dsm.opcode = nvme_cmd_dsm; - cmnd->dsm.nsid = cpu_to_le32(ns->ns_id); - cmnd->dsm.prp1 = cpu_to_le64(iod->first_dma); - cmnd->dsm.nr = 0; - cmnd->dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD); - return BLK_MQ_RQ_QUEUE_OK; -} - -/* * NOTE: ns is NULL when called on the admin queue. */ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, @@ -637,6 +561,7 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, struct nvme_dev *dev = nvmeq->dev; struct request *req = bd->rq; struct nvme_command cmnd; + unsigned map_len; int ret = BLK_MQ_RQ_QUEUE_OK; /* @@ -652,23 +577,17 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, } } - ret = nvme_init_iod(req, dev); + map_len = nvme_map_len(req); + ret = nvme_init_iod(req, map_len, dev); if (ret) return ret; - if (req->cmd_flags & REQ_DISCARD) { - ret = nvme_setup_discard(nvmeq, ns, req, &cmnd); - } else { - if (req->cmd_type == REQ_TYPE_DRV_PRIV) - memcpy(&cmnd, req->cmd, sizeof(cmnd)); - else if (req->cmd_flags & REQ_FLUSH) - nvme_setup_flush(ns, &cmnd); - else - nvme_setup_rw(ns, req, &cmnd); + ret = nvme_setup_cmd(ns, req, &cmnd); + if (ret) + goto out; - if (req->nr_phys_segments) - ret = nvme_map_data(dev, req, &cmnd); - } + if (req->nr_phys_segments) + ret = nvme_map_data(dev, req, map_len, &cmnd); if (ret) goto out; @@ -764,7 +683,7 @@ static void __nvme_process_cq(struct nvme_queue *nvmeq, unsigned int *tag) */ if (unlikely(nvmeq->qid == 0 && cqe.command_id >= NVME_AQ_BLKMQ_DEPTH)) { - nvme_complete_async_event(nvmeq->dev, &cqe); + nvme_complete_async_event(&nvmeq->dev->ctrl, &cqe); continue; } @@ -833,21 +752,18 @@ static int nvme_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag) return 0; } -static void nvme_async_event_work(struct work_struct *work) +static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl, int aer_idx) { - struct nvme_dev *dev = container_of(work, struct nvme_dev, async_work); + struct nvme_dev *dev = to_nvme_dev(ctrl); struct nvme_queue *nvmeq = dev->queues[0]; struct nvme_command c; memset(&c, 0, sizeof(c)); c.common.opcode = nvme_admin_async_event; + c.common.command_id = NVME_AQ_BLKMQ_DEPTH + aer_idx; spin_lock_irq(&nvmeq->q_lock); - while (dev->ctrl.event_limit > 0) { - c.common.command_id = NVME_AQ_BLKMQ_DEPTH + - --dev->ctrl.event_limit; - __nvme_submit_cmd(nvmeq, &c); - } + __nvme_submit_cmd(nvmeq, &c); spin_unlock_irq(&nvmeq->q_lock); } @@ -939,7 +855,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) * cancellation error. All outstanding requests are completed on * shutdown, so we return BLK_EH_HANDLED. */ - if (test_bit(NVME_CTRL_RESETTING, &dev->flags)) { + if (dev->ctrl.state == NVME_CTRL_RESETTING) { dev_warn(dev->ctrl.device, "I/O %d QID %d timeout, disable controller\n", req->tag, nvmeq->qid); @@ -1003,16 +919,15 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) return BLK_EH_RESET_TIMER; } -static void nvme_cancel_queue_ios(struct request *req, void *data, bool reserved) +static void nvme_cancel_io(struct request *req, void *data, bool reserved) { - struct nvme_queue *nvmeq = data; int status; if (!blk_mq_request_started(req)) return; - dev_dbg_ratelimited(nvmeq->dev->ctrl.device, - "Cancelling I/O %d QID %d\n", req->tag, nvmeq->qid); + dev_dbg_ratelimited(((struct nvme_dev *) data)->ctrl.device, + "Cancelling I/O %d", req->tag); status = NVME_SC_ABORT_REQ; if (blk_queue_dying(req->q)) @@ -1069,14 +984,6 @@ static int nvme_suspend_queue(struct nvme_queue *nvmeq) return 0; } -static void nvme_clear_queue(struct nvme_queue *nvmeq) -{ - spin_lock_irq(&nvmeq->q_lock); - if (nvmeq->tags && *nvmeq->tags) - blk_mq_all_tag_busy_iter(*nvmeq->tags, nvme_cancel_queue_ios, nvmeq); - spin_unlock_irq(&nvmeq->q_lock); -} - static void nvme_disable_admin_queue(struct nvme_dev *dev, bool shutdown) { struct nvme_queue *nvmeq = dev->queues[0]; @@ -1350,22 +1257,44 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev) return result; } +static bool nvme_should_reset(struct nvme_dev *dev, u32 csts) +{ + + /* If true, indicates loss of adapter communication, possibly by a + * NVMe Subsystem reset. + */ + bool nssro = dev->subsystem && (csts & NVME_CSTS_NSSRO); + + /* If there is a reset ongoing, we shouldn't reset again. */ + if (work_busy(&dev->reset_work)) + return false; + + /* We shouldn't reset unless the controller is on fatal error state + * _or_ if we lost the communication with it. + */ + if (!(csts & NVME_CSTS_CFS) && !nssro) + return false; + + /* If PCI error recovery process is happening, we cannot reset or + * the recovery mechanism will surely fail. + */ + if (pci_channel_offline(to_pci_dev(dev->dev))) + return false; + + return true; +} + static void nvme_watchdog_timer(unsigned long data) { struct nvme_dev *dev = (struct nvme_dev *)data; u32 csts = readl(dev->bar + NVME_REG_CSTS); - /* - * Skip controllers currently under reset. - */ - if (!work_pending(&dev->reset_work) && !work_busy(&dev->reset_work) && - ((csts & NVME_CSTS_CFS) || - (dev->subsystem && (csts & NVME_CSTS_NSSRO)))) { - if (queue_work(nvme_workq, &dev->reset_work)) { + /* Skip controllers under certain specific conditions. */ + if (nvme_should_reset(dev, csts)) { + if (queue_work(nvme_workq, &dev->reset_work)) dev_warn(dev->dev, "Failed status: 0x%x, reset controller.\n", csts); - } return; } @@ -1551,8 +1480,9 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) return result; } -static void nvme_set_irq_hints(struct nvme_dev *dev) +static void nvme_pci_post_scan(struct nvme_ctrl *ctrl) { + struct nvme_dev *dev = to_nvme_dev(ctrl); struct nvme_queue *nvmeq; int i; @@ -1567,16 +1497,6 @@ static void nvme_set_irq_hints(struct nvme_dev *dev) } } -static void nvme_dev_scan(struct work_struct *work) -{ - struct nvme_dev *dev = container_of(work, struct nvme_dev, scan_work); - - if (!dev->tagset.tags) - return; - nvme_scan_namespaces(&dev->ctrl); - nvme_set_irq_hints(dev); -} - static void nvme_del_queue_end(struct request *req, int error) { struct nvme_queue *nvmeq = req->end_io_data; @@ -1592,7 +1512,13 @@ static void nvme_del_cq_end(struct request *req, int error) if (!error) { unsigned long flags; - spin_lock_irqsave(&nvmeq->q_lock, flags); + /* + * We might be called with the AQ q_lock held + * and the I/O queue q_lock should always + * nest inside the AQ one. + */ + spin_lock_irqsave_nested(&nvmeq->q_lock, flags, + SINGLE_DEPTH_NESTING); nvme_process_cq(nvmeq); spin_unlock_irqrestore(&nvmeq->q_lock, flags); } @@ -1684,7 +1610,6 @@ static int nvme_dev_add(struct nvme_dev *dev) nvme_free_queues(dev, dev->online_queues); } - nvme_queue_scan(dev); return 0; } @@ -1797,8 +1722,8 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown) } nvme_pci_disable(dev); - for (i = dev->queue_count - 1; i >= 0; i--) - nvme_clear_queue(dev->queues[i]); + blk_mq_tagset_busy_iter(&dev->tagset, nvme_cancel_io, dev); + blk_mq_tagset_busy_iter(&dev->admin_tagset, nvme_cancel_io, dev); mutex_unlock(&dev->shutdown_lock); } @@ -1854,7 +1779,7 @@ static void nvme_reset_work(struct work_struct *work) struct nvme_dev *dev = container_of(work, struct nvme_dev, reset_work); int result = -ENODEV; - if (WARN_ON(test_bit(NVME_CTRL_RESETTING, &dev->flags))) + if (WARN_ON(dev->ctrl.state == NVME_CTRL_RESETTING)) goto out; /* @@ -1864,11 +1789,9 @@ static void nvme_reset_work(struct work_struct *work) if (dev->ctrl.ctrl_config & NVME_CC_ENABLE) nvme_dev_disable(dev, false); - if (test_bit(NVME_CTRL_REMOVING, &dev->flags)) + if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_RESETTING)) goto out; - set_bit(NVME_CTRL_RESETTING, &dev->flags); - result = nvme_pci_enable(dev); if (result) goto out; @@ -1890,8 +1813,14 @@ static void nvme_reset_work(struct work_struct *work) if (result) goto out; - dev->ctrl.event_limit = NVME_NR_AEN_COMMANDS; - queue_work(nvme_workq, &dev->async_work); + /* + * A controller that can not execute IO typically requires user + * intervention to correct. For such degraded controllers, the driver + * should not submit commands the user did not request, so skip + * registering for asynchronous event notification on this condition. + */ + if (dev->online_queues > 1) + nvme_queue_async_events(&dev->ctrl); mod_timer(&dev->watchdog_timer, round_jiffies(jiffies + HZ)); @@ -1901,13 +1830,20 @@ static void nvme_reset_work(struct work_struct *work) */ if (dev->online_queues < 2) { dev_warn(dev->ctrl.device, "IO queues not created\n"); + nvme_kill_queues(&dev->ctrl); nvme_remove_namespaces(&dev->ctrl); } else { nvme_start_queues(&dev->ctrl); nvme_dev_add(dev); } - clear_bit(NVME_CTRL_RESETTING, &dev->flags); + if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_LIVE)) { + dev_warn(dev->ctrl.device, "failed to mark controller live\n"); + goto out; + } + + if (dev->online_queues > 1) + nvme_queue_scan(&dev->ctrl); return; out: @@ -1955,13 +1891,6 @@ static int nvme_pci_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val) return 0; } -static bool nvme_pci_io_incapable(struct nvme_ctrl *ctrl) -{ - struct nvme_dev *dev = to_nvme_dev(ctrl); - - return !dev->bar || dev->online_queues < 2; -} - static int nvme_pci_reset_ctrl(struct nvme_ctrl *ctrl) { return nvme_reset(to_nvme_dev(ctrl)); @@ -1972,9 +1901,10 @@ static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = { .reg_read32 = nvme_pci_reg_read32, .reg_write32 = nvme_pci_reg_write32, .reg_read64 = nvme_pci_reg_read64, - .io_incapable = nvme_pci_io_incapable, .reset_ctrl = nvme_pci_reset_ctrl, .free_ctrl = nvme_pci_free_ctrl, + .post_scan = nvme_pci_post_scan, + .submit_async_event = nvme_pci_submit_async_event, }; static int nvme_dev_map(struct nvme_dev *dev) @@ -2026,10 +1956,8 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (result) goto free; - INIT_WORK(&dev->scan_work, nvme_dev_scan); INIT_WORK(&dev->reset_work, nvme_reset_work); INIT_WORK(&dev->remove_work, nvme_remove_dead_ctrl_work); - INIT_WORK(&dev->async_work, nvme_async_event_work); setup_timer(&dev->watchdog_timer, nvme_watchdog_timer, (unsigned long)dev); mutex_init(&dev->shutdown_lock); @@ -2086,15 +2014,12 @@ static void nvme_remove(struct pci_dev *pdev) { struct nvme_dev *dev = pci_get_drvdata(pdev); - set_bit(NVME_CTRL_REMOVING, &dev->flags); + nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING); + pci_set_drvdata(pdev, NULL); - flush_work(&dev->async_work); flush_work(&dev->reset_work); - flush_work(&dev->scan_work); - nvme_remove_namespaces(&dev->ctrl); nvme_uninit_ctrl(&dev->ctrl); nvme_dev_disable(dev, true); - flush_work(&dev->reset_work); nvme_dev_remove_admin(dev); nvme_free_queues(dev, 0); nvme_release_cmb(dev); |