From a5f3d8a5eaaf917878f07998e6f1ea46024e6bab Mon Sep 17 00:00:00 2001
From: Coly Li <colyli@suse.de>
Date: Tue, 26 Sep 2017 17:54:12 +0800
Subject: bcache: use llist_for_each_entry_safe() in __closure_wake_up()

Commit 09b3efec ("bcache: Don't reinvent the wheel but use existing llist
API") replaces the following while loop by llist_for_each_entry(),

-
-	while (reverse) {
-		cl = container_of(reverse, struct closure, list);
-		reverse = llist_next(reverse);
-
+	llist_for_each_entry(cl, reverse, list) {
 		closure_set_waiting(cl, 0);
 		closure_sub(cl, CLOSURE_WAITING + 1);
 	}

This modification introduces a potential race by iterating a corrupted
list. Here is how it happens.

In the above modification, closure_sub() may wake up a process which is
waiting on reverse list. If this process decides to wait again by calling
closure_wait(), its cl->list will be added to another wait list. Then
when llist_for_each_entry() continues to iterate next node, it will travel
on another new wait list which is added in closure_wait(), not the
original reverse list in __closure_wake_up(). It is more probably to
happen on UP machine because the waked up process may preempt the process
which wakes up it.

Use llist_for_each_entry_safe() will fix the issue, the safe version fetch
next node before waking up a process. Then the copy of next node will make
sure list iteration stays on original reverse list.

Fixes: 09b3efec81de ("bcache: Don't reinvent the wheel but use existing llist API")
Signed-off-by: Coly Li <colyli@suse.de>
Reported-by: Michael Lyle <mlyle@lyle.org>
Reviewed-by: Byungchul Park <byungchul.park@lge.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/closure.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c
index 7d5286b05036..1841d0359bac 100644
--- a/drivers/md/bcache/closure.c
+++ b/drivers/md/bcache/closure.c
@@ -64,7 +64,7 @@ EXPORT_SYMBOL(closure_put);
 void __closure_wake_up(struct closure_waitlist *wait_list)
 {
 	struct llist_node *list;
-	struct closure *cl;
+	struct closure *cl, *t;
 	struct llist_node *reverse = NULL;
 
 	list = llist_del_all(&wait_list->list);
@@ -73,7 +73,7 @@ void __closure_wake_up(struct closure_waitlist *wait_list)
 	reverse = llist_reverse_order(list);
 
 	/* Then do the wakeups */
-	llist_for_each_entry(cl, reverse, list) {
+	llist_for_each_entry_safe(cl, t, reverse, list) {
 		closure_set_waiting(cl, 0);
 		closure_sub(cl, CLOSURE_WAITING + 1);
 	}
-- 
cgit v1.2.3


From 007a61ae2f35c7fcf767313285c4924e81f11983 Mon Sep 17 00:00:00 2001
From: Martin Wilck <mwilck@suse.com>
Date: Thu, 28 Sep 2017 21:33:23 +0200
Subject: nvme: fix visibility of "uuid" ns attribute

"uuid" must be invisible if both ns->uuid and ns->nguid are unset,
not if either one is.

Fixes: d934f9848a77 "nvme: provide UUID value to userspace"
Signed-off-by: Martin Wilck <mwilck@suse.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index bb2aad078637..5a14cc7f28ee 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -2136,7 +2136,7 @@ static umode_t nvme_ns_attrs_are_visible(struct kobject *kobj,
 	struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
 
 	if (a == &dev_attr_uuid.attr) {
-		if (uuid_is_null(&ns->uuid) ||
+		if (uuid_is_null(&ns->uuid) &&
 		    !memchr_inv(ns->nguid, 0, sizeof(ns->nguid)))
 			return 0;
 	}
-- 
cgit v1.2.3


From 6e60a3bbb45bd8b307269d6a821ee2c72d815846 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fb.com>
Date: Mon, 2 Oct 2017 16:22:08 -0400
Subject: nbd: fix -ERESTARTSYS handling

Christoph made it so that if we return'ed BLK_STS_RESOURCE whenever we
got ERESTARTSYS from sending our packets we'd return BLK_STS_OK, which
means we'd never requeue and just hang.  We really need to return the
right value from the upper layer.

Fixes: fc17b6534eb8 ("blk-mq: switch ->queue_rq return value to blk_status_t")
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/nbd.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 3684e21d543f..883dfebd3014 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -820,9 +820,13 @@ static blk_status_t nbd_queue_rq(struct blk_mq_hw_ctx *hctx,
 	 * appropriate.
 	 */
 	ret = nbd_handle_cmd(cmd, hctx->queue_num);
+	if (ret < 0)
+		ret = BLK_STS_IOERR;
+	else if (!ret)
+		ret = BLK_STS_OK;
 	complete(&cmd->send_complete);
 
-	return ret < 0 ? BLK_STS_IOERR : BLK_STS_OK;
+	return ret;
 }
 
 static int nbd_add_socket(struct nbd_device *nbd, unsigned long arg,
-- 
cgit v1.2.3


From 38b249bc0ca26d57dac65f5f659b39d88899d23d Mon Sep 17 00:00:00 2001
From: Wouter Verhelst <w@uter.be>
Date: Fri, 22 Sep 2017 12:09:54 +0200
Subject: MAINTAINERS: update list for NBD

nbd-general@sourceforge.net becomes nbd@other.debian.org, because
sourceforge is just a spamtrap these days.

Signed-off-by: Wouter Verhelst <w@uter.be>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 MAINTAINERS | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 6671f375f7fc..17a643f670a4 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -9348,7 +9348,7 @@ NETWORK BLOCK DEVICE (NBD)
 M:	Josef Bacik <jbacik@fb.com>
 S:	Maintained
 L:	linux-block@vger.kernel.org
-L:	nbd-general@lists.sourceforge.net
+L:	nbd@other.debian.org
 F:	Documentation/blockdev/nbd.txt
 F:	drivers/block/nbd.c
 F:	include/uapi/linux/nbd.h
-- 
cgit v1.2.3


From 4f02fb7617ba12ac15d261c654b9759ea8f1f1ef Mon Sep 17 00:00:00 2001
From: Joseph Qi <qijiang.qj@alibaba-inc.com>
Date: Sat, 30 Sep 2017 14:38:49 +0800
Subject: blk-throttle: fix possible io stall when upgrade to max

There is a case which will lead to io stall. The case is described as
follows.
/test1
  |-subtest1
/test2
  |-subtest2
And subtest1 and subtest2 each has 32 queued bios already.

Now upgrade to max. In throtl_upgrade_state, it will try to dispatch
bios as follows:
1) tg=subtest1, do nothing;
2) tg=test1, transfer 32 queued bios from subtest1 to test1; no pending
left, no need to schedule next dispatch;
3) tg=subtest2, do nothing;
4) tg=test2, transfer 32 queued bios from subtest2 to test2; no pending
left, no need to schedule next dispatch;
5) tg=/, transfer 8 queued bios from test1 to /, 8 queued bios from
test2 to /, 8 queued bios from test1 to /, and 8 queued bios from test2
to /; note that test1 and test2 each still has 16 queued bios left;
6) tg=/, try to schedule next dispatch, but since disptime is now
(update in tg_update_disptime, wait=0), pending timer is not scheduled
in fact;
7) In throtl_upgrade_state it totally dispatches 32 queued bios and with
32 left. test1 and test2 each has 16 queued bios;
8) throtl_pending_timer_fn sees the left over bios, but could do
nothing, because throtl_select_dispatch returns 0, and test1/test2 has
no pending tg.

The blktrace shows the following:
8,32   0        0     2.539007641     0  m   N throtl upgrade to max
8,32   0        0     2.539072267     0  m   N throtl /test2 dispatch nr_queued=16 read=0 write=16
8,32   7        0     2.539077142     0  m   N throtl /test1 dispatch nr_queued=16 read=0 write=16

So force schedule dispatch if there are pending children.

Reviewed-by: Shaohua Li <shli@fb.com>
Signed-off-by: Joseph Qi <qijiang.qj@alibaba-inc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-throttle.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 0fea76aa0f3f..17816a028dcb 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -1911,11 +1911,11 @@ static void throtl_upgrade_state(struct throtl_data *td)
 
 		tg->disptime = jiffies - 1;
 		throtl_select_dispatch(sq);
-		throtl_schedule_next_dispatch(sq, false);
+		throtl_schedule_next_dispatch(sq, true);
 	}
 	rcu_read_unlock();
 	throtl_select_dispatch(&td->service_queue);
-	throtl_schedule_next_dispatch(&td->service_queue, false);
+	throtl_schedule_next_dispatch(&td->service_queue, true);
 	queue_work(kthrotld_workqueue, &td->dispatch_work);
 }
 
-- 
cgit v1.2.3


From 6cd1a6fef7058de15405b13d6587538853279c7b Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 3 Oct 2017 15:58:15 -0600
Subject: null_blk: change configfs dependency to select

A recent commit made null_blk depend on configfs, which is kind of
annoying since you now have to find this dependency and enable that
as well. Discovered this since I no longer had null_blk available
on a box I needed to debug, since it got killed when the config
updated after the configfs change was merged.

Fixes: 3bf2bd20734e ("nullb: add configfs interface")
Reviewed-by: Shaohua Li <shli@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index 4a438b8abe27..2dfe99b328f8 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -17,7 +17,7 @@ if BLK_DEV
 
 config BLK_DEV_NULL_BLK
 	tristate "Null test block driver"
-	depends on CONFIGFS_FS
+	select CONFIGFS_FS
 
 config BLK_DEV_FD
 	tristate "Normal floppy disk support"
-- 
cgit v1.2.3


From 70e62f4bacdf31ea8a59f241c9229120cd06d9d1 Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Tue, 3 Oct 2017 14:57:16 -0700
Subject: blk-mq-debugfs: fix device sched directory for default scheduler

In blk_mq_debugfs_register(), I remembered to set up the per-hctx sched
directories if a default scheduler was already configured by
blk_mq_sched_init() from blk_mq_init_allocated_queue(), but I didn't do
the same for the device-wide sched directory. Fix it.

Fixes: d332ce091813 ("blk-mq-debugfs: allow schedulers to register debugfs attributes")
Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-debugfs.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 980e73095643..de294d775acf 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -815,10 +815,14 @@ int blk_mq_debugfs_register(struct request_queue *q)
 		goto err;
 
 	/*
-	 * blk_mq_init_hctx() attempted to do this already, but q->debugfs_dir
+	 * blk_mq_init_sched() attempted to do this already, but q->debugfs_dir
 	 * didn't exist yet (because we don't know what to name the directory
 	 * until the queue is registered to a gendisk).
 	 */
+	if (q->elevator && !q->sched_debugfs_dir)
+		blk_mq_debugfs_register_sched(q);
+
+	/* Similarly, blk_mq_init_hctx() couldn't do this previously. */
 	queue_for_each_hw_ctx(q, hctx, i) {
 		if (!hctx->debugfs_dir && blk_mq_debugfs_register_hctx(q, hctx))
 			goto err;
-- 
cgit v1.2.3


From 8969f1f8291762c13147c1ba89d46238af01675b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 1 Oct 2017 09:37:35 +0200
Subject: nvme-pci: Use PCI bus address for data/queues in CMB

Currently, NVMe PCI host driver is programming CMB dma address as
I/O SQs addresses. This results in failures on systems where 1:1
outbound mapping is not used (example Broadcom iProc SOCs) because
CMB BAR will be progammed with PCI bus address but NVMe PCI EP will
try to access CMB using dma address.

To have CMB working on systems without 1:1 outbound mapping, we
program PCI bus address for I/O SQs instead of dma address. This
approach will work on systems with/without 1:1 outbound mapping.

Based on a report and previous patch from Abhishek Shah.

Fixes: 8ffaadf7 ("NVMe: Use CMB for the IO SQes if available")
Cc: stable@vger.kernel.org
Reported-by: Abhishek Shah <abhishek.shah@broadcom.com>
Tested-by: Abhishek Shah <abhishek.shah@broadcom.com>
Reviewed-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/pci.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index cb73bc8cad3b..3f5a04c586ce 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -94,7 +94,7 @@ struct nvme_dev {
 	struct mutex shutdown_lock;
 	bool subsystem;
 	void __iomem *cmb;
-	dma_addr_t cmb_dma_addr;
+	pci_bus_addr_t cmb_bus_addr;
 	u64 cmb_size;
 	u32 cmbsz;
 	u32 cmbloc;
@@ -1226,7 +1226,7 @@ static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq,
 	if (qid && dev->cmb && use_cmb_sqes && NVME_CMB_SQS(dev->cmbsz)) {
 		unsigned offset = (qid - 1) * roundup(SQ_SIZE(depth),
 						      dev->ctrl.page_size);
-		nvmeq->sq_dma_addr = dev->cmb_dma_addr + offset;
+		nvmeq->sq_dma_addr = dev->cmb_bus_addr + offset;
 		nvmeq->sq_cmds_io = dev->cmb + offset;
 	} else {
 		nvmeq->sq_cmds = dma_alloc_coherent(dev->dev, SQ_SIZE(depth),
@@ -1527,7 +1527,7 @@ static void __iomem *nvme_map_cmb(struct nvme_dev *dev)
 	resource_size_t bar_size;
 	struct pci_dev *pdev = to_pci_dev(dev->dev);
 	void __iomem *cmb;
-	dma_addr_t dma_addr;
+	int bar;
 
 	dev->cmbsz = readl(dev->bar + NVME_REG_CMBSZ);
 	if (!(NVME_CMB_SZ(dev->cmbsz)))
@@ -1540,7 +1540,8 @@ static void __iomem *nvme_map_cmb(struct nvme_dev *dev)
 	szu = (u64)1 << (12 + 4 * NVME_CMB_SZU(dev->cmbsz));
 	size = szu * NVME_CMB_SZ(dev->cmbsz);
 	offset = szu * NVME_CMB_OFST(dev->cmbloc);
-	bar_size = pci_resource_len(pdev, NVME_CMB_BIR(dev->cmbloc));
+	bar = NVME_CMB_BIR(dev->cmbloc);
+	bar_size = pci_resource_len(pdev, bar);
 
 	if (offset > bar_size)
 		return NULL;
@@ -1553,12 +1554,11 @@ static void __iomem *nvme_map_cmb(struct nvme_dev *dev)
 	if (size > bar_size - offset)
 		size = bar_size - offset;
 
-	dma_addr = pci_resource_start(pdev, NVME_CMB_BIR(dev->cmbloc)) + offset;
-	cmb = ioremap_wc(dma_addr, size);
+	cmb = ioremap_wc(pci_resource_start(pdev, bar) + offset, size);
 	if (!cmb)
 		return NULL;
 
-	dev->cmb_dma_addr = dma_addr;
+	dev->cmb_bus_addr = pci_bus_address(pdev, bar) + offset;
 	dev->cmb_size = size;
 	return cmb;
 }
-- 
cgit v1.2.3


From eab40cf336065e8d765e006b81ff48c5c114b365 Mon Sep 17 00:00:00 2001
From: Benjamin Block <bblock@linux.vnet.ibm.com>
Date: Tue, 3 Oct 2017 12:48:37 +0200
Subject: bsg-lib: fix use-after-free under memory-pressure

When under memory-pressure it is possible that the mempool which backs
the 'struct request_queue' will make use of up to BLKDEV_MIN_RQ count
emergency buffers - in case it can't get a regular allocation. These
buffers are preallocated and once they are also used, they are
re-supplied with old finished requests from the same request_queue (see
mempool_free()).

The bug is, when re-supplying the emergency pool, the old requests are
not again ran through the callback mempool_t->alloc(), and thus also not
through the callback bsg_init_rq(). Thus we skip initialization, and
while the sense-buffer still should be good, scsi_request->cmd might
have become to be an invalid pointer in the meantime. When the request
is initialized in bsg.c, and the user's CDB is larger than BLK_MAX_CDB,
bsg will replace it with a custom allocated buffer, which is freed when
the user's command is finished, thus it dangles afterwards. When next a
command is sent by the user that has a smaller/similar CDB as
BLK_MAX_CDB, bsg will assume that scsi_request->cmd is backed by
scsi_request->__cmd, will not make a custom allocation, and write into
undefined memory.

Fix this by splitting bsg_init_rq() into two functions:
 - bsg_init_rq() is changed to only do the allocation of the
   sense-buffer, which is used to back the bsg job's reply buffer. This
   pointer should never change during the lifetime of a scsi_request, so
   it doesn't need re-initialization.
 - bsg_initialize_rq() is a new function that makes use of
   'struct request_queue's initialize_rq_fn callback (which was
   introduced in v4.12). This is always called before the request is
   given out via blk_get_request(). This function does the remaining
   initialization that was previously done in bsg_init_rq(), and will
   also do it when the request is taken from the emergency-pool of the
   backing mempool.

Fixes: 50b4d485528d ("bsg-lib: fix kernel panic resulting from missing allocation of reply-buffer")
Cc: <stable@vger.kernel.org> # 4.11+
Reviewed-by: Hannes Reinecke <hare@suse.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Benjamin Block <bblock@linux.vnet.ibm.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bsg-lib.c | 27 +++++++++++++++++++++------
 1 file changed, 21 insertions(+), 6 deletions(-)

diff --git a/block/bsg-lib.c b/block/bsg-lib.c
index dbddff8174e5..15d25ccd51a5 100644
--- a/block/bsg-lib.c
+++ b/block/bsg-lib.c
@@ -207,20 +207,34 @@ static int bsg_init_rq(struct request_queue *q, struct request *req, gfp_t gfp)
 	struct bsg_job *job = blk_mq_rq_to_pdu(req);
 	struct scsi_request *sreq = &job->sreq;
 
+	/* called right after the request is allocated for the request_queue */
+
+	sreq->sense = kzalloc(SCSI_SENSE_BUFFERSIZE, gfp);
+	if (!sreq->sense)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static void bsg_initialize_rq(struct request *req)
+{
+	struct bsg_job *job = blk_mq_rq_to_pdu(req);
+	struct scsi_request *sreq = &job->sreq;
+	void *sense = sreq->sense;
+
+	/* called right before the request is given to the request_queue user */
+
 	memset(job, 0, sizeof(*job));
 
 	scsi_req_init(sreq);
+
+	sreq->sense = sense;
 	sreq->sense_len = SCSI_SENSE_BUFFERSIZE;
-	sreq->sense = kzalloc(sreq->sense_len, gfp);
-	if (!sreq->sense)
-		return -ENOMEM;
 
 	job->req = req;
-	job->reply = sreq->sense;
+	job->reply = sense;
 	job->reply_len = sreq->sense_len;
 	job->dd_data = job + 1;
-
-	return 0;
 }
 
 static void bsg_exit_rq(struct request_queue *q, struct request *req)
@@ -251,6 +265,7 @@ struct request_queue *bsg_setup_queue(struct device *dev, const char *name,
 	q->cmd_size = sizeof(struct bsg_job) + dd_job_size;
 	q->init_rq_fn = bsg_init_rq;
 	q->exit_rq_fn = bsg_exit_rq;
+	q->initialize_rq_fn = bsg_initialize_rq;
 	q->request_fn = bsg_request_fn;
 
 	ret = blk_init_allocated_queue(q);
-- 
cgit v1.2.3