From 57c36519e4b949f89381053f7283f5d605595b42 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Wed, 16 Jan 2019 18:53:26 -0500 Subject: dm: fix clone_bio() to trigger blk_recount_segments() DM's clone_bio() now benefits from using bio_trim() by fixing the fact that clone_bio() wasn't clearing BIO_SEG_VALID like bio_trim() does; which triggers blk_recount_segments() via bio_phys_segments(). Reviewed-by: Ming Lei Signed-off-by: Mike Snitzer --- drivers/md/dm.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'drivers/md/dm.c') diff --git a/drivers/md/dm.c b/drivers/md/dm.c index d67c95ef8d7e..fcb97b0a5743 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1320,7 +1320,7 @@ static int clone_bio(struct dm_target_io *tio, struct bio *bio, __bio_clone_fast(clone, bio); - if (unlikely(bio_integrity(bio) != NULL)) { + if (bio_integrity(bio)) { int r; if (unlikely(!dm_target_has_integrity(tio->ti->type) && @@ -1336,11 +1336,7 @@ static int clone_bio(struct dm_target_io *tio, struct bio *bio, return r; } - bio_advance(clone, to_bytes(sector - clone->bi_iter.bi_sector)); - clone->bi_iter.bi_size = to_bytes(len); - - if (unlikely(bio_integrity(bio) != NULL)) - bio_integrity_trim(clone); + bio_trim(clone, sector - clone->bi_iter.bi_sector, len); return 0; } -- cgit v1.2.3 From a1e1cb72d96491277ede8d257ce6b48a381dd336 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Thu, 17 Jan 2019 10:48:01 -0500 Subject: dm: fix redundant IO accounting for bios that need splitting The risk of redundant IO accounting was not taken into consideration when commit 18a25da84354 ("dm: ensure bio submission follows a depth-first tree walk") introduced IO splitting in terms of recursion via generic_make_request(). Fix this by subtracting the split bio's payload from the IO stats that were already accounted for by start_io_acct() upon dm_make_request() entry. This repeat oscillation of the IO accounting, up then down, isn't ideal but refactoring DM core's IO splitting to pre-split bios _before_ they are accounted turned out to be an excessive amount of change that will need a full development cycle to refine and verify. Before this fix: /dev/mapper/stripe_dev is a 4-way stripe using a 32k chunksize, so bios are split on 32k boundaries. # fio --name=16M --filename=/dev/mapper/stripe_dev --rw=write --bs=64k --size=16M \ --iodepth=1 --ioengine=libaio --direct=1 --refill_buffers with debugging added: [103898.310264] device-mapper: core: start_io_acct: dm-2 WRITE bio->bi_iter.bi_sector=0 len=128 [103898.318704] device-mapper: core: __split_and_process_bio: recursing for following split bio: [103898.329136] device-mapper: core: start_io_acct: dm-2 WRITE bio->bi_iter.bi_sector=64 len=64 ... 16M written yet 136M (278528 * 512b) accounted: # cat /sys/block/dm-2/stat | awk '{ print $7 }' 278528 After this fix: 16M written and 16M (32768 * 512b) accounted: # cat /sys/block/dm-2/stat | awk '{ print $7 }' 32768 Fixes: 18a25da84354 ("dm: ensure bio submission follows a depth-first tree walk") Cc: stable@vger.kernel.org # 4.16+ Reported-by: Bryan Gurney Reviewed-by: Ming Lei Signed-off-by: Mike Snitzer --- drivers/md/dm.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'drivers/md/dm.c') diff --git a/drivers/md/dm.c b/drivers/md/dm.c index fcb97b0a5743..fbadda68e23b 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1584,6 +1584,9 @@ static void init_clone_info(struct clone_info *ci, struct mapped_device *md, ci->sector = bio->bi_iter.bi_sector; } +#define __dm_part_stat_sub(part, field, subnd) \ + (part_stat_get(part, field) -= (subnd)) + /* * Entry point to split a bio into clones and submit them to the targets. */ @@ -1638,6 +1641,19 @@ static blk_qc_t __split_and_process_bio(struct mapped_device *md, struct bio *b = bio_split(bio, bio_sectors(bio) - ci.sector_count, GFP_NOIO, &md->queue->bio_split); ci.io->orig_bio = b; + + /* + * Adjust IO stats for each split, otherwise upon queue + * reentry there will be redundant IO accounting. + * NOTE: this is a stop-gap fix, a proper fix involves + * significant refactoring of DM core's bio splitting + * (by eliminating DM's splitting and just using bio_split) + */ + part_stat_lock(); + __dm_part_stat_sub(&dm_disk(md)->part0, + sectors[op_stat_group(bio_op(bio))], ci.sector_count); + part_stat_unlock(); + bio_chain(b, bio); ret = generic_make_request(bio); break; -- cgit v1.2.3 From 6548c7c538e5658cbce686c2dd1a9b4f5398bf34 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Thu, 17 Jan 2019 14:33:01 -0500 Subject: dm: fix dm_wq_work() to only use __split_and_process_bio() if appropriate Otherwise targets that don't support/expect IO splitting could resubmit bios using code paths with unnecessary IO splitting complexity. Depends-on: 24113d487843 ("dm: avoid indirect call in __dm_make_request") Fixes: 978e51ba38e00 ("dm: optimize bio-based NVMe IO submission") Signed-off-by: Mike Snitzer --- drivers/md/dm.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) (limited to 'drivers/md/dm.c') diff --git a/drivers/md/dm.c b/drivers/md/dm.c index fbadda68e23b..a010002d892b 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1725,6 +1725,15 @@ out: return ret; } +static blk_qc_t dm_process_bio(struct mapped_device *md, + struct dm_table *map, struct bio *bio) +{ + if (dm_get_md_type(md) == DM_TYPE_NVME_BIO_BASED) + return __process_bio(md, map, bio); + else + return __split_and_process_bio(md, map, bio); +} + static blk_qc_t dm_make_request(struct request_queue *q, struct bio *bio) { struct mapped_device *md = q->queuedata; @@ -1745,10 +1754,7 @@ static blk_qc_t dm_make_request(struct request_queue *q, struct bio *bio) return ret; } - if (dm_get_md_type(md) == DM_TYPE_NVME_BIO_BASED) - ret = __process_bio(md, map, bio); - else - ret = __split_and_process_bio(md, map, bio); + ret = dm_process_bio(md, map, bio); dm_put_live_table(md, srcu_idx); return ret; @@ -2427,9 +2433,9 @@ static void dm_wq_work(struct work_struct *work) break; if (dm_request_based(md)) - generic_make_request(c); + (void) generic_make_request(c); else - __split_and_process_bio(md, map, c); + (void) dm_process_bio(md, map, c); } dm_put_live_table(md, srcu_idx); -- cgit v1.2.3 From 075c18c3e124a1511ebc10a89f1858c8a77dcb01 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Fri, 18 Jan 2019 01:21:11 -0500 Subject: dm: add missing trace_block_split() to __split_and_process_bio() Provides useful context about bio splits in blktrace. Signed-off-by: Mike Snitzer --- drivers/md/dm.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/md/dm.c') diff --git a/drivers/md/dm.c b/drivers/md/dm.c index a010002d892b..2b53c3841b53 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1655,6 +1655,7 @@ static blk_qc_t __split_and_process_bio(struct mapped_device *md, part_stat_unlock(); bio_chain(b, bio); + trace_block_split(md->queue, b, bio->bi_iter.bi_sector); ret = generic_make_request(bio); break; } -- cgit v1.2.3 From 645efa84f6c7566ea863ed37a8b3247247f72e02 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Tue, 5 Feb 2019 05:09:00 -0500 Subject: dm: add memory barrier before waitqueue_active Block core changes to switch bio-based IO accounting to be percpu had a side-effect of altering DM core to now rely on calling waitqueue_active (in both bio-based and request-based) to check if another task is in dm_wait_for_completion(). A memory barrier is needed before calling waitqueue_active(). DM core doesn't piggyback on a preceding memory barrier so it must explicitly use its own. For more details on why using waitqueue_active() without a preceding barrier is unsafe, please see the comment before the waitqueue_active() definition in include/linux/wait.h. Add the missing memory barrier by switching to using wq_has_sleeper(). Fixes: 6f75723190d8 ("dm: remove the pending IO accounting") Fixes: c4576aed8d85 ("dm: fix request-based dm's use of dm_wait_for_completion") Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/md/dm.c') diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 2b53c3841b53..a0972a9301de 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -699,7 +699,7 @@ static void end_io_acct(struct dm_io *io) true, duration, &io->stats_aux); /* nudge anyone waiting on suspend queue */ - if (unlikely(waitqueue_active(&md->wait))) + if (unlikely(wq_has_sleeper(&md->wait))) wake_up(&md->wait); } -- cgit v1.2.3 From fa8db4948f5224dae33a0e783e7dec682e145f88 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Tue, 5 Feb 2019 17:07:58 -0500 Subject: dm: don't use bio_trim() afterall bio_trim() has an early return, which makes it _not_ idempotent, if the offset is 0 and the bio's bi_size already matches the requested size. Prior to DM, all users of bio_trim() were fine with this. But DM has exposed the fact that bio_trim()'s early return is incompatible with a cloned bio whose integrity payload must be trimmed via bio_integrity_trim(). Fix this by reverting DM back to doing the equivalent of bio_trim() but in an idempotent manner (so bio_integrity_trim is always performed). Follow-on work is needed to assess what benefit bio_trim()'s early return is providing to its existing callers. Reported-by: Milan Broz Fixes: 57c36519e4b94 ("dm: fix clone_bio() to trigger blk_recount_segments()") Signed-off-by: Mike Snitzer --- drivers/md/dm.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'drivers/md/dm.c') diff --git a/drivers/md/dm.c b/drivers/md/dm.c index a0972a9301de..515e6af9bed2 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1336,7 +1336,11 @@ static int clone_bio(struct dm_target_io *tio, struct bio *bio, return r; } - bio_trim(clone, sector - clone->bi_iter.bi_sector, len); + bio_advance(clone, to_bytes(sector - clone->bi_iter.bi_sector)); + clone->bi_iter.bi_size = to_bytes(len); + + if (bio_integrity(bio)) + bio_integrity_trim(clone); return 0; } -- cgit v1.2.3