From aa306ab703e9452b1e25cc8e8f04b8df523d0bb8 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 24 Jul 2019 11:48:39 +0800 Subject: blk-mq: introduce blk_mq_request_completed() NVMe needs this function to decide if one request to be aborted has been completed in normal IO path already. So introduce it. Cc: Max Gurtovoy Cc: Sagi Grimberg Cc: Keith Busch Cc: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 3fa1fa59f9b2..baac2926e54a 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -296,6 +296,7 @@ static inline u16 blk_mq_unique_tag_to_tag(u32 unique_tag) int blk_mq_request_started(struct request *rq); +int blk_mq_request_completed(struct request *rq); void blk_mq_start_request(struct request *rq); void blk_mq_end_request(struct request *rq, blk_status_t error); void __blk_mq_end_request(struct request *rq, blk_status_t error); -- cgit v1.2.3 From f9934a80f91dba8c7029ba7601459e41ea7770aa Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 24 Jul 2019 11:48:40 +0800 Subject: blk-mq: introduce blk_mq_tagset_wait_completed_request() blk-mq may schedule to call queue's complete function on remote CPU via IPI, but doesn't provide any way to synchronize the request's complete fn. The current queue freeze interface can't provide the synchonization because aborted requests stay at blk-mq queues during EH. In some driver's EH(such as NVMe), hardware queue's resource may be freed & re-allocated. If the completed request's complete fn is run finally after the hardware queue's resource is released, kernel crash will be triggered. Prepare for fixing this kind of issue by introducing blk_mq_tagset_wait_completed_request(). Cc: Max Gurtovoy Cc: Sagi Grimberg Cc: Keith Busch Cc: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index baac2926e54a..ee0719b649b6 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -322,6 +322,7 @@ bool blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); void blk_mq_run_hw_queues(struct request_queue *q, bool async); void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, busy_tag_iter_fn *fn, void *priv); +void blk_mq_tagset_wait_completed_request(struct blk_mq_tag_set *tagset); void blk_mq_freeze_queue(struct request_queue *q); void blk_mq_unfreeze_queue(struct request_queue *q); void blk_freeze_queue_start(struct request_queue *q); -- cgit v1.2.3 From a87ccce0b5a06ee546931859fa62e10f1bce54f9 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 24 Jul 2019 11:48:43 +0800 Subject: blk-mq: remove blk_mq_complete_request_sync blk_mq_tagset_wait_completed_request() has been applied for waiting for completed request's fn, so not necessary to use blk_mq_complete_request_sync() any more. Cc: Max Gurtovoy Cc: Sagi Grimberg Cc: Keith Busch Cc: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index ee0719b649b6..1cdd2788cfa6 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -305,7 +305,6 @@ void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list); void blk_mq_kick_requeue_list(struct request_queue *q); void blk_mq_delay_kick_requeue_list(struct request_queue *q, unsigned long msecs); bool blk_mq_complete_request(struct request *rq); -void blk_mq_complete_request_sync(struct request *rq); bool blk_mq_bio_list_merge(struct request_queue *q, struct list_head *list, struct bio *bio, unsigned int nr_segs); bool blk_mq_queue_stopped(struct request_queue *q); -- cgit v1.2.3 From af2c68fe94e8c0a628519b60ba070c5cf6526a99 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 1 Aug 2019 15:50:40 -0700 Subject: block: Declare several function pointer arguments 'const' Make it clear to the compiler and also to humans that the functions that query request queue properties do not modify any member of the request_queue data structure. Reviewed-by: Johannes Thumshirn Cc: Christoph Hellwig Cc: Ming Lei Cc: Hannes Reinecke Signed-off-by: Bart Van Assche Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 1ef375dafb1c..96a29a72fd4a 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1232,42 +1232,42 @@ enum blk_default_limits { BLK_SEG_BOUNDARY_MASK = 0xFFFFFFFFUL, }; -static inline unsigned long queue_segment_boundary(struct request_queue *q) +static inline unsigned long queue_segment_boundary(const struct request_queue *q) { return q->limits.seg_boundary_mask; } -static inline unsigned long queue_virt_boundary(struct request_queue *q) +static inline unsigned long queue_virt_boundary(const struct request_queue *q) { return q->limits.virt_boundary_mask; } -static inline unsigned int queue_max_sectors(struct request_queue *q) +static inline unsigned int queue_max_sectors(const struct request_queue *q) { return q->limits.max_sectors; } -static inline unsigned int queue_max_hw_sectors(struct request_queue *q) +static inline unsigned int queue_max_hw_sectors(const struct request_queue *q) { return q->limits.max_hw_sectors; } -static inline unsigned short queue_max_segments(struct request_queue *q) +static inline unsigned short queue_max_segments(const struct request_queue *q) { return q->limits.max_segments; } -static inline unsigned short queue_max_discard_segments(struct request_queue *q) +static inline unsigned short queue_max_discard_segments(const struct request_queue *q) { return q->limits.max_discard_segments; } -static inline unsigned int queue_max_segment_size(struct request_queue *q) +static inline unsigned int queue_max_segment_size(const struct request_queue *q) { return q->limits.max_segment_size; } -static inline unsigned short queue_logical_block_size(struct request_queue *q) +static inline unsigned short queue_logical_block_size(const struct request_queue *q) { int retval = 512; @@ -1282,7 +1282,7 @@ static inline unsigned short bdev_logical_block_size(struct block_device *bdev) return queue_logical_block_size(bdev_get_queue(bdev)); } -static inline unsigned int queue_physical_block_size(struct request_queue *q) +static inline unsigned int queue_physical_block_size(const struct request_queue *q) { return q->limits.physical_block_size; } @@ -1292,7 +1292,7 @@ static inline unsigned int bdev_physical_block_size(struct block_device *bdev) return queue_physical_block_size(bdev_get_queue(bdev)); } -static inline unsigned int queue_io_min(struct request_queue *q) +static inline unsigned int queue_io_min(const struct request_queue *q) { return q->limits.io_min; } @@ -1302,7 +1302,7 @@ static inline int bdev_io_min(struct block_device *bdev) return queue_io_min(bdev_get_queue(bdev)); } -static inline unsigned int queue_io_opt(struct request_queue *q) +static inline unsigned int queue_io_opt(const struct request_queue *q) { return q->limits.io_opt; } @@ -1312,7 +1312,7 @@ static inline int bdev_io_opt(struct block_device *bdev) return queue_io_opt(bdev_get_queue(bdev)); } -static inline int queue_alignment_offset(struct request_queue *q) +static inline int queue_alignment_offset(const struct request_queue *q) { if (q->limits.misaligned) return -1; @@ -1342,7 +1342,7 @@ static inline int bdev_alignment_offset(struct block_device *bdev) return q->limits.alignment_offset; } -static inline int queue_discard_alignment(struct request_queue *q) +static inline int queue_discard_alignment(const struct request_queue *q) { if (q->limits.discard_misaligned) return -1; @@ -1432,7 +1432,7 @@ static inline sector_t bdev_zone_sectors(struct block_device *bdev) return 0; } -static inline int queue_dma_alignment(struct request_queue *q) +static inline int queue_dma_alignment(const struct request_queue *q) { return q ? q->dma_alignment : 511; } @@ -1543,7 +1543,7 @@ static inline void blk_queue_max_integrity_segments(struct request_queue *q, } static inline unsigned short -queue_max_integrity_segments(struct request_queue *q) +queue_max_integrity_segments(const struct request_queue *q) { return q->limits.max_integrity_segments; } @@ -1626,7 +1626,7 @@ static inline void blk_queue_max_integrity_segments(struct request_queue *q, unsigned int segs) { } -static inline unsigned short queue_max_integrity_segments(struct request_queue *q) +static inline unsigned short queue_max_integrity_segments(const struct request_queue *q) { return 0; } -- cgit v1.2.3 From 012d4a652ca172d93315cb69f2adf7df37ea77e6 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 1 Aug 2019 15:39:07 -0700 Subject: block: Fix spelling in the header above blkg_lookup() See also commit 8f4236d9008b ("block: remove QUEUE_FLAG_BYPASS and ->bypass") # v5.0. Cc: Christoph Hellwig Cc: Hannes Reinecke Signed-off-by: Bart Van Assche Signed-off-by: Jens Axboe --- include/linux/blk-cgroup.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 12811091fd50..0bb79d858a13 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -375,7 +375,7 @@ static inline struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, * @q: request_queue of interest * * Lookup blkg for the @blkcg - @q pair. This function should be called - * under RCU read loc. + * under RCU read lock. */ static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q) -- cgit v1.2.3 From e84e8f0663956f45c747df5629046794cff93893 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Thu, 1 Aug 2019 10:26:35 -0700 Subject: block: add req op to reset all zones and flag This patch introduces a new request operation REQ_OP_ZONE_RESET_ALL. This is useful for the applications like mkfs where it needs to reset all the zones present on the underlying block device. As part for this patch we also introduce new QUEUE_FLAG_ZONE_RESETALL which indicates the queue zone reset all capability and corresponding helper macro. Reviewed-by: Damien Le Moal Reviewed-by: Hannes Reinecke Signed-off-by: Chaitanya Kulkarni Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 2 ++ include/linux/blkdev.h | 3 +++ 2 files changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 1b1fa1557e68..d6ce7b3ec8b1 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -282,6 +282,8 @@ enum req_opf { REQ_OP_ZONE_RESET = 6, /* write the same sector many times */ REQ_OP_WRITE_SAME = 7, + /* reset all the zone present on the device */ + REQ_OP_ZONE_RESET_ALL = 8, /* write the zero filled sector many times */ REQ_OP_WRITE_ZEROES = 9, diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 96a29a72fd4a..167bf879f072 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -611,6 +611,7 @@ struct request_queue { #define QUEUE_FLAG_SCSI_PASSTHROUGH 23 /* queue supports SCSI commands */ #define QUEUE_FLAG_QUIESCED 24 /* queue has been quiesced */ #define QUEUE_FLAG_PCI_P2PDMA 25 /* device supports PCI p2p requests */ +#define QUEUE_FLAG_ZONE_RESETALL 26 /* supports Zone Reset All */ #define QUEUE_FLAG_MQ_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ (1 << QUEUE_FLAG_SAME_COMP)) @@ -630,6 +631,8 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q); #define blk_queue_io_stat(q) test_bit(QUEUE_FLAG_IO_STAT, &(q)->queue_flags) #define blk_queue_add_random(q) test_bit(QUEUE_FLAG_ADD_RANDOM, &(q)->queue_flags) #define blk_queue_discard(q) test_bit(QUEUE_FLAG_DISCARD, &(q)->queue_flags) +#define blk_queue_zone_resetall(q) \ + test_bit(QUEUE_FLAG_ZONE_RESETALL, &(q)->queue_flags) #define blk_queue_secure_erase(q) \ (test_bit(QUEUE_FLAG_SECERASE, &(q)->queue_flags)) #define blk_queue_dax(q) test_bit(QUEUE_FLAG_DAX, &(q)->queue_flags) -- cgit v1.2.3 From 226b4fc75c78f9c497c5182d939101b260cfb9f3 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 25 Jul 2019 10:04:59 +0800 Subject: blk-mq: add callback of .cleanup_rq SCSI maintains its own driver private data hooked off of each SCSI request, and the pridate data won't be freed after scsi_queue_rq() returns BLK_STS_RESOURCE or BLK_STS_DEV_RESOURCE. An upper layer driver (e.g. dm-rq) may need to retry these SCSI requests, before SCSI has fully dispatched them, due to a lower level SCSI driver's resource limitation identified in scsi_queue_rq(). Currently SCSI's per-request private data is leaked when the upper layer driver (dm-rq) frees and then retries these requests in response to BLK_STS_RESOURCE or BLK_STS_DEV_RESOURCE returns from scsi_queue_rq(). This usecase is so specialized that it doesn't warrant training an existing blk-mq interface (e.g. blk_mq_free_request) to allow SCSI to account for freeing its driver private data -- doing so would add an extra branch for handling a special case that all other consumers of SCSI (and blk-mq) won't ever need to worry about. So the most pragmatic way forward is to delegate freeing SCSI driver private data to the upper layer driver (dm-rq). Do so by adding new .cleanup_rq callback and calling a new blk_mq_cleanup_rq() method from dm-rq. A following commit will implement the .cleanup_rq() hook in scsi_mq_ops. Cc: Ewan D. Milne Cc: Bart Van Assche Cc: Hannes Reinecke Cc: Christoph Hellwig Cc: Mike Snitzer Cc: dm-devel@redhat.com Cc: Fixes: 396eaf21ee17 ("blk-mq: improve DM's blk-mq IO merging via blk_insert_cloned_request feedback") Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 1cdd2788cfa6..21cebe901ac0 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -140,6 +140,7 @@ typedef int (poll_fn)(struct blk_mq_hw_ctx *); typedef int (map_queues_fn)(struct blk_mq_tag_set *set); typedef bool (busy_fn)(struct request_queue *); typedef void (complete_fn)(struct request *); +typedef void (cleanup_rq_fn)(struct request *); struct blk_mq_ops { @@ -200,6 +201,12 @@ struct blk_mq_ops { /* Called from inside blk_get_request() */ void (*initialize_rq_fn)(struct request *rq); + /* + * Called before freeing one request which isn't completed yet, + * and usually for freeing the driver private data + */ + cleanup_rq_fn *cleanup_rq; + /* * If set, returns whether or not this queue currently is busy */ @@ -367,4 +374,10 @@ static inline blk_qc_t request_to_qc_t(struct blk_mq_hw_ctx *hctx, BLK_QC_T_INTERNAL; } +static inline void blk_mq_cleanup_rq(struct request *rq) +{ + if (rq->q->mq_ops->cleanup_rq) + rq->q->mq_ops->cleanup_rq(rq); +} + #endif -- cgit v1.2.3 From 98d87f70f4ab84b9e50e16b7848937ae07518cd4 Mon Sep 17 00:00:00 2001 From: Hans Holmberg Date: Wed, 31 Jul 2019 11:41:33 +0200 Subject: lightnvm: remove nvm_submit_io_sync_fn MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move the redundant sync handling interface and wait for a completion in the lightnvm core instead. Reviewed-by: Javier González Reviewed-by: Christoph Hellwig Signed-off-by: Hans Holmberg Signed-off-by: Jens Axboe --- include/linux/lightnvm.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index 4d0d5655c7b2..8891647b24b1 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -89,7 +89,6 @@ typedef int (nvm_op_set_bb_fn)(struct nvm_dev *, struct ppa_addr *, int, int); typedef int (nvm_get_chk_meta_fn)(struct nvm_dev *, sector_t, int, struct nvm_chk_meta *); typedef int (nvm_submit_io_fn)(struct nvm_dev *, struct nvm_rq *); -typedef int (nvm_submit_io_sync_fn)(struct nvm_dev *, struct nvm_rq *); typedef void *(nvm_create_dma_pool_fn)(struct nvm_dev *, char *, int); typedef void (nvm_destroy_dma_pool_fn)(void *); typedef void *(nvm_dev_dma_alloc_fn)(struct nvm_dev *, void *, gfp_t, @@ -104,7 +103,6 @@ struct nvm_dev_ops { nvm_get_chk_meta_fn *get_chk_meta; nvm_submit_io_fn *submit_io; - nvm_submit_io_sync_fn *submit_io_sync; nvm_create_dma_pool_fn *create_dma_pool; nvm_destroy_dma_pool_fn *destroy_dma_pool; -- cgit v1.2.3 From 48e5da725581c1f7444e45cccbafc33e11430b48 Mon Sep 17 00:00:00 2001 From: Hans Holmberg Date: Wed, 31 Jul 2019 11:41:34 +0200 Subject: lightnvm: move metadata mapping to lower level driver MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now that blk_rq_map_kern can map both kmem and vmem, move internal metadata mapping down to the lower level driver. Reviewed-by: Javier González Reviewed-by: Christoph Hellwig Signed-off-by: Hans Holmberg Signed-off-by: Jens Axboe --- include/linux/lightnvm.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index 8891647b24b1..ee8ec2e68055 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -88,7 +88,7 @@ typedef int (nvm_op_bb_tbl_fn)(struct nvm_dev *, struct ppa_addr, u8 *); typedef int (nvm_op_set_bb_fn)(struct nvm_dev *, struct ppa_addr *, int, int); typedef int (nvm_get_chk_meta_fn)(struct nvm_dev *, sector_t, int, struct nvm_chk_meta *); -typedef int (nvm_submit_io_fn)(struct nvm_dev *, struct nvm_rq *); +typedef int (nvm_submit_io_fn)(struct nvm_dev *, struct nvm_rq *, void *); typedef void *(nvm_create_dma_pool_fn)(struct nvm_dev *, char *, int); typedef void (nvm_destroy_dma_pool_fn)(void *); typedef void *(nvm_dev_dma_alloc_fn)(struct nvm_dev *, void *, gfp_t, @@ -680,8 +680,8 @@ extern int nvm_get_chunk_meta(struct nvm_tgt_dev *, struct ppa_addr, int, struct nvm_chk_meta *); extern int nvm_set_chunk_meta(struct nvm_tgt_dev *, struct ppa_addr *, int, int); -extern int nvm_submit_io(struct nvm_tgt_dev *, struct nvm_rq *); -extern int nvm_submit_io_sync(struct nvm_tgt_dev *, struct nvm_rq *); +extern int nvm_submit_io(struct nvm_tgt_dev *, struct nvm_rq *, void *); +extern int nvm_submit_io_sync(struct nvm_tgt_dev *, struct nvm_rq *, void *); extern void nvm_end_io(struct nvm_rq *); #else /* CONFIG_NVM */ -- cgit v1.2.3 From b8e24a9300b0836a9d39f6b20746766b3b81f1bd Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 8 Aug 2019 15:03:00 -0400 Subject: block: annotate refault stalls from IO submission psi tracks the time tasks wait for refaulting pages to become uptodate, but it does not track the time spent submitting the IO. The submission part can be significant if backing storage is contended or when cgroup throttling (io.latency) is in effect - a lot of time is spent in submit_bio(). In that case, we underreport memory pressure. Annotate submit_bio() to account submission time as memory stall when the bio is reading userspace workingset pages. Tested-by: Suren Baghdasaryan Signed-off-by: Johannes Weiner Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index d6ce7b3ec8b1..5a1118d4ef7e 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -209,6 +209,7 @@ enum { BIO_BOUNCED, /* bio is a bounce bio */ BIO_USER_MAPPED, /* contains user pages */ BIO_NULL_MAPPED, /* contains invalid user pages */ + BIO_WORKINGSET, /* contains userspace workingset pages */ BIO_QUIET, /* Make BIO Quiet */ BIO_CHAIN, /* chained bio, ->bi_remaining in effect */ BIO_REFFED, /* bio has elevated ->bi_cnt */ -- cgit v1.2.3 From 988721db93b2f5e6477cb0ea0b64ba9bcfd67778 Mon Sep 17 00:00:00 2001 From: Junxiao Bi Date: Fri, 16 Aug 2019 14:12:33 -0700 Subject: block: remove struct request_queue queue_head The dispatch list is not used any more, as the legacy block IO stack has been removed. Reviewed-by: Bart Van Assche Reviewed-by: Ming Lei Signed-off-by: Junxiao Bi Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 167bf879f072..4798bb25f1ee 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -391,10 +391,6 @@ static inline int blkdev_reset_zones_ioctl(struct block_device *bdev, #endif /* CONFIG_BLK_DEV_ZONED */ struct request_queue { - /* - * Together with queue_head for cacheline sharing - */ - struct list_head queue_head; struct request *last_merge; struct elevator_queue *elevator; -- cgit v1.2.3 From 5b9cce4c7eb0696558dfd4946074ae1fb9d8f05d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 26 Aug 2019 09:06:52 -0700 Subject: writeback: Generalize and expose wb_completion wb_completion is used to track writeback completions. We want to use it from memcg side for foreign inode flushes. This patch updates it to remember the target waitq instead of assuming bdi->wb_waitq and expose it outside of fs-writeback.c. Reviewed-by: Jan Kara Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- include/linux/backing-dev-defs.h | 20 ++++++++++++++++++++ include/linux/backing-dev.h | 2 ++ 2 files changed, 22 insertions(+) (limited to 'include') diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index 6a1a8a314d85..8fb740178d5d 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -67,6 +67,26 @@ enum wb_reason { WB_REASON_MAX, }; +struct wb_completion { + atomic_t cnt; + wait_queue_head_t *waitq; +}; + +#define __WB_COMPLETION_INIT(_waitq) \ + (struct wb_completion){ .cnt = ATOMIC_INIT(1), .waitq = (_waitq) } + +/* + * If one wants to wait for one or more wb_writeback_works, each work's + * ->done should be set to a wb_completion defined using the following + * macro. Once all work items are issued with wb_queue_work(), the caller + * can wait for the completion of all using wb_wait_for_completion(). Work + * items which are waited upon aren't freed automatically on completion. + */ +#define WB_COMPLETION_INIT(bdi) __WB_COMPLETION_INIT(&(bdi)->wb_waitq) + +#define DEFINE_WB_COMPLETION(cmpl, bdi) \ + struct wb_completion cmpl = WB_COMPLETION_INIT(bdi) + /* * For cgroup writeback, multiple wb's may map to the same blkcg. Those * wb's can operate mostly independently but should share the congested diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 35b31d176f74..02650b1253a2 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -44,6 +44,8 @@ void wb_start_background_writeback(struct bdi_writeback *wb); void wb_workfn(struct work_struct *work); void wb_wakeup_delayed(struct bdi_writeback *wb); +void wb_wait_for_completion(struct wb_completion *done); + extern spinlock_t bdi_lock; extern struct list_head bdi_list; -- cgit v1.2.3 From 34f8fe501f0624de115d087680c84000b5d9abc9 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 26 Aug 2019 09:06:53 -0700 Subject: bdi: Add bdi->id There currently is no way to universally identify and lookup a bdi without holding a reference and pointer to it. This patch adds an non-recycling bdi->id and implements bdi_get_by_id() which looks up bdis by their ids. This will be used by memcg foreign inode flushing. I left bdi_list alone for simplicity and because while rb_tree does support rcu assignment it doesn't seem to guarantee lossless walk when walk is racing aginst tree rebalance operations. Reviewed-by: Jan Kara Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- include/linux/backing-dev-defs.h | 2 ++ include/linux/backing-dev.h | 1 + 2 files changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index 8fb740178d5d..1075f2552cfc 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -185,6 +185,8 @@ struct bdi_writeback { }; struct backing_dev_info { + u64 id; + struct rb_node rb_node; /* keyed by ->id */ struct list_head bdi_list; unsigned long ra_pages; /* max readahead in PAGE_SIZE units */ unsigned long io_pages; /* max allowed IO size */ diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 02650b1253a2..84cdcfbc763f 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -24,6 +24,7 @@ static inline struct backing_dev_info *bdi_get(struct backing_dev_info *bdi) return bdi; } +struct backing_dev_info *bdi_get_by_id(u64 id); void bdi_put(struct backing_dev_info *bdi); __printf(2, 3) -- cgit v1.2.3 From ed288dc0d4aa29f65bd25b31b5cb866aa5664ff9 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 26 Aug 2019 09:06:54 -0700 Subject: writeback: Separate out wb_get_lookup() from wb_get_create() Separate out wb_get_lookup() which doesn't try to create one if there isn't already one from wb_get_create(). This will be used by later patches. Reviewed-by: Jan Kara Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- include/linux/backing-dev.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 84cdcfbc763f..97967ce06de3 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -230,6 +230,8 @@ static inline int bdi_sched_wait(void *word) struct bdi_writeback_congested * wb_congested_get_create(struct backing_dev_info *bdi, int blkcg_id, gfp_t gfp); void wb_congested_put(struct bdi_writeback_congested *congested); +struct bdi_writeback *wb_get_lookup(struct backing_dev_info *bdi, + struct cgroup_subsys_state *memcg_css); struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi, struct cgroup_subsys_state *memcg_css, gfp_t gfp); -- cgit v1.2.3 From d62241c7a406f0680d702bd974f6f17e28ab8e5d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 26 Aug 2019 09:06:55 -0700 Subject: writeback, memcg: Implement cgroup_writeback_by_id() Implement cgroup_writeback_by_id() which initiates cgroup writeback from bdi and memcg IDs. This will be used by memcg foreign inode flushing. v2: Use wb_get_lookup() instead of wb_get_create() to avoid creating spurious wbs. v3: Interpret 0 @nr as 1.25 * nr_dirty to implement best-effort flushing while avoding possible livelocks. Reviewed-by: Jan Kara Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- include/linux/writeback.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 8945aac31392..a19d845dd7eb 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -217,6 +217,8 @@ void wbc_attach_and_unlock_inode(struct writeback_control *wbc, void wbc_detach_inode(struct writeback_control *wbc); void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page, size_t bytes); +int cgroup_writeback_by_id(u64 bdi_id, int memcg_id, unsigned long nr_pages, + enum wb_reason reason, struct wb_completion *done); void cgroup_writeback_umount(void); /** -- cgit v1.2.3 From 97b27821b4854ca744946dae32a3f2fd55bcd5bc Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 26 Aug 2019 09:06:56 -0700 Subject: writeback, memcg: Implement foreign dirty flushing There's an inherent mismatch between memcg and writeback. The former trackes ownership per-page while the latter per-inode. This was a deliberate design decision because honoring per-page ownership in the writeback path is complicated, may lead to higher CPU and IO overheads and deemed unnecessary given that write-sharing an inode across different cgroups isn't a common use-case. Combined with inode majority-writer ownership switching, this works well enough in most cases but there are some pathological cases. For example, let's say there are two cgroups A and B which keep writing to different but confined parts of the same inode. B owns the inode and A's memory is limited far below B's. A's dirty ratio can rise enough to trigger balance_dirty_pages() sleeps but B's can be low enough to avoid triggering background writeback. A will be slowed down without a way to make writeback of the dirty pages happen. This patch implements foreign dirty recording and foreign mechanism so that when a memcg encounters a condition as above it can trigger flushes on bdi_writebacks which can clean its pages. Please see the comment on top of mem_cgroup_track_foreign_dirty_slowpath() for details. A reproducer follows. write-range.c:: #include #include #include #include #include static const char *usage = "write-range FILE START SIZE\n"; int main(int argc, char **argv) { int fd; unsigned long start, size, end, pos; char *endp; char buf[4096]; if (argc < 4) { fprintf(stderr, usage); return 1; } fd = open(argv[1], O_WRONLY); if (fd < 0) { perror("open"); return 1; } start = strtoul(argv[2], &endp, 0); if (*endp != '\0') { fprintf(stderr, usage); return 1; } size = strtoul(argv[3], &endp, 0); if (*endp != '\0') { fprintf(stderr, usage); return 1; } end = start + size; while (1) { for (pos = start; pos < end; ) { long bread, bwritten = 0; if (lseek(fd, pos, SEEK_SET) < 0) { perror("lseek"); return 1; } bread = read(0, buf, sizeof(buf) < end - pos ? sizeof(buf) : end - pos); if (bread < 0) { perror("read"); return 1; } if (bread == 0) return 0; while (bwritten < bread) { long this; this = write(fd, buf + bwritten, bread - bwritten); if (this < 0) { perror("write"); return 1; } bwritten += this; pos += bwritten; } } } } repro.sh:: #!/bin/bash set -e set -x sysctl -w vm.dirty_expire_centisecs=300000 sysctl -w vm.dirty_writeback_centisecs=300000 sysctl -w vm.dirtytime_expire_seconds=300000 echo 3 > /proc/sys/vm/drop_caches TEST=/sys/fs/cgroup/test A=$TEST/A B=$TEST/B mkdir -p $A $B echo "+memory +io" > $TEST/cgroup.subtree_control echo $((1<<30)) > $A/memory.high echo $((32<<30)) > $B/memory.high rm -f testfile touch testfile fallocate -l 4G testfile echo "Starting B" (echo $BASHPID > $B/cgroup.procs pv -q --rate-limit 70M < /dev/urandom | ./write-range testfile $((2<<30)) $((2<<30))) & echo "Waiting 10s to ensure B claims the testfile inode" sleep 5 sync sleep 5 sync echo "Starting A" (echo $BASHPID > $A/cgroup.procs pv < /dev/urandom | ./write-range testfile 0 $((2<<30))) v2: Added comments explaining why the specific intervals are being used. v3: Use 0 @nr when calling cgroup_writeback_by_id() to use best-effort flushing while avoding possible livelocks. v4: Use get_jiffies_64() and time_before/after64() instead of raw jiffies_64 and arthimetic comparisons as suggested by Jan. Reviewed-by: Jan Kara Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- include/linux/backing-dev-defs.h | 1 + include/linux/memcontrol.h | 39 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+) (limited to 'include') diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index 1075f2552cfc..4fc87dee005a 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -63,6 +63,7 @@ enum wb_reason { * so it has a mismatch name. */ WB_REASON_FORKER_THREAD, + WB_REASON_FOREIGN_FLUSH, WB_REASON_MAX, }; diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 44c41462be33..bc69d5725760 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -183,6 +183,23 @@ struct memcg_padding { #define MEMCG_PADDING(name) #endif +/* + * Remember four most recent foreign writebacks with dirty pages in this + * cgroup. Inode sharing is expected to be uncommon and, even if we miss + * one in a given round, we're likely to catch it later if it keeps + * foreign-dirtying, so a fairly low count should be enough. + * + * See mem_cgroup_track_foreign_dirty_slowpath() for details. + */ +#define MEMCG_CGWB_FRN_CNT 4 + +struct memcg_cgwb_frn { + u64 bdi_id; /* bdi->id of the foreign inode */ + int memcg_id; /* memcg->css.id of foreign inode */ + u64 at; /* jiffies_64 at the time of dirtying */ + struct wb_completion done; /* tracks in-flight foreign writebacks */ +}; + /* * The memory controller data structure. The memory controller controls both * page cache and RSS per cgroup. We would eventually like to provide @@ -307,6 +324,7 @@ struct mem_cgroup { #ifdef CONFIG_CGROUP_WRITEBACK struct list_head cgwb_list; struct wb_domain cgwb_domain; + struct memcg_cgwb_frn cgwb_frn[MEMCG_CGWB_FRN_CNT]; #endif /* List of events which userspace want to receive */ @@ -1218,6 +1236,18 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, unsigned long *pheadroom, unsigned long *pdirty, unsigned long *pwriteback); +void mem_cgroup_track_foreign_dirty_slowpath(struct page *page, + struct bdi_writeback *wb); + +static inline void mem_cgroup_track_foreign_dirty(struct page *page, + struct bdi_writeback *wb) +{ + if (unlikely(&page->mem_cgroup->css != wb->memcg_css)) + mem_cgroup_track_foreign_dirty_slowpath(page, wb); +} + +void mem_cgroup_flush_foreign(struct bdi_writeback *wb); + #else /* CONFIG_CGROUP_WRITEBACK */ static inline struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb) @@ -1233,6 +1263,15 @@ static inline void mem_cgroup_wb_stats(struct bdi_writeback *wb, { } +static inline void mem_cgroup_track_foreign_dirty(struct page *page, + struct bdi_writeback *wb) +{ +} + +static inline void mem_cgroup_flush_foreign(struct bdi_writeback *wb) +{ +} + #endif /* CONFIG_CGROUP_WRITEBACK */ struct sock; -- cgit v1.2.3 From 9685b2270211628e27ea7880a02b52efd4524099 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 27 Aug 2019 19:01:44 +0800 Subject: block: Remove blk_mq_register_dev() This function has no callers. Hence remove it. Cc: Christoph Hellwig Cc: Ming Lei Cc: Hannes Reinecke Signed-off-by: Bart Van Assche Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 21cebe901ac0..62a3bb715899 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -253,7 +253,6 @@ struct request_queue *blk_mq_init_sq_queue(struct blk_mq_tag_set *set, const struct blk_mq_ops *ops, unsigned int queue_depth, unsigned int set_flags); -int blk_mq_register_dev(struct device *, struct request_queue *); void blk_mq_unregister_dev(struct device *, struct request_queue *); int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set); -- cgit v1.2.3 From 58c898ba370e68d39470cd0d932b524682c1f9be Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 27 Aug 2019 19:01:47 +0800 Subject: block: add helper for checking if queue is registered There are 4 users which check if queue is registered, so add one helper to check it. Cc: Christoph Hellwig Cc: Hannes Reinecke Cc: Greg KH Cc: Mike Snitzer Cc: Bart Van Assche Reviewed-by: Bart Van Assche Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 4798bb25f1ee..d5077f3fdfd6 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -643,6 +643,7 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q); #define blk_queue_quiesced(q) test_bit(QUEUE_FLAG_QUIESCED, &(q)->queue_flags) #define blk_queue_pm_only(q) atomic_read(&(q)->pm_only) #define blk_queue_fua(q) test_bit(QUEUE_FLAG_FUA, &(q)->queue_flags) +#define blk_queue_registered(q) test_bit(QUEUE_FLAG_REGISTERED, &(q)->queue_flags) extern void blk_set_pm_only(struct request_queue *q); extern void blk_clear_pm_only(struct request_queue *q); -- cgit v1.2.3 From cecf5d87ff2035127bb5a9ee054d0023a4a7cad3 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 27 Aug 2019 19:01:48 +0800 Subject: block: split .sysfs_lock into two locks The kernfs built-in lock of 'kn->count' is held in sysfs .show/.store path. Meantime, inside block's .show/.store callback, q->sysfs_lock is required. However, when mq & iosched kobjects are removed via blk_mq_unregister_dev() & elv_unregister_queue(), q->sysfs_lock is held too. This way causes AB-BA lock because the kernfs built-in lock of 'kn-count' is required inside kobject_del() too, see the lockdep warning[1]. On the other hand, it isn't necessary to acquire q->sysfs_lock for both blk_mq_unregister_dev() & elv_unregister_queue() because clearing REGISTERED flag prevents storing to 'queue/scheduler' from being happened. Also sysfs write(store) is exclusive, so no necessary to hold the lock for elv_unregister_queue() when it is called in switching elevator path. So split .sysfs_lock into two: one is still named as .sysfs_lock for covering sync .store, the other one is named as .sysfs_dir_lock for covering kobjects and related status change. sysfs itself can handle the race between add/remove kobjects and showing/storing attributes under kobjects. For switching scheduler via storing to 'queue/scheduler', we use the queue flag of QUEUE_FLAG_REGISTERED with .sysfs_lock for avoiding the race, then we can avoid to hold .sysfs_lock during removing/adding kobjects. [1] lockdep warning ====================================================== WARNING: possible circular locking dependency detected 5.3.0-rc3-00044-g73277fc75ea0 #1380 Not tainted ------------------------------------------------------ rmmod/777 is trying to acquire lock: 00000000ac50e981 (kn->count#202){++++}, at: kernfs_remove_by_name_ns+0x59/0x72 but task is already holding lock: 00000000fb16ae21 (&q->sysfs_lock){+.+.}, at: blk_unregister_queue+0x78/0x10b which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (&q->sysfs_lock){+.+.}: __lock_acquire+0x95f/0xa2f lock_acquire+0x1b4/0x1e8 __mutex_lock+0x14a/0xa9b blk_mq_hw_sysfs_show+0x63/0xb6 sysfs_kf_seq_show+0x11f/0x196 seq_read+0x2cd/0x5f2 vfs_read+0xc7/0x18c ksys_read+0xc4/0x13e do_syscall_64+0xa7/0x295 entry_SYSCALL_64_after_hwframe+0x49/0xbe -> #0 (kn->count#202){++++}: check_prev_add+0x5d2/0xc45 validate_chain+0xed3/0xf94 __lock_acquire+0x95f/0xa2f lock_acquire+0x1b4/0x1e8 __kernfs_remove+0x237/0x40b kernfs_remove_by_name_ns+0x59/0x72 remove_files+0x61/0x96 sysfs_remove_group+0x81/0xa4 sysfs_remove_groups+0x3b/0x44 kobject_del+0x44/0x94 blk_mq_unregister_dev+0x83/0xdd blk_unregister_queue+0xa0/0x10b del_gendisk+0x259/0x3fa null_del_dev+0x8b/0x1c3 [null_blk] null_exit+0x5c/0x95 [null_blk] __se_sys_delete_module+0x204/0x337 do_syscall_64+0xa7/0x295 entry_SYSCALL_64_after_hwframe+0x49/0xbe other info that might help us debug this: Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&q->sysfs_lock); lock(kn->count#202); lock(&q->sysfs_lock); lock(kn->count#202); *** DEADLOCK *** 2 locks held by rmmod/777: #0: 00000000e69bd9de (&lock){+.+.}, at: null_exit+0x2e/0x95 [null_blk] #1: 00000000fb16ae21 (&q->sysfs_lock){+.+.}, at: blk_unregister_queue+0x78/0x10b stack backtrace: CPU: 0 PID: 777 Comm: rmmod Not tainted 5.3.0-rc3-00044-g73277fc75ea0 #1380 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS ?-20180724_192412-buildhw-07.phx4 Call Trace: dump_stack+0x9a/0xe6 check_noncircular+0x207/0x251 ? print_circular_bug+0x32a/0x32a ? find_usage_backwards+0x84/0xb0 check_prev_add+0x5d2/0xc45 validate_chain+0xed3/0xf94 ? check_prev_add+0xc45/0xc45 ? mark_lock+0x11b/0x804 ? check_usage_forwards+0x1ca/0x1ca __lock_acquire+0x95f/0xa2f lock_acquire+0x1b4/0x1e8 ? kernfs_remove_by_name_ns+0x59/0x72 __kernfs_remove+0x237/0x40b ? kernfs_remove_by_name_ns+0x59/0x72 ? kernfs_next_descendant_post+0x7d/0x7d ? strlen+0x10/0x23 ? strcmp+0x22/0x44 kernfs_remove_by_name_ns+0x59/0x72 remove_files+0x61/0x96 sysfs_remove_group+0x81/0xa4 sysfs_remove_groups+0x3b/0x44 kobject_del+0x44/0x94 blk_mq_unregister_dev+0x83/0xdd blk_unregister_queue+0xa0/0x10b del_gendisk+0x259/0x3fa ? disk_events_poll_msecs_store+0x12b/0x12b ? check_flags+0x1ea/0x204 ? mark_held_locks+0x1f/0x7a null_del_dev+0x8b/0x1c3 [null_blk] null_exit+0x5c/0x95 [null_blk] __se_sys_delete_module+0x204/0x337 ? free_module+0x39f/0x39f ? blkcg_maybe_throttle_current+0x8a/0x718 ? rwlock_bug+0x62/0x62 ? __blkcg_punt_bio_submit+0xd0/0xd0 ? trace_hardirqs_on_thunk+0x1a/0x20 ? mark_held_locks+0x1f/0x7a ? do_syscall_64+0x4c/0x295 do_syscall_64+0xa7/0x295 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x7fb696cdbe6b Code: 73 01 c3 48 8b 0d 1d 20 0c 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 008 RSP: 002b:00007ffec9588788 EFLAGS: 00000206 ORIG_RAX: 00000000000000b0 RAX: ffffffffffffffda RBX: 0000559e589137c0 RCX: 00007fb696cdbe6b RDX: 000000000000000a RSI: 0000000000000800 RDI: 0000559e58913828 RBP: 0000000000000000 R08: 00007ffec9587701 R09: 0000000000000000 R10: 00007fb696d4eae0 R11: 0000000000000206 R12: 00007ffec95889b0 R13: 00007ffec95896b3 R14: 0000559e58913260 R15: 0000559e589137c0 Cc: Christoph Hellwig Cc: Hannes Reinecke Cc: Greg KH Cc: Mike Snitzer Reviewed-by: Bart Van Assche Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index d5077f3fdfd6..1ac790178787 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -535,6 +535,7 @@ struct request_queue { struct delayed_work requeue_work; struct mutex sysfs_lock; + struct mutex sysfs_dir_lock; /* * for reusing dead hctx instance in case of updating -- cgit v1.2.3 From cf09a8ee19ad1f78b4e18cdde9f2a61133efacf5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 28 Aug 2019 15:05:51 -0700 Subject: blkcg: pass @q and @blkcg into blkcg_pol_alloc_pd_fn() Instead of @node, pass in @q and @blkcg so that the alloc function has more context. This doesn't cause any behavior change and will be used by io.weight implementation. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- include/linux/blk-cgroup.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 0bb79d858a13..261248e88eb1 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -149,7 +149,8 @@ typedef struct blkcg_policy_data *(blkcg_pol_alloc_cpd_fn)(gfp_t gfp); typedef void (blkcg_pol_init_cpd_fn)(struct blkcg_policy_data *cpd); typedef void (blkcg_pol_free_cpd_fn)(struct blkcg_policy_data *cpd); typedef void (blkcg_pol_bind_cpd_fn)(struct blkcg_policy_data *cpd); -typedef struct blkg_policy_data *(blkcg_pol_alloc_pd_fn)(gfp_t gfp, int node); +typedef struct blkg_policy_data *(blkcg_pol_alloc_pd_fn)(gfp_t gfp, + struct request_queue *q, struct blkcg *blkcg); typedef void (blkcg_pol_init_pd_fn)(struct blkg_policy_data *pd); typedef void (blkcg_pol_online_pd_fn)(struct blkg_policy_data *pd); typedef void (blkcg_pol_offline_pd_fn)(struct blkg_policy_data *pd); -- cgit v1.2.3 From 015d254cb02b6d8eec4b3366274bf4672f9e0b64 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 28 Aug 2019 15:05:53 -0700 Subject: blkcg: separate blkcg_conf_get_disk() out of blkg_conf_prep() Separate out blkcg_conf_get_disk() so that it can be used by blkcg policy interface file input parsers before the policy is actually enabled. This doesn't introduce any functional changes. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- include/linux/blk-cgroup.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 261248e88eb1..bed9e43f9426 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -234,6 +234,7 @@ struct blkg_conf_ctx { char *body; }; +struct gendisk *blkcg_conf_get_disk(char **inputp); int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, char *input, struct blkg_conf_ctx *ctx); void blkg_conf_finish(struct blkg_conf_ctx *ctx); -- cgit v1.2.3 From 6f816b4b746c2241540e537682d30d8e9997d674 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 28 Aug 2019 15:05:57 -0700 Subject: blk-mq: add optional request->alloc_time_ns There are currently two start time timestamps - start_time_ns and io_start_time_ns. The former marks the request allocation and and the second issue-to-device time. The planned io.weight controller needs to measure the total time bios take to execute after it leaves rq_qos including the time spent waiting for request to become available, which can easily dominate on saturated devices. This patch adds request->alloc_time_ns which records when the request allocation attempt started. As it isn't used for the usual stats, make it optional behind CONFIG_BLK_RQ_ALLOC_TIME and QUEUE_FLAG_RQ_ALLOC_TIME so that it can be compiled out when there are no users and it's active only on queues which need it even when compiled in. v2: s/pre_start_time/alloc_time/ and add CONFIG_BLK_RQ_ALLOC_TIME gating as suggested by Jens. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 1ac790178787..d0ad21e4771b 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -194,7 +194,11 @@ struct request { struct gendisk *rq_disk; struct hd_struct *part; - /* Time that I/O was submitted to the kernel. */ +#ifdef CONFIG_BLK_RQ_ALLOC_TIME + /* Time that the first bio started allocating this request. */ + u64 alloc_time_ns; +#endif + /* Time that this request was allocated for this IO. */ u64 start_time_ns; /* Time that I/O was submitted to the device. */ u64 io_start_time_ns; @@ -609,6 +613,7 @@ struct request_queue { #define QUEUE_FLAG_QUIESCED 24 /* queue has been quiesced */ #define QUEUE_FLAG_PCI_P2PDMA 25 /* device supports PCI p2p requests */ #define QUEUE_FLAG_ZONE_RESETALL 26 /* supports Zone Reset All */ +#define QUEUE_FLAG_RQ_ALLOC_TIME 27 /* record rq->alloc_time_ns */ #define QUEUE_FLAG_MQ_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ (1 << QUEUE_FLAG_SAME_COMP)) @@ -637,6 +642,12 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q); test_bit(QUEUE_FLAG_SCSI_PASSTHROUGH, &(q)->queue_flags) #define blk_queue_pci_p2pdma(q) \ test_bit(QUEUE_FLAG_PCI_P2PDMA, &(q)->queue_flags) +#ifdef CONFIG_BLK_RQ_ALLOC_TIME +#define blk_queue_rq_alloc_time(q) \ + test_bit(QUEUE_FLAG_RQ_ALLOC_TIME, &(q)->queue_flags) +#else +#define blk_queue_rq_alloc_time(q) false +#endif #define blk_noretry_request(rq) \ ((rq)->cmd_flags & (REQ_FAILFAST_DEV|REQ_FAILFAST_TRANSPORT| \ -- cgit v1.2.3 From 7caa47151ab2e644dd221f741ec7578d9532c9a3 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 28 Aug 2019 15:05:58 -0700 Subject: blkcg: implement blk-iocost This patchset implements IO cost model based work-conserving proportional controller. While io.latency provides the capability to comprehensively prioritize and protect IOs depending on the cgroups, its protection is binary - the lowest latency target cgroup which is suffering is protected at the cost of all others. In many use cases including stacking multiple workload containers in a single system, it's necessary to distribute IO capacity with better granularity. One challenge of controlling IO resources is the lack of trivially observable cost metric. The most common metrics - bandwidth and iops - can be off by orders of magnitude depending on the device type and IO pattern. However, the cost isn't a complete mystery. Given several key attributes, we can make fairly reliable predictions on how expensive a given stream of IOs would be, at least compared to other IO patterns. The function which determines the cost of a given IO is the IO cost model for the device. This controller distributes IO capacity based on the costs estimated by such model. The more accurate the cost model the better but the controller adapts based on IO completion latency and as long as the relative costs across differents IO patterns are consistent and sensible, it'll adapt to the actual performance of the device. Currently, the only implemented cost model is a simple linear one with a few sets of default parameters for different classes of device. This covers most common devices reasonably well. All the infrastructure to tune and add different cost models is already in place and a later patch will also allow using bpf progs for cost models. Please see the top comment in blk-iocost.c and documentation for more details. v2: Rebased on top of RQ_ALLOC_TIME changes and folded in Rik's fix for a divide-by-zero bug in current_hweight() triggered by zero inuse_sum. Signed-off-by: Tejun Heo Cc: Andy Newell Cc: Josef Bacik Cc: Rik van Riel Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 3 + include/trace/events/iocost.h | 174 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 177 insertions(+) create mode 100644 include/trace/events/iocost.h (limited to 'include') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 5a1118d4ef7e..d688b96d1d63 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -169,6 +169,9 @@ struct bio { */ struct blkcg_gq *bi_blkg; struct bio_issue bi_issue; +#ifdef CONFIG_BLK_CGROUP_IOCOST + u64 bi_iocost_cost; +#endif #endif union { #if defined(CONFIG_BLK_DEV_INTEGRITY) diff --git a/include/trace/events/iocost.h b/include/trace/events/iocost.h new file mode 100644 index 000000000000..ec2217dd57ac --- /dev/null +++ b/include/trace/events/iocost.h @@ -0,0 +1,174 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM iocost + +#if !defined(_TRACE_BLK_IOCOST_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_BLK_IOCOST_H + +#include + +TRACE_EVENT(iocost_iocg_activate, + + TP_PROTO(struct ioc_gq *iocg, const char *path, struct ioc_now *now, + u64 last_period, u64 cur_period, u64 vtime), + + TP_ARGS(iocg, path, now, last_period, cur_period, vtime), + + TP_STRUCT__entry ( + __string(devname, ioc_name(iocg->ioc)) + __string(cgroup, path) + __field(u64, now) + __field(u64, vnow) + __field(u64, vrate) + __field(u64, last_period) + __field(u64, cur_period) + __field(u64, last_vtime) + __field(u64, vtime) + __field(u32, weight) + __field(u32, inuse) + __field(u64, hweight_active) + __field(u64, hweight_inuse) + ), + + TP_fast_assign( + __assign_str(devname, ioc_name(iocg->ioc)); + __assign_str(cgroup, path); + __entry->now = now->now; + __entry->vnow = now->vnow; + __entry->vrate = now->vrate; + __entry->last_period = last_period; + __entry->cur_period = cur_period; + __entry->last_vtime = iocg->last_vtime; + __entry->vtime = vtime; + __entry->weight = iocg->weight; + __entry->inuse = iocg->inuse; + __entry->hweight_active = iocg->hweight_active; + __entry->hweight_inuse = iocg->hweight_inuse; + ), + + TP_printk("[%s:%s] now=%llu:%llu vrate=%llu " + "period=%llu->%llu vtime=%llu->%llu " + "weight=%u/%u hweight=%llu/%llu", + __get_str(devname), __get_str(cgroup), + __entry->now, __entry->vnow, __entry->vrate, + __entry->last_period, __entry->cur_period, + __entry->last_vtime, __entry->vtime, + __entry->inuse, __entry->weight, + __entry->hweight_inuse, __entry->hweight_active + ) +); + +DECLARE_EVENT_CLASS(iocg_inuse_update, + + TP_PROTO(struct ioc_gq *iocg, const char *path, struct ioc_now *now, + u32 old_inuse, u32 new_inuse, + u64 old_hw_inuse, u64 new_hw_inuse), + + TP_ARGS(iocg, path, now, old_inuse, new_inuse, + old_hw_inuse, new_hw_inuse), + + TP_STRUCT__entry ( + __string(devname, ioc_name(iocg->ioc)) + __string(cgroup, path) + __field(u64, now) + __field(u32, old_inuse) + __field(u32, new_inuse) + __field(u64, old_hweight_inuse) + __field(u64, new_hweight_inuse) + ), + + TP_fast_assign( + __assign_str(devname, ioc_name(iocg->ioc)); + __assign_str(cgroup, path); + __entry->now = now->now; + __entry->old_inuse = old_inuse; + __entry->new_inuse = new_inuse; + __entry->old_hweight_inuse = old_hw_inuse; + __entry->new_hweight_inuse = new_hw_inuse; + ), + + TP_printk("[%s:%s] now=%llu inuse=%u->%u hw_inuse=%llu->%llu", + __get_str(devname), __get_str(cgroup), __entry->now, + __entry->old_inuse, __entry->new_inuse, + __entry->old_hweight_inuse, __entry->new_hweight_inuse + ) +); + +DEFINE_EVENT(iocg_inuse_update, iocost_inuse_takeback, + + TP_PROTO(struct ioc_gq *iocg, const char *path, struct ioc_now *now, + u32 old_inuse, u32 new_inuse, + u64 old_hw_inuse, u64 new_hw_inuse), + + TP_ARGS(iocg, path, now, old_inuse, new_inuse, + old_hw_inuse, new_hw_inuse) +); + +DEFINE_EVENT(iocg_inuse_update, iocost_inuse_giveaway, + + TP_PROTO(struct ioc_gq *iocg, const char *path, struct ioc_now *now, + u32 old_inuse, u32 new_inuse, + u64 old_hw_inuse, u64 new_hw_inuse), + + TP_ARGS(iocg, path, now, old_inuse, new_inuse, + old_hw_inuse, new_hw_inuse) +); + +DEFINE_EVENT(iocg_inuse_update, iocost_inuse_reset, + + TP_PROTO(struct ioc_gq *iocg, const char *path, struct ioc_now *now, + u32 old_inuse, u32 new_inuse, + u64 old_hw_inuse, u64 new_hw_inuse), + + TP_ARGS(iocg, path, now, old_inuse, new_inuse, + old_hw_inuse, new_hw_inuse) +); + +TRACE_EVENT(iocost_ioc_vrate_adj, + + TP_PROTO(struct ioc *ioc, u64 new_vrate, u32 (*missed_ppm)[2], + u32 rq_wait_pct, int nr_lagging, int nr_shortages, + int nr_surpluses), + + TP_ARGS(ioc, new_vrate, missed_ppm, rq_wait_pct, nr_lagging, nr_shortages, + nr_surpluses), + + TP_STRUCT__entry ( + __string(devname, ioc_name(ioc)) + __field(u64, old_vrate) + __field(u64, new_vrate) + __field(int, busy_level) + __field(u32, read_missed_ppm) + __field(u32, write_missed_ppm) + __field(u32, rq_wait_pct) + __field(int, nr_lagging) + __field(int, nr_shortages) + __field(int, nr_surpluses) + ), + + TP_fast_assign( + __assign_str(devname, ioc_name(ioc)); + __entry->old_vrate = atomic64_read(&ioc->vtime_rate);; + __entry->new_vrate = new_vrate; + __entry->busy_level = ioc->busy_level; + __entry->read_missed_ppm = (*missed_ppm)[READ]; + __entry->write_missed_ppm = (*missed_ppm)[WRITE]; + __entry->rq_wait_pct = rq_wait_pct; + __entry->nr_lagging = nr_lagging; + __entry->nr_shortages = nr_shortages; + __entry->nr_surpluses = nr_surpluses; + ), + + TP_printk("[%s] vrate=%llu->%llu busy=%d missed_ppm=%u:%u rq_wait_pct=%u lagging=%d shortages=%d surpluses=%d", + __get_str(devname), __entry->old_vrate, __entry->new_vrate, + __entry->busy_level, + __entry->read_missed_ppm, __entry->write_missed_ppm, + __entry->rq_wait_pct, __entry->nr_lagging, __entry->nr_shortages, + __entry->nr_surpluses + ) +); + +#endif /* _TRACE_BLK_IOCOST_H */ + +/* This part must be outside protection */ +#include -- cgit v1.2.3 From 8d1c1560c383004e09c6a39498094671cc664e6b Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Thu, 29 Aug 2019 09:43:34 -0600 Subject: blkcg: blk-iocost: predeclare used structs Fixes: 7caa47151ab2 ("blkcg: implement blk-iocost") Acked-by: Tejun Heo Signed-off-by: Stephen Rothwell Signed-off-by: Jens Axboe --- include/trace/events/iocost.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/trace/events/iocost.h b/include/trace/events/iocost.h index ec2217dd57ac..7ecaa65b7106 100644 --- a/include/trace/events/iocost.h +++ b/include/trace/events/iocost.h @@ -2,6 +2,10 @@ #undef TRACE_SYSTEM #define TRACE_SYSTEM iocost +struct ioc; +struct ioc_now; +struct ioc_gq; + #if !defined(_TRACE_BLK_IOCOST_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_BLK_IOCOST_H -- cgit v1.2.3 From c638984521f19ba218477d5ef9f10f9a6206bab6 Mon Sep 17 00:00:00 2001 From: Minwoo Im Date: Sun, 4 Aug 2019 16:50:47 +0900 Subject: nvme: add Get LBA Status command opcode NVMe 1.4 added Get LBA Status command with opcode 0x86. Signed-off-by: Minwoo Im Signed-off-by: Sagi Grimberg --- include/linux/nvme.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 01aa6a6c241d..a01277501eae 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -814,6 +814,7 @@ enum nvme_admin_opcode { nvme_admin_security_send = 0x81, nvme_admin_security_recv = 0x82, nvme_admin_sanitize_nvm = 0x84, + nvme_admin_get_lba_status = 0x86, }; #define nvme_admin_opcode_name(opcode) { opcode, #opcode } -- cgit v1.2.3 From a5ef757204bab6f80268a7437556cb57744ab7d4 Mon Sep 17 00:00:00 2001 From: Minwoo Im Date: Sun, 4 Aug 2019 16:50:48 +0900 Subject: nvme: trace: support for Get LBA Status opcode parsed This patch adds Get LBA Status command's opcode to the macro that is used by the trace feature. Now we can see "get_lba_status" instead of the opcode value itself. Signed-off-by: Minwoo Im Signed-off-by: Sagi Grimberg --- include/linux/nvme.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index a01277501eae..32c25b46ae63 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -841,7 +841,8 @@ enum nvme_admin_opcode { nvme_admin_opcode_name(nvme_admin_format_nvm), \ nvme_admin_opcode_name(nvme_admin_security_send), \ nvme_admin_opcode_name(nvme_admin_security_recv), \ - nvme_admin_opcode_name(nvme_admin_sanitize_nvm)) + nvme_admin_opcode_name(nvme_admin_sanitize_nvm), \ + nvme_admin_opcode_name(nvme_admin_get_lba_status)) enum { NVME_QUEUE_PHYS_CONTIG = (1 << 0), -- cgit v1.2.3 From c1e0cc7e1d319936271dfdd0a9405275c8091381 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Wed, 7 Aug 2019 17:51:20 +1000 Subject: nvme-pci: Add support for variable IO SQ element size The size of a submission queue element should always be 6 (64 bytes) by spec. However some controllers such as Apple's are not properly implementing the standard and require a different size. This provides the ground work for the subsequent quirks for these controllers. Signed-off-by: Benjamin Herrenschmidt Reviewed-by: Minwoo Im Reviewed-by: Christoph Hellwig Signed-off-by: Sagi Grimberg --- include/linux/nvme.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 32c25b46ae63..f61d6906e59d 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -140,6 +140,7 @@ enum { * Submission and Completion Queue Entry Sizes for the NVM command set. * (In bytes and specified as a power of two (2^n)). */ +#define NVME_ADM_SQES 6 #define NVME_NVM_IOSQES 6 #define NVME_NVM_IOCQES 4 -- cgit v1.2.3 From 3a8e9ac89e6a5106cfb6b85d4c9cf9bfa3519bc7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 29 Aug 2019 15:47:19 -0700 Subject: writeback: add tracepoints for cgroup foreign writebacks cgroup foreign inode handling has quite a bit of heuristics and internal states which sometimes makes it difficult to understand what's going on. Add tracepoints to improve visibility. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- include/trace/events/writeback.h | 123 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 123 insertions(+) (limited to 'include') diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index aa7f3aeac740..3dc9fb9e7c78 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -176,6 +176,129 @@ static inline unsigned int __trace_wbc_assign_cgroup(struct writeback_control *w #endif /* CONFIG_CGROUP_WRITEBACK */ #endif /* CREATE_TRACE_POINTS */ +#ifdef CONFIG_CGROUP_WRITEBACK +TRACE_EVENT(inode_foreign_history, + + TP_PROTO(struct inode *inode, struct writeback_control *wbc, + unsigned int history), + + TP_ARGS(inode, wbc, history), + + TP_STRUCT__entry( + __array(char, name, 32) + __field(unsigned long, ino) + __field(unsigned int, cgroup_ino) + __field(unsigned int, history) + ), + + TP_fast_assign( + strncpy(__entry->name, dev_name(inode_to_bdi(inode)->dev), 32); + __entry->ino = inode->i_ino; + __entry->cgroup_ino = __trace_wbc_assign_cgroup(wbc); + __entry->history = history; + ), + + TP_printk("bdi %s: ino=%lu cgroup_ino=%u history=0x%x", + __entry->name, + __entry->ino, + __entry->cgroup_ino, + __entry->history + ) +); + +TRACE_EVENT(inode_switch_wbs, + + TP_PROTO(struct inode *inode, struct bdi_writeback *old_wb, + struct bdi_writeback *new_wb), + + TP_ARGS(inode, old_wb, new_wb), + + TP_STRUCT__entry( + __array(char, name, 32) + __field(unsigned long, ino) + __field(unsigned int, old_cgroup_ino) + __field(unsigned int, new_cgroup_ino) + ), + + TP_fast_assign( + strncpy(__entry->name, dev_name(old_wb->bdi->dev), 32); + __entry->ino = inode->i_ino; + __entry->old_cgroup_ino = __trace_wb_assign_cgroup(old_wb); + __entry->new_cgroup_ino = __trace_wb_assign_cgroup(new_wb); + ), + + TP_printk("bdi %s: ino=%lu old_cgroup_ino=%u new_cgroup_ino=%u", + __entry->name, + __entry->ino, + __entry->old_cgroup_ino, + __entry->new_cgroup_ino + ) +); + +TRACE_EVENT(track_foreign_dirty, + + TP_PROTO(struct page *page, struct bdi_writeback *wb), + + TP_ARGS(page, wb), + + TP_STRUCT__entry( + __array(char, name, 32) + __field(u64, bdi_id) + __field(unsigned long, ino) + __field(unsigned int, memcg_id) + __field(unsigned int, cgroup_ino) + __field(unsigned int, page_cgroup_ino) + ), + + TP_fast_assign( + strncpy(__entry->name, dev_name(wb->bdi->dev), 32); + __entry->bdi_id = wb->bdi->id; + __entry->ino = page->mapping->host->i_ino; + __entry->memcg_id = wb->memcg_css->id; + __entry->cgroup_ino = __trace_wb_assign_cgroup(wb); + __entry->page_cgroup_ino = page->mem_cgroup->css.cgroup->kn->id.ino; + ), + + TP_printk("bdi %s[%llu]: ino=%lu memcg_id=%u cgroup_ino=%u page_cgroup_ino=%u", + __entry->name, + __entry->bdi_id, + __entry->ino, + __entry->memcg_id, + __entry->cgroup_ino, + __entry->page_cgroup_ino + ) +); + +TRACE_EVENT(flush_foreign, + + TP_PROTO(struct bdi_writeback *wb, unsigned int frn_bdi_id, + unsigned int frn_memcg_id), + + TP_ARGS(wb, frn_bdi_id, frn_memcg_id), + + TP_STRUCT__entry( + __array(char, name, 32) + __field(unsigned int, cgroup_ino) + __field(unsigned int, frn_bdi_id) + __field(unsigned int, frn_memcg_id) + ), + + TP_fast_assign( + strncpy(__entry->name, dev_name(wb->bdi->dev), 32); + __entry->cgroup_ino = __trace_wb_assign_cgroup(wb); + __entry->frn_bdi_id = frn_bdi_id; + __entry->frn_memcg_id = frn_memcg_id; + ), + + TP_printk("bdi %s: cgroup_ino=%u frn_bdi_id=%u frn_memcg_id=%u", + __entry->name, + __entry->cgroup_ino, + __entry->frn_bdi_id, + __entry->frn_memcg_id + ) +); +#endif + DECLARE_EVENT_CLASS(writeback_write_inode_template, TP_PROTO(struct inode *inode, struct writeback_control *wbc), -- cgit v1.2.3 From 0feacaa21634014148068035b02eade71f853496 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 30 Aug 2019 16:39:54 -0700 Subject: writeback: don't access page->mapping directly in track_foreign_dirty TP page->mapping may encode different values in it and page_mapping() should always be used to access the mapping pointer. track_foreign_dirty tracepoint was incorrectly accessing page->mapping directly. Use page_mapping() instead. Also, add NULL checks while at it. Fixes: 3a8e9ac89e6a ("writeback: add tracepoints for cgroup foreign writebacks") Reported-by: Jan Kara Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- include/trace/events/writeback.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index 3dc9fb9e7c78..3a27335fce2c 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -251,9 +251,12 @@ TRACE_EVENT(track_foreign_dirty, ), TP_fast_assign( + struct address_space *mapping = page_mapping(page); + struct inode *inode = mapping ? mapping->host : NULL; + strncpy(__entry->name, dev_name(wb->bdi->dev), 32); __entry->bdi_id = wb->bdi->id; - __entry->ino = page->mapping->host->i_ino; + __entry->ino = inode ? inode->i_ino : 0; __entry->memcg_id = wb->memcg_css->id; __entry->cgroup_ino = __trace_wb_assign_cgroup(wb); __entry->page_cgroup_ino = page->mem_cgroup->css.cgroup->kn->id.ino; -- cgit v1.2.3 From 68c43f133a754c7bf5cb1018bb16dc0821cc43a1 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 5 Sep 2019 18:51:31 +0900 Subject: block: Introduce elevator features Introduce the definition of elevator features through the elevator_features flags in the elevator_type structure. Each flag can represent a feature supported by an elevator. The first feature defined by this patch is support for zoned block device sequential write constraint with the flag ELEVATOR_F_ZBD_SEQ_WRITE, which is implemented by the mq-deadline elevator using zone write locking. Other possible features are IO priorities, write hints, latency targets or single-LUN dual-actuator disks (for which the elevator could maintain one LBA ordered list per actuator). The required_elevator_features field is also added to the request_queue structure to allow a device driver to specify elevator feature flags that an elevator must support for the correct operation of the device (e.g. device drivers for zoned block devices can have the ELEVATOR_F_ZBD_SEQ_WRITE flag as a required feature). The helper function blk_queue_required_elevator_features() is defined for setting this new field. With these two new fields in place, the elevator functions elevator_match() and elevator_find() are modified to allow a user to set only an elevator with a set of features that satisfies the device required features. Elevators not matching the device requirements are not shown in the device sysfs queue/scheduler file to prevent their use. The "none" elevator can always be selected as before. Reviewed-by: Johannes Thumshirn Reviewed-by: Christoph Hellwig Reviewed-by: Ming Lei Signed-off-by: Damien Le Moal Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 4 ++++ include/linux/elevator.h | 8 ++++++++ 2 files changed, 12 insertions(+) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index d0ad21e4771b..b196124e3240 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -496,6 +496,8 @@ struct request_queue { struct queue_limits limits; + unsigned int required_elevator_features; + #ifdef CONFIG_BLK_DEV_ZONED /* * Zoned block device information for request dispatch control. @@ -1097,6 +1099,8 @@ extern void blk_queue_dma_alignment(struct request_queue *, int); extern void blk_queue_update_dma_alignment(struct request_queue *, int); extern void blk_queue_rq_timeout(struct request_queue *, unsigned int); extern void blk_queue_write_cache(struct request_queue *q, bool enabled, bool fua); +extern void blk_queue_required_elevator_features(struct request_queue *q, + unsigned int features); /* * Number of physical segments as sent to the device. diff --git a/include/linux/elevator.h b/include/linux/elevator.h index 1dd014c9c87b..901bda352dcb 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -76,6 +76,7 @@ struct elevator_type struct elv_fs_entry *elevator_attrs; const char *elevator_name; const char *elevator_alias; + const unsigned int elevator_features; struct module *elevator_owner; #ifdef CONFIG_BLK_DEBUG_FS const struct blk_mq_debugfs_attr *queue_debugfs_attrs; @@ -165,5 +166,12 @@ extern struct request *elv_rb_find(struct rb_root *, sector_t); #define rq_entry_fifo(ptr) list_entry((ptr), struct request, queuelist) #define rq_fifo_clear(rq) list_del_init(&(rq)->queuelist) +/* + * Elevator features. + */ + +/* Supports zoned block devices sequential write constraint */ +#define ELEVATOR_F_ZBD_SEQ_WRITE (1U << 0) + #endif /* CONFIG_BLOCK */ #endif -- cgit v1.2.3 From 737eb78e82d52d35df166d29af32bf61992de71d Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 5 Sep 2019 18:51:33 +0900 Subject: block: Delay default elevator initialization When elevator_init_mq() is called from blk_mq_init_allocated_queue(), the only information known about the device is the number of hardware queues as the block device scan by the device driver is not completed yet for most drivers. The device type and elevator required features are not set yet, preventing to correctly select the default elevator most suitable for the device. This currently affects all multi-queue zoned block devices which default to the "none" elevator instead of the required "mq-deadline" elevator. These drives currently include host-managed SMR disks connected to a smartpqi HBA and null_blk block devices with zoned mode enabled. Upcoming NVMe Zoned Namespace devices will also be affected. Fix this by adding the boolean elevator_init argument to blk_mq_init_allocated_queue() to control the execution of elevator_init_mq(). Two cases exist: 1) elevator_init = false is used for calls to blk_mq_init_allocated_queue() within blk_mq_init_queue(). In this case, a call to elevator_init_mq() is added to __device_add_disk(), resulting in the delayed initialization of the queue elevator after the device driver finished probing the device information. This effectively allows elevator_init_mq() access to more information about the device. 2) elevator_init = true preserves the current behavior of initializing the elevator directly from blk_mq_init_allocated_queue(). This case is used for the special request based DM devices where the device gendisk is created before the queue initialization and device information (e.g. queue limits) is already known when the queue initialization is executed. Additionally, to make sure that the elevator initialization is never done while requests are in-flight (there should be none when the device driver calls device_add_disk()), freeze and quiesce the device request queue before calling blk_mq_init_sched() in elevator_init_mq(). Reviewed-by: Ming Lei Signed-off-by: Damien Le Moal Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 62a3bb715899..0bf056de5cc3 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -248,7 +248,8 @@ enum { struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *); struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, - struct request_queue *q); + struct request_queue *q, + bool elevator_init); struct request_queue *blk_mq_init_sq_queue(struct blk_mq_tag_set *set, const struct blk_mq_ops *ops, unsigned int queue_depth, -- cgit v1.2.3 From 33f2c35a54dfd75ad0e7e86918dcbe4de799a56c Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 9 Sep 2019 16:52:29 +1000 Subject: md: add feature flag MD_FEATURE_RAID0_LAYOUT Due to a bug introduced in Linux 3.14 we cannot determine the correctly layout for a multi-zone RAID0 array - there are two possibilities. It is possible to tell the kernel which to chose using a module parameter, but this can be clumsy to use. It would be best if the choice were recorded in the metadata. So add a feature flag for this purpose. If it is set, then the 'layout' field of the superblock is used to determine which layout to use. If this flag is not set, then mddev->layout gets set to -1, which causes the module parameter to be required. Acked-by: Guoqing Jiang Signed-off-by: NeilBrown Signed-off-by: Song Liu --- include/uapi/linux/raid/md_p.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/raid/md_p.h b/include/uapi/linux/raid/md_p.h index b0d15c73f6d7..1f2d8c81f0e0 100644 --- a/include/uapi/linux/raid/md_p.h +++ b/include/uapi/linux/raid/md_p.h @@ -329,6 +329,7 @@ struct mdp_superblock_1 { #define MD_FEATURE_JOURNAL 512 /* support write cache */ #define MD_FEATURE_PPL 1024 /* support PPL */ #define MD_FEATURE_MULTIPLE_PPLS 2048 /* support for multiple PPLs */ +#define MD_FEATURE_RAID0_LAYOUT 4096 /* layout is meaningful for RAID0 */ #define MD_FEATURE_ALL (MD_FEATURE_BITMAP_OFFSET \ |MD_FEATURE_RECOVERY_OFFSET \ |MD_FEATURE_RESHAPE_ACTIVE \ @@ -341,6 +342,7 @@ struct mdp_superblock_1 { |MD_FEATURE_JOURNAL \ |MD_FEATURE_PPL \ |MD_FEATURE_MULTIPLE_PPLS \ + |MD_FEATURE_RAID0_LAYOUT \ ) struct r5l_payload_header { -- cgit v1.2.3 From 3d24430694077313c75c6b89f618db09943621e4 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Tue, 21 May 2019 15:59:03 +0800 Subject: block: make rq sector size accessible for block stats Currently rq->data_len will be decreased by partial completion or zeroed by completion, so when blk_stat_add() is invoked, data_len will be zero and there will never be samples in poll_cb because blk_mq_poll_stats_bkt() will return -1 if data_len is zero. We could move blk_stat_add() back to __blk_mq_complete_request(), but that would make the effort of trying to call ktime_get_ns() once in vain. Instead we can reuse throtl_size field, and use it for both block stats and block throttle, and adjust the logic in blk_mq_poll_stats_bkt() accordingly. Fixes: 4bc6339a583c ("block: move blk_stat_add() to __blk_mq_end_request()") Tested-by: Pavel Begunkov Signed-off-by: Hou Tao Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index b196124e3240..3094f2d513b2 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -206,9 +206,12 @@ struct request { #ifdef CONFIG_BLK_WBT unsigned short wbt_flags; #endif -#ifdef CONFIG_BLK_DEV_THROTTLING_LOW - unsigned short throtl_size; -#endif + /* + * rq sectors used for blk stats. It has the same value + * with blk_rq_sectors(rq), except that it never be zeroed + * by completion. + */ + unsigned short stats_sectors; /* * Number of scatter-gather DMA addr+len pairs after @@ -917,6 +920,7 @@ static inline struct request_queue *bdev_get_queue(struct block_device *bdev) * blk_rq_err_bytes() : bytes left till the next error boundary * blk_rq_sectors() : sectors left in the entire request * blk_rq_cur_sectors() : sectors left in the current segment + * blk_rq_stats_sectors() : sectors of the entire request used for stats */ static inline sector_t blk_rq_pos(const struct request *rq) { @@ -945,6 +949,11 @@ static inline unsigned int blk_rq_cur_sectors(const struct request *rq) return blk_rq_cur_bytes(rq) >> SECTOR_SHIFT; } +static inline unsigned int blk_rq_stats_sectors(const struct request *rq) +{ + return rq->stats_sectors; +} + #ifdef CONFIG_BLK_DEV_ZONED static inline unsigned int blk_rq_zone_no(struct request *rq) { -- cgit v1.2.3