From 08a9ad8bf607388d768a341957d53eae64250c2d Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 17 Jun 2021 17:44:55 -0700 Subject: block/mq-deadline: Add cgroup support Maintain statistics per cgroup and export these to user space. These statistics are essential for verifying whether the proper I/O priorities have been assigned to requests. An example of the statistics data with this patch applied: $ cat /sys/fs/cgroup/io.stat 11:2 rbytes=0 wbytes=0 rios=3 wios=0 dbytes=0 dios=0 [NONE] dispatched=0 inserted=0 merged=171 [RT] dispatched=0 inserted=0 merged=0 [BE] dispatched=0 inserted=0 merged=0 [IDLE] dispatched=0 inserted=0 merged=0 8:32 rbytes=2142720 wbytes=0 rios=105 wios=0 dbytes=0 dios=0 [NONE] dispatched=0 inserted=0 merged=171 [RT] dispatched=0 inserted=0 merged=0 [BE] dispatched=0 inserted=0 merged=0 [IDLE] dispatched=0 inserted=0 merged=0 Cc: Damien Le Moal Cc: Hannes Reinecke Cc: Christoph Hellwig Cc: Ming Lei Cc: Johannes Thumshirn Cc: Himanshu Madhani Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20210618004456.7280-16-bvanassche@acm.org Signed-off-by: Jens Axboe --- block/Kconfig.iosched | 6 + block/Makefile | 2 + block/mq-deadline-cgroup.c | 126 +++++ block/mq-deadline-cgroup.h | 114 +++++ block/mq-deadline-main.c | 1141 ++++++++++++++++++++++++++++++++++++++++++++ block/mq-deadline.c | 1095 ------------------------------------------ 6 files changed, 1389 insertions(+), 1095 deletions(-) create mode 100644 block/mq-deadline-cgroup.c create mode 100644 block/mq-deadline-cgroup.h create mode 100644 block/mq-deadline-main.c delete mode 100644 block/mq-deadline.c (limited to 'block') diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched index 2f2158e05a91..64053d67a97b 100644 --- a/block/Kconfig.iosched +++ b/block/Kconfig.iosched @@ -9,6 +9,12 @@ config MQ_IOSCHED_DEADLINE help MQ version of the deadline IO scheduler. +config MQ_IOSCHED_DEADLINE_CGROUP + tristate + default y + depends on MQ_IOSCHED_DEADLINE + depends on BLK_CGROUP + config MQ_IOSCHED_KYBER tristate "Kyber I/O scheduler" default y diff --git a/block/Makefile b/block/Makefile index af3d044abaf1..b9db5d4edfc8 100644 --- a/block/Makefile +++ b/block/Makefile @@ -21,6 +21,8 @@ obj-$(CONFIG_BLK_CGROUP_IOPRIO) += blk-ioprio.o obj-$(CONFIG_BLK_CGROUP_IOLATENCY) += blk-iolatency.o obj-$(CONFIG_BLK_CGROUP_IOCOST) += blk-iocost.o obj-$(CONFIG_MQ_IOSCHED_DEADLINE) += mq-deadline.o +mq-deadline-y += mq-deadline-main.o +mq-deadline-$(CONFIG_MQ_IOSCHED_DEADLINE_CGROUP)+= mq-deadline-cgroup.o obj-$(CONFIG_MQ_IOSCHED_KYBER) += kyber-iosched.o bfq-y := bfq-iosched.o bfq-wf2q.o bfq-cgroup.o obj-$(CONFIG_IOSCHED_BFQ) += bfq.o diff --git a/block/mq-deadline-cgroup.c b/block/mq-deadline-cgroup.c new file mode 100644 index 000000000000..3b4bfddec39f --- /dev/null +++ b/block/mq-deadline-cgroup.c @@ -0,0 +1,126 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include + +#include "mq-deadline-cgroup.h" + +static struct blkcg_policy dd_blkcg_policy; + +static struct blkcg_policy_data *dd_cpd_alloc(gfp_t gfp) +{ + struct dd_blkcg *pd; + + pd = kzalloc(sizeof(*pd), gfp); + if (!pd) + return NULL; + pd->stats = alloc_percpu_gfp(typeof(*pd->stats), + GFP_KERNEL | __GFP_ZERO); + if (!pd->stats) { + kfree(pd); + return NULL; + } + return &pd->cpd; +} + +static void dd_cpd_free(struct blkcg_policy_data *cpd) +{ + struct dd_blkcg *dd_blkcg = container_of(cpd, typeof(*dd_blkcg), cpd); + + free_percpu(dd_blkcg->stats); + kfree(dd_blkcg); +} + +static struct dd_blkcg *dd_blkcg_from_pd(struct blkg_policy_data *pd) +{ + return container_of(blkcg_to_cpd(pd->blkg->blkcg, &dd_blkcg_policy), + struct dd_blkcg, cpd); +} + +/* + * Convert an association between a block cgroup and a request queue into a + * pointer to the mq-deadline information associated with a (blkcg, queue) pair. + */ +struct dd_blkcg *dd_blkcg_from_bio(struct bio *bio) +{ + struct blkg_policy_data *pd; + + pd = blkg_to_pd(bio->bi_blkg, &dd_blkcg_policy); + if (!pd) + return NULL; + + return dd_blkcg_from_pd(pd); +} + +static size_t dd_pd_stat(struct blkg_policy_data *pd, char *buf, size_t size) +{ + static const char *const prio_class_name[] = { + [IOPRIO_CLASS_NONE] = "NONE", + [IOPRIO_CLASS_RT] = "RT", + [IOPRIO_CLASS_BE] = "BE", + [IOPRIO_CLASS_IDLE] = "IDLE", + }; + struct dd_blkcg *blkcg = dd_blkcg_from_pd(pd); + int res = 0; + u8 prio; + + for (prio = 0; prio < ARRAY_SIZE(blkcg->stats->stats); prio++) + res += scnprintf(buf + res, size - res, + " [%s] dispatched=%u inserted=%u merged=%u", + prio_class_name[prio], + ddcg_sum(blkcg, dispatched, prio) + + ddcg_sum(blkcg, merged, prio) - + ddcg_sum(blkcg, completed, prio), + ddcg_sum(blkcg, inserted, prio) - + ddcg_sum(blkcg, completed, prio), + ddcg_sum(blkcg, merged, prio)); + + return res; +} + +static struct blkg_policy_data *dd_pd_alloc(gfp_t gfp, struct request_queue *q, + struct blkcg *blkcg) +{ + struct dd_blkg *pd; + + pd = kzalloc(sizeof(*pd), gfp); + if (!pd) + return NULL; + return &pd->pd; +} + +static void dd_pd_free(struct blkg_policy_data *pd) +{ + struct dd_blkg *dd_blkg = container_of(pd, typeof(*dd_blkg), pd); + + kfree(dd_blkg); +} + +static struct blkcg_policy dd_blkcg_policy = { + .cpd_alloc_fn = dd_cpd_alloc, + .cpd_free_fn = dd_cpd_free, + + .pd_alloc_fn = dd_pd_alloc, + .pd_free_fn = dd_pd_free, + .pd_stat_fn = dd_pd_stat, +}; + +int dd_activate_policy(struct request_queue *q) +{ + return blkcg_activate_policy(q, &dd_blkcg_policy); +} + +void dd_deactivate_policy(struct request_queue *q) +{ + blkcg_deactivate_policy(q, &dd_blkcg_policy); +} + +int __init dd_blkcg_init(void) +{ + return blkcg_policy_register(&dd_blkcg_policy); +} + +void __exit dd_blkcg_exit(void) +{ + blkcg_policy_unregister(&dd_blkcg_policy); +} diff --git a/block/mq-deadline-cgroup.h b/block/mq-deadline-cgroup.h new file mode 100644 index 000000000000..0143fd74f3ce --- /dev/null +++ b/block/mq-deadline-cgroup.h @@ -0,0 +1,114 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#if !defined(_MQ_DEADLINE_CGROUP_H_) +#define _MQ_DEADLINE_CGROUP_H_ + +#include + +struct request_queue; + +/** + * struct io_stats_per_prio - I/O statistics per I/O priority class. + * @inserted: Number of inserted requests. + * @merged: Number of merged requests. + * @dispatched: Number of dispatched requests. + * @completed: Number of I/O completions. + */ +struct io_stats_per_prio { + local_t inserted; + local_t merged; + local_t dispatched; + local_t completed; +}; + +/* I/O statistics per I/O cgroup per I/O priority class (IOPRIO_CLASS_*). */ +struct blkcg_io_stats { + struct io_stats_per_prio stats[4]; +}; + +/** + * struct dd_blkcg - Per cgroup data. + * @cpd: blkcg_policy_data structure. + * @stats: I/O statistics. + */ +struct dd_blkcg { + struct blkcg_policy_data cpd; /* must be the first member */ + struct blkcg_io_stats __percpu *stats; +}; + +/* + * Count one event of type 'event_type' and with I/O priority class + * 'prio_class'. + */ +#define ddcg_count(ddcg, event_type, prio_class) do { \ +if (ddcg) { \ + struct blkcg_io_stats *io_stats = get_cpu_ptr((ddcg)->stats); \ + \ + BUILD_BUG_ON(!__same_type((ddcg), struct dd_blkcg *)); \ + BUILD_BUG_ON(!__same_type((prio_class), u8)); \ + local_inc(&io_stats->stats[(prio_class)].event_type); \ + put_cpu_ptr(io_stats); \ +} \ +} while (0) + +/* + * Returns the total number of ddcg_count(ddcg, event_type, prio_class) calls + * across all CPUs. No locking or barriers since it is fine if the returned + * sum is slightly outdated. + */ +#define ddcg_sum(ddcg, event_type, prio) ({ \ + unsigned int cpu; \ + u32 sum = 0; \ + \ + BUILD_BUG_ON(!__same_type((ddcg), struct dd_blkcg *)); \ + BUILD_BUG_ON(!__same_type((prio), u8)); \ + for_each_present_cpu(cpu) \ + sum += local_read(&per_cpu_ptr((ddcg)->stats, cpu)-> \ + stats[(prio)].event_type); \ + sum; \ +}) + +#ifdef CONFIG_BLK_CGROUP + +/** + * struct dd_blkg - Per (cgroup, request queue) data. + * @pd: blkg_policy_data structure. + */ +struct dd_blkg { + struct blkg_policy_data pd; /* must be the first member */ +}; + +struct dd_blkcg *dd_blkcg_from_bio(struct bio *bio); +int dd_activate_policy(struct request_queue *q); +void dd_deactivate_policy(struct request_queue *q); +int __init dd_blkcg_init(void); +void __exit dd_blkcg_exit(void); + +#else /* CONFIG_BLK_CGROUP */ + +static inline struct dd_blkcg *dd_blkcg_from_bio(struct bio *bio) +{ + return NULL; +} + +static inline int dd_activate_policy(struct request_queue *q) +{ + return 0; +} + +static inline void dd_deactivate_policy(struct request_queue *q) +{ +} + +static inline int dd_blkcg_init(void) +{ + return 0; +} + +static inline void dd_blkcg_exit(void) +{ +} + +#endif /* CONFIG_BLK_CGROUP */ + +#endif /* _MQ_DEADLINE_CGROUP_H_ */ diff --git a/block/mq-deadline-main.c b/block/mq-deadline-main.c new file mode 100644 index 000000000000..58a401ea8f56 --- /dev/null +++ b/block/mq-deadline-main.c @@ -0,0 +1,1141 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * MQ Deadline i/o scheduler - adaptation of the legacy deadline scheduler, + * for the blk-mq scheduling framework + * + * Copyright (C) 2016 Jens Axboe + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "blk.h" +#include "blk-mq.h" +#include "blk-mq-debugfs.h" +#include "blk-mq-tag.h" +#include "blk-mq-sched.h" +#include "mq-deadline-cgroup.h" + +/* + * See Documentation/block/deadline-iosched.rst + */ +static const int read_expire = HZ / 2; /* max time before a read is submitted. */ +static const int write_expire = 5 * HZ; /* ditto for writes, these limits are SOFT! */ +static const int writes_starved = 2; /* max times reads can starve a write */ +static const int fifo_batch = 16; /* # of sequential requests treated as one + by the above parameters. For throughput. */ + +enum dd_data_dir { + DD_READ = READ, + DD_WRITE = WRITE, +}; + +enum { DD_DIR_COUNT = 2 }; + +enum dd_prio { + DD_RT_PRIO = 0, + DD_BE_PRIO = 1, + DD_IDLE_PRIO = 2, + DD_PRIO_MAX = 2, +}; + +enum { DD_PRIO_COUNT = 3 }; + +/* I/O statistics for all I/O priorities (enum dd_prio). */ +struct io_stats { + struct io_stats_per_prio stats[DD_PRIO_COUNT]; +}; + +/* + * Deadline scheduler data per I/O priority (enum dd_prio). Requests are + * present on both sort_list[] and fifo_list[]. + */ +struct dd_per_prio { + struct list_head dispatch; + struct rb_root sort_list[DD_DIR_COUNT]; + struct list_head fifo_list[DD_DIR_COUNT]; + /* Next request in FIFO order. Read, write or both are NULL. */ + struct request *next_rq[DD_DIR_COUNT]; +}; + +struct deadline_data { + /* + * run time data + */ + + /* Request queue that owns this data structure. */ + struct request_queue *queue; + + struct dd_per_prio per_prio[DD_PRIO_COUNT]; + + /* Data direction of latest dispatched request. */ + enum dd_data_dir last_dir; + unsigned int batching; /* number of sequential requests made */ + unsigned int starved; /* times reads have starved writes */ + + struct io_stats __percpu *stats; + + /* + * settings that change how the i/o scheduler behaves + */ + int fifo_expire[DD_DIR_COUNT]; + int fifo_batch; + int writes_starved; + int front_merges; + u32 async_depth; + + spinlock_t lock; + spinlock_t zone_lock; +}; + +/* Count one event of type 'event_type' and with I/O priority 'prio' */ +#define dd_count(dd, event_type, prio) do { \ + struct io_stats *io_stats = get_cpu_ptr((dd)->stats); \ + \ + BUILD_BUG_ON(!__same_type((dd), struct deadline_data *)); \ + BUILD_BUG_ON(!__same_type((prio), enum dd_prio)); \ + local_inc(&io_stats->stats[(prio)].event_type); \ + put_cpu_ptr(io_stats); \ +} while (0) + +/* + * Returns the total number of dd_count(dd, event_type, prio) calls across all + * CPUs. No locking or barriers since it is fine if the returned sum is slightly + * outdated. + */ +#define dd_sum(dd, event_type, prio) ({ \ + unsigned int cpu; \ + u32 sum = 0; \ + \ + BUILD_BUG_ON(!__same_type((dd), struct deadline_data *)); \ + BUILD_BUG_ON(!__same_type((prio), enum dd_prio)); \ + for_each_present_cpu(cpu) \ + sum += local_read(&per_cpu_ptr((dd)->stats, cpu)-> \ + stats[(prio)].event_type); \ + sum; \ +}) + +/* Maps an I/O priority class to a deadline scheduler priority. */ +static const enum dd_prio ioprio_class_to_prio[] = { + [IOPRIO_CLASS_NONE] = DD_BE_PRIO, + [IOPRIO_CLASS_RT] = DD_RT_PRIO, + [IOPRIO_CLASS_BE] = DD_BE_PRIO, + [IOPRIO_CLASS_IDLE] = DD_IDLE_PRIO, +}; + +static inline struct rb_root * +deadline_rb_root(struct dd_per_prio *per_prio, struct request *rq) +{ + return &per_prio->sort_list[rq_data_dir(rq)]; +} + +/* + * Returns the I/O priority class (IOPRIO_CLASS_*) that has been assigned to a + * request. + */ +static u8 dd_rq_ioclass(struct request *rq) +{ + return IOPRIO_PRIO_CLASS(req_get_ioprio(rq)); +} + +/* + * get the request after `rq' in sector-sorted order + */ +static inline struct request * +deadline_latter_request(struct request *rq) +{ + struct rb_node *node = rb_next(&rq->rb_node); + + if (node) + return rb_entry_rq(node); + + return NULL; +} + +static void +deadline_add_rq_rb(struct dd_per_prio *per_prio, struct request *rq) +{ + struct rb_root *root = deadline_rb_root(per_prio, rq); + + elv_rb_add(root, rq); +} + +static inline void +deadline_del_rq_rb(struct dd_per_prio *per_prio, struct request *rq) +{ + const enum dd_data_dir data_dir = rq_data_dir(rq); + + if (per_prio->next_rq[data_dir] == rq) + per_prio->next_rq[data_dir] = deadline_latter_request(rq); + + elv_rb_del(deadline_rb_root(per_prio, rq), rq); +} + +/* + * remove rq from rbtree and fifo. + */ +static void deadline_remove_request(struct request_queue *q, + struct dd_per_prio *per_prio, + struct request *rq) +{ + list_del_init(&rq->queuelist); + + /* + * We might not be on the rbtree, if we are doing an insert merge + */ + if (!RB_EMPTY_NODE(&rq->rb_node)) + deadline_del_rq_rb(per_prio, rq); + + elv_rqhash_del(q, rq); + if (q->last_merge == rq) + q->last_merge = NULL; +} + +static void dd_request_merged(struct request_queue *q, struct request *req, + enum elv_merge type) +{ + struct deadline_data *dd = q->elevator->elevator_data; + const u8 ioprio_class = dd_rq_ioclass(req); + const enum dd_prio prio = ioprio_class_to_prio[ioprio_class]; + struct dd_per_prio *per_prio = &dd->per_prio[prio]; + + /* + * if the merge was a front merge, we need to reposition request + */ + if (type == ELEVATOR_FRONT_MERGE) { + elv_rb_del(deadline_rb_root(per_prio, req), req); + deadline_add_rq_rb(per_prio, req); + } +} + +/* + * Callback function that is invoked after @next has been merged into @req. + */ +static void dd_merged_requests(struct request_queue *q, struct request *req, + struct request *next) +{ + struct deadline_data *dd = q->elevator->elevator_data; + const u8 ioprio_class = dd_rq_ioclass(next); + const enum dd_prio prio = ioprio_class_to_prio[ioprio_class]; + struct dd_blkcg *blkcg = next->elv.priv[0]; + + dd_count(dd, merged, prio); + ddcg_count(blkcg, merged, ioprio_class); + + /* + * if next expires before rq, assign its expire time to rq + * and move into next position (next will be deleted) in fifo + */ + if (!list_empty(&req->queuelist) && !list_empty(&next->queuelist)) { + if (time_before((unsigned long)next->fifo_time, + (unsigned long)req->fifo_time)) { + list_move(&req->queuelist, &next->queuelist); + req->fifo_time = next->fifo_time; + } + } + + /* + * kill knowledge of next, this one is a goner + */ + deadline_remove_request(q, &dd->per_prio[prio], next); +} + +/* + * move an entry to dispatch queue + */ +static void +deadline_move_request(struct deadline_data *dd, struct dd_per_prio *per_prio, + struct request *rq) +{ + const enum dd_data_dir data_dir = rq_data_dir(rq); + + per_prio->next_rq[data_dir] = deadline_latter_request(rq); + + /* + * take it off the sort and fifo list + */ + deadline_remove_request(rq->q, per_prio, rq); +} + +/* Number of requests queued for a given priority level. */ +static u32 dd_queued(struct deadline_data *dd, enum dd_prio prio) +{ + return dd_sum(dd, inserted, prio) - dd_sum(dd, completed, prio); +} + +/* + * deadline_check_fifo returns 0 if there are no expired requests on the fifo, + * 1 otherwise. Requires !list_empty(&dd->fifo_list[data_dir]) + */ +static inline int deadline_check_fifo(struct dd_per_prio *per_prio, + enum dd_data_dir data_dir) +{ + struct request *rq = rq_entry_fifo(per_prio->fifo_list[data_dir].next); + + /* + * rq is expired! + */ + if (time_after_eq(jiffies, (unsigned long)rq->fifo_time)) + return 1; + + return 0; +} + +/* + * For the specified data direction, return the next request to + * dispatch using arrival ordered lists. + */ +static struct request * +deadline_fifo_request(struct deadline_data *dd, struct dd_per_prio *per_prio, + enum dd_data_dir data_dir) +{ + struct request *rq; + unsigned long flags; + + if (list_empty(&per_prio->fifo_list[data_dir])) + return NULL; + + rq = rq_entry_fifo(per_prio->fifo_list[data_dir].next); + if (data_dir == DD_READ || !blk_queue_is_zoned(rq->q)) + return rq; + + /* + * Look for a write request that can be dispatched, that is one with + * an unlocked target zone. + */ + spin_lock_irqsave(&dd->zone_lock, flags); + list_for_each_entry(rq, &per_prio->fifo_list[DD_WRITE], queuelist) { + if (blk_req_can_dispatch_to_zone(rq)) + goto out; + } + rq = NULL; +out: + spin_unlock_irqrestore(&dd->zone_lock, flags); + + return rq; +} + +/* + * For the specified data direction, return the next request to + * dispatch using sector position sorted lists. + */ +static struct request * +deadline_next_request(struct deadline_data *dd, struct dd_per_prio *per_prio, + enum dd_data_dir data_dir) +{ + struct request *rq; + unsigned long flags; + + rq = per_prio->next_rq[data_dir]; + if (!rq) + return NULL; + + if (data_dir == DD_READ || !blk_queue_is_zoned(rq->q)) + return rq; + + /* + * Look for a write request that can be dispatched, that is one with + * an unlocked target zone. + */ + spin_lock_irqsave(&dd->zone_lock, flags); + while (rq) { + if (blk_req_can_dispatch_to_zone(rq)) + break; + rq = deadline_latter_request(rq); + } + spin_unlock_irqrestore(&dd->zone_lock, flags); + + return rq; +} + +/* + * deadline_dispatch_requests selects the best request according to + * read/write expire, fifo_batch, etc + */ +static struct request *__dd_dispatch_request(struct deadline_data *dd, + struct dd_per_prio *per_prio) +{ + struct request *rq, *next_rq; + enum dd_data_dir data_dir; + struct dd_blkcg *blkcg; + enum dd_prio prio; + u8 ioprio_class; + + lockdep_assert_held(&dd->lock); + + if (!list_empty(&per_prio->dispatch)) { + rq = list_first_entry(&per_prio->dispatch, struct request, + queuelist); + list_del_init(&rq->queuelist); + goto done; + } + + /* + * batches are currently reads XOR writes + */ + rq = deadline_next_request(dd, per_prio, dd->last_dir); + if (rq && dd->batching < dd->fifo_batch) + /* we have a next request are still entitled to batch */ + goto dispatch_request; + + /* + * at this point we are not running a batch. select the appropriate + * data direction (read / write) + */ + + if (!list_empty(&per_prio->fifo_list[DD_READ])) { + BUG_ON(RB_EMPTY_ROOT(&per_prio->sort_list[DD_READ])); + + if (deadline_fifo_request(dd, per_prio, DD_WRITE) && + (dd->starved++ >= dd->writes_starved)) + goto dispatch_writes; + + data_dir = DD_READ; + + goto dispatch_find_request; + } + + /* + * there are either no reads or writes have been starved + */ + + if (!list_empty(&per_prio->fifo_list[DD_WRITE])) { +dispatch_writes: + BUG_ON(RB_EMPTY_ROOT(&per_prio->sort_list[DD_WRITE])); + + dd->starved = 0; + + data_dir = DD_WRITE; + + goto dispatch_find_request; + } + + return NULL; + +dispatch_find_request: + /* + * we are not running a batch, find best request for selected data_dir + */ + next_rq = deadline_next_request(dd, per_prio, data_dir); + if (deadline_check_fifo(per_prio, data_dir) || !next_rq) { + /* + * A deadline has expired, the last request was in the other + * direction, or we have run out of higher-sectored requests. + * Start again from the request with the earliest expiry time. + */ + rq = deadline_fifo_request(dd, per_prio, data_dir); + } else { + /* + * The last req was the same dir and we have a next request in + * sort order. No expired requests so continue on from here. + */ + rq = next_rq; + } + + /* + * For a zoned block device, if we only have writes queued and none of + * them can be dispatched, rq will be NULL. + */ + if (!rq) + return NULL; + + dd->last_dir = data_dir; + dd->batching = 0; + +dispatch_request: + /* + * rq is the selected appropriate request. + */ + dd->batching++; + deadline_move_request(dd, per_prio, rq); +done: + ioprio_class = dd_rq_ioclass(rq); + prio = ioprio_class_to_prio[ioprio_class]; + dd_count(dd, dispatched, prio); + blkcg = rq->elv.priv[0]; + ddcg_count(blkcg, dispatched, ioprio_class); + /* + * If the request needs its target zone locked, do it. + */ + blk_req_zone_write_lock(rq); + rq->rq_flags |= RQF_STARTED; + return rq; +} + +/* + * Called from blk_mq_run_hw_queue() -> __blk_mq_sched_dispatch_requests(). + * + * One confusing aspect here is that we get called for a specific + * hardware queue, but we may return a request that is for a + * different hardware queue. This is because mq-deadline has shared + * state for all hardware queues, in terms of sorting, FIFOs, etc. + */ +static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx) +{ + struct deadline_data *dd = hctx->queue->elevator->elevator_data; + struct request *rq; + enum dd_prio prio; + + spin_lock(&dd->lock); + for (prio = 0; prio <= DD_PRIO_MAX; prio++) { + rq = __dd_dispatch_request(dd, &dd->per_prio[prio]); + if (rq) + break; + } + spin_unlock(&dd->lock); + + return rq; +} + +/* + * Called by __blk_mq_alloc_request(). The shallow_depth value set by this + * function is used by __blk_mq_get_tag(). + */ +static void dd_limit_depth(unsigned int op, struct blk_mq_alloc_data *data) +{ + struct deadline_data *dd = data->q->elevator->elevator_data; + + /* Do not throttle synchronous reads. */ + if (op_is_sync(op) && !op_is_write(op)) + return; + + /* + * Throttle asynchronous requests and writes such that these requests + * do not block the allocation of synchronous requests. + */ + data->shallow_depth = dd->async_depth; +} + +/* Called by blk_mq_update_nr_requests(). */ +static void dd_depth_updated(struct blk_mq_hw_ctx *hctx) +{ + struct request_queue *q = hctx->queue; + struct deadline_data *dd = q->elevator->elevator_data; + struct blk_mq_tags *tags = hctx->sched_tags; + + dd->async_depth = max(1UL, 3 * q->nr_requests / 4); + + sbitmap_queue_min_shallow_depth(tags->bitmap_tags, dd->async_depth); +} + +/* Called by blk_mq_init_hctx() and blk_mq_init_sched(). */ +static int dd_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) +{ + dd_depth_updated(hctx); + return 0; +} + +static void dd_exit_sched(struct elevator_queue *e) +{ + struct deadline_data *dd = e->elevator_data; + enum dd_prio prio; + + dd_deactivate_policy(dd->queue); + + for (prio = 0; prio <= DD_PRIO_MAX; prio++) { + struct dd_per_prio *per_prio = &dd->per_prio[prio]; + + WARN_ON_ONCE(!list_empty(&per_prio->fifo_list[DD_READ])); + WARN_ON_ONCE(!list_empty(&per_prio->fifo_list[DD_WRITE])); + } + + free_percpu(dd->stats); + + kfree(dd); +} + +/* + * Initialize elevator private data (deadline_data) and associate with blkcg. + */ +static int dd_init_sched(struct request_queue *q, struct elevator_type *e) +{ + struct deadline_data *dd; + struct elevator_queue *eq; + enum dd_prio prio; + int ret = -ENOMEM; + + /* + * Initialization would be very tricky if the queue is not frozen, + * hence the warning statement below. + */ + WARN_ON_ONCE(!percpu_ref_is_zero(&q->q_usage_counter)); + + eq = elevator_alloc(q, e); + if (!eq) + return ret; + + dd = kzalloc_node(sizeof(*dd), GFP_KERNEL, q->node); + if (!dd) + goto put_eq; + + eq->elevator_data = dd; + + dd->stats = alloc_percpu_gfp(typeof(*dd->stats), + GFP_KERNEL | __GFP_ZERO); + if (!dd->stats) + goto free_dd; + + dd->queue = q; + + for (prio = 0; prio <= DD_PRIO_MAX; prio++) { + struct dd_per_prio *per_prio = &dd->per_prio[prio]; + + INIT_LIST_HEAD(&per_prio->dispatch); + INIT_LIST_HEAD(&per_prio->fifo_list[DD_READ]); + INIT_LIST_HEAD(&per_prio->fifo_list[DD_WRITE]); + per_prio->sort_list[DD_READ] = RB_ROOT; + per_prio->sort_list[DD_WRITE] = RB_ROOT; + } + dd->fifo_expire[DD_READ] = read_expire; + dd->fifo_expire[DD_WRITE] = write_expire; + dd->writes_starved = writes_starved; + dd->front_merges = 1; + dd->last_dir = DD_WRITE; + dd->fifo_batch = fifo_batch; + spin_lock_init(&dd->lock); + spin_lock_init(&dd->zone_lock); + + ret = dd_activate_policy(q); + if (ret) + goto free_stats; + + ret = 0; + q->elevator = eq; + return 0; + +free_stats: + free_percpu(dd->stats); + +free_dd: + kfree(dd); + +put_eq: + kobject_put(&eq->kobj); + return ret; +} + +/* + * Try to merge @bio into an existing request. If @bio has been merged into + * an existing request, store the pointer to that request into *@rq. + */ +static int dd_request_merge(struct request_queue *q, struct request **rq, + struct bio *bio) +{ + struct deadline_data *dd = q->elevator->elevator_data; + const u8 ioprio_class = IOPRIO_PRIO_CLASS(bio->bi_ioprio); + const enum dd_prio prio = ioprio_class_to_prio[ioprio_class]; + struct dd_per_prio *per_prio = &dd->per_prio[prio]; + sector_t sector = bio_end_sector(bio); + struct request *__rq; + + if (!dd->front_merges) + return ELEVATOR_NO_MERGE; + + __rq = elv_rb_find(&per_prio->sort_list[bio_data_dir(bio)], sector); + if (__rq) { + BUG_ON(sector != blk_rq_pos(__rq)); + + if (elv_bio_merge_ok(__rq, bio)) { + *rq = __rq; + return ELEVATOR_FRONT_MERGE; + } + } + + return ELEVATOR_NO_MERGE; +} + +/* + * Attempt to merge a bio into an existing request. This function is called + * before @bio is associated with a request. + */ +static bool dd_bio_merge(struct request_queue *q, struct bio *bio, + unsigned int nr_segs) +{ + struct deadline_data *dd = q->elevator->elevator_data; + struct request *free = NULL; + bool ret; + + spin_lock(&dd->lock); + ret = blk_mq_sched_try_merge(q, bio, nr_segs, &free); + spin_unlock(&dd->lock); + + if (free) + blk_mq_free_request(free); + + return ret; +} + +/* + * add rq to rbtree and fifo + */ +static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, + bool at_head) +{ + struct request_queue *q = hctx->queue; + struct deadline_data *dd = q->elevator->elevator_data; + const enum dd_data_dir data_dir = rq_data_dir(rq); + u16 ioprio = req_get_ioprio(rq); + u8 ioprio_class = IOPRIO_PRIO_CLASS(ioprio); + struct dd_per_prio *per_prio; + enum dd_prio prio; + struct dd_blkcg *blkcg; + + lockdep_assert_held(&dd->lock); + + /* + * This may be a requeue of a write request that has locked its + * target zone. If it is the case, this releases the zone lock. + */ + blk_req_zone_write_unlock(rq); + + /* + * If a block cgroup has been associated with the submitter and if an + * I/O priority has been set in the associated block cgroup, use the + * lowest of the cgroup priority and the request priority for the + * request. If no priority has been set in the request, use the cgroup + * priority. + */ + prio = ioprio_class_to_prio[ioprio_class]; + dd_count(dd, inserted, prio); + blkcg = dd_blkcg_from_bio(rq->bio); + ddcg_count(blkcg, inserted, ioprio_class); + WARN_ON_ONCE(rq->elv.priv[0]); + rq->elv.priv[0] = blkcg; + + if (blk_mq_sched_try_insert_merge(q, rq)) + return; + + trace_block_rq_insert(rq); + + per_prio = &dd->per_prio[prio]; + if (at_head) { + list_add(&rq->queuelist, &per_prio->dispatch); + } else { + deadline_add_rq_rb(per_prio, rq); + + if (rq_mergeable(rq)) { + elv_rqhash_add(q, rq); + if (!q->last_merge) + q->last_merge = rq; + } + + /* + * set expire time and add to fifo list + */ + rq->fifo_time = jiffies + dd->fifo_expire[data_dir]; + list_add_tail(&rq->queuelist, &per_prio->fifo_list[data_dir]); + } +} + +/* + * Called from blk_mq_sched_insert_request() or blk_mq_sched_insert_requests(). + */ +static void dd_insert_requests(struct blk_mq_hw_ctx *hctx, + struct list_head *list, bool at_head) +{ + struct request_queue *q = hctx->queue; + struct deadline_data *dd = q->elevator->elevator_data; + + spin_lock(&dd->lock); + while (!list_empty(list)) { + struct request *rq; + + rq = list_first_entry(list, struct request, queuelist); + list_del_init(&rq->queuelist); + dd_insert_request(hctx, rq, at_head); + } + spin_unlock(&dd->lock); +} + +/* Callback from inside blk_mq_rq_ctx_init(). */ +static void dd_prepare_request(struct request *rq) +{ + rq->elv.priv[0] = NULL; +} + +/* + * Callback from inside blk_mq_free_request(). + * + * For zoned block devices, write unlock the target zone of + * completed write requests. Do this while holding the zone lock + * spinlock so that the zone is never unlocked while deadline_fifo_request() + * or deadline_next_request() are executing. This function is called for + * all requests, whether or not these requests complete successfully. + * + * For a zoned block device, __dd_dispatch_request() may have stopped + * dispatching requests if all the queued requests are write requests directed + * at zones that are already locked due to on-going write requests. To ensure + * write request dispatch progress in this case, mark the queue as needing a + * restart to ensure that the queue is run again after completion of the + * request and zones being unlocked. + */ +static void dd_finish_request(struct request *rq) +{ + struct request_queue *q = rq->q; + struct deadline_data *dd = q->elevator->elevator_data; + struct dd_blkcg *blkcg = rq->elv.priv[0]; + const u8 ioprio_class = dd_rq_ioclass(rq); + const enum dd_prio prio = ioprio_class_to_prio[ioprio_class]; + struct dd_per_prio *per_prio = &dd->per_prio[prio]; + + dd_count(dd, completed, prio); + ddcg_count(blkcg, completed, ioprio_class); + + if (blk_queue_is_zoned(q)) { + unsigned long flags; + + spin_lock_irqsave(&dd->zone_lock, flags); + blk_req_zone_write_unlock(rq); + if (!list_empty(&per_prio->fifo_list[DD_WRITE])) + blk_mq_sched_mark_restart_hctx(rq->mq_hctx); + spin_unlock_irqrestore(&dd->zone_lock, flags); + } +} + +static bool dd_has_work_for_prio(struct dd_per_prio *per_prio) +{ + return !list_empty_careful(&per_prio->dispatch) || + !list_empty_careful(&per_prio->fifo_list[DD_READ]) || + !list_empty_careful(&per_prio->fifo_list[DD_WRITE]); +} + +static bool dd_has_work(struct blk_mq_hw_ctx *hctx) +{ + struct deadline_data *dd = hctx->queue->elevator->elevator_data; + enum dd_prio prio; + + for (prio = 0; prio <= DD_PRIO_MAX; prio++) + if (dd_has_work_for_prio(&dd->per_prio[prio])) + return true; + + return false; +} + +/* + * sysfs parts below + */ +#define SHOW_INT(__FUNC, __VAR) \ +static ssize_t __FUNC(struct elevator_queue *e, char *page) \ +{ \ + struct deadline_data *dd = e->elevator_data; \ + \ + return sysfs_emit(page, "%d\n", __VAR); \ +} +#define SHOW_JIFFIES(__FUNC, __VAR) SHOW_INT(__FUNC, jiffies_to_msecs(__VAR)) +SHOW_JIFFIES(deadline_read_expire_show, dd->fifo_expire[DD_READ]); +SHOW_JIFFIES(deadline_write_expire_show, dd->fifo_expire[DD_WRITE]); +SHOW_INT(deadline_writes_starved_show, dd->writes_starved); +SHOW_INT(deadline_front_merges_show, dd->front_merges); +SHOW_INT(deadline_async_depth_show, dd->front_merges); +SHOW_INT(deadline_fifo_batch_show, dd->fifo_batch); +#undef SHOW_INT +#undef SHOW_JIFFIES + +#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ +static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \ +{ \ + struct deadline_data *dd = e->elevator_data; \ + int __data, __ret; \ + \ + __ret = kstrtoint(page, 0, &__data); \ + if (__ret < 0) \ + return __ret; \ + if (__data < (MIN)) \ + __data = (MIN); \ + else if (__data > (MAX)) \ + __data = (MAX); \ + *(__PTR) = __CONV(__data); \ + return count; \ +} +#define STORE_INT(__FUNC, __PTR, MIN, MAX) \ + STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, ) +#define STORE_JIFFIES(__FUNC, __PTR, MIN, MAX) \ + STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, msecs_to_jiffies) +STORE_JIFFIES(deadline_read_expire_store, &dd->fifo_expire[DD_READ], 0, INT_MAX); +STORE_JIFFIES(deadline_write_expire_store, &dd->fifo_expire[DD_WRITE], 0, INT_MAX); +STORE_INT(deadline_writes_starved_store, &dd->writes_starved, INT_MIN, INT_MAX); +STORE_INT(deadline_front_merges_store, &dd->front_merges, 0, 1); +STORE_INT(deadline_async_depth_store, &dd->front_merges, 1, INT_MAX); +STORE_INT(deadline_fifo_batch_store, &dd->fifo_batch, 0, INT_MAX); +#undef STORE_FUNCTION +#undef STORE_INT +#undef STORE_JIFFIES + +#define DD_ATTR(name) \ + __ATTR(name, 0644, deadline_##name##_show, deadline_##name##_store) + +static struct elv_fs_entry deadline_attrs[] = { + DD_ATTR(read_expire), + DD_ATTR(write_expire), + DD_ATTR(writes_starved), + DD_ATTR(front_merges), + DD_ATTR(async_depth), + DD_ATTR(fifo_batch), + __ATTR_NULL +}; + +#ifdef CONFIG_BLK_DEBUG_FS +#define DEADLINE_DEBUGFS_DDIR_ATTRS(prio, data_dir, name) \ +static void *deadline_##name##_fifo_start(struct seq_file *m, \ + loff_t *pos) \ + __acquires(&dd->lock) \ +{ \ + struct request_queue *q = m->private; \ + struct deadline_data *dd = q->elevator->elevator_data; \ + struct dd_per_prio *per_prio = &dd->per_prio[prio]; \ + \ + spin_lock(&dd->lock); \ + return seq_list_start(&per_prio->fifo_list[data_dir], *pos); \ +} \ + \ +static void *deadline_##name##_fifo_next(struct seq_file *m, void *v, \ + loff_t *pos) \ +{ \ + struct request_queue *q = m->private; \ + struct deadline_data *dd = q->elevator->elevator_data; \ + struct dd_per_prio *per_prio = &dd->per_prio[prio]; \ + \ + return seq_list_next(v, &per_prio->fifo_list[data_dir], pos); \ +} \ + \ +static void deadline_##name##_fifo_stop(struct seq_file *m, void *v) \ + __releases(&dd->lock) \ +{ \ + struct request_queue *q = m->private; \ + struct deadline_data *dd = q->elevator->elevator_data; \ + \ + spin_unlock(&dd->lock); \ +} \ + \ +static const struct seq_operations deadline_##name##_fifo_seq_ops = { \ + .start = deadline_##name##_fifo_start, \ + .next = deadline_##name##_fifo_next, \ + .stop = deadline_##name##_fifo_stop, \ + .show = blk_mq_debugfs_rq_show, \ +}; \ + \ +static int deadline_##name##_next_rq_show(void *data, \ + struct seq_file *m) \ +{ \ + struct request_queue *q = data; \ + struct deadline_data *dd = q->elevator->elevator_data; \ + struct dd_per_prio *per_prio = &dd->per_prio[prio]; \ + struct request *rq = per_prio->next_rq[data_dir]; \ + \ + if (rq) \ + __blk_mq_debugfs_rq_show(m, rq); \ + return 0; \ +} + +DEADLINE_DEBUGFS_DDIR_ATTRS(DD_RT_PRIO, DD_READ, read0); +DEADLINE_DEBUGFS_DDIR_ATTRS(DD_RT_PRIO, DD_WRITE, write0); +DEADLINE_DEBUGFS_DDIR_ATTRS(DD_BE_PRIO, DD_READ, read1); +DEADLINE_DEBUGFS_DDIR_ATTRS(DD_BE_PRIO, DD_WRITE, write1); +DEADLINE_DEBUGFS_DDIR_ATTRS(DD_IDLE_PRIO, DD_READ, read2); +DEADLINE_DEBUGFS_DDIR_ATTRS(DD_IDLE_PRIO, DD_WRITE, write2); +#undef DEADLINE_DEBUGFS_DDIR_ATTRS + +static int deadline_batching_show(void *data, struct seq_file *m) +{ + struct request_queue *q = data; + struct deadline_data *dd = q->elevator->elevator_data; + + seq_printf(m, "%u\n", dd->batching); + return 0; +} + +static int deadline_starved_show(void *data, struct seq_file *m) +{ + struct request_queue *q = data; + struct deadline_data *dd = q->elevator->elevator_data; + + seq_printf(m, "%u\n", dd->starved); + return 0; +} + +static int dd_async_depth_show(void *data, struct seq_file *m) +{ + struct request_queue *q = data; + struct deadline_data *dd = q->elevator->elevator_data; + + seq_printf(m, "%u\n", dd->async_depth); + return 0; +} + +static int dd_queued_show(void *data, struct seq_file *m) +{ + struct request_queue *q = data; + struct deadline_data *dd = q->elevator->elevator_data; + + seq_printf(m, "%u %u %u\n", dd_queued(dd, DD_RT_PRIO), + dd_queued(dd, DD_BE_PRIO), + dd_queued(dd, DD_IDLE_PRIO)); + return 0; +} + +/* Number of requests owned by the block driver for a given priority. */ +static u32 dd_owned_by_driver(struct deadline_data *dd, enum dd_prio prio) +{ + return dd_sum(dd, dispatched, prio) + dd_sum(dd, merged, prio) + - dd_sum(dd, completed, prio); +} + +static int dd_owned_by_driver_show(void *data, struct seq_file *m) +{ + struct request_queue *q = data; + struct deadline_data *dd = q->elevator->elevator_data; + + seq_printf(m, "%u %u %u\n", dd_owned_by_driver(dd, DD_RT_PRIO), + dd_owned_by_driver(dd, DD_BE_PRIO), + dd_owned_by_driver(dd, DD_IDLE_PRIO)); + return 0; +} + +#define DEADLINE_DISPATCH_ATTR(prio) \ +static void *deadline_dispatch##prio##_start(struct seq_file *m, \ + loff_t *pos) \ + __acquires(&dd->lock) \ +{ \ + struct request_queue *q = m->private; \ + struct deadline_data *dd = q->elevator->elevator_data; \ + struct dd_per_prio *per_prio = &dd->per_prio[prio]; \ + \ + spin_lock(&dd->lock); \ + return seq_list_start(&per_prio->dispatch, *pos); \ +} \ + \ +static void *deadline_dispatch##prio##_next(struct seq_file *m, \ + void *v, loff_t *pos) \ +{ \ + struct request_queue *q = m->private; \ + struct deadline_data *dd = q->elevator->elevator_data; \ + struct dd_per_prio *per_prio = &dd->per_prio[prio]; \ + \ + return seq_list_next(v, &per_prio->dispatch, pos); \ +} \ + \ +static void deadline_dispatch##prio##_stop(struct seq_file *m, void *v) \ + __releases(&dd->lock) \ +{ \ + struct request_queue *q = m->private; \ + struct deadline_data *dd = q->elevator->elevator_data; \ + \ + spin_unlock(&dd->lock); \ +} \ + \ +static const struct seq_operations deadline_dispatch##prio##_seq_ops = { \ + .start = deadline_dispatch##prio##_start, \ + .next = deadline_dispatch##prio##_next, \ + .stop = deadline_dispatch##prio##_stop, \ + .show = blk_mq_debugfs_rq_show, \ +} + +DEADLINE_DISPATCH_ATTR(0); +DEADLINE_DISPATCH_ATTR(1); +DEADLINE_DISPATCH_ATTR(2); +#undef DEADLINE_DISPATCH_ATTR + +#define DEADLINE_QUEUE_DDIR_ATTRS(name) \ + {#name "_fifo_list", 0400, \ + .seq_ops = &deadline_##name##_fifo_seq_ops} +#define DEADLINE_NEXT_RQ_ATTR(name) \ + {#name "_next_rq", 0400, deadline_##name##_next_rq_show} +static const struct blk_mq_debugfs_attr deadline_queue_debugfs_attrs[] = { + DEADLINE_QUEUE_DDIR_ATTRS(read0), + DEADLINE_QUEUE_DDIR_ATTRS(write0), + DEADLINE_QUEUE_DDIR_ATTRS(read1), + DEADLINE_QUEUE_DDIR_ATTRS(write1), + DEADLINE_QUEUE_DDIR_ATTRS(read2), + DEADLINE_QUEUE_DDIR_ATTRS(write2), + DEADLINE_NEXT_RQ_ATTR(read0), + DEADLINE_NEXT_RQ_ATTR(write0), + DEADLINE_NEXT_RQ_ATTR(read1), + DEADLINE_NEXT_RQ_ATTR(write1), + DEADLINE_NEXT_RQ_ATTR(read2), + DEADLINE_NEXT_RQ_ATTR(write2), + {"batching", 0400, deadline_batching_show}, + {"starved", 0400, deadline_starved_show}, + {"async_depth", 0400, dd_async_depth_show}, + {"dispatch0", 0400, .seq_ops = &deadline_dispatch0_seq_ops}, + {"dispatch1", 0400, .seq_ops = &deadline_dispatch1_seq_ops}, + {"dispatch2", 0400, .seq_ops = &deadline_dispatch2_seq_ops}, + {"owned_by_driver", 0400, dd_owned_by_driver_show}, + {"queued", 0400, dd_queued_show}, + {}, +}; +#undef DEADLINE_QUEUE_DDIR_ATTRS +#endif + +static struct elevator_type mq_deadline = { + .ops = { + .depth_updated = dd_depth_updated, + .limit_depth = dd_limit_depth, + .insert_requests = dd_insert_requests, + .dispatch_request = dd_dispatch_request, + .prepare_request = dd_prepare_request, + .finish_request = dd_finish_request, + .next_request = elv_rb_latter_request, + .former_request = elv_rb_former_request, + .bio_merge = dd_bio_merge, + .request_merge = dd_request_merge, + .requests_merged = dd_merged_requests, + .request_merged = dd_request_merged, + .has_work = dd_has_work, + .init_sched = dd_init_sched, + .exit_sched = dd_exit_sched, + .init_hctx = dd_init_hctx, + }, + +#ifdef CONFIG_BLK_DEBUG_FS + .queue_debugfs_attrs = deadline_queue_debugfs_attrs, +#endif + .elevator_attrs = deadline_attrs, + .elevator_name = "mq-deadline", + .elevator_alias = "deadline", + .elevator_features = ELEVATOR_F_ZBD_SEQ_WRITE, + .elevator_owner = THIS_MODULE, +}; +MODULE_ALIAS("mq-deadline-iosched"); + +static int __init deadline_init(void) +{ + int ret; + + ret = elv_register(&mq_deadline); + if (ret) + goto out; + ret = dd_blkcg_init(); + if (ret) + goto unreg; + +out: + return ret; + +unreg: + elv_unregister(&mq_deadline); + goto out; +} + +static void __exit deadline_exit(void) +{ + dd_blkcg_exit(); + elv_unregister(&mq_deadline); +} + +module_init(deadline_init); +module_exit(deadline_exit); + +MODULE_AUTHOR("Jens Axboe, Damien Le Moal and Bart Van Assche"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("MQ deadline IO scheduler"); diff --git a/block/mq-deadline.c b/block/mq-deadline.c deleted file mode 100644 index 04d9d6b3745b..000000000000 --- a/block/mq-deadline.c +++ /dev/null @@ -1,1095 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * MQ Deadline i/o scheduler - adaptation of the legacy deadline scheduler, - * for the blk-mq scheduling framework - * - * Copyright (C) 2016 Jens Axboe - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include "blk.h" -#include "blk-mq.h" -#include "blk-mq-debugfs.h" -#include "blk-mq-tag.h" -#include "blk-mq-sched.h" - -/* - * See Documentation/block/deadline-iosched.rst - */ -static const int read_expire = HZ / 2; /* max time before a read is submitted. */ -static const int write_expire = 5 * HZ; /* ditto for writes, these limits are SOFT! */ -static const int writes_starved = 2; /* max times reads can starve a write */ -static const int fifo_batch = 16; /* # of sequential requests treated as one - by the above parameters. For throughput. */ - -enum dd_data_dir { - DD_READ = READ, - DD_WRITE = WRITE, -}; - -enum { DD_DIR_COUNT = 2 }; - -enum dd_prio { - DD_RT_PRIO = 0, - DD_BE_PRIO = 1, - DD_IDLE_PRIO = 2, - DD_PRIO_MAX = 2, -}; - -enum { DD_PRIO_COUNT = 3 }; - -/* I/O statistics per I/O priority. */ -struct io_stats_per_prio { - local_t inserted; - local_t merged; - local_t dispatched; - local_t completed; -}; - -/* I/O statistics for all I/O priorities (enum dd_prio). */ -struct io_stats { - struct io_stats_per_prio stats[DD_PRIO_COUNT]; -}; - -/* - * Deadline scheduler data per I/O priority (enum dd_prio). Requests are - * present on both sort_list[] and fifo_list[]. - */ -struct dd_per_prio { - struct list_head dispatch; - struct rb_root sort_list[DD_DIR_COUNT]; - struct list_head fifo_list[DD_DIR_COUNT]; - /* Next request in FIFO order. Read, write or both are NULL. */ - struct request *next_rq[DD_DIR_COUNT]; -}; - -struct deadline_data { - /* - * run time data - */ - - struct dd_per_prio per_prio[DD_PRIO_COUNT]; - - /* Data direction of latest dispatched request. */ - enum dd_data_dir last_dir; - unsigned int batching; /* number of sequential requests made */ - unsigned int starved; /* times reads have starved writes */ - - struct io_stats __percpu *stats; - - /* - * settings that change how the i/o scheduler behaves - */ - int fifo_expire[DD_DIR_COUNT]; - int fifo_batch; - int writes_starved; - int front_merges; - u32 async_depth; - - spinlock_t lock; - spinlock_t zone_lock; -}; - -/* Count one event of type 'event_type' and with I/O priority 'prio' */ -#define dd_count(dd, event_type, prio) do { \ - struct io_stats *io_stats = get_cpu_ptr((dd)->stats); \ - \ - BUILD_BUG_ON(!__same_type((dd), struct deadline_data *)); \ - BUILD_BUG_ON(!__same_type((prio), enum dd_prio)); \ - local_inc(&io_stats->stats[(prio)].event_type); \ - put_cpu_ptr(io_stats); \ -} while (0) - -/* - * Returns the total number of dd_count(dd, event_type, prio) calls across all - * CPUs. No locking or barriers since it is fine if the returned sum is slightly - * outdated. - */ -#define dd_sum(dd, event_type, prio) ({ \ - unsigned int cpu; \ - u32 sum = 0; \ - \ - BUILD_BUG_ON(!__same_type((dd), struct deadline_data *)); \ - BUILD_BUG_ON(!__same_type((prio), enum dd_prio)); \ - for_each_present_cpu(cpu) \ - sum += local_read(&per_cpu_ptr((dd)->stats, cpu)-> \ - stats[(prio)].event_type); \ - sum; \ -}) - -/* Maps an I/O priority class to a deadline scheduler priority. */ -static const enum dd_prio ioprio_class_to_prio[] = { - [IOPRIO_CLASS_NONE] = DD_BE_PRIO, - [IOPRIO_CLASS_RT] = DD_RT_PRIO, - [IOPRIO_CLASS_BE] = DD_BE_PRIO, - [IOPRIO_CLASS_IDLE] = DD_IDLE_PRIO, -}; - -static inline struct rb_root * -deadline_rb_root(struct dd_per_prio *per_prio, struct request *rq) -{ - return &per_prio->sort_list[rq_data_dir(rq)]; -} - -/* - * Returns the I/O priority class (IOPRIO_CLASS_*) that has been assigned to a - * request. - */ -static u8 dd_rq_ioclass(struct request *rq) -{ - return IOPRIO_PRIO_CLASS(req_get_ioprio(rq)); -} - -/* - * get the request after `rq' in sector-sorted order - */ -static inline struct request * -deadline_latter_request(struct request *rq) -{ - struct rb_node *node = rb_next(&rq->rb_node); - - if (node) - return rb_entry_rq(node); - - return NULL; -} - -static void -deadline_add_rq_rb(struct dd_per_prio *per_prio, struct request *rq) -{ - struct rb_root *root = deadline_rb_root(per_prio, rq); - - elv_rb_add(root, rq); -} - -static inline void -deadline_del_rq_rb(struct dd_per_prio *per_prio, struct request *rq) -{ - const enum dd_data_dir data_dir = rq_data_dir(rq); - - if (per_prio->next_rq[data_dir] == rq) - per_prio->next_rq[data_dir] = deadline_latter_request(rq); - - elv_rb_del(deadline_rb_root(per_prio, rq), rq); -} - -/* - * remove rq from rbtree and fifo. - */ -static void deadline_remove_request(struct request_queue *q, - struct dd_per_prio *per_prio, - struct request *rq) -{ - list_del_init(&rq->queuelist); - - /* - * We might not be on the rbtree, if we are doing an insert merge - */ - if (!RB_EMPTY_NODE(&rq->rb_node)) - deadline_del_rq_rb(per_prio, rq); - - elv_rqhash_del(q, rq); - if (q->last_merge == rq) - q->last_merge = NULL; -} - -static void dd_request_merged(struct request_queue *q, struct request *req, - enum elv_merge type) -{ - struct deadline_data *dd = q->elevator->elevator_data; - const u8 ioprio_class = dd_rq_ioclass(req); - const enum dd_prio prio = ioprio_class_to_prio[ioprio_class]; - struct dd_per_prio *per_prio = &dd->per_prio[prio]; - - /* - * if the merge was a front merge, we need to reposition request - */ - if (type == ELEVATOR_FRONT_MERGE) { - elv_rb_del(deadline_rb_root(per_prio, req), req); - deadline_add_rq_rb(per_prio, req); - } -} - -/* - * Callback function that is invoked after @next has been merged into @req. - */ -static void dd_merged_requests(struct request_queue *q, struct request *req, - struct request *next) -{ - struct deadline_data *dd = q->elevator->elevator_data; - const u8 ioprio_class = dd_rq_ioclass(next); - const enum dd_prio prio = ioprio_class_to_prio[ioprio_class]; - - dd_count(dd, merged, prio); - - /* - * if next expires before rq, assign its expire time to rq - * and move into next position (next will be deleted) in fifo - */ - if (!list_empty(&req->queuelist) && !list_empty(&next->queuelist)) { - if (time_before((unsigned long)next->fifo_time, - (unsigned long)req->fifo_time)) { - list_move(&req->queuelist, &next->queuelist); - req->fifo_time = next->fifo_time; - } - } - - /* - * kill knowledge of next, this one is a goner - */ - deadline_remove_request(q, &dd->per_prio[prio], next); -} - -/* - * move an entry to dispatch queue - */ -static void -deadline_move_request(struct deadline_data *dd, struct dd_per_prio *per_prio, - struct request *rq) -{ - const enum dd_data_dir data_dir = rq_data_dir(rq); - - per_prio->next_rq[data_dir] = deadline_latter_request(rq); - - /* - * take it off the sort and fifo list - */ - deadline_remove_request(rq->q, per_prio, rq); -} - -/* Number of requests queued for a given priority level. */ -static u32 dd_queued(struct deadline_data *dd, enum dd_prio prio) -{ - return dd_sum(dd, inserted, prio) - dd_sum(dd, completed, prio); -} - -/* - * deadline_check_fifo returns 0 if there are no expired requests on the fifo, - * 1 otherwise. Requires !list_empty(&dd->fifo_list[data_dir]) - */ -static inline int deadline_check_fifo(struct dd_per_prio *per_prio, - enum dd_data_dir data_dir) -{ - struct request *rq = rq_entry_fifo(per_prio->fifo_list[data_dir].next); - - /* - * rq is expired! - */ - if (time_after_eq(jiffies, (unsigned long)rq->fifo_time)) - return 1; - - return 0; -} - -/* - * For the specified data direction, return the next request to - * dispatch using arrival ordered lists. - */ -static struct request * -deadline_fifo_request(struct deadline_data *dd, struct dd_per_prio *per_prio, - enum dd_data_dir data_dir) -{ - struct request *rq; - unsigned long flags; - - if (list_empty(&per_prio->fifo_list[data_dir])) - return NULL; - - rq = rq_entry_fifo(per_prio->fifo_list[data_dir].next); - if (data_dir == DD_READ || !blk_queue_is_zoned(rq->q)) - return rq; - - /* - * Look for a write request that can be dispatched, that is one with - * an unlocked target zone. - */ - spin_lock_irqsave(&dd->zone_lock, flags); - list_for_each_entry(rq, &per_prio->fifo_list[DD_WRITE], queuelist) { - if (blk_req_can_dispatch_to_zone(rq)) - goto out; - } - rq = NULL; -out: - spin_unlock_irqrestore(&dd->zone_lock, flags); - - return rq; -} - -/* - * For the specified data direction, return the next request to - * dispatch using sector position sorted lists. - */ -static struct request * -deadline_next_request(struct deadline_data *dd, struct dd_per_prio *per_prio, - enum dd_data_dir data_dir) -{ - struct request *rq; - unsigned long flags; - - rq = per_prio->next_rq[data_dir]; - if (!rq) - return NULL; - - if (data_dir == DD_READ || !blk_queue_is_zoned(rq->q)) - return rq; - - /* - * Look for a write request that can be dispatched, that is one with - * an unlocked target zone. - */ - spin_lock_irqsave(&dd->zone_lock, flags); - while (rq) { - if (blk_req_can_dispatch_to_zone(rq)) - break; - rq = deadline_latter_request(rq); - } - spin_unlock_irqrestore(&dd->zone_lock, flags); - - return rq; -} - -/* - * deadline_dispatch_requests selects the best request according to - * read/write expire, fifo_batch, etc - */ -static struct request *__dd_dispatch_request(struct deadline_data *dd, - struct dd_per_prio *per_prio) -{ - struct request *rq, *next_rq; - enum dd_data_dir data_dir; - enum dd_prio prio; - u8 ioprio_class; - - lockdep_assert_held(&dd->lock); - - if (!list_empty(&per_prio->dispatch)) { - rq = list_first_entry(&per_prio->dispatch, struct request, - queuelist); - list_del_init(&rq->queuelist); - goto done; - } - - /* - * batches are currently reads XOR writes - */ - rq = deadline_next_request(dd, per_prio, dd->last_dir); - if (rq && dd->batching < dd->fifo_batch) - /* we have a next request are still entitled to batch */ - goto dispatch_request; - - /* - * at this point we are not running a batch. select the appropriate - * data direction (read / write) - */ - - if (!list_empty(&per_prio->fifo_list[DD_READ])) { - BUG_ON(RB_EMPTY_ROOT(&per_prio->sort_list[DD_READ])); - - if (deadline_fifo_request(dd, per_prio, DD_WRITE) && - (dd->starved++ >= dd->writes_starved)) - goto dispatch_writes; - - data_dir = DD_READ; - - goto dispatch_find_request; - } - - /* - * there are either no reads or writes have been starved - */ - - if (!list_empty(&per_prio->fifo_list[DD_WRITE])) { -dispatch_writes: - BUG_ON(RB_EMPTY_ROOT(&per_prio->sort_list[DD_WRITE])); - - dd->starved = 0; - - data_dir = DD_WRITE; - - goto dispatch_find_request; - } - - return NULL; - -dispatch_find_request: - /* - * we are not running a batch, find best request for selected data_dir - */ - next_rq = deadline_next_request(dd, per_prio, data_dir); - if (deadline_check_fifo(per_prio, data_dir) || !next_rq) { - /* - * A deadline has expired, the last request was in the other - * direction, or we have run out of higher-sectored requests. - * Start again from the request with the earliest expiry time. - */ - rq = deadline_fifo_request(dd, per_prio, data_dir); - } else { - /* - * The last req was the same dir and we have a next request in - * sort order. No expired requests so continue on from here. - */ - rq = next_rq; - } - - /* - * For a zoned block device, if we only have writes queued and none of - * them can be dispatched, rq will be NULL. - */ - if (!rq) - return NULL; - - dd->last_dir = data_dir; - dd->batching = 0; - -dispatch_request: - /* - * rq is the selected appropriate request. - */ - dd->batching++; - deadline_move_request(dd, per_prio, rq); -done: - ioprio_class = dd_rq_ioclass(rq); - prio = ioprio_class_to_prio[ioprio_class]; - dd_count(dd, dispatched, prio); - /* - * If the request needs its target zone locked, do it. - */ - blk_req_zone_write_lock(rq); - rq->rq_flags |= RQF_STARTED; - return rq; -} - -/* - * Called from blk_mq_run_hw_queue() -> __blk_mq_sched_dispatch_requests(). - * - * One confusing aspect here is that we get called for a specific - * hardware queue, but we may return a request that is for a - * different hardware queue. This is because mq-deadline has shared - * state for all hardware queues, in terms of sorting, FIFOs, etc. - */ -static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx) -{ - struct deadline_data *dd = hctx->queue->elevator->elevator_data; - struct request *rq; - enum dd_prio prio; - - spin_lock(&dd->lock); - for (prio = 0; prio <= DD_PRIO_MAX; prio++) { - rq = __dd_dispatch_request(dd, &dd->per_prio[prio]); - if (rq) - break; - } - spin_unlock(&dd->lock); - - return rq; -} - -/* - * Called by __blk_mq_alloc_request(). The shallow_depth value set by this - * function is used by __blk_mq_get_tag(). - */ -static void dd_limit_depth(unsigned int op, struct blk_mq_alloc_data *data) -{ - struct deadline_data *dd = data->q->elevator->elevator_data; - - /* Do not throttle synchronous reads. */ - if (op_is_sync(op) && !op_is_write(op)) - return; - - /* - * Throttle asynchronous requests and writes such that these requests - * do not block the allocation of synchronous requests. - */ - data->shallow_depth = dd->async_depth; -} - -/* Called by blk_mq_update_nr_requests(). */ -static void dd_depth_updated(struct blk_mq_hw_ctx *hctx) -{ - struct request_queue *q = hctx->queue; - struct deadline_data *dd = q->elevator->elevator_data; - struct blk_mq_tags *tags = hctx->sched_tags; - - dd->async_depth = max(1UL, 3 * q->nr_requests / 4); - - sbitmap_queue_min_shallow_depth(tags->bitmap_tags, dd->async_depth); -} - -/* Called by blk_mq_init_hctx() and blk_mq_init_sched(). */ -static int dd_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) -{ - dd_depth_updated(hctx); - return 0; -} - -static void dd_exit_sched(struct elevator_queue *e) -{ - struct deadline_data *dd = e->elevator_data; - enum dd_prio prio; - - for (prio = 0; prio <= DD_PRIO_MAX; prio++) { - struct dd_per_prio *per_prio = &dd->per_prio[prio]; - - WARN_ON_ONCE(!list_empty(&per_prio->fifo_list[DD_READ])); - WARN_ON_ONCE(!list_empty(&per_prio->fifo_list[DD_WRITE])); - } - - free_percpu(dd->stats); - - kfree(dd); -} - -/* - * initialize elevator private data (deadline_data). - */ -static int dd_init_sched(struct request_queue *q, struct elevator_type *e) -{ - struct deadline_data *dd; - struct elevator_queue *eq; - enum dd_prio prio; - int ret = -ENOMEM; - - eq = elevator_alloc(q, e); - if (!eq) - return ret; - - dd = kzalloc_node(sizeof(*dd), GFP_KERNEL, q->node); - if (!dd) - goto put_eq; - - eq->elevator_data = dd; - - dd->stats = alloc_percpu_gfp(typeof(*dd->stats), - GFP_KERNEL | __GFP_ZERO); - if (!dd->stats) - goto free_dd; - - for (prio = 0; prio <= DD_PRIO_MAX; prio++) { - struct dd_per_prio *per_prio = &dd->per_prio[prio]; - - INIT_LIST_HEAD(&per_prio->dispatch); - INIT_LIST_HEAD(&per_prio->fifo_list[DD_READ]); - INIT_LIST_HEAD(&per_prio->fifo_list[DD_WRITE]); - per_prio->sort_list[DD_READ] = RB_ROOT; - per_prio->sort_list[DD_WRITE] = RB_ROOT; - } - dd->fifo_expire[DD_READ] = read_expire; - dd->fifo_expire[DD_WRITE] = write_expire; - dd->writes_starved = writes_starved; - dd->front_merges = 1; - dd->last_dir = DD_WRITE; - dd->fifo_batch = fifo_batch; - spin_lock_init(&dd->lock); - spin_lock_init(&dd->zone_lock); - - q->elevator = eq; - return 0; - -free_dd: - kfree(dd); - -put_eq: - kobject_put(&eq->kobj); - return ret; -} - -/* - * Try to merge @bio into an existing request. If @bio has been merged into - * an existing request, store the pointer to that request into *@rq. - */ -static int dd_request_merge(struct request_queue *q, struct request **rq, - struct bio *bio) -{ - struct deadline_data *dd = q->elevator->elevator_data; - const u8 ioprio_class = IOPRIO_PRIO_CLASS(bio->bi_ioprio); - const enum dd_prio prio = ioprio_class_to_prio[ioprio_class]; - struct dd_per_prio *per_prio = &dd->per_prio[prio]; - sector_t sector = bio_end_sector(bio); - struct request *__rq; - - if (!dd->front_merges) - return ELEVATOR_NO_MERGE; - - __rq = elv_rb_find(&per_prio->sort_list[bio_data_dir(bio)], sector); - if (__rq) { - BUG_ON(sector != blk_rq_pos(__rq)); - - if (elv_bio_merge_ok(__rq, bio)) { - *rq = __rq; - return ELEVATOR_FRONT_MERGE; - } - } - - return ELEVATOR_NO_MERGE; -} - -/* - * Attempt to merge a bio into an existing request. This function is called - * before @bio is associated with a request. - */ -static bool dd_bio_merge(struct request_queue *q, struct bio *bio, - unsigned int nr_segs) -{ - struct deadline_data *dd = q->elevator->elevator_data; - struct request *free = NULL; - bool ret; - - spin_lock(&dd->lock); - ret = blk_mq_sched_try_merge(q, bio, nr_segs, &free); - spin_unlock(&dd->lock); - - if (free) - blk_mq_free_request(free); - - return ret; -} - -/* - * add rq to rbtree and fifo - */ -static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, - bool at_head) -{ - struct request_queue *q = hctx->queue; - struct deadline_data *dd = q->elevator->elevator_data; - const enum dd_data_dir data_dir = rq_data_dir(rq); - u16 ioprio = req_get_ioprio(rq); - u8 ioprio_class = IOPRIO_PRIO_CLASS(ioprio); - struct dd_per_prio *per_prio; - enum dd_prio prio; - - lockdep_assert_held(&dd->lock); - - /* - * This may be a requeue of a write request that has locked its - * target zone. If it is the case, this releases the zone lock. - */ - blk_req_zone_write_unlock(rq); - - prio = ioprio_class_to_prio[ioprio_class]; - dd_count(dd, inserted, prio); - - if (blk_mq_sched_try_insert_merge(q, rq)) - return; - - trace_block_rq_insert(rq); - - per_prio = &dd->per_prio[prio]; - if (at_head) { - list_add(&rq->queuelist, &per_prio->dispatch); - } else { - deadline_add_rq_rb(per_prio, rq); - - if (rq_mergeable(rq)) { - elv_rqhash_add(q, rq); - if (!q->last_merge) - q->last_merge = rq; - } - - /* - * set expire time and add to fifo list - */ - rq->fifo_time = jiffies + dd->fifo_expire[data_dir]; - list_add_tail(&rq->queuelist, &per_prio->fifo_list[data_dir]); - } -} - -/* - * Called from blk_mq_sched_insert_request() or blk_mq_sched_insert_requests(). - */ -static void dd_insert_requests(struct blk_mq_hw_ctx *hctx, - struct list_head *list, bool at_head) -{ - struct request_queue *q = hctx->queue; - struct deadline_data *dd = q->elevator->elevator_data; - - spin_lock(&dd->lock); - while (!list_empty(list)) { - struct request *rq; - - rq = list_first_entry(list, struct request, queuelist); - list_del_init(&rq->queuelist); - dd_insert_request(hctx, rq, at_head); - } - spin_unlock(&dd->lock); -} - -/* - * Nothing to do here. This is defined only to ensure that .finish_request - * method is called upon request completion. - */ -static void dd_prepare_request(struct request *rq) -{ -} - -/* - * Callback from inside blk_mq_free_request(). - * - * For zoned block devices, write unlock the target zone of - * completed write requests. Do this while holding the zone lock - * spinlock so that the zone is never unlocked while deadline_fifo_request() - * or deadline_next_request() are executing. This function is called for - * all requests, whether or not these requests complete successfully. - * - * For a zoned block device, __dd_dispatch_request() may have stopped - * dispatching requests if all the queued requests are write requests directed - * at zones that are already locked due to on-going write requests. To ensure - * write request dispatch progress in this case, mark the queue as needing a - * restart to ensure that the queue is run again after completion of the - * request and zones being unlocked. - */ -static void dd_finish_request(struct request *rq) -{ - struct request_queue *q = rq->q; - struct deadline_data *dd = q->elevator->elevator_data; - const u8 ioprio_class = dd_rq_ioclass(rq); - const enum dd_prio prio = ioprio_class_to_prio[ioprio_class]; - struct dd_per_prio *per_prio = &dd->per_prio[prio]; - - dd_count(dd, completed, prio); - - if (blk_queue_is_zoned(q)) { - unsigned long flags; - - spin_lock_irqsave(&dd->zone_lock, flags); - blk_req_zone_write_unlock(rq); - if (!list_empty(&per_prio->fifo_list[DD_WRITE])) - blk_mq_sched_mark_restart_hctx(rq->mq_hctx); - spin_unlock_irqrestore(&dd->zone_lock, flags); - } -} - -static bool dd_has_work_for_prio(struct dd_per_prio *per_prio) -{ - return !list_empty_careful(&per_prio->dispatch) || - !list_empty_careful(&per_prio->fifo_list[DD_READ]) || - !list_empty_careful(&per_prio->fifo_list[DD_WRITE]); -} - -static bool dd_has_work(struct blk_mq_hw_ctx *hctx) -{ - struct deadline_data *dd = hctx->queue->elevator->elevator_data; - enum dd_prio prio; - - for (prio = 0; prio <= DD_PRIO_MAX; prio++) - if (dd_has_work_for_prio(&dd->per_prio[prio])) - return true; - - return false; -} - -/* - * sysfs parts below - */ -#define SHOW_INT(__FUNC, __VAR) \ -static ssize_t __FUNC(struct elevator_queue *e, char *page) \ -{ \ - struct deadline_data *dd = e->elevator_data; \ - \ - return sysfs_emit(page, "%d\n", __VAR); \ -} -#define SHOW_JIFFIES(__FUNC, __VAR) SHOW_INT(__FUNC, jiffies_to_msecs(__VAR)) -SHOW_JIFFIES(deadline_read_expire_show, dd->fifo_expire[DD_READ]); -SHOW_JIFFIES(deadline_write_expire_show, dd->fifo_expire[DD_WRITE]); -SHOW_INT(deadline_writes_starved_show, dd->writes_starved); -SHOW_INT(deadline_front_merges_show, dd->front_merges); -SHOW_INT(deadline_async_depth_show, dd->front_merges); -SHOW_INT(deadline_fifo_batch_show, dd->fifo_batch); -#undef SHOW_INT -#undef SHOW_JIFFIES - -#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ -static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \ -{ \ - struct deadline_data *dd = e->elevator_data; \ - int __data, __ret; \ - \ - __ret = kstrtoint(page, 0, &__data); \ - if (__ret < 0) \ - return __ret; \ - if (__data < (MIN)) \ - __data = (MIN); \ - else if (__data > (MAX)) \ - __data = (MAX); \ - *(__PTR) = __CONV(__data); \ - return count; \ -} -#define STORE_INT(__FUNC, __PTR, MIN, MAX) \ - STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, ) -#define STORE_JIFFIES(__FUNC, __PTR, MIN, MAX) \ - STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, msecs_to_jiffies) -STORE_JIFFIES(deadline_read_expire_store, &dd->fifo_expire[DD_READ], 0, INT_MAX); -STORE_JIFFIES(deadline_write_expire_store, &dd->fifo_expire[DD_WRITE], 0, INT_MAX); -STORE_INT(deadline_writes_starved_store, &dd->writes_starved, INT_MIN, INT_MAX); -STORE_INT(deadline_front_merges_store, &dd->front_merges, 0, 1); -STORE_INT(deadline_async_depth_store, &dd->front_merges, 1, INT_MAX); -STORE_INT(deadline_fifo_batch_store, &dd->fifo_batch, 0, INT_MAX); -#undef STORE_FUNCTION -#undef STORE_INT -#undef STORE_JIFFIES - -#define DD_ATTR(name) \ - __ATTR(name, 0644, deadline_##name##_show, deadline_##name##_store) - -static struct elv_fs_entry deadline_attrs[] = { - DD_ATTR(read_expire), - DD_ATTR(write_expire), - DD_ATTR(writes_starved), - DD_ATTR(front_merges), - DD_ATTR(async_depth), - DD_ATTR(fifo_batch), - __ATTR_NULL -}; - -#ifdef CONFIG_BLK_DEBUG_FS -#define DEADLINE_DEBUGFS_DDIR_ATTRS(prio, data_dir, name) \ -static void *deadline_##name##_fifo_start(struct seq_file *m, \ - loff_t *pos) \ - __acquires(&dd->lock) \ -{ \ - struct request_queue *q = m->private; \ - struct deadline_data *dd = q->elevator->elevator_data; \ - struct dd_per_prio *per_prio = &dd->per_prio[prio]; \ - \ - spin_lock(&dd->lock); \ - return seq_list_start(&per_prio->fifo_list[data_dir], *pos); \ -} \ - \ -static void *deadline_##name##_fifo_next(struct seq_file *m, void *v, \ - loff_t *pos) \ -{ \ - struct request_queue *q = m->private; \ - struct deadline_data *dd = q->elevator->elevator_data; \ - struct dd_per_prio *per_prio = &dd->per_prio[prio]; \ - \ - return seq_list_next(v, &per_prio->fifo_list[data_dir], pos); \ -} \ - \ -static void deadline_##name##_fifo_stop(struct seq_file *m, void *v) \ - __releases(&dd->lock) \ -{ \ - struct request_queue *q = m->private; \ - struct deadline_data *dd = q->elevator->elevator_data; \ - \ - spin_unlock(&dd->lock); \ -} \ - \ -static const struct seq_operations deadline_##name##_fifo_seq_ops = { \ - .start = deadline_##name##_fifo_start, \ - .next = deadline_##name##_fifo_next, \ - .stop = deadline_##name##_fifo_stop, \ - .show = blk_mq_debugfs_rq_show, \ -}; \ - \ -static int deadline_##name##_next_rq_show(void *data, \ - struct seq_file *m) \ -{ \ - struct request_queue *q = data; \ - struct deadline_data *dd = q->elevator->elevator_data; \ - struct dd_per_prio *per_prio = &dd->per_prio[prio]; \ - struct request *rq = per_prio->next_rq[data_dir]; \ - \ - if (rq) \ - __blk_mq_debugfs_rq_show(m, rq); \ - return 0; \ -} - -DEADLINE_DEBUGFS_DDIR_ATTRS(DD_RT_PRIO, DD_READ, read0); -DEADLINE_DEBUGFS_DDIR_ATTRS(DD_RT_PRIO, DD_WRITE, write0); -DEADLINE_DEBUGFS_DDIR_ATTRS(DD_BE_PRIO, DD_READ, read1); -DEADLINE_DEBUGFS_DDIR_ATTRS(DD_BE_PRIO, DD_WRITE, write1); -DEADLINE_DEBUGFS_DDIR_ATTRS(DD_IDLE_PRIO, DD_READ, read2); -DEADLINE_DEBUGFS_DDIR_ATTRS(DD_IDLE_PRIO, DD_WRITE, write2); -#undef DEADLINE_DEBUGFS_DDIR_ATTRS - -static int deadline_batching_show(void *data, struct seq_file *m) -{ - struct request_queue *q = data; - struct deadline_data *dd = q->elevator->elevator_data; - - seq_printf(m, "%u\n", dd->batching); - return 0; -} - -static int deadline_starved_show(void *data, struct seq_file *m) -{ - struct request_queue *q = data; - struct deadline_data *dd = q->elevator->elevator_data; - - seq_printf(m, "%u\n", dd->starved); - return 0; -} - -static int dd_async_depth_show(void *data, struct seq_file *m) -{ - struct request_queue *q = data; - struct deadline_data *dd = q->elevator->elevator_data; - - seq_printf(m, "%u\n", dd->async_depth); - return 0; -} - -static int dd_queued_show(void *data, struct seq_file *m) -{ - struct request_queue *q = data; - struct deadline_data *dd = q->elevator->elevator_data; - - seq_printf(m, "%u %u %u\n", dd_queued(dd, DD_RT_PRIO), - dd_queued(dd, DD_BE_PRIO), - dd_queued(dd, DD_IDLE_PRIO)); - return 0; -} - -/* Number of requests owned by the block driver for a given priority. */ -static u32 dd_owned_by_driver(struct deadline_data *dd, enum dd_prio prio) -{ - return dd_sum(dd, dispatched, prio) + dd_sum(dd, merged, prio) - - dd_sum(dd, completed, prio); -} - -static int dd_owned_by_driver_show(void *data, struct seq_file *m) -{ - struct request_queue *q = data; - struct deadline_data *dd = q->elevator->elevator_data; - - seq_printf(m, "%u %u %u\n", dd_owned_by_driver(dd, DD_RT_PRIO), - dd_owned_by_driver(dd, DD_BE_PRIO), - dd_owned_by_driver(dd, DD_IDLE_PRIO)); - return 0; -} - -#define DEADLINE_DISPATCH_ATTR(prio) \ -static void *deadline_dispatch##prio##_start(struct seq_file *m, \ - loff_t *pos) \ - __acquires(&dd->lock) \ -{ \ - struct request_queue *q = m->private; \ - struct deadline_data *dd = q->elevator->elevator_data; \ - struct dd_per_prio *per_prio = &dd->per_prio[prio]; \ - \ - spin_lock(&dd->lock); \ - return seq_list_start(&per_prio->dispatch, *pos); \ -} \ - \ -static void *deadline_dispatch##prio##_next(struct seq_file *m, \ - void *v, loff_t *pos) \ -{ \ - struct request_queue *q = m->private; \ - struct deadline_data *dd = q->elevator->elevator_data; \ - struct dd_per_prio *per_prio = &dd->per_prio[prio]; \ - \ - return seq_list_next(v, &per_prio->dispatch, pos); \ -} \ - \ -static void deadline_dispatch##prio##_stop(struct seq_file *m, void *v) \ - __releases(&dd->lock) \ -{ \ - struct request_queue *q = m->private; \ - struct deadline_data *dd = q->elevator->elevator_data; \ - \ - spin_unlock(&dd->lock); \ -} \ - \ -static const struct seq_operations deadline_dispatch##prio##_seq_ops = { \ - .start = deadline_dispatch##prio##_start, \ - .next = deadline_dispatch##prio##_next, \ - .stop = deadline_dispatch##prio##_stop, \ - .show = blk_mq_debugfs_rq_show, \ -} - -DEADLINE_DISPATCH_ATTR(0); -DEADLINE_DISPATCH_ATTR(1); -DEADLINE_DISPATCH_ATTR(2); -#undef DEADLINE_DISPATCH_ATTR - -#define DEADLINE_QUEUE_DDIR_ATTRS(name) \ - {#name "_fifo_list", 0400, \ - .seq_ops = &deadline_##name##_fifo_seq_ops} -#define DEADLINE_NEXT_RQ_ATTR(name) \ - {#name "_next_rq", 0400, deadline_##name##_next_rq_show} -static const struct blk_mq_debugfs_attr deadline_queue_debugfs_attrs[] = { - DEADLINE_QUEUE_DDIR_ATTRS(read0), - DEADLINE_QUEUE_DDIR_ATTRS(write0), - DEADLINE_QUEUE_DDIR_ATTRS(read1), - DEADLINE_QUEUE_DDIR_ATTRS(write1), - DEADLINE_QUEUE_DDIR_ATTRS(read2), - DEADLINE_QUEUE_DDIR_ATTRS(write2), - DEADLINE_NEXT_RQ_ATTR(read0), - DEADLINE_NEXT_RQ_ATTR(write0), - DEADLINE_NEXT_RQ_ATTR(read1), - DEADLINE_NEXT_RQ_ATTR(write1), - DEADLINE_NEXT_RQ_ATTR(read2), - DEADLINE_NEXT_RQ_ATTR(write2), - {"batching", 0400, deadline_batching_show}, - {"starved", 0400, deadline_starved_show}, - {"async_depth", 0400, dd_async_depth_show}, - {"dispatch0", 0400, .seq_ops = &deadline_dispatch0_seq_ops}, - {"dispatch1", 0400, .seq_ops = &deadline_dispatch1_seq_ops}, - {"dispatch2", 0400, .seq_ops = &deadline_dispatch2_seq_ops}, - {"owned_by_driver", 0400, dd_owned_by_driver_show}, - {"queued", 0400, dd_queued_show}, - {}, -}; -#undef DEADLINE_QUEUE_DDIR_ATTRS -#endif - -static struct elevator_type mq_deadline = { - .ops = { - .depth_updated = dd_depth_updated, - .limit_depth = dd_limit_depth, - .insert_requests = dd_insert_requests, - .dispatch_request = dd_dispatch_request, - .prepare_request = dd_prepare_request, - .finish_request = dd_finish_request, - .next_request = elv_rb_latter_request, - .former_request = elv_rb_former_request, - .bio_merge = dd_bio_merge, - .request_merge = dd_request_merge, - .requests_merged = dd_merged_requests, - .request_merged = dd_request_merged, - .has_work = dd_has_work, - .init_sched = dd_init_sched, - .exit_sched = dd_exit_sched, - .init_hctx = dd_init_hctx, - }, - -#ifdef CONFIG_BLK_DEBUG_FS - .queue_debugfs_attrs = deadline_queue_debugfs_attrs, -#endif - .elevator_attrs = deadline_attrs, - .elevator_name = "mq-deadline", - .elevator_alias = "deadline", - .elevator_features = ELEVATOR_F_ZBD_SEQ_WRITE, - .elevator_owner = THIS_MODULE, -}; -MODULE_ALIAS("mq-deadline-iosched"); - -static int __init deadline_init(void) -{ - return elv_register(&mq_deadline); -} - -static void __exit deadline_exit(void) -{ - elv_unregister(&mq_deadline); -} - -module_init(deadline_init); -module_exit(deadline_exit); - -MODULE_AUTHOR("Jens Axboe, Damien Le Moal and Bart Van Assche"); -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("MQ deadline IO scheduler"); -- cgit v1.2.3