diff options
Diffstat (limited to 'block/blk-iocost.c')
-rw-r--r-- | block/blk-iocost.c | 287 |
1 files changed, 155 insertions, 132 deletions
diff --git a/block/blk-iocost.c b/block/blk-iocost.c index bbe86d1199dc..ffa418c0dcb1 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -39,7 +39,7 @@ * On top of that, a size cost proportional to the length of the IO is * added. While simple, this model captures the operational * characteristics of a wide varienty of devices well enough. Default - * paramters for several different classes of devices are provided and the + * parameters for several different classes of devices are provided and the * parameters can be configured from userspace via * /sys/fs/cgroup/io.cost.model. * @@ -77,7 +77,7 @@ * * This constitutes the basis of IO capacity distribution. Each cgroup's * vtime is running at a rate determined by its hweight. A cgroup tracks - * the vtime consumed by past IOs and can issue a new IO iff doing so + * the vtime consumed by past IOs and can issue a new IO if doing so * wouldn't outrun the current device vtime. Otherwise, the IO is * suspended until the vtime has progressed enough to cover it. * @@ -155,7 +155,7 @@ * Instead of debugfs or other clumsy monitoring mechanisms, this * controller uses a drgn based monitoring script - * tools/cgroup/iocost_monitor.py. For details on drgn, please see - * https://github.com/osandov/drgn. The ouput looks like the following. + * https://github.com/osandov/drgn. The output looks like the following. * * sdb RUN per=300ms cur_per=234.218:v203.695 busy= +1 vrate= 62.12% * active weight hweight% inflt% dbt delay usages% @@ -370,8 +370,6 @@ enum { AUTOP_SSD_FAST, }; -struct ioc_gq; - struct ioc_params { u32 qos[NR_QOS_PARAMS]; u64 i_lcoefs[NR_I_LCOEFS]; @@ -492,7 +490,7 @@ struct ioc_gq { /* * `vtime` is this iocg's vtime cursor which progresses as IOs are * issued. If lagging behind device vtime, the delta represents - * the currently available IO budget. If runnning ahead, the + * the currently available IO budget. If running ahead, the * overage. * * `vtime_done` is the same but progressed on completion rather @@ -973,6 +971,58 @@ done: ioc->vtime_err = clamp(ioc->vtime_err, -vperiod, vperiod); } +static void ioc_adjust_base_vrate(struct ioc *ioc, u32 rq_wait_pct, + int nr_lagging, int nr_shortages, + int prev_busy_level, u32 *missed_ppm) +{ + u64 vrate = ioc->vtime_base_rate; + u64 vrate_min = ioc->vrate_min, vrate_max = ioc->vrate_max; + + if (!ioc->busy_level || (ioc->busy_level < 0 && nr_lagging)) { + if (ioc->busy_level != prev_busy_level || nr_lagging) + trace_iocost_ioc_vrate_adj(ioc, atomic64_read(&ioc->vtime_rate), + missed_ppm, rq_wait_pct, + nr_lagging, nr_shortages); + + return; + } + + /* rq_wait signal is always reliable, ignore user vrate_min */ + if (rq_wait_pct > RQ_WAIT_BUSY_PCT) + vrate_min = VRATE_MIN; + + /* + * If vrate is out of bounds, apply clamp gradually as the + * bounds can change abruptly. Otherwise, apply busy_level + * based adjustment. + */ + if (vrate < vrate_min) { + vrate = div64_u64(vrate * (100 + VRATE_CLAMP_ADJ_PCT), 100); + vrate = min(vrate, vrate_min); + } else if (vrate > vrate_max) { + vrate = div64_u64(vrate * (100 - VRATE_CLAMP_ADJ_PCT), 100); + vrate = max(vrate, vrate_max); + } else { + int idx = min_t(int, abs(ioc->busy_level), + ARRAY_SIZE(vrate_adj_pct) - 1); + u32 adj_pct = vrate_adj_pct[idx]; + + if (ioc->busy_level > 0) + adj_pct = 100 - adj_pct; + else + adj_pct = 100 + adj_pct; + + vrate = clamp(DIV64_U64_ROUND_UP(vrate * adj_pct, 100), + vrate_min, vrate_max); + } + + trace_iocost_ioc_vrate_adj(ioc, vrate, missed_ppm, rq_wait_pct, + nr_lagging, nr_shortages); + + ioc->vtime_base_rate = vrate; + ioc_refresh_margins(ioc); +} + /* take a snapshot of the current [v]time and vrate */ static void ioc_now(struct ioc *ioc, struct ioc_now *now) { @@ -1046,7 +1096,7 @@ static void __propagate_weights(struct ioc_gq *iocg, u32 active, u32 inuse, /* * The delta between inuse and active sums indicates that - * that much of weight is being given away. Parent's inuse + * much of weight is being given away. Parent's inuse * and active should reflect the ratio. */ if (parent->child_active_sum) { @@ -2071,40 +2121,21 @@ static void ioc_forgive_debts(struct ioc *ioc, u64 usage_us_sum, int nr_debtors, } } -static void ioc_timer_fn(struct timer_list *timer) +/* + * Check the active iocgs' state to avoid oversleeping and deactive + * idle iocgs. + * + * Since waiters determine the sleep durations based on the vrate + * they saw at the time of sleep, if vrate has increased, some + * waiters could be sleeping for too long. Wake up tardy waiters + * which should have woken up in the last period and expire idle + * iocgs. + */ +static int ioc_check_iocgs(struct ioc *ioc, struct ioc_now *now) { - struct ioc *ioc = container_of(timer, struct ioc, timer); + int nr_debtors = 0; struct ioc_gq *iocg, *tiocg; - struct ioc_now now; - LIST_HEAD(surpluses); - int nr_debtors = 0, nr_shortages = 0, nr_lagging = 0; - u64 usage_us_sum = 0; - u32 ppm_rthr = MILLION - ioc->params.qos[QOS_RPPM]; - u32 ppm_wthr = MILLION - ioc->params.qos[QOS_WPPM]; - u32 missed_ppm[2], rq_wait_pct; - u64 period_vtime; - int prev_busy_level; - - /* how were the latencies during the period? */ - ioc_lat_stat(ioc, missed_ppm, &rq_wait_pct); - - /* take care of active iocgs */ - spin_lock_irq(&ioc->lock); - - ioc_now(ioc, &now); - period_vtime = now.vnow - ioc->period_at_vtime; - if (WARN_ON_ONCE(!period_vtime)) { - spin_unlock_irq(&ioc->lock); - return; - } - - /* - * Waiters determine the sleep durations based on the vrate they - * saw at the time of sleep. If vrate has increased, some waiters - * could be sleeping for too long. Wake up tardy waiters which - * should have woken up in the last period and expire idle iocgs. - */ list_for_each_entry_safe(iocg, tiocg, &ioc->active_iocgs, active_list) { if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt && !iocg->delay && !iocg_is_idle(iocg)) @@ -2114,24 +2145,24 @@ static void ioc_timer_fn(struct timer_list *timer) /* flush wait and indebt stat deltas */ if (iocg->wait_since) { - iocg->local_stat.wait_us += now.now - iocg->wait_since; - iocg->wait_since = now.now; + iocg->local_stat.wait_us += now->now - iocg->wait_since; + iocg->wait_since = now->now; } if (iocg->indebt_since) { iocg->local_stat.indebt_us += - now.now - iocg->indebt_since; - iocg->indebt_since = now.now; + now->now - iocg->indebt_since; + iocg->indebt_since = now->now; } if (iocg->indelay_since) { iocg->local_stat.indelay_us += - now.now - iocg->indelay_since; - iocg->indelay_since = now.now; + now->now - iocg->indelay_since; + iocg->indelay_since = now->now; } if (waitqueue_active(&iocg->waitq) || iocg->abs_vdebt || iocg->delay) { /* might be oversleeping vtime / hweight changes, kick */ - iocg_kick_waitq(iocg, true, &now); + iocg_kick_waitq(iocg, true, now); if (iocg->abs_vdebt || iocg->delay) nr_debtors++; } else if (iocg_is_idle(iocg)) { @@ -2145,7 +2176,7 @@ static void ioc_timer_fn(struct timer_list *timer) * error and throw away. On reactivation, it'll start * with the target budget. */ - excess = now.vnow - vtime - ioc->margins.target; + excess = now->vnow - vtime - ioc->margins.target; if (excess > 0) { u32 old_hwi; @@ -2154,13 +2185,46 @@ static void ioc_timer_fn(struct timer_list *timer) WEIGHT_ONE); } - __propagate_weights(iocg, 0, 0, false, &now); + __propagate_weights(iocg, 0, 0, false, now); list_del_init(&iocg->active_list); } spin_unlock(&iocg->waitq.lock); } + commit_weights(ioc); + return nr_debtors; +} + +static void ioc_timer_fn(struct timer_list *timer) +{ + struct ioc *ioc = container_of(timer, struct ioc, timer); + struct ioc_gq *iocg, *tiocg; + struct ioc_now now; + LIST_HEAD(surpluses); + int nr_debtors, nr_shortages = 0, nr_lagging = 0; + u64 usage_us_sum = 0; + u32 ppm_rthr = MILLION - ioc->params.qos[QOS_RPPM]; + u32 ppm_wthr = MILLION - ioc->params.qos[QOS_WPPM]; + u32 missed_ppm[2], rq_wait_pct; + u64 period_vtime; + int prev_busy_level; + + /* how were the latencies during the period? */ + ioc_lat_stat(ioc, missed_ppm, &rq_wait_pct); + + /* take care of active iocgs */ + spin_lock_irq(&ioc->lock); + + ioc_now(ioc, &now); + + period_vtime = now.vnow - ioc->period_at_vtime; + if (WARN_ON_ONCE(!period_vtime)) { + spin_unlock_irq(&ioc->lock); + return; + } + + nr_debtors = ioc_check_iocgs(ioc, &now); /* * Wait and indebt stat are flushed above and the donation calculation @@ -2170,8 +2234,8 @@ static void ioc_timer_fn(struct timer_list *timer) /* calc usage and see whether some weights need to be moved around */ list_for_each_entry(iocg, &ioc->active_iocgs, active_list) { - u64 vdone, vtime, usage_us, usage_dur; - u32 usage, hw_active, hw_inuse; + u64 vdone, vtime, usage_us; + u32 hw_active, hw_inuse; /* * Collect unused and wind vtime closer to vnow to prevent @@ -2202,30 +2266,32 @@ static void ioc_timer_fn(struct timer_list *timer) usage_us = iocg->usage_delta_us; usage_us_sum += usage_us; - if (vdone != vtime) { - u64 inflight_us = DIV64_U64_ROUND_UP( - cost_to_abs_cost(vtime - vdone, hw_inuse), - ioc->vtime_base_rate); - usage_us = max(usage_us, inflight_us); - } - - /* convert to hweight based usage ratio */ - if (time_after64(iocg->activated_at, ioc->period_at)) - usage_dur = max_t(u64, now.now - iocg->activated_at, 1); - else - usage_dur = max_t(u64, now.now - ioc->period_at, 1); - - usage = clamp_t(u32, - DIV64_U64_ROUND_UP(usage_us * WEIGHT_ONE, - usage_dur), - 1, WEIGHT_ONE); - /* see whether there's surplus vtime */ WARN_ON_ONCE(!list_empty(&iocg->surplus_list)); if (hw_inuse < hw_active || (!waitqueue_active(&iocg->waitq) && time_before64(vtime, now.vnow - ioc->margins.low))) { - u32 hwa, old_hwi, hwm, new_hwi; + u32 hwa, old_hwi, hwm, new_hwi, usage; + u64 usage_dur; + + if (vdone != vtime) { + u64 inflight_us = DIV64_U64_ROUND_UP( + cost_to_abs_cost(vtime - vdone, hw_inuse), + ioc->vtime_base_rate); + + usage_us = max(usage_us, inflight_us); + } + + /* convert to hweight based usage ratio */ + if (time_after64(iocg->activated_at, ioc->period_at)) + usage_dur = max_t(u64, now.now - iocg->activated_at, 1); + else + usage_dur = max_t(u64, now.now - ioc->period_at, 1); + + usage = clamp_t(u32, + DIV64_U64_ROUND_UP(usage_us * WEIGHT_ONE, + usage_dur), + 1, WEIGHT_ONE); /* * Already donating or accumulated enough to start. @@ -2309,51 +2375,8 @@ static void ioc_timer_fn(struct timer_list *timer) ioc->busy_level = clamp(ioc->busy_level, -1000, 1000); - if (ioc->busy_level > 0 || (ioc->busy_level < 0 && !nr_lagging)) { - u64 vrate = ioc->vtime_base_rate; - u64 vrate_min = ioc->vrate_min, vrate_max = ioc->vrate_max; - - /* rq_wait signal is always reliable, ignore user vrate_min */ - if (rq_wait_pct > RQ_WAIT_BUSY_PCT) - vrate_min = VRATE_MIN; - - /* - * If vrate is out of bounds, apply clamp gradually as the - * bounds can change abruptly. Otherwise, apply busy_level - * based adjustment. - */ - if (vrate < vrate_min) { - vrate = div64_u64(vrate * (100 + VRATE_CLAMP_ADJ_PCT), - 100); - vrate = min(vrate, vrate_min); - } else if (vrate > vrate_max) { - vrate = div64_u64(vrate * (100 - VRATE_CLAMP_ADJ_PCT), - 100); - vrate = max(vrate, vrate_max); - } else { - int idx = min_t(int, abs(ioc->busy_level), - ARRAY_SIZE(vrate_adj_pct) - 1); - u32 adj_pct = vrate_adj_pct[idx]; - - if (ioc->busy_level > 0) - adj_pct = 100 - adj_pct; - else - adj_pct = 100 + adj_pct; - - vrate = clamp(DIV64_U64_ROUND_UP(vrate * adj_pct, 100), - vrate_min, vrate_max); - } - - trace_iocost_ioc_vrate_adj(ioc, vrate, missed_ppm, rq_wait_pct, - nr_lagging, nr_shortages); - - ioc->vtime_base_rate = vrate; - ioc_refresh_margins(ioc); - } else if (ioc->busy_level != prev_busy_level || nr_lagging) { - trace_iocost_ioc_vrate_adj(ioc, atomic64_read(&ioc->vtime_rate), - missed_ppm, rq_wait_pct, nr_lagging, - nr_shortages); - } + ioc_adjust_base_vrate(ioc, rq_wait_pct, nr_lagging, nr_shortages, + prev_busy_level, missed_ppm); ioc_refresh_params(ioc, false); @@ -2400,7 +2423,7 @@ static u64 adjust_inuse_and_calc_cost(struct ioc_gq *iocg, u64 vtime, return cost; /* - * We only increase inuse during period and do so iff the margin has + * We only increase inuse during period and do so if the margin has * deteriorated since the previous adjustment. */ if (margin >= iocg->saved_margin || margin >= margins->low || @@ -3120,23 +3143,23 @@ static const match_table_t qos_tokens = { static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input, size_t nbytes, loff_t off) { - struct gendisk *disk; + struct block_device *bdev; struct ioc *ioc; u32 qos[NR_QOS_PARAMS]; bool enable, user; char *p; int ret; - disk = blkcg_conf_get_disk(&input); - if (IS_ERR(disk)) - return PTR_ERR(disk); + bdev = blkcg_conf_open_bdev(&input); + if (IS_ERR(bdev)) + return PTR_ERR(bdev); - ioc = q_to_ioc(disk->queue); + ioc = q_to_ioc(bdev->bd_disk->queue); if (!ioc) { - ret = blk_iocost_init(disk->queue); + ret = blk_iocost_init(bdev->bd_disk->queue); if (ret) goto err; - ioc = q_to_ioc(disk->queue); + ioc = q_to_ioc(bdev->bd_disk->queue); } spin_lock_irq(&ioc->lock); @@ -3231,12 +3254,12 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input, ioc_refresh_params(ioc, true); spin_unlock_irq(&ioc->lock); - put_disk_and_module(disk); + blkdev_put_no_open(bdev); return nbytes; einval: ret = -EINVAL; err: - put_disk_and_module(disk); + blkdev_put_no_open(bdev); return ret; } @@ -3287,23 +3310,23 @@ static const match_table_t i_lcoef_tokens = { static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input, size_t nbytes, loff_t off) { - struct gendisk *disk; + struct block_device *bdev; struct ioc *ioc; u64 u[NR_I_LCOEFS]; bool user; char *p; int ret; - disk = blkcg_conf_get_disk(&input); - if (IS_ERR(disk)) - return PTR_ERR(disk); + bdev = blkcg_conf_open_bdev(&input); + if (IS_ERR(bdev)) + return PTR_ERR(bdev); - ioc = q_to_ioc(disk->queue); + ioc = q_to_ioc(bdev->bd_disk->queue); if (!ioc) { - ret = blk_iocost_init(disk->queue); + ret = blk_iocost_init(bdev->bd_disk->queue); if (ret) goto err; - ioc = q_to_ioc(disk->queue); + ioc = q_to_ioc(bdev->bd_disk->queue); } spin_lock_irq(&ioc->lock); @@ -3356,13 +3379,13 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input, ioc_refresh_params(ioc, true); spin_unlock_irq(&ioc->lock); - put_disk_and_module(disk); + blkdev_put_no_open(bdev); return nbytes; einval: ret = -EINVAL; err: - put_disk_and_module(disk); + blkdev_put_no_open(bdev); return ret; } |