summaryrefslogtreecommitdiff
path: root/block/blk-iocost.c
diff options
context:
space:
mode:
Diffstat (limited to 'block/blk-iocost.c')
-rw-r--r--block/blk-iocost.c287
1 files changed, 155 insertions, 132 deletions
diff --git a/block/blk-iocost.c b/block/blk-iocost.c
index bbe86d1199dc..ffa418c0dcb1 100644
--- a/block/blk-iocost.c
+++ b/block/blk-iocost.c
@@ -39,7 +39,7 @@
* On top of that, a size cost proportional to the length of the IO is
* added. While simple, this model captures the operational
* characteristics of a wide varienty of devices well enough. Default
- * paramters for several different classes of devices are provided and the
+ * parameters for several different classes of devices are provided and the
* parameters can be configured from userspace via
* /sys/fs/cgroup/io.cost.model.
*
@@ -77,7 +77,7 @@
*
* This constitutes the basis of IO capacity distribution. Each cgroup's
* vtime is running at a rate determined by its hweight. A cgroup tracks
- * the vtime consumed by past IOs and can issue a new IO iff doing so
+ * the vtime consumed by past IOs and can issue a new IO if doing so
* wouldn't outrun the current device vtime. Otherwise, the IO is
* suspended until the vtime has progressed enough to cover it.
*
@@ -155,7 +155,7 @@
* Instead of debugfs or other clumsy monitoring mechanisms, this
* controller uses a drgn based monitoring script -
* tools/cgroup/iocost_monitor.py. For details on drgn, please see
- * https://github.com/osandov/drgn. The ouput looks like the following.
+ * https://github.com/osandov/drgn. The output looks like the following.
*
* sdb RUN per=300ms cur_per=234.218:v203.695 busy= +1 vrate= 62.12%
* active weight hweight% inflt% dbt delay usages%
@@ -370,8 +370,6 @@ enum {
AUTOP_SSD_FAST,
};
-struct ioc_gq;
-
struct ioc_params {
u32 qos[NR_QOS_PARAMS];
u64 i_lcoefs[NR_I_LCOEFS];
@@ -492,7 +490,7 @@ struct ioc_gq {
/*
* `vtime` is this iocg's vtime cursor which progresses as IOs are
* issued. If lagging behind device vtime, the delta represents
- * the currently available IO budget. If runnning ahead, the
+ * the currently available IO budget. If running ahead, the
* overage.
*
* `vtime_done` is the same but progressed on completion rather
@@ -973,6 +971,58 @@ done:
ioc->vtime_err = clamp(ioc->vtime_err, -vperiod, vperiod);
}
+static void ioc_adjust_base_vrate(struct ioc *ioc, u32 rq_wait_pct,
+ int nr_lagging, int nr_shortages,
+ int prev_busy_level, u32 *missed_ppm)
+{
+ u64 vrate = ioc->vtime_base_rate;
+ u64 vrate_min = ioc->vrate_min, vrate_max = ioc->vrate_max;
+
+ if (!ioc->busy_level || (ioc->busy_level < 0 && nr_lagging)) {
+ if (ioc->busy_level != prev_busy_level || nr_lagging)
+ trace_iocost_ioc_vrate_adj(ioc, atomic64_read(&ioc->vtime_rate),
+ missed_ppm, rq_wait_pct,
+ nr_lagging, nr_shortages);
+
+ return;
+ }
+
+ /* rq_wait signal is always reliable, ignore user vrate_min */
+ if (rq_wait_pct > RQ_WAIT_BUSY_PCT)
+ vrate_min = VRATE_MIN;
+
+ /*
+ * If vrate is out of bounds, apply clamp gradually as the
+ * bounds can change abruptly. Otherwise, apply busy_level
+ * based adjustment.
+ */
+ if (vrate < vrate_min) {
+ vrate = div64_u64(vrate * (100 + VRATE_CLAMP_ADJ_PCT), 100);
+ vrate = min(vrate, vrate_min);
+ } else if (vrate > vrate_max) {
+ vrate = div64_u64(vrate * (100 - VRATE_CLAMP_ADJ_PCT), 100);
+ vrate = max(vrate, vrate_max);
+ } else {
+ int idx = min_t(int, abs(ioc->busy_level),
+ ARRAY_SIZE(vrate_adj_pct) - 1);
+ u32 adj_pct = vrate_adj_pct[idx];
+
+ if (ioc->busy_level > 0)
+ adj_pct = 100 - adj_pct;
+ else
+ adj_pct = 100 + adj_pct;
+
+ vrate = clamp(DIV64_U64_ROUND_UP(vrate * adj_pct, 100),
+ vrate_min, vrate_max);
+ }
+
+ trace_iocost_ioc_vrate_adj(ioc, vrate, missed_ppm, rq_wait_pct,
+ nr_lagging, nr_shortages);
+
+ ioc->vtime_base_rate = vrate;
+ ioc_refresh_margins(ioc);
+}
+
/* take a snapshot of the current [v]time and vrate */
static void ioc_now(struct ioc *ioc, struct ioc_now *now)
{
@@ -1046,7 +1096,7 @@ static void __propagate_weights(struct ioc_gq *iocg, u32 active, u32 inuse,
/*
* The delta between inuse and active sums indicates that
- * that much of weight is being given away. Parent's inuse
+ * much of weight is being given away. Parent's inuse
* and active should reflect the ratio.
*/
if (parent->child_active_sum) {
@@ -2071,40 +2121,21 @@ static void ioc_forgive_debts(struct ioc *ioc, u64 usage_us_sum, int nr_debtors,
}
}
-static void ioc_timer_fn(struct timer_list *timer)
+/*
+ * Check the active iocgs' state to avoid oversleeping and deactive
+ * idle iocgs.
+ *
+ * Since waiters determine the sleep durations based on the vrate
+ * they saw at the time of sleep, if vrate has increased, some
+ * waiters could be sleeping for too long. Wake up tardy waiters
+ * which should have woken up in the last period and expire idle
+ * iocgs.
+ */
+static int ioc_check_iocgs(struct ioc *ioc, struct ioc_now *now)
{
- struct ioc *ioc = container_of(timer, struct ioc, timer);
+ int nr_debtors = 0;
struct ioc_gq *iocg, *tiocg;
- struct ioc_now now;
- LIST_HEAD(surpluses);
- int nr_debtors = 0, nr_shortages = 0, nr_lagging = 0;
- u64 usage_us_sum = 0;
- u32 ppm_rthr = MILLION - ioc->params.qos[QOS_RPPM];
- u32 ppm_wthr = MILLION - ioc->params.qos[QOS_WPPM];
- u32 missed_ppm[2], rq_wait_pct;
- u64 period_vtime;
- int prev_busy_level;
-
- /* how were the latencies during the period? */
- ioc_lat_stat(ioc, missed_ppm, &rq_wait_pct);
-
- /* take care of active iocgs */
- spin_lock_irq(&ioc->lock);
-
- ioc_now(ioc, &now);
- period_vtime = now.vnow - ioc->period_at_vtime;
- if (WARN_ON_ONCE(!period_vtime)) {
- spin_unlock_irq(&ioc->lock);
- return;
- }
-
- /*
- * Waiters determine the sleep durations based on the vrate they
- * saw at the time of sleep. If vrate has increased, some waiters
- * could be sleeping for too long. Wake up tardy waiters which
- * should have woken up in the last period and expire idle iocgs.
- */
list_for_each_entry_safe(iocg, tiocg, &ioc->active_iocgs, active_list) {
if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt &&
!iocg->delay && !iocg_is_idle(iocg))
@@ -2114,24 +2145,24 @@ static void ioc_timer_fn(struct timer_list *timer)
/* flush wait and indebt stat deltas */
if (iocg->wait_since) {
- iocg->local_stat.wait_us += now.now - iocg->wait_since;
- iocg->wait_since = now.now;
+ iocg->local_stat.wait_us += now->now - iocg->wait_since;
+ iocg->wait_since = now->now;
}
if (iocg->indebt_since) {
iocg->local_stat.indebt_us +=
- now.now - iocg->indebt_since;
- iocg->indebt_since = now.now;
+ now->now - iocg->indebt_since;
+ iocg->indebt_since = now->now;
}
if (iocg->indelay_since) {
iocg->local_stat.indelay_us +=
- now.now - iocg->indelay_since;
- iocg->indelay_since = now.now;
+ now->now - iocg->indelay_since;
+ iocg->indelay_since = now->now;
}
if (waitqueue_active(&iocg->waitq) || iocg->abs_vdebt ||
iocg->delay) {
/* might be oversleeping vtime / hweight changes, kick */
- iocg_kick_waitq(iocg, true, &now);
+ iocg_kick_waitq(iocg, true, now);
if (iocg->abs_vdebt || iocg->delay)
nr_debtors++;
} else if (iocg_is_idle(iocg)) {
@@ -2145,7 +2176,7 @@ static void ioc_timer_fn(struct timer_list *timer)
* error and throw away. On reactivation, it'll start
* with the target budget.
*/
- excess = now.vnow - vtime - ioc->margins.target;
+ excess = now->vnow - vtime - ioc->margins.target;
if (excess > 0) {
u32 old_hwi;
@@ -2154,13 +2185,46 @@ static void ioc_timer_fn(struct timer_list *timer)
WEIGHT_ONE);
}
- __propagate_weights(iocg, 0, 0, false, &now);
+ __propagate_weights(iocg, 0, 0, false, now);
list_del_init(&iocg->active_list);
}
spin_unlock(&iocg->waitq.lock);
}
+
commit_weights(ioc);
+ return nr_debtors;
+}
+
+static void ioc_timer_fn(struct timer_list *timer)
+{
+ struct ioc *ioc = container_of(timer, struct ioc, timer);
+ struct ioc_gq *iocg, *tiocg;
+ struct ioc_now now;
+ LIST_HEAD(surpluses);
+ int nr_debtors, nr_shortages = 0, nr_lagging = 0;
+ u64 usage_us_sum = 0;
+ u32 ppm_rthr = MILLION - ioc->params.qos[QOS_RPPM];
+ u32 ppm_wthr = MILLION - ioc->params.qos[QOS_WPPM];
+ u32 missed_ppm[2], rq_wait_pct;
+ u64 period_vtime;
+ int prev_busy_level;
+
+ /* how were the latencies during the period? */
+ ioc_lat_stat(ioc, missed_ppm, &rq_wait_pct);
+
+ /* take care of active iocgs */
+ spin_lock_irq(&ioc->lock);
+
+ ioc_now(ioc, &now);
+
+ period_vtime = now.vnow - ioc->period_at_vtime;
+ if (WARN_ON_ONCE(!period_vtime)) {
+ spin_unlock_irq(&ioc->lock);
+ return;
+ }
+
+ nr_debtors = ioc_check_iocgs(ioc, &now);
/*
* Wait and indebt stat are flushed above and the donation calculation
@@ -2170,8 +2234,8 @@ static void ioc_timer_fn(struct timer_list *timer)
/* calc usage and see whether some weights need to be moved around */
list_for_each_entry(iocg, &ioc->active_iocgs, active_list) {
- u64 vdone, vtime, usage_us, usage_dur;
- u32 usage, hw_active, hw_inuse;
+ u64 vdone, vtime, usage_us;
+ u32 hw_active, hw_inuse;
/*
* Collect unused and wind vtime closer to vnow to prevent
@@ -2202,30 +2266,32 @@ static void ioc_timer_fn(struct timer_list *timer)
usage_us = iocg->usage_delta_us;
usage_us_sum += usage_us;
- if (vdone != vtime) {
- u64 inflight_us = DIV64_U64_ROUND_UP(
- cost_to_abs_cost(vtime - vdone, hw_inuse),
- ioc->vtime_base_rate);
- usage_us = max(usage_us, inflight_us);
- }
-
- /* convert to hweight based usage ratio */
- if (time_after64(iocg->activated_at, ioc->period_at))
- usage_dur = max_t(u64, now.now - iocg->activated_at, 1);
- else
- usage_dur = max_t(u64, now.now - ioc->period_at, 1);
-
- usage = clamp_t(u32,
- DIV64_U64_ROUND_UP(usage_us * WEIGHT_ONE,
- usage_dur),
- 1, WEIGHT_ONE);
-
/* see whether there's surplus vtime */
WARN_ON_ONCE(!list_empty(&iocg->surplus_list));
if (hw_inuse < hw_active ||
(!waitqueue_active(&iocg->waitq) &&
time_before64(vtime, now.vnow - ioc->margins.low))) {
- u32 hwa, old_hwi, hwm, new_hwi;
+ u32 hwa, old_hwi, hwm, new_hwi, usage;
+ u64 usage_dur;
+
+ if (vdone != vtime) {
+ u64 inflight_us = DIV64_U64_ROUND_UP(
+ cost_to_abs_cost(vtime - vdone, hw_inuse),
+ ioc->vtime_base_rate);
+
+ usage_us = max(usage_us, inflight_us);
+ }
+
+ /* convert to hweight based usage ratio */
+ if (time_after64(iocg->activated_at, ioc->period_at))
+ usage_dur = max_t(u64, now.now - iocg->activated_at, 1);
+ else
+ usage_dur = max_t(u64, now.now - ioc->period_at, 1);
+
+ usage = clamp_t(u32,
+ DIV64_U64_ROUND_UP(usage_us * WEIGHT_ONE,
+ usage_dur),
+ 1, WEIGHT_ONE);
/*
* Already donating or accumulated enough to start.
@@ -2309,51 +2375,8 @@ static void ioc_timer_fn(struct timer_list *timer)
ioc->busy_level = clamp(ioc->busy_level, -1000, 1000);
- if (ioc->busy_level > 0 || (ioc->busy_level < 0 && !nr_lagging)) {
- u64 vrate = ioc->vtime_base_rate;
- u64 vrate_min = ioc->vrate_min, vrate_max = ioc->vrate_max;
-
- /* rq_wait signal is always reliable, ignore user vrate_min */
- if (rq_wait_pct > RQ_WAIT_BUSY_PCT)
- vrate_min = VRATE_MIN;
-
- /*
- * If vrate is out of bounds, apply clamp gradually as the
- * bounds can change abruptly. Otherwise, apply busy_level
- * based adjustment.
- */
- if (vrate < vrate_min) {
- vrate = div64_u64(vrate * (100 + VRATE_CLAMP_ADJ_PCT),
- 100);
- vrate = min(vrate, vrate_min);
- } else if (vrate > vrate_max) {
- vrate = div64_u64(vrate * (100 - VRATE_CLAMP_ADJ_PCT),
- 100);
- vrate = max(vrate, vrate_max);
- } else {
- int idx = min_t(int, abs(ioc->busy_level),
- ARRAY_SIZE(vrate_adj_pct) - 1);
- u32 adj_pct = vrate_adj_pct[idx];
-
- if (ioc->busy_level > 0)
- adj_pct = 100 - adj_pct;
- else
- adj_pct = 100 + adj_pct;
-
- vrate = clamp(DIV64_U64_ROUND_UP(vrate * adj_pct, 100),
- vrate_min, vrate_max);
- }
-
- trace_iocost_ioc_vrate_adj(ioc, vrate, missed_ppm, rq_wait_pct,
- nr_lagging, nr_shortages);
-
- ioc->vtime_base_rate = vrate;
- ioc_refresh_margins(ioc);
- } else if (ioc->busy_level != prev_busy_level || nr_lagging) {
- trace_iocost_ioc_vrate_adj(ioc, atomic64_read(&ioc->vtime_rate),
- missed_ppm, rq_wait_pct, nr_lagging,
- nr_shortages);
- }
+ ioc_adjust_base_vrate(ioc, rq_wait_pct, nr_lagging, nr_shortages,
+ prev_busy_level, missed_ppm);
ioc_refresh_params(ioc, false);
@@ -2400,7 +2423,7 @@ static u64 adjust_inuse_and_calc_cost(struct ioc_gq *iocg, u64 vtime,
return cost;
/*
- * We only increase inuse during period and do so iff the margin has
+ * We only increase inuse during period and do so if the margin has
* deteriorated since the previous adjustment.
*/
if (margin >= iocg->saved_margin || margin >= margins->low ||
@@ -3120,23 +3143,23 @@ static const match_table_t qos_tokens = {
static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
size_t nbytes, loff_t off)
{
- struct gendisk *disk;
+ struct block_device *bdev;
struct ioc *ioc;
u32 qos[NR_QOS_PARAMS];
bool enable, user;
char *p;
int ret;
- disk = blkcg_conf_get_disk(&input);
- if (IS_ERR(disk))
- return PTR_ERR(disk);
+ bdev = blkcg_conf_open_bdev(&input);
+ if (IS_ERR(bdev))
+ return PTR_ERR(bdev);
- ioc = q_to_ioc(disk->queue);
+ ioc = q_to_ioc(bdev->bd_disk->queue);
if (!ioc) {
- ret = blk_iocost_init(disk->queue);
+ ret = blk_iocost_init(bdev->bd_disk->queue);
if (ret)
goto err;
- ioc = q_to_ioc(disk->queue);
+ ioc = q_to_ioc(bdev->bd_disk->queue);
}
spin_lock_irq(&ioc->lock);
@@ -3231,12 +3254,12 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
ioc_refresh_params(ioc, true);
spin_unlock_irq(&ioc->lock);
- put_disk_and_module(disk);
+ blkdev_put_no_open(bdev);
return nbytes;
einval:
ret = -EINVAL;
err:
- put_disk_and_module(disk);
+ blkdev_put_no_open(bdev);
return ret;
}
@@ -3287,23 +3310,23 @@ static const match_table_t i_lcoef_tokens = {
static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
size_t nbytes, loff_t off)
{
- struct gendisk *disk;
+ struct block_device *bdev;
struct ioc *ioc;
u64 u[NR_I_LCOEFS];
bool user;
char *p;
int ret;
- disk = blkcg_conf_get_disk(&input);
- if (IS_ERR(disk))
- return PTR_ERR(disk);
+ bdev = blkcg_conf_open_bdev(&input);
+ if (IS_ERR(bdev))
+ return PTR_ERR(bdev);
- ioc = q_to_ioc(disk->queue);
+ ioc = q_to_ioc(bdev->bd_disk->queue);
if (!ioc) {
- ret = blk_iocost_init(disk->queue);
+ ret = blk_iocost_init(bdev->bd_disk->queue);
if (ret)
goto err;
- ioc = q_to_ioc(disk->queue);
+ ioc = q_to_ioc(bdev->bd_disk->queue);
}
spin_lock_irq(&ioc->lock);
@@ -3356,13 +3379,13 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
ioc_refresh_params(ioc, true);
spin_unlock_irq(&ioc->lock);
- put_disk_and_module(disk);
+ blkdev_put_no_open(bdev);
return nbytes;
einval:
ret = -EINVAL;
err:
- put_disk_and_module(disk);
+ blkdev_put_no_open(bdev);
return ret;
}