summaryrefslogtreecommitdiff
path: root/block
diff options
context:
space:
mode:
Diffstat (limited to 'block')
-rw-r--r--block/bfq-cgroup.c116
-rw-r--r--block/bfq-iosched.c2
-rw-r--r--block/bfq-iosched.h23
-rw-r--r--block/bio-integrity.c3
-rw-r--r--block/blk-mq.c25
-rw-r--r--block/blk-sysfs.c34
-rw-r--r--block/blk-throttle.c22
7 files changed, 165 insertions, 60 deletions
diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
index c8a32fb345cf..78b2e0db4fb2 100644
--- a/block/bfq-cgroup.c
+++ b/block/bfq-cgroup.c
@@ -52,7 +52,7 @@ BFQG_FLAG_FNS(idling)
BFQG_FLAG_FNS(empty)
#undef BFQG_FLAG_FNS
-/* This should be called with the queue_lock held. */
+/* This should be called with the scheduler lock held. */
static void bfqg_stats_update_group_wait_time(struct bfqg_stats *stats)
{
unsigned long long now;
@@ -67,7 +67,7 @@ static void bfqg_stats_update_group_wait_time(struct bfqg_stats *stats)
bfqg_stats_clear_waiting(stats);
}
-/* This should be called with the queue_lock held. */
+/* This should be called with the scheduler lock held. */
static void bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg,
struct bfq_group *curr_bfqg)
{
@@ -81,7 +81,7 @@ static void bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg,
bfqg_stats_mark_waiting(stats);
}
-/* This should be called with the queue_lock held. */
+/* This should be called with the scheduler lock held. */
static void bfqg_stats_end_empty_time(struct bfqg_stats *stats)
{
unsigned long long now;
@@ -203,12 +203,30 @@ struct bfq_group *bfqq_group(struct bfq_queue *bfqq)
static void bfqg_get(struct bfq_group *bfqg)
{
- return blkg_get(bfqg_to_blkg(bfqg));
+ bfqg->ref++;
}
void bfqg_put(struct bfq_group *bfqg)
{
- return blkg_put(bfqg_to_blkg(bfqg));
+ bfqg->ref--;
+
+ if (bfqg->ref == 0)
+ kfree(bfqg);
+}
+
+static void bfqg_and_blkg_get(struct bfq_group *bfqg)
+{
+ /* see comments in bfq_bic_update_cgroup for why refcounting bfqg */
+ bfqg_get(bfqg);
+
+ blkg_get(bfqg_to_blkg(bfqg));
+}
+
+void bfqg_and_blkg_put(struct bfq_group *bfqg)
+{
+ bfqg_put(bfqg);
+
+ blkg_put(bfqg_to_blkg(bfqg));
}
void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq,
@@ -312,7 +330,11 @@ void bfq_init_entity(struct bfq_entity *entity, struct bfq_group *bfqg)
if (bfqq) {
bfqq->ioprio = bfqq->new_ioprio;
bfqq->ioprio_class = bfqq->new_ioprio_class;
- bfqg_get(bfqg);
+ /*
+ * Make sure that bfqg and its associated blkg do not
+ * disappear before entity.
+ */
+ bfqg_and_blkg_get(bfqg);
}
entity->parent = bfqg->my_entity; /* NULL for root group */
entity->sched_data = &bfqg->sched_data;
@@ -399,6 +421,8 @@ struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, int node)
return NULL;
}
+ /* see comments in bfq_bic_update_cgroup for why refcounting */
+ bfqg_get(bfqg);
return &bfqg->pd;
}
@@ -426,7 +450,7 @@ void bfq_pd_free(struct blkg_policy_data *pd)
struct bfq_group *bfqg = pd_to_bfqg(pd);
bfqg_stats_exit(&bfqg->stats);
- return kfree(bfqg);
+ bfqg_put(bfqg);
}
void bfq_pd_reset_stats(struct blkg_policy_data *pd)
@@ -496,9 +520,10 @@ struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd,
* Move @bfqq to @bfqg, deactivating it from its old group and reactivating
* it on the new one. Avoid putting the entity on the old group idle tree.
*
- * Must be called under the queue lock; the cgroup owning @bfqg must
- * not disappear (by now this just means that we are called under
- * rcu_read_lock()).
+ * Must be called under the scheduler lock, to make sure that the blkg
+ * owning @bfqg does not disappear (see comments in
+ * bfq_bic_update_cgroup on guaranteeing the consistency of blkg
+ * objects).
*/
void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
struct bfq_group *bfqg)
@@ -519,16 +544,12 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
bfq_deactivate_bfqq(bfqd, bfqq, false, false);
else if (entity->on_st)
bfq_put_idle_entity(bfq_entity_service_tree(entity), entity);
- bfqg_put(bfqq_group(bfqq));
+ bfqg_and_blkg_put(bfqq_group(bfqq));
- /*
- * Here we use a reference to bfqg. We don't need a refcounter
- * as the cgroup reference will not be dropped, so that its
- * destroy() callback will not be invoked.
- */
entity->parent = bfqg->my_entity;
entity->sched_data = &bfqg->sched_data;
- bfqg_get(bfqg);
+ /* pin down bfqg and its associated blkg */
+ bfqg_and_blkg_get(bfqg);
if (bfq_bfqq_busy(bfqq)) {
bfq_pos_tree_add_move(bfqd, bfqq);
@@ -545,8 +566,9 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
* @bic: the bic to move.
* @blkcg: the blk-cgroup to move to.
*
- * Move bic to blkcg, assuming that bfqd->queue is locked; the caller
- * has to make sure that the reference to cgroup is valid across the call.
+ * Move bic to blkcg, assuming that bfqd->lock is held; which makes
+ * sure that the reference to cgroup is valid across the call (see
+ * comments in bfq_bic_update_cgroup on this issue)
*
* NOTE: an alternative approach might have been to store the current
* cgroup in bfqq and getting a reference to it, reducing the lookup
@@ -604,6 +626,57 @@ void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio)
goto out;
bfqg = __bfq_bic_change_cgroup(bfqd, bic, bio_blkcg(bio));
+ /*
+ * Update blkg_path for bfq_log_* functions. We cache this
+ * path, and update it here, for the following
+ * reasons. Operations on blkg objects in blk-cgroup are
+ * protected with the request_queue lock, and not with the
+ * lock that protects the instances of this scheduler
+ * (bfqd->lock). This exposes BFQ to the following sort of
+ * race.
+ *
+ * The blkg_lookup performed in bfq_get_queue, protected
+ * through rcu, may happen to return the address of a copy of
+ * the original blkg. If this is the case, then the
+ * bfqg_and_blkg_get performed in bfq_get_queue, to pin down
+ * the blkg, is useless: it does not prevent blk-cgroup code
+ * from destroying both the original blkg and all objects
+ * directly or indirectly referred by the copy of the
+ * blkg.
+ *
+ * On the bright side, destroy operations on a blkg invoke, as
+ * a first step, hooks of the scheduler associated with the
+ * blkg. And these hooks are executed with bfqd->lock held for
+ * BFQ. As a consequence, for any blkg associated with the
+ * request queue this instance of the scheduler is attached
+ * to, we are guaranteed that such a blkg is not destroyed, and
+ * that all the pointers it contains are consistent, while we
+ * are holding bfqd->lock. A blkg_lookup performed with
+ * bfqd->lock held then returns a fully consistent blkg, which
+ * remains consistent until this lock is held.
+ *
+ * Thanks to the last fact, and to the fact that: (1) bfqg has
+ * been obtained through a blkg_lookup in the above
+ * assignment, and (2) bfqd->lock is being held, here we can
+ * safely use the policy data for the involved blkg (i.e., the
+ * field bfqg->pd) to get to the blkg associated with bfqg,
+ * and then we can safely use any field of blkg. After we
+ * release bfqd->lock, even just getting blkg through this
+ * bfqg may cause dangling references to be traversed, as
+ * bfqg->pd may not exist any more.
+ *
+ * In view of the above facts, here we cache, in the bfqg, any
+ * blkg data we may need for this bic, and for its associated
+ * bfq_queue. As of now, we need to cache only the path of the
+ * blkg, which is used in the bfq_log_* functions.
+ *
+ * Finally, note that bfqg itself needs to be protected from
+ * destruction on the blkg_free of the original blkg (which
+ * invokes bfq_pd_free). We use an additional private
+ * refcounter for bfqg, to let it disappear only after no
+ * bfq_queue refers to it any longer.
+ */
+ blkg_path(bfqg_to_blkg(bfqg), bfqg->blkg_path, sizeof(bfqg->blkg_path));
bic->blkcg_serial_nr = serial_nr;
out:
rcu_read_unlock();
@@ -640,8 +713,6 @@ static void bfq_reparent_leaf_entity(struct bfq_data *bfqd,
* @bfqd: the device data structure with the root group.
* @bfqg: the group to move from.
* @st: the service tree with the entities.
- *
- * Needs queue_lock to be taken and reference to be valid over the call.
*/
static void bfq_reparent_active_entities(struct bfq_data *bfqd,
struct bfq_group *bfqg,
@@ -692,8 +763,7 @@ void bfq_pd_offline(struct blkg_policy_data *pd)
/*
* The idle tree may still contain bfq_queues belonging
* to exited task because they never migrated to a different
- * cgroup from the one being destroyed now. No one else
- * can access them so it's safe to act without any lock.
+ * cgroup from the one being destroyed now.
*/
bfq_flush_idle_tree(st);
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 08ce45096350..ed93da2462ab 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -3665,7 +3665,7 @@ void bfq_put_queue(struct bfq_queue *bfqq)
kmem_cache_free(bfq_pool, bfqq);
#ifdef CONFIG_BFQ_GROUP_IOSCHED
- bfqg_put(bfqg);
+ bfqg_and_blkg_put(bfqg);
#endif
}
diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h
index ae783c06dfd9..5c3bf9861492 100644
--- a/block/bfq-iosched.h
+++ b/block/bfq-iosched.h
@@ -759,6 +759,12 @@ struct bfq_group {
/* must be the first member */
struct blkg_policy_data pd;
+ /* cached path for this blkg (see comments in bfq_bic_update_cgroup) */
+ char blkg_path[128];
+
+ /* reference counter (see comments in bfq_bic_update_cgroup) */
+ int ref;
+
struct bfq_entity entity;
struct bfq_sched_data sched_data;
@@ -838,7 +844,7 @@ struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd,
struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg);
struct bfq_group *bfqq_group(struct bfq_queue *bfqq);
struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node);
-void bfqg_put(struct bfq_group *bfqg);
+void bfqg_and_blkg_put(struct bfq_group *bfqg);
#ifdef CONFIG_BFQ_GROUP_IOSCHED
extern struct cftype bfq_blkcg_legacy_files[];
@@ -910,20 +916,13 @@ void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq);
struct bfq_group *bfqq_group(struct bfq_queue *bfqq);
#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \
- char __pbuf[128]; \
- \
- blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \
- blk_add_trace_msg((bfqd)->queue, "bfq%d%c %s " fmt, (bfqq)->pid, \
+ blk_add_trace_msg((bfqd)->queue, "bfq%d%c %s " fmt, (bfqq)->pid,\
bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \
- __pbuf, ##args); \
+ bfqq_group(bfqq)->blkg_path, ##args); \
} while (0)
-#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \
- char __pbuf[128]; \
- \
- blkg_path(bfqg_to_blkg(bfqg), __pbuf, sizeof(__pbuf)); \
- blk_add_trace_msg((bfqd)->queue, "%s " fmt, __pbuf, ##args); \
-} while (0)
+#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) \
+ blk_add_trace_msg((bfqd)->queue, "%s " fmt, (bfqg)->blkg_path, ##args)
#else /* CONFIG_BFQ_GROUP_IOSCHED */
diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index 5384713d48bc..b5009a896a7f 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -175,6 +175,9 @@ bool bio_integrity_enabled(struct bio *bio)
if (bio_op(bio) != REQ_OP_READ && bio_op(bio) != REQ_OP_WRITE)
return false;
+ if (!bio_sectors(bio))
+ return false;
+
/* Already protected? */
if (bio_integrity(bio))
return false;
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 1bcccedcc74f..bb66c96850b1 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1461,22 +1461,28 @@ static blk_qc_t request_to_qc_t(struct blk_mq_hw_ctx *hctx, struct request *rq)
return blk_tag_to_qc_t(rq->internal_tag, hctx->queue_num, true);
}
-static void __blk_mq_try_issue_directly(struct request *rq, blk_qc_t *cookie,
- bool may_sleep)
+static void __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
+ struct request *rq,
+ blk_qc_t *cookie, bool may_sleep)
{
struct request_queue *q = rq->q;
struct blk_mq_queue_data bd = {
.rq = rq,
.last = true,
};
- struct blk_mq_hw_ctx *hctx;
blk_qc_t new_cookie;
int ret;
+ bool run_queue = true;
+
+ if (blk_mq_hctx_stopped(hctx)) {
+ run_queue = false;
+ goto insert;
+ }
if (q->elevator)
goto insert;
- if (!blk_mq_get_driver_tag(rq, &hctx, false))
+ if (!blk_mq_get_driver_tag(rq, NULL, false))
goto insert;
new_cookie = request_to_qc_t(hctx, rq);
@@ -1500,7 +1506,7 @@ static void __blk_mq_try_issue_directly(struct request *rq, blk_qc_t *cookie,
__blk_mq_requeue_request(rq);
insert:
- blk_mq_sched_insert_request(rq, false, true, false, may_sleep);
+ blk_mq_sched_insert_request(rq, false, run_queue, false, may_sleep);
}
static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
@@ -1508,7 +1514,7 @@ static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
{
if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
rcu_read_lock();
- __blk_mq_try_issue_directly(rq, cookie, false);
+ __blk_mq_try_issue_directly(hctx, rq, cookie, false);
rcu_read_unlock();
} else {
unsigned int srcu_idx;
@@ -1516,7 +1522,7 @@ static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
might_sleep();
srcu_idx = srcu_read_lock(&hctx->queue_rq_srcu);
- __blk_mq_try_issue_directly(rq, cookie, true);
+ __blk_mq_try_issue_directly(hctx, rq, cookie, true);
srcu_read_unlock(&hctx->queue_rq_srcu, srcu_idx);
}
}
@@ -1619,9 +1625,12 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
blk_mq_put_ctx(data.ctx);
- if (same_queue_rq)
+ if (same_queue_rq) {
+ data.hctx = blk_mq_map_queue(q,
+ same_queue_rq->mq_ctx->cpu);
blk_mq_try_issue_directly(data.hctx, same_queue_rq,
&cookie);
+ }
} else if (q->nr_hw_queues > 1 && is_sync) {
blk_mq_put_ctx(data.ctx);
blk_mq_bio_to_request(rq, bio);
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 283da7fbe034..27aceab1cc31 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -777,24 +777,25 @@ static void blk_free_queue_rcu(struct rcu_head *rcu_head)
}
/**
- * blk_release_queue: - release a &struct request_queue when it is no longer needed
- * @kobj: the kobj belonging to the request queue to be released
+ * __blk_release_queue - release a request queue when it is no longer needed
+ * @work: pointer to the release_work member of the request queue to be released
*
* Description:
- * blk_release_queue is the pair to blk_init_queue() or
- * blk_queue_make_request(). It should be called when a request queue is
- * being released; typically when a block device is being de-registered.
- * Currently, its primary task it to free all the &struct request
- * structures that were allocated to the queue and the queue itself.
+ * blk_release_queue is the counterpart of blk_init_queue(). It should be
+ * called when a request queue is being released; typically when a block
+ * device is being de-registered. Its primary task it to free the queue
+ * itself.
*
- * Note:
+ * Notes:
* The low level driver must have finished any outstanding requests first
* via blk_cleanup_queue().
- **/
-static void blk_release_queue(struct kobject *kobj)
+ *
+ * Although blk_release_queue() may be called with preemption disabled,
+ * __blk_release_queue() may sleep.
+ */
+static void __blk_release_queue(struct work_struct *work)
{
- struct request_queue *q =
- container_of(kobj, struct request_queue, kobj);
+ struct request_queue *q = container_of(work, typeof(*q), release_work);
if (test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags))
blk_stat_remove_callback(q, q->poll_cb);
@@ -834,6 +835,15 @@ static void blk_release_queue(struct kobject *kobj)
call_rcu(&q->rcu_head, blk_free_queue_rcu);
}
+static void blk_release_queue(struct kobject *kobj)
+{
+ struct request_queue *q =
+ container_of(kobj, struct request_queue, kobj);
+
+ INIT_WORK(&q->release_work, __blk_release_queue);
+ schedule_work(&q->release_work);
+}
+
static const struct sysfs_ops queue_sysfs_ops = {
.show = queue_attr_show,
.store = queue_attr_store,
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index fc13dd0c6e39..a7285bf2831c 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -27,6 +27,13 @@ static int throtl_quantum = 32;
#define MIN_THROTL_IOPS (10)
#define DFL_LATENCY_TARGET (-1L)
#define DFL_IDLE_THRESHOLD (0)
+#define DFL_HD_BASELINE_LATENCY (4000L) /* 4ms */
+#define LATENCY_FILTERED_SSD (0)
+/*
+ * For HD, very small latency comes from sequential IO. Such IO is helpless to
+ * help determine if its IO is impacted by others, hence we ignore the IO
+ */
+#define LATENCY_FILTERED_HD (1000L) /* 1ms */
#define SKIP_LATENCY (((u64)1) << BLK_STAT_RES_SHIFT)
@@ -212,6 +219,7 @@ struct throtl_data
struct avg_latency_bucket avg_buckets[LATENCY_BUCKET_SIZE];
struct latency_bucket __percpu *latency_buckets;
unsigned long last_calculate_time;
+ unsigned long filtered_latency;
bool track_bio_latency;
};
@@ -698,7 +706,7 @@ static void throtl_dequeue_tg(struct throtl_grp *tg)
static void throtl_schedule_pending_timer(struct throtl_service_queue *sq,
unsigned long expires)
{
- unsigned long max_expire = jiffies + 8 * sq_to_tg(sq)->td->throtl_slice;
+ unsigned long max_expire = jiffies + 8 * sq_to_td(sq)->throtl_slice;
/*
* Since we are adjusting the throttle limit dynamically, the sleep
@@ -2281,7 +2289,7 @@ void blk_throtl_bio_endio(struct bio *bio)
throtl_track_latency(tg->td, blk_stat_size(&bio->bi_issue_stat),
bio_op(bio), lat);
- if (tg->latency_target) {
+ if (tg->latency_target && lat >= tg->td->filtered_latency) {
int bucket;
unsigned int threshold;
@@ -2417,14 +2425,20 @@ void blk_throtl_exit(struct request_queue *q)
void blk_throtl_register_queue(struct request_queue *q)
{
struct throtl_data *td;
+ int i;
td = q->td;
BUG_ON(!td);
- if (blk_queue_nonrot(q))
+ if (blk_queue_nonrot(q)) {
td->throtl_slice = DFL_THROTL_SLICE_SSD;
- else
+ td->filtered_latency = LATENCY_FILTERED_SSD;
+ } else {
td->throtl_slice = DFL_THROTL_SLICE_HD;
+ td->filtered_latency = LATENCY_FILTERED_HD;
+ for (i = 0; i < LATENCY_BUCKET_SIZE; i++)
+ td->avg_buckets[i].latency = DFL_HD_BASELINE_LATENCY;
+ }
#ifndef CONFIG_BLK_DEV_THROTTLING_LOW
/* if no low limit, use previous default */
td->throtl_slice = DFL_THROTL_SLICE_HD;