diff options
Diffstat (limited to 'drivers/md/raid5.c')
-rw-r--r-- | drivers/md/raid5.c | 643 |
1 files changed, 409 insertions, 234 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 2efdb0d67460..2e38cfac5b1d 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -58,11 +58,13 @@ #include <linux/sched/signal.h> #include <trace/events/block.h> +#include <linux/list_sort.h> #include "md.h" #include "raid5.h" #include "raid0.h" #include "bitmap.h" +#include "raid5-log.h" #define UNSUPPORTED_MDDEV_FLAGS (1L << MD_FAILFAST_SUPPORTED) @@ -156,17 +158,6 @@ static int raid6_idx_to_slot(int idx, struct stripe_head *sh, return slot; } -static void return_io(struct bio_list *return_bi) -{ - struct bio *bi; - while ((bi = bio_list_pop(return_bi)) != NULL) { - bi->bi_iter.bi_size = 0; - trace_block_bio_complete(bdev_get_queue(bi->bi_bdev), - bi, 0); - bio_endio(bi); - } -} - static void print_raid5_conf (struct r5conf *conf); static int stripe_operations_active(struct stripe_head *sh) @@ -176,6 +167,13 @@ static int stripe_operations_active(struct stripe_head *sh) test_bit(STRIPE_COMPUTE_RUN, &sh->state); } +static bool stripe_is_lowprio(struct stripe_head *sh) +{ + return (test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state) || + test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) && + !test_bit(STRIPE_R5C_CACHING, &sh->state); +} + static void raid5_wakeup_stripe_thread(struct stripe_head *sh) { struct r5conf *conf = sh->raid_conf; @@ -191,7 +189,10 @@ static void raid5_wakeup_stripe_thread(struct stripe_head *sh) if (list_empty(&sh->lru)) { struct r5worker_group *group; group = conf->worker_groups + cpu_to_group(cpu); - list_add_tail(&sh->lru, &group->handle_list); + if (stripe_is_lowprio(sh)) + list_add_tail(&sh->lru, &group->loprio_list); + else + list_add_tail(&sh->lru, &group->handle_list); group->stripes_cnt++; sh->group = group; } @@ -254,7 +255,12 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh, clear_bit(STRIPE_DELAYED, &sh->state); clear_bit(STRIPE_BIT_DELAY, &sh->state); if (conf->worker_cnt_per_group == 0) { - list_add_tail(&sh->lru, &conf->handle_list); + if (stripe_is_lowprio(sh)) + list_add_tail(&sh->lru, + &conf->loprio_list); + else + list_add_tail(&sh->lru, + &conf->handle_list); } else { raid5_wakeup_stripe_thread(sh); return; @@ -481,6 +487,7 @@ static int grow_buffers(struct stripe_head *sh, gfp_t gfp) sh->dev[i].page = page; sh->dev[i].orig_page = page; } + return 0; } @@ -729,7 +736,7 @@ static bool stripe_can_batch(struct stripe_head *sh) { struct r5conf *conf = sh->raid_conf; - if (conf->log) + if (conf->log || raid5_has_ppl(conf)) return false; return test_bit(STRIPE_BATCH_READY, &sh->state) && !test_bit(STRIPE_BITMAP_PENDING, &sh->state) && @@ -863,41 +870,107 @@ static int use_new_offset(struct r5conf *conf, struct stripe_head *sh) return 1; } -static void flush_deferred_bios(struct r5conf *conf) +static void dispatch_bio_list(struct bio_list *tmp) { - struct bio_list tmp; struct bio *bio; - if (!conf->batch_bio_dispatch || !conf->group_cnt) + while ((bio = bio_list_pop(tmp))) + generic_make_request(bio); +} + +static int cmp_stripe(void *priv, struct list_head *a, struct list_head *b) +{ + const struct r5pending_data *da = list_entry(a, + struct r5pending_data, sibling); + const struct r5pending_data *db = list_entry(b, + struct r5pending_data, sibling); + if (da->sector > db->sector) + return 1; + if (da->sector < db->sector) + return -1; + return 0; +} + +static void dispatch_defer_bios(struct r5conf *conf, int target, + struct bio_list *list) +{ + struct r5pending_data *data; + struct list_head *first, *next = NULL; + int cnt = 0; + + if (conf->pending_data_cnt == 0) + return; + + list_sort(NULL, &conf->pending_list, cmp_stripe); + + first = conf->pending_list.next; + + /* temporarily move the head */ + if (conf->next_pending_data) + list_move_tail(&conf->pending_list, + &conf->next_pending_data->sibling); + + while (!list_empty(&conf->pending_list)) { + data = list_first_entry(&conf->pending_list, + struct r5pending_data, sibling); + if (&data->sibling == first) + first = data->sibling.next; + next = data->sibling.next; + + bio_list_merge(list, &data->bios); + list_move(&data->sibling, &conf->free_list); + cnt++; + if (cnt >= target) + break; + } + conf->pending_data_cnt -= cnt; + BUG_ON(conf->pending_data_cnt < 0 || cnt < target); + + if (next != &conf->pending_list) + conf->next_pending_data = list_entry(next, + struct r5pending_data, sibling); + else + conf->next_pending_data = NULL; + /* list isn't empty */ + if (first != &conf->pending_list) + list_move_tail(&conf->pending_list, first); +} + +static void flush_deferred_bios(struct r5conf *conf) +{ + struct bio_list tmp = BIO_EMPTY_LIST; + + if (conf->pending_data_cnt == 0) return; - bio_list_init(&tmp); spin_lock(&conf->pending_bios_lock); - bio_list_merge(&tmp, &conf->pending_bios); - bio_list_init(&conf->pending_bios); + dispatch_defer_bios(conf, conf->pending_data_cnt, &tmp); + BUG_ON(conf->pending_data_cnt != 0); spin_unlock(&conf->pending_bios_lock); - while ((bio = bio_list_pop(&tmp))) - generic_make_request(bio); + dispatch_bio_list(&tmp); } -static void defer_bio_issue(struct r5conf *conf, struct bio *bio) +static void defer_issue_bios(struct r5conf *conf, sector_t sector, + struct bio_list *bios) { - /* - * change group_cnt will drain all bios, so this is safe - * - * A read generally means a read-modify-write, which usually means a - * randwrite, so we don't delay it - */ - if (!conf->batch_bio_dispatch || !conf->group_cnt || - bio_op(bio) == REQ_OP_READ) { - generic_make_request(bio); - return; - } + struct bio_list tmp = BIO_EMPTY_LIST; + struct r5pending_data *ent; + spin_lock(&conf->pending_bios_lock); - bio_list_add(&conf->pending_bios, bio); + ent = list_first_entry(&conf->free_list, struct r5pending_data, + sibling); + list_move_tail(&ent->sibling, &conf->pending_list); + ent->sector = sector; + bio_list_init(&ent->bios); + bio_list_merge(&ent->bios, bios); + conf->pending_data_cnt++; + if (conf->pending_data_cnt >= PENDING_IO_MAX) + dispatch_defer_bios(conf, PENDING_IO_ONE_FLUSH, &tmp); + spin_unlock(&conf->pending_bios_lock); - md_wakeup_thread(conf->mddev->thread); + + dispatch_bio_list(&tmp); } static void @@ -910,21 +983,15 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) struct r5conf *conf = sh->raid_conf; int i, disks = sh->disks; struct stripe_head *head_sh = sh; + struct bio_list pending_bios = BIO_EMPTY_LIST; + bool should_defer; might_sleep(); - if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) { - /* writing out phase */ - if (s->waiting_extra_page) - return; - if (r5l_write_stripe(conf->log, sh) == 0) - return; - } else { /* caching phase */ - if (test_bit(STRIPE_LOG_TRAPPED, &sh->state)) { - r5c_cache_data(conf->log, sh, s); - return; - } - } + if (log_stripe(sh, s) == 0) + return; + + should_defer = conf->batch_bio_dispatch && conf->group_cnt; for (i = disks; i--; ) { int op, op_flags = 0; @@ -1080,7 +1147,10 @@ again: trace_block_bio_remap(bdev_get_queue(bi->bi_bdev), bi, disk_devt(conf->mddev->gendisk), sh->dev[i].sector); - defer_bio_issue(conf, bi); + if (should_defer && op_is_write(op)) + bio_list_add(&pending_bios, bi); + else + generic_make_request(bi); } if (rrdev) { if (s->syncing || s->expanding || s->expanded @@ -1125,7 +1195,10 @@ again: trace_block_bio_remap(bdev_get_queue(rbi->bi_bdev), rbi, disk_devt(conf->mddev->gendisk), sh->dev[i].sector); - defer_bio_issue(conf, rbi); + if (should_defer && op_is_write(op)) + bio_list_add(&pending_bios, rbi); + else + generic_make_request(rbi); } if (!rdev && !rrdev) { if (op_is_write(op)) @@ -1143,6 +1216,9 @@ again: if (sh != head_sh) goto again; } + + if (should_defer && !bio_list_empty(&pending_bios)) + defer_issue_bios(conf, head_sh->sector, &pending_bios); } static struct dma_async_tx_descriptor * @@ -1212,7 +1288,6 @@ async_copy_data(int frombio, struct bio *bio, struct page **page, static void ops_complete_biofill(void *stripe_head_ref) { struct stripe_head *sh = stripe_head_ref; - struct bio_list return_bi = BIO_EMPTY_LIST; int i; pr_debug("%s: stripe %llu\n", __func__, @@ -1236,16 +1311,13 @@ static void ops_complete_biofill(void *stripe_head_ref) while (rbi && rbi->bi_iter.bi_sector < dev->sector + STRIPE_SECTORS) { rbi2 = r5_next_bio(rbi, dev->sector); - if (!raid5_dec_bi_active_stripes(rbi)) - bio_list_add(&return_bi, rbi); + bio_endio(rbi); rbi = rbi2; } } } clear_bit(STRIPE_BIOFILL_RUN, &sh->state); - return_io(&return_bi); - set_bit(STRIPE_HANDLE, &sh->state); raid5_release_stripe(sh); } @@ -2014,6 +2086,9 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) tx = ops_run_prexor6(sh, percpu, tx); } + if (test_bit(STRIPE_OP_PARTIAL_PARITY, &ops_request)) + tx = ops_run_partial_parity(sh, percpu, tx); + if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) { tx = ops_run_biodrain(sh, tx); overlap_clear++; @@ -2046,8 +2121,15 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) put_cpu(); } +static void free_stripe(struct kmem_cache *sc, struct stripe_head *sh) +{ + if (sh->ppl_page) + __free_page(sh->ppl_page); + kmem_cache_free(sc, sh); +} + static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp, - int disks) + int disks, struct r5conf *conf) { struct stripe_head *sh; int i; @@ -2061,6 +2143,7 @@ static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp, INIT_LIST_HEAD(&sh->r5c); INIT_LIST_HEAD(&sh->log_list); atomic_set(&sh->count, 1); + sh->raid_conf = conf; sh->log_start = MaxSector; for (i = 0; i < disks; i++) { struct r5dev *dev = &sh->dev[i]; @@ -2068,6 +2151,14 @@ static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp, bio_init(&dev->req, &dev->vec, 1); bio_init(&dev->rreq, &dev->rvec, 1); } + + if (raid5_has_ppl(conf)) { + sh->ppl_page = alloc_page(gfp); + if (!sh->ppl_page) { + free_stripe(sc, sh); + sh = NULL; + } + } } return sh; } @@ -2075,15 +2166,13 @@ static int grow_one_stripe(struct r5conf *conf, gfp_t gfp) { struct stripe_head *sh; - sh = alloc_stripe(conf->slab_cache, gfp, conf->pool_size); + sh = alloc_stripe(conf->slab_cache, gfp, conf->pool_size, conf); if (!sh) return 0; - sh->raid_conf = conf; - if (grow_buffers(sh, gfp)) { shrink_buffers(sh); - kmem_cache_free(conf->slab_cache, sh); + free_stripe(conf->slab_cache, sh); return 0; } sh->hash_lock_index = @@ -2210,7 +2299,7 @@ static int resize_stripes(struct r5conf *conf, int newsize) * pages have been transferred over, and the old kmem_cache is * freed when all stripes are done. * 3/ reallocate conf->disks to be suitable bigger. If this fails, - * we simple return a failre status - no need to clean anything up. + * we simple return a failure status - no need to clean anything up. * 4/ allocate new pages for the new slots in the new stripe_heads. * If this fails, we don't bother trying the shrink the * stripe_heads down again, we just leave them as they are. @@ -2228,9 +2317,6 @@ static int resize_stripes(struct r5conf *conf, int newsize) int i; int hash, cnt; - if (newsize <= conf->pool_size) - return 0; /* never bother to shrink */ - err = md_allow_write(conf->mddev); if (err) return err; @@ -2246,11 +2332,10 @@ static int resize_stripes(struct r5conf *conf, int newsize) mutex_lock(&conf->cache_size_mutex); for (i = conf->max_nr_stripes; i; i--) { - nsh = alloc_stripe(sc, GFP_KERNEL, newsize); + nsh = alloc_stripe(sc, GFP_KERNEL, newsize, conf); if (!nsh) break; - nsh->raid_conf = conf; list_add(&nsh->lru, &newstripes); } if (i) { @@ -2258,7 +2343,7 @@ static int resize_stripes(struct r5conf *conf, int newsize) while (!list_empty(&newstripes)) { nsh = list_entry(newstripes.next, struct stripe_head, lru); list_del(&nsh->lru); - kmem_cache_free(sc, nsh); + free_stripe(sc, nsh); } kmem_cache_destroy(sc); mutex_unlock(&conf->cache_size_mutex); @@ -2284,7 +2369,7 @@ static int resize_stripes(struct r5conf *conf, int newsize) nsh->dev[i].orig_page = osh->dev[i].page; } nsh->hash_lock_index = hash; - kmem_cache_free(conf->slab_cache, osh); + free_stripe(conf->slab_cache, osh); cnt++; if (cnt >= conf->max_nr_stripes / NR_STRIPE_HASH_LOCKS + !!((conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS) > hash)) { @@ -2323,6 +2408,10 @@ static int resize_stripes(struct r5conf *conf, int newsize) err = -ENOMEM; mutex_unlock(&conf->cache_size_mutex); + + conf->slab_cache = sc; + conf->active_name = 1-conf->active_name; + /* Step 4, return new stripes to service */ while(!list_empty(&newstripes)) { nsh = list_entry(newstripes.next, struct stripe_head, lru); @@ -2340,8 +2429,6 @@ static int resize_stripes(struct r5conf *conf, int newsize) } /* critical section pass, GFP_NOIO no longer needed */ - conf->slab_cache = sc; - conf->active_name = 1-conf->active_name; if (!err) conf->pool_size = newsize; return err; @@ -2359,7 +2446,7 @@ static int drop_one_stripe(struct r5conf *conf) return 0; BUG_ON(atomic_read(&sh->count)); shrink_buffers(sh); - kmem_cache_free(conf->slab_cache, sh); + free_stripe(conf->slab_cache, sh); atomic_dec(&conf->active_stripes); conf->max_nr_stripes--; return 1; @@ -3082,6 +3169,12 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, s->locked++; } + if (raid5_has_ppl(sh->raid_conf) && sh->ppl_page && + test_bit(STRIPE_OP_BIODRAIN, &s->ops_request) && + !test_bit(STRIPE_FULL_WRITE, &sh->state) && + test_bit(R5_Insync, &sh->dev[pd_idx].flags)) + set_bit(STRIPE_OP_PARTIAL_PARITY, &s->ops_request); + pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n", __func__, (unsigned long long)sh->sector, s->locked, s->ops_request); @@ -3103,14 +3196,6 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, (unsigned long long)bi->bi_iter.bi_sector, (unsigned long long)sh->sector); - /* - * If several bio share a stripe. The bio bi_phys_segments acts as a - * reference count to avoid race. The reference count should already be - * increased before this function is called (for example, in - * raid5_make_request()), so other bio sharing this stripe will not free the - * stripe. If a stripe is owned by one stripe, the stripe lock will - * protect it. - */ spin_lock_irq(&sh->stripe_lock); /* Don't allow new IO added to stripes in batch list */ if (sh->batch_head) @@ -3129,6 +3214,36 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, if (*bip && (*bip)->bi_iter.bi_sector < bio_end_sector(bi)) goto overlap; + if (forwrite && raid5_has_ppl(conf)) { + /* + * With PPL only writes to consecutive data chunks within a + * stripe are allowed because for a single stripe_head we can + * only have one PPL entry at a time, which describes one data + * range. Not really an overlap, but wait_for_overlap can be + * used to handle this. + */ + sector_t sector; + sector_t first = 0; + sector_t last = 0; + int count = 0; + int i; + + for (i = 0; i < sh->disks; i++) { + if (i != sh->pd_idx && + (i == dd_idx || sh->dev[i].towrite)) { + sector = sh->dev[i].sector; + if (count == 0 || sector < first) + first = sector; + if (sector > last) + last = sector; + count++; + } + } + + if (first + conf->chunk_sectors * (count - 1) != last) + goto overlap; + } + if (!forwrite || previous) clear_bit(STRIPE_BATCH_READY, &sh->state); @@ -3136,7 +3251,8 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, if (*bip) bi->bi_next = *bip; *bip = bi; - raid5_inc_bi_active_stripes(bi); + bio_inc_remaining(bi); + md_write_inc(conf->mddev, bi); if (forwrite) { /* check if page is covered */ @@ -3213,8 +3329,7 @@ static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous, static void handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, - struct stripe_head_state *s, int disks, - struct bio_list *return_bi) + struct stripe_head_state *s, int disks) { int i; BUG_ON(sh->batch_head); @@ -3250,7 +3365,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, if (bi) bitmap_end = 1; - r5l_stripe_write_finished(sh); + log_stripe_write_finished(sh); if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) wake_up(&conf->wait_for_overlap); @@ -3260,10 +3375,8 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); bi->bi_error = -EIO; - if (!raid5_dec_bi_active_stripes(bi)) { - md_write_end(conf->mddev); - bio_list_add(return_bi, bi); - } + md_write_end(conf->mddev); + bio_endio(bi); bi = nextbi; } if (bitmap_end) @@ -3284,10 +3397,8 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector); bi->bi_error = -EIO; - if (!raid5_dec_bi_active_stripes(bi)) { - md_write_end(conf->mddev); - bio_list_add(return_bi, bi); - } + md_write_end(conf->mddev); + bio_endio(bi); bi = bi2; } @@ -3312,8 +3423,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, r5_next_bio(bi, sh->dev[i].sector); bi->bi_error = -EIO; - if (!raid5_dec_bi_active_stripes(bi)) - bio_list_add(return_bi, bi); + bio_endio(bi); bi = nextbi; } } @@ -3449,7 +3559,7 @@ static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s, !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) /* Pre-reads at not permitted until after short delay * to gather multiple requests. However if this - * device is no Insync, the block could only be be computed + * device is no Insync, the block could only be computed * and there is no need to delay that. */ return 0; @@ -3468,7 +3578,7 @@ static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s, /* If we are forced to do a reconstruct-write, either because * the current RAID6 implementation only supports that, or - * or because parity cannot be trusted and we are currently + * because parity cannot be trusted and we are currently * recovering it, there is extra need to be careful. * If one of the devices that we would need to read, because * it is not being overwritten (and maybe not written at all) @@ -3508,9 +3618,20 @@ static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s, BUG_ON(test_bit(R5_Wantcompute, &dev->flags)); BUG_ON(test_bit(R5_Wantread, &dev->flags)); BUG_ON(sh->batch_head); + + /* + * In the raid6 case if the only non-uptodate disk is P + * then we already trusted P to compute the other failed + * drives. It is safe to compute rather than re-read P. + * In other cases we only compute blocks from failed + * devices, otherwise check/repair might fail to detect + * a real inconsistency. + */ + if ((s->uptodate == disks - 1) && + ((sh->qd_idx >= 0 && sh->pd_idx == disk_idx) || (s->failed && (disk_idx == s->failed_num[0] || - disk_idx == s->failed_num[1]))) { + disk_idx == s->failed_num[1])))) { /* have disk failed, and we're requested to fetch it; * do compute it */ @@ -3612,7 +3733,7 @@ static void break_stripe_batch_list(struct stripe_head *head_sh, * never LOCKED, so we don't need to test 'failed' directly. */ static void handle_stripe_clean_event(struct r5conf *conf, - struct stripe_head *sh, int disks, struct bio_list *return_bi) + struct stripe_head *sh, int disks) { int i; struct r5dev *dev; @@ -3644,10 +3765,8 @@ returnbi: while (wbi && wbi->bi_iter.bi_sector < dev->sector + STRIPE_SECTORS) { wbi2 = r5_next_bio(wbi, dev->sector); - if (!raid5_dec_bi_active_stripes(wbi)) { - md_write_end(conf->mddev); - bio_list_add(return_bi, wbi); - } + md_write_end(conf->mddev); + bio_endio(wbi); wbi = wbi2; } bitmap_endwrite(conf->mddev->bitmap, sh->sector, @@ -3669,7 +3788,7 @@ returnbi: discard_pending = 1; } - r5l_stripe_write_finished(sh); + log_stripe_write_finished(sh); if (!discard_pending && test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)) { @@ -4556,7 +4675,8 @@ static void handle_stripe(struct stripe_head *sh) if (test_bit(STRIPE_LOG_TRAPPED, &sh->state)) goto finish; - if (s.handle_bad_blocks) { + if (s.handle_bad_blocks || + test_bit(MD_SB_CHANGE_PENDING, &conf->mddev->sb_flags)) { set_bit(STRIPE_HANDLE, &sh->state); goto finish; } @@ -4589,7 +4709,7 @@ static void handle_stripe(struct stripe_head *sh) sh->reconstruct_state = 0; break_stripe_batch_list(sh, 0); if (s.to_read+s.to_write+s.written) - handle_failed_stripe(conf, sh, &s, disks, &s.return_bi); + handle_failed_stripe(conf, sh, &s, disks); if (s.syncing + s.replacing) handle_failed_sync(conf, sh, &s); } @@ -4655,11 +4775,11 @@ static void handle_stripe(struct stripe_head *sh) && !test_bit(R5_LOCKED, &qdev->flags) && (test_bit(R5_UPTODATE, &qdev->flags) || test_bit(R5_Discard, &qdev->flags)))))) - handle_stripe_clean_event(conf, sh, disks, &s.return_bi); + handle_stripe_clean_event(conf, sh, disks); if (s.just_cached) - r5c_handle_cached_data_endio(conf, sh, disks, &s.return_bi); - r5l_stripe_write_finished(sh); + r5c_handle_cached_data_endio(conf, sh, disks); + log_stripe_write_finished(sh); /* Now we might consider reading some blocks, either to check/generate * parity, or to satisfy requests @@ -4886,16 +5006,6 @@ finish: md_wakeup_thread(conf->mddev->thread); } - if (!bio_list_empty(&s.return_bi)) { - if (test_bit(MD_SB_CHANGE_PENDING, &conf->mddev->sb_flags)) { - spin_lock_irq(&conf->device_lock); - bio_list_merge(&conf->return_bi, &s.return_bi); - spin_unlock_irq(&conf->device_lock); - md_wakeup_thread(conf->mddev->thread); - } else - return_io(&s.return_bi); - } - clear_bit_unlock(STRIPE_ACTIVE, &sh->state); } @@ -4984,12 +5094,14 @@ static void add_bio_to_retry(struct bio *bi,struct r5conf *conf) md_wakeup_thread(conf->mddev->thread); } -static struct bio *remove_bio_from_retry(struct r5conf *conf) +static struct bio *remove_bio_from_retry(struct r5conf *conf, + unsigned int *offset) { struct bio *bi; bi = conf->retry_read_aligned; if (bi) { + *offset = conf->retry_read_offset; conf->retry_read_aligned = NULL; return bi; } @@ -4997,11 +5109,7 @@ static struct bio *remove_bio_from_retry(struct r5conf *conf) if(bi) { conf->retry_read_aligned_list = bi->bi_next; bi->bi_next = NULL; - /* - * this sets the active strip count to 1 and the processed - * strip count to zero (upper 8 bits) - */ - raid5_set_bi_stripes(bi, 1); /* biased count of active stripes */ + *offset = 0; } return bi; @@ -5136,24 +5244,20 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio) static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio) { struct bio *split; + sector_t sector = raid_bio->bi_iter.bi_sector; + unsigned chunk_sects = mddev->chunk_sectors; + unsigned sectors = chunk_sects - (sector & (chunk_sects-1)); - do { - sector_t sector = raid_bio->bi_iter.bi_sector; - unsigned chunk_sects = mddev->chunk_sectors; - unsigned sectors = chunk_sects - (sector & (chunk_sects-1)); - - if (sectors < bio_sectors(raid_bio)) { - split = bio_split(raid_bio, sectors, GFP_NOIO, fs_bio_set); - bio_chain(split, raid_bio); - } else - split = raid_bio; + if (sectors < bio_sectors(raid_bio)) { + struct r5conf *conf = mddev->private; + split = bio_split(raid_bio, sectors, GFP_NOIO, conf->bio_split); + bio_chain(split, raid_bio); + generic_make_request(raid_bio); + raid_bio = split; + } - if (!raid5_read_one_chunk(mddev, split)) { - if (split != raid_bio) - generic_make_request(raid_bio); - return split; - } - } while (split != raid_bio); + if (!raid5_read_one_chunk(mddev, raid_bio)) + return raid_bio; return NULL; } @@ -5170,19 +5274,27 @@ static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio) */ static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group) { - struct stripe_head *sh = NULL, *tmp; + struct stripe_head *sh, *tmp; struct list_head *handle_list = NULL; - struct r5worker_group *wg = NULL; + struct r5worker_group *wg; + bool second_try = !r5c_is_writeback(conf->log); + bool try_loprio = test_bit(R5C_LOG_TIGHT, &conf->cache_state); +again: + wg = NULL; + sh = NULL; if (conf->worker_cnt_per_group == 0) { - handle_list = &conf->handle_list; + handle_list = try_loprio ? &conf->loprio_list : + &conf->handle_list; } else if (group != ANY_GROUP) { - handle_list = &conf->worker_groups[group].handle_list; + handle_list = try_loprio ? &conf->worker_groups[group].loprio_list : + &conf->worker_groups[group].handle_list; wg = &conf->worker_groups[group]; } else { int i; for (i = 0; i < conf->group_cnt; i++) { - handle_list = &conf->worker_groups[i].handle_list; + handle_list = try_loprio ? &conf->worker_groups[i].loprio_list : + &conf->worker_groups[i].handle_list; wg = &conf->worker_groups[i]; if (!list_empty(handle_list)) break; @@ -5233,8 +5345,13 @@ static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group) wg = NULL; } - if (!sh) - return NULL; + if (!sh) { + if (second_try) + return NULL; + second_try = true; + try_loprio = !try_loprio; + goto again; + } if (wg) { wg->stripes_cnt--; @@ -5323,7 +5440,6 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi) struct r5conf *conf = mddev->private; sector_t logical_sector, last_sector; struct stripe_head *sh; - int remaining; int stripe_sectors; if (mddev->reshape_position != MaxSector) @@ -5334,7 +5450,7 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi) last_sector = bi->bi_iter.bi_sector + (bi->bi_iter.bi_size>>9); bi->bi_next = NULL; - bi->bi_phys_segments = 1; /* over-loaded to count active stripes */ + md_write_start(mddev, bi); stripe_sectors = conf->chunk_sectors * (conf->raid_disks - conf->max_degraded); @@ -5380,7 +5496,8 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi) continue; sh->dev[d].towrite = bi; set_bit(R5_OVERWRITE, &sh->dev[d].flags); - raid5_inc_bi_active_stripes(bi); + bio_inc_remaining(bi); + md_write_inc(mddev, bi); sh->overwrite_disks++; } spin_unlock_irq(&sh->stripe_lock); @@ -5403,11 +5520,8 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi) release_stripe_plug(mddev, sh); } - remaining = raid5_dec_bi_active_stripes(bi); - if (remaining == 0) { - md_write_end(mddev); - bio_endio(bi); - } + md_write_end(mddev); + bio_endio(bi); } static void raid5_make_request(struct mddev *mddev, struct bio * bi) @@ -5418,7 +5532,6 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi) sector_t logical_sector, last_sector; struct stripe_head *sh; const int rw = bio_data_dir(bi); - int remaining; DEFINE_WAIT(w); bool do_prepare; bool do_flush = false; @@ -5440,8 +5553,6 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi) do_flush = bi->bi_opf & REQ_PREFLUSH; } - md_write_start(mddev, bi); - /* * If array is degraded, better not do chunk aligned read because * later we might have to read it again in order to reconstruct @@ -5462,7 +5573,7 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi) logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1); last_sector = bio_end_sector(bi); bi->bi_next = NULL; - bi->bi_phys_segments = 1; /* over-loaded to count active stripes */ + md_write_start(mddev, bi); prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { @@ -5597,16 +5708,9 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi) } finish_wait(&conf->wait_for_overlap, &w); - remaining = raid5_dec_bi_active_stripes(bi); - if (remaining == 0) { - - if ( rw == WRITE ) - md_write_end(mddev); - - trace_block_bio_complete(bdev_get_queue(bi->bi_bdev), - bi, 0); - bio_endio(bi); - } + if (rw == WRITE) + md_write_end(mddev); + bio_endio(bi); } static sector_t raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks); @@ -5955,7 +6059,8 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n return STRIPE_SECTORS; } -static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) +static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio, + unsigned int offset) { /* We may not be able to submit a whole bio at once as there * may not be enough stripe_heads available. @@ -5971,7 +6076,6 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) int dd_idx; sector_t sector, logical_sector, last_sector; int scnt = 0; - int remaining; int handled = 0; logical_sector = raid_bio->bi_iter.bi_sector & @@ -5985,7 +6089,7 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) sector += STRIPE_SECTORS, scnt++) { - if (scnt < raid5_bi_processed_stripes(raid_bio)) + if (scnt < offset) /* already done this stripe */ continue; @@ -5993,15 +6097,15 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) if (!sh) { /* failed to get a stripe - must wait */ - raid5_set_bi_processed_stripes(raid_bio, scnt); conf->retry_read_aligned = raid_bio; + conf->retry_read_offset = scnt; return handled; } if (!add_stripe_bio(sh, raid_bio, dd_idx, 0, 0)) { raid5_release_stripe(sh); - raid5_set_bi_processed_stripes(raid_bio, scnt); conf->retry_read_aligned = raid_bio; + conf->retry_read_offset = scnt; return handled; } @@ -6010,12 +6114,9 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) raid5_release_stripe(sh); handled++; } - remaining = raid5_dec_bi_active_stripes(raid_bio); - if (remaining == 0) { - trace_block_bio_complete(bdev_get_queue(raid_bio->bi_bdev), - raid_bio, 0); - bio_endio(raid_bio); - } + + bio_endio(raid_bio); + if (atomic_dec_and_test(&conf->active_aligned_reads)) wake_up(&conf->wait_for_quiescent); return handled; @@ -6058,7 +6159,7 @@ static int handle_active_stripes(struct r5conf *conf, int group, for (i = 0; i < batch_size; i++) handle_stripe(batch[i]); - r5l_write_stripe_run(conf->log); + log_write_stripe_run(conf); cond_resched(); @@ -6075,6 +6176,7 @@ static void raid5_do_work(struct work_struct *work) struct r5worker *worker = container_of(work, struct r5worker, work); struct r5worker_group *group = worker->group; struct r5conf *conf = group->conf; + struct mddev *mddev = conf->mddev; int group_id = group - conf->worker_groups; int handled; struct blk_plug plug; @@ -6095,6 +6197,9 @@ static void raid5_do_work(struct work_struct *work) if (!batch_size && !released) break; handled += batch_size; + wait_event_lock_irq(mddev->sb_wait, + !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags), + conf->device_lock); } pr_debug("%d stripes handled\n", handled); @@ -6122,24 +6227,13 @@ static void raid5d(struct md_thread *thread) md_check_recovery(mddev); - if (!bio_list_empty(&conf->return_bi) && - !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) { - struct bio_list tmp = BIO_EMPTY_LIST; - spin_lock_irq(&conf->device_lock); - if (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) { - bio_list_merge(&tmp, &conf->return_bi); - bio_list_init(&conf->return_bi); - } - spin_unlock_irq(&conf->device_lock); - return_io(&tmp); - } - blk_start_plug(&plug); handled = 0; spin_lock_irq(&conf->device_lock); while (1) { struct bio *bio; int batch_size, released; + unsigned int offset; released = release_stripe_list(conf, conf->temp_inactive_list); if (released) @@ -6157,10 +6251,10 @@ static void raid5d(struct md_thread *thread) } raid5_activate_delayed(conf); - while ((bio = remove_bio_from_retry(conf))) { + while ((bio = remove_bio_from_retry(conf, &offset))) { int ok; spin_unlock_irq(&conf->device_lock); - ok = retry_aligned_read(conf, bio); + ok = retry_aligned_read(conf, bio, offset); spin_lock_irq(&conf->device_lock); if (!ok) break; @@ -6544,6 +6638,7 @@ static int alloc_thread_groups(struct r5conf *conf, int cnt, group = &(*worker_groups)[i]; INIT_LIST_HEAD(&group->handle_list); + INIT_LIST_HEAD(&group->loprio_list); group->conf = conf; group->workers = workers + i * cnt; @@ -6634,8 +6729,8 @@ static void free_conf(struct r5conf *conf) { int i; - if (conf->log) - r5l_exit_log(conf->log); + log_exit(conf); + if (conf->shrinker.nr_deferred) unregister_shrinker(&conf->shrinker); @@ -6646,7 +6741,10 @@ static void free_conf(struct r5conf *conf) if (conf->disks[i].extra_page) put_page(conf->disks[i].extra_page); kfree(conf->disks); + if (conf->bio_split) + bioset_free(conf->bio_split); kfree(conf->stripe_hashtbl); + kfree(conf->pending_data); kfree(conf); } @@ -6756,6 +6854,14 @@ static struct r5conf *setup_conf(struct mddev *mddev) conf = kzalloc(sizeof(struct r5conf), GFP_KERNEL); if (conf == NULL) goto abort; + INIT_LIST_HEAD(&conf->free_list); + INIT_LIST_HEAD(&conf->pending_list); + conf->pending_data = kzalloc(sizeof(struct r5pending_data) * + PENDING_IO_MAX, GFP_KERNEL); + if (!conf->pending_data) + goto abort; + for (i = 0; i < PENDING_IO_MAX; i++) + list_add(&conf->pending_data[i].sibling, &conf->free_list); /* Don't enable multi-threading by default*/ if (!alloc_thread_groups(conf, 0, &group_cnt, &worker_cnt_per_group, &new_group)) { @@ -6771,15 +6877,14 @@ static struct r5conf *setup_conf(struct mddev *mddev) init_waitqueue_head(&conf->wait_for_stripe); init_waitqueue_head(&conf->wait_for_overlap); INIT_LIST_HEAD(&conf->handle_list); + INIT_LIST_HEAD(&conf->loprio_list); INIT_LIST_HEAD(&conf->hold_list); INIT_LIST_HEAD(&conf->delayed_list); INIT_LIST_HEAD(&conf->bitmap_list); - bio_list_init(&conf->return_bi); init_llist_head(&conf->released_stripes); atomic_set(&conf->active_stripes, 0); atomic_set(&conf->preread_active_stripes, 0); atomic_set(&conf->active_aligned_reads, 0); - bio_list_init(&conf->pending_bios); spin_lock_init(&conf->pending_bios_lock); conf->batch_bio_dispatch = true; rdev_for_each(rdev, mddev) { @@ -6813,6 +6918,9 @@ static struct r5conf *setup_conf(struct mddev *mddev) goto abort; } + conf->bio_split = bioset_create(BIO_POOL_SIZE, 0); + if (!conf->bio_split) + goto abort; conf->mddev = mddev; if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL) @@ -7097,6 +7205,13 @@ static int raid5_run(struct mddev *mddev) BUG_ON(mddev->delta_disks != 0); } + if (test_bit(MD_HAS_JOURNAL, &mddev->flags) && + test_bit(MD_HAS_PPL, &mddev->flags)) { + pr_warn("md/raid:%s: using journal device and PPL not allowed - disabling PPL\n", + mdname(mddev)); + clear_bit(MD_HAS_PPL, &mddev->flags); + } + if (mddev->private == NULL) conf = setup_conf(mddev); else @@ -7188,7 +7303,10 @@ static int raid5_run(struct mddev *mddev) if (mddev->degraded > dirty_parity_disks && mddev->recovery_cp != MaxSector) { - if (mddev->ok_start_degraded) + if (test_bit(MD_HAS_PPL, &mddev->flags)) + pr_crit("md/raid:%s: starting dirty degraded array with PPL.\n", + mdname(mddev)); + else if (mddev->ok_start_degraded) pr_crit("md/raid:%s: starting dirty degraded array - data corruption possible.\n", mdname(mddev)); else { @@ -7254,14 +7372,6 @@ static int raid5_run(struct mddev *mddev) mddev->queue->limits.discard_alignment = stripe; mddev->queue->limits.discard_granularity = stripe; - /* - * We use 16-bit counter of active stripes in bi_phys_segments - * (minus one for over-loaded initialization) - */ - blk_queue_max_hw_sectors(mddev->queue, 0xfffe * STRIPE_SECTORS); - blk_queue_max_discard_sectors(mddev->queue, - 0xfffe * STRIPE_SECTORS); - blk_queue_max_write_same_sectors(mddev->queue, 0); blk_queue_max_write_zeroes_sectors(mddev->queue, 0); @@ -7299,14 +7409,8 @@ static int raid5_run(struct mddev *mddev) blk_queue_max_hw_sectors(mddev->queue, UINT_MAX); } - if (journal_dev) { - char b[BDEVNAME_SIZE]; - - pr_debug("md/raid:%s: using device %s as journal\n", - mdname(mddev), bdevname(journal_dev->bdev, b)); - if (r5l_init_log(conf, journal_dev)) - goto abort; - } + if (log_init(conf, journal_dev, raid5_has_ppl(conf))) + goto abort; return 0; abort: @@ -7420,17 +7524,16 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev) print_raid5_conf(conf); if (test_bit(Journal, &rdev->flags) && conf->log) { - struct r5l_log *log; /* * we can't wait pending write here, as this is called in * raid5d, wait will deadlock. + * neilb: there is no locking about new writes here, + * so this cannot be safe. */ - if (atomic_read(&mddev->writes_pending)) + if (atomic_read(&conf->active_stripes)) { return -EBUSY; - log = conf->log; - conf->log = NULL; - synchronize_rcu(); - r5l_exit_log(log); + } + log_exit(conf); return 0; } if (rdev == p->rdev) @@ -7469,6 +7572,11 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev) *rdevp = rdev; } } + if (!err) { + err = log_modify(conf, rdev, false); + if (err) + goto abort; + } if (p->replacement) { /* We must have just cleared 'rdev' */ p->rdev = p->replacement; @@ -7477,12 +7585,12 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev) * but will never see neither - if they are careful */ p->replacement = NULL; - clear_bit(WantReplacement, &rdev->flags); - } else - /* We might have just removed the Replacement as faulty- - * clear the bit just in case - */ - clear_bit(WantReplacement, &rdev->flags); + + if (!err) + err = log_modify(conf, p->rdev, true); + } + + clear_bit(WantReplacement, &rdev->flags); abort: print_raid5_conf(conf); @@ -7499,7 +7607,6 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev) int last = conf->raid_disks - 1; if (test_bit(Journal, &rdev->flags)) { - char b[BDEVNAME_SIZE]; if (conf->log) return -EBUSY; @@ -7508,9 +7615,7 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev) * The array is in readonly mode if journal is missing, so no * write requests running. We should be safe */ - r5l_init_log(conf, rdev); - pr_debug("md/raid:%s: using device %s as journal\n", - mdname(mddev), bdevname(rdev->bdev, b)); + log_init(conf, rdev, false); return 0; } if (mddev->recovery_disabled == conf->recovery_disabled) @@ -7537,10 +7642,12 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev) if (p->rdev == NULL) { clear_bit(In_sync, &rdev->flags); rdev->raid_disk = disk; - err = 0; if (rdev->saved_raid_disk != disk) conf->fullsync = 1; rcu_assign_pointer(p->rdev, rdev); + + err = log_modify(conf, rdev, true); + goto out; } } @@ -7574,7 +7681,7 @@ static int raid5_resize(struct mddev *mddev, sector_t sectors) sector_t newsize; struct r5conf *conf = mddev->private; - if (conf->log) + if (conf->log || raid5_has_ppl(conf)) return -EINVAL; sectors &= ~((sector_t)conf->chunk_sectors - 1); newsize = raid5_size(mddev, sectors, mddev->raid_disks); @@ -7625,7 +7732,7 @@ static int check_reshape(struct mddev *mddev) { struct r5conf *conf = mddev->private; - if (conf->log) + if (conf->log || raid5_has_ppl(conf)) return -EINVAL; if (mddev->delta_disks == 0 && mddev->new_layout == mddev->layout && @@ -7658,6 +7765,9 @@ static int check_reshape(struct mddev *mddev) mddev->chunk_sectors) ) < 0) return -ENOMEM; + + if (conf->previous_raid_disks + mddev->delta_disks <= conf->pool_size) + return 0; /* never bother to shrink */ return resize_stripes(conf, (conf->previous_raid_disks + mddev->delta_disks)); } @@ -8148,6 +8258,68 @@ static void *raid6_takeover(struct mddev *mddev) return setup_conf(mddev); } +static int raid5_change_consistency_policy(struct mddev *mddev, const char *buf) +{ + struct r5conf *conf; + int err; + + err = mddev_lock(mddev); + if (err) + return err; + conf = mddev->private; + if (!conf) { + mddev_unlock(mddev); + return -ENODEV; + } + + if (strncmp(buf, "ppl", 3) == 0) { + /* ppl only works with RAID 5 */ + if (!raid5_has_ppl(conf) && conf->level == 5) { + err = log_init(conf, NULL, true); + if (!err) { + err = resize_stripes(conf, conf->pool_size); + if (err) + log_exit(conf); + } + } else + err = -EINVAL; + } else if (strncmp(buf, "resync", 6) == 0) { + if (raid5_has_ppl(conf)) { + mddev_suspend(mddev); + log_exit(conf); + mddev_resume(mddev); + err = resize_stripes(conf, conf->pool_size); + } else if (test_bit(MD_HAS_JOURNAL, &conf->mddev->flags) && + r5l_log_disk_error(conf)) { + bool journal_dev_exists = false; + struct md_rdev *rdev; + + rdev_for_each(rdev, mddev) + if (test_bit(Journal, &rdev->flags)) { + journal_dev_exists = true; + break; + } + + if (!journal_dev_exists) { + mddev_suspend(mddev); + clear_bit(MD_HAS_JOURNAL, &mddev->flags); + mddev_resume(mddev); + } else /* need remove journal device first */ + err = -EBUSY; + } else + err = -EINVAL; + } else { + err = -EINVAL; + } + + if (!err) + md_update_sb(mddev, 1); + + mddev_unlock(mddev); + + return err; +} + static struct md_personality raid6_personality = { .name = "raid6", @@ -8170,6 +8342,7 @@ static struct md_personality raid6_personality = .quiesce = raid5_quiesce, .takeover = raid6_takeover, .congested = raid5_congested, + .change_consistency_policy = raid5_change_consistency_policy, }; static struct md_personality raid5_personality = { @@ -8193,6 +8366,7 @@ static struct md_personality raid5_personality = .quiesce = raid5_quiesce, .takeover = raid5_takeover, .congested = raid5_congested, + .change_consistency_policy = raid5_change_consistency_policy, }; static struct md_personality raid4_personality = @@ -8217,6 +8391,7 @@ static struct md_personality raid4_personality = .quiesce = raid5_quiesce, .takeover = raid4_takeover, .congested = raid5_congested, + .change_consistency_policy = raid5_change_consistency_policy, }; static int __init raid5_init(void) |