From ed30be077e705e0dff53bfc51d23feb8aeeab78f Mon Sep 17 00:00:00 2001 From: Jonathan Brassow Date: Wed, 31 Oct 2012 11:42:30 +1100 Subject: MD RAID10: Fix oops when creating RAID10 arrays via dm-raid.c Commit 2863b9eb didn't take into account the changes to add TRIM support to RAID10 (commit 532a2a3fb). That is, when using dm-raid.c to create the RAID10 arrays, there is no mddev->gendisk or mddev->queue. The code added to support TRIM simply assumes that mddev->queue is available without checking. The result is an oops any time dm-raid.c attempts to create a RAID10 device. Signed-off-by: Jonathan Brassow Signed-off-by: NeilBrown --- drivers/md/raid10.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'drivers/md/raid10.c') diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 906ccbd0f7dc..d1295aff4173 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1783,7 +1783,7 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) clear_bit(Unmerged, &rdev->flags); } md_integrity_add_rdev(rdev, mddev); - if (blk_queue_discard(bdev_get_queue(rdev->bdev))) + if (mddev->queue && blk_queue_discard(bdev_get_queue(rdev->bdev))) queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); print_conf(conf); @@ -3613,11 +3613,14 @@ static int run(struct mddev *mddev) discard_supported = true; } - if (discard_supported) - queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); - else - queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); - + if (mddev->queue) { + if (discard_supported) + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, + mddev->queue); + else + queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, + mddev->queue); + } /* need to check that every block has at least one working mirror */ if (!enough(conf, -1)) { printk(KERN_ERR "md/raid10:%s: not enough operational mirrors.\n", -- cgit v1.2.3 From e7c0c3fa29280d62aa5e11101a674bb3064bd791 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 22 Nov 2012 14:42:49 +1100 Subject: md/raid10: close race that lose writes lost when replacement completes. When a replacement operation completes there is a small window when the original device is marked 'faulty' and the replacement still looks like a replacement. The faulty should be removed and the replacement moved in place very quickly, bit it isn't instant. So the code write out to the array must handle the possibility that the only working device for some slot in the replacement - but it doesn't. If the primary device is faulty it just gives up. This can lead to corruption. So make the code more robust: if either the primary or the replacement is present and working, write to them. Only when neither are present do we give up. This bug has been present since replacement was introduced in 3.3, so it is suitable for any -stable kernel since then. Reported-by: "George Spelvin" Cc: stable@vger.kernel.org Signed-off-by: NeilBrown --- drivers/md/raid10.c | 129 +++++++++++++++++++++++++++------------------------- 1 file changed, 68 insertions(+), 61 deletions(-) (limited to 'drivers/md/raid10.c') diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index d1295aff4173..ad032518fdc0 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1334,18 +1334,21 @@ retry_write: blocked_rdev = rrdev; break; } + if (rdev && (test_bit(Faulty, &rdev->flags) + || test_bit(Unmerged, &rdev->flags))) + rdev = NULL; if (rrdev && (test_bit(Faulty, &rrdev->flags) || test_bit(Unmerged, &rrdev->flags))) rrdev = NULL; r10_bio->devs[i].bio = NULL; r10_bio->devs[i].repl_bio = NULL; - if (!rdev || test_bit(Faulty, &rdev->flags) || - test_bit(Unmerged, &rdev->flags)) { + + if (!rdev && !rrdev) { set_bit(R10BIO_Degraded, &r10_bio->state); continue; } - if (test_bit(WriteErrorSeen, &rdev->flags)) { + if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) { sector_t first_bad; sector_t dev_sector = r10_bio->devs[i].addr; int bad_sectors; @@ -1387,8 +1390,10 @@ retry_write: max_sectors = good_sectors; } } - r10_bio->devs[i].bio = bio; - atomic_inc(&rdev->nr_pending); + if (rdev) { + r10_bio->devs[i].bio = bio; + atomic_inc(&rdev->nr_pending); + } if (rrdev) { r10_bio->devs[i].repl_bio = bio; atomic_inc(&rrdev->nr_pending); @@ -1444,69 +1449,71 @@ retry_write: for (i = 0; i < conf->copies; i++) { struct bio *mbio; int d = r10_bio->devs[i].devnum; - if (!r10_bio->devs[i].bio) - continue; + if (r10_bio->devs[i].bio) { + struct md_rdev *rdev = conf->mirrors[d].rdev; + mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); + md_trim_bio(mbio, r10_bio->sector - bio->bi_sector, + max_sectors); + r10_bio->devs[i].bio = mbio; + + mbio->bi_sector = (r10_bio->devs[i].addr+ + choose_data_offset(r10_bio, + rdev)); + mbio->bi_bdev = rdev->bdev; + mbio->bi_end_io = raid10_end_write_request; + mbio->bi_rw = WRITE | do_sync | do_fua | do_discard; + mbio->bi_private = r10_bio; - mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); - md_trim_bio(mbio, r10_bio->sector - bio->bi_sector, - max_sectors); - r10_bio->devs[i].bio = mbio; + atomic_inc(&r10_bio->remaining); - mbio->bi_sector = (r10_bio->devs[i].addr+ - choose_data_offset(r10_bio, - conf->mirrors[d].rdev)); - mbio->bi_bdev = conf->mirrors[d].rdev->bdev; - mbio->bi_end_io = raid10_end_write_request; - mbio->bi_rw = WRITE | do_sync | do_fua | do_discard; - mbio->bi_private = r10_bio; + cb = blk_check_plugged(raid10_unplug, mddev, + sizeof(*plug)); + if (cb) + plug = container_of(cb, struct raid10_plug_cb, + cb); + else + plug = NULL; + spin_lock_irqsave(&conf->device_lock, flags); + if (plug) { + bio_list_add(&plug->pending, mbio); + plug->pending_cnt++; + } else { + bio_list_add(&conf->pending_bio_list, mbio); + conf->pending_count++; + } + spin_unlock_irqrestore(&conf->device_lock, flags); + if (!plug) + md_wakeup_thread(mddev->thread); + } - atomic_inc(&r10_bio->remaining); + if (r10_bio->devs[i].repl_bio) { + struct md_rdev *rdev = conf->mirrors[d].replacement; + if (rdev == NULL) { + /* Replacement just got moved to main 'rdev' */ + smp_mb(); + rdev = conf->mirrors[d].rdev; + } + mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); + md_trim_bio(mbio, r10_bio->sector - bio->bi_sector, + max_sectors); + r10_bio->devs[i].repl_bio = mbio; + + mbio->bi_sector = (r10_bio->devs[i].addr + + choose_data_offset( + r10_bio, rdev)); + mbio->bi_bdev = rdev->bdev; + mbio->bi_end_io = raid10_end_write_request; + mbio->bi_rw = WRITE | do_sync | do_fua | do_discard; + mbio->bi_private = r10_bio; - cb = blk_check_plugged(raid10_unplug, mddev, sizeof(*plug)); - if (cb) - plug = container_of(cb, struct raid10_plug_cb, cb); - else - plug = NULL; - spin_lock_irqsave(&conf->device_lock, flags); - if (plug) { - bio_list_add(&plug->pending, mbio); - plug->pending_cnt++; - } else { + atomic_inc(&r10_bio->remaining); + spin_lock_irqsave(&conf->device_lock, flags); bio_list_add(&conf->pending_bio_list, mbio); conf->pending_count++; + spin_unlock_irqrestore(&conf->device_lock, flags); + if (!mddev_check_plugged(mddev)) + md_wakeup_thread(mddev->thread); } - spin_unlock_irqrestore(&conf->device_lock, flags); - if (!plug) - md_wakeup_thread(mddev->thread); - - if (!r10_bio->devs[i].repl_bio) - continue; - - mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); - md_trim_bio(mbio, r10_bio->sector - bio->bi_sector, - max_sectors); - r10_bio->devs[i].repl_bio = mbio; - - /* We are actively writing to the original device - * so it cannot disappear, so the replacement cannot - * become NULL here - */ - mbio->bi_sector = (r10_bio->devs[i].addr + - choose_data_offset( - r10_bio, - conf->mirrors[d].replacement)); - mbio->bi_bdev = conf->mirrors[d].replacement->bdev; - mbio->bi_end_io = raid10_end_write_request; - mbio->bi_rw = WRITE | do_sync | do_fua | do_discard; - mbio->bi_private = r10_bio; - - atomic_inc(&r10_bio->remaining); - spin_lock_irqsave(&conf->device_lock, flags); - bio_list_add(&conf->pending_bio_list, mbio); - conf->pending_count++; - spin_unlock_irqrestore(&conf->device_lock, flags); - if (!mddev_check_plugged(mddev)) - md_wakeup_thread(mddev->thread); } /* Don't remove the bias on 'remaining' (one_write_done) until -- cgit v1.2.3 From 884162df2aadd7414bef4935e1a54976fd4e3988 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 22 Nov 2012 15:12:09 +1100 Subject: md/raid10: decrement correct pending counter when writing to replacement. When a write to a replacement device completes, we carefully and correctly found the rdev that the write actually went to and the blithely called rdev_dec_pending on the primary rdev, even if this write was to the replacement. This means that any writes to an array while a replacement was ongoing would cause the nr_pending count for the primary device to go negative, so it could never be removed. This bug has been present since replacement was introduced in 3.3, so it is suitable for any -stable kernel since then. Reported-by: "George Spelvin" Cc: stable@vger.kernel.org Signed-off-by: NeilBrown --- drivers/md/raid10.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/md/raid10.c') diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index ad032518fdc0..0d5d0ff2c0f7 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -499,7 +499,7 @@ static void raid10_end_write_request(struct bio *bio, int error) */ one_write_done(r10_bio); if (dec_rdev) - rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev); + rdev_dec_pending(rdev, conf->mddev); } /* -- cgit v1.2.3 From 874807a83139abc094f939e93623c5623573d543 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 27 Nov 2012 12:14:40 +1100 Subject: md/raid1{,0}: fix deadlock in bitmap_unplug. If the raid1 or raid10 unplug function gets called from a make_request function (which is very possible) when there are bios on the current->bio_list list, then it will not be able to successfully call bitmap_unplug() and it could need to submit more bios and wait for them to complete. But they won't complete while current->bio_list is non-empty. So detect that case and handle the unplugging off to another thread just like we already do when called from within the scheduler. RAID1 version of bug was introduced in 3.6, so that part of fix is suitable for 3.6.y. RAID10 part won't apply. Cc: stable@vger.kernel.org Reported-by: Torsten Kaiser Reported-by: Peter Maloney Signed-off-by: NeilBrown --- drivers/md/raid10.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/md/raid10.c') diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 0d5d0ff2c0f7..c9acbd717131 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1069,7 +1069,7 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule) struct r10conf *conf = mddev->private; struct bio *bio; - if (from_schedule) { + if (from_schedule || current->bio_list) { spin_lock_irq(&conf->device_lock); bio_list_merge(&conf->pending_bio_list, &plug->pending); conf->pending_count += plug->pending_cnt; -- cgit v1.2.3