Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

md/raid10: ensure device failure recorded before write request returns.

When a write to one of the legs of a RAID10 fails, the failure is
recorded in the metadata of the other legs so that after a restart
the data on the failed drive wont be trusted even if that drive seems
to be working again (maybe a cable was unplugged).

Currently there is no interlock between the write request completing
and the metadata update. So it is possible that the write will
complete, the app will confirm success in some way, and then the
machine will crash before the metadata update completes.

This is an extremely small hole for a racy to fit in, but it is
theoretically possible and so should be closed.

So:
- set MD_CHANGE_PENDING when requesting a metadata update for a
failed device, so we can know with certainty when it completes
- queue requests that experienced an error on a new queue which
is only processed after the metadata update completes
- call raid_end_bio_io() on bios in that queue when the time comes.

Signed-off-by: NeilBrown <neilb@suse.com>

NeilBrown 95af587e 55ce74d4

+34 -1
+28 -1
drivers/md/raid10.c
··· 1681 1681 set_bit(Blocked, &rdev->flags); 1682 1682 set_bit(Faulty, &rdev->flags); 1683 1683 set_bit(MD_CHANGE_DEVS, &mddev->flags); 1684 + set_bit(MD_CHANGE_PENDING, &mddev->flags); 1684 1685 spin_unlock_irqrestore(&conf->device_lock, flags); 1685 1686 printk(KERN_ALERT 1686 1687 "md/raid10:%s: Disk failure on %s, disabling device.\n" ··· 2739 2738 } 2740 2739 put_buf(r10_bio); 2741 2740 } else { 2741 + bool fail = false; 2742 2742 for (m = 0; m < conf->copies; m++) { 2743 2743 int dev = r10_bio->devs[m].devnum; 2744 2744 struct bio *bio = r10_bio->devs[m].bio; ··· 2752 2750 rdev_dec_pending(rdev, conf->mddev); 2753 2751 } else if (bio != NULL && 2754 2752 !test_bit(BIO_UPTODATE, &bio->bi_flags)) { 2753 + fail = true; 2755 2754 if (!narrow_write_error(r10_bio, m)) { 2756 2755 md_error(conf->mddev, rdev); 2757 2756 set_bit(R10BIO_Degraded, ··· 2773 2770 if (test_bit(R10BIO_WriteError, 2774 2771 &r10_bio->state)) 2775 2772 close_write(r10_bio); 2776 - raid_end_bio_io(r10_bio); 2773 + if (fail) { 2774 + spin_lock_irq(&conf->device_lock); 2775 + list_add(&r10_bio->retry_list, &conf->bio_end_io_list); 2776 + spin_unlock_irq(&conf->device_lock); 2777 + md_wakeup_thread(conf->mddev->thread); 2778 + } else 2779 + raid_end_bio_io(r10_bio); 2777 2780 } 2778 2781 } 2779 2782 ··· 2793 2784 struct blk_plug plug; 2794 2785 2795 2786 md_check_recovery(mddev); 2787 + 2788 + if (!list_empty_careful(&conf->bio_end_io_list) && 2789 + !test_bit(MD_CHANGE_PENDING, &mddev->flags)) { 2790 + LIST_HEAD(tmp); 2791 + spin_lock_irqsave(&conf->device_lock, flags); 2792 + if (!test_bit(MD_CHANGE_PENDING, &mddev->flags)) { 2793 + list_add(&tmp, &conf->bio_end_io_list); 2794 + list_del_init(&conf->bio_end_io_list); 2795 + } 2796 + spin_unlock_irqrestore(&conf->device_lock, flags); 2797 + while (!list_empty(&tmp)) { 2798 + r10_bio = list_first_entry(&conf->bio_end_io_list, 2799 + struct r10bio, retry_list); 2800 + list_del(&r10_bio->retry_list); 2801 + raid_end_bio_io(r10_bio); 2802 + } 2803 + } 2796 2804 2797 2805 blk_start_plug(&plug); 2798 2806 for (;;) { ··· 3585 3559 conf->reshape_safe = conf->reshape_progress; 3586 3560 spin_lock_init(&conf->device_lock); 3587 3561 INIT_LIST_HEAD(&conf->retry_list); 3562 + INIT_LIST_HEAD(&conf->bio_end_io_list); 3588 3563 3589 3564 spin_lock_init(&conf->resync_lock); 3590 3565 init_waitqueue_head(&conf->wait_barrier);
+6
drivers/md/raid10.h
··· 53 53 sector_t offset_diff; 54 54 55 55 struct list_head retry_list; 56 + /* A separate list of r1bio which just need raid_end_bio_io called. 57 + * This mustn't happen for writes which had any errors if the superblock 58 + * needs to be written. 59 + */ 60 + struct list_head bio_end_io_list; 61 + 56 62 /* queue pending writes and submit them on unplug */ 57 63 struct bio_list pending_bio_list; 58 64 int pending_count;