Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

md: support blocking writes to an array on device failure

Allows a userspace metadata handler to take action upon detecting a device
failure.

Based on an original patch by Neil Brown.

Changes:
-added blocked_wait waitqueue to rdev
-don't qualify Blocked with Faulty always let userspace block writes
-added md_wait_for_blocked_rdev to wait for the block device to be clear, if
userspace misses the notification another one is sent every 5 seconds
-set MD_RECOVERY_NEEDED after clearing "blocked"
-kill DoBlock flag, just test mddev->external

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

authored by

Dan Williams and committed by
Linus Torvalds
6bfe0b49 11e2ede0

+120 -7
+32 -1
drivers/md/md.c
··· 1828 1828 len += sprintf(page+len, "%swrite_mostly",sep); 1829 1829 sep = ","; 1830 1830 } 1831 + if (test_bit(Blocked, &rdev->flags)) { 1832 + len += sprintf(page+len, "%sblocked", sep); 1833 + sep = ","; 1834 + } 1831 1835 if (!test_bit(Faulty, &rdev->flags) && 1832 1836 !test_bit(In_sync, &rdev->flags)) { 1833 1837 len += sprintf(page+len, "%sspare", sep); ··· 1848 1844 * remove - disconnects the device 1849 1845 * writemostly - sets write_mostly 1850 1846 * -writemostly - clears write_mostly 1847 + * blocked - sets the Blocked flag 1848 + * -blocked - clears the Blocked flag 1851 1849 */ 1852 1850 int err = -EINVAL; 1853 1851 if (cmd_match(buf, "faulty") && rdev->mddev->pers) { ··· 1871 1865 err = 0; 1872 1866 } else if (cmd_match(buf, "-writemostly")) { 1873 1867 clear_bit(WriteMostly, &rdev->flags); 1868 + err = 0; 1869 + } else if (cmd_match(buf, "blocked")) { 1870 + set_bit(Blocked, &rdev->flags); 1871 + err = 0; 1872 + } else if (cmd_match(buf, "-blocked")) { 1873 + clear_bit(Blocked, &rdev->flags); 1874 + wake_up(&rdev->blocked_wait); 1875 + set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 1876 + md_wakeup_thread(rdev->mddev->thread); 1877 + 1874 1878 err = 0; 1875 1879 } 1876 1880 return err ? err : len; ··· 2210 2194 goto abort_free; 2211 2195 } 2212 2196 } 2197 + 2213 2198 INIT_LIST_HEAD(&rdev->same_set); 2199 + init_waitqueue_head(&rdev->blocked_wait); 2214 2200 2215 2201 return rdev; 2216 2202 ··· 4976 4958 4977 4959 if (!rdev || test_bit(Faulty, &rdev->flags)) 4978 4960 return; 4961 + 4962 + if (mddev->external) 4963 + set_bit(Blocked, &rdev->flags); 4979 4964 /* 4980 4965 dprintk("md_error dev:%s, rdev:(%d:%d), (caller: %p,%p,%p,%p).\n", 4981 4966 mdname(mddev), ··· 5781 5760 5782 5761 rdev_for_each(rdev, rtmp, mddev) 5783 5762 if (rdev->raid_disk >= 0 && 5784 - !mddev->external && 5763 + !test_bit(Blocked, &rdev->flags) && 5785 5764 (test_bit(Faulty, &rdev->flags) || 5786 5765 ! test_bit(In_sync, &rdev->flags)) && 5787 5766 atomic_read(&rdev->nr_pending)==0) { ··· 5979 5958 mddev_unlock(mddev); 5980 5959 } 5981 5960 } 5961 + 5962 + void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev) 5963 + { 5964 + sysfs_notify(&rdev->kobj, NULL, "state"); 5965 + wait_event_timeout(rdev->blocked_wait, 5966 + !test_bit(Blocked, &rdev->flags), 5967 + msecs_to_jiffies(5000)); 5968 + rdev_dec_pending(rdev, mddev); 5969 + } 5970 + EXPORT_SYMBOL(md_wait_for_blocked_rdev); 5982 5971 5983 5972 static int md_notify_reboot(struct notifier_block *this, 5984 5973 unsigned long code, void *x)
+24 -3
drivers/md/raid1.c
··· 773 773 r1bio_t *r1_bio; 774 774 struct bio *read_bio; 775 775 int i, targets = 0, disks; 776 - mdk_rdev_t *rdev; 777 776 struct bitmap *bitmap = mddev->bitmap; 778 777 unsigned long flags; 779 778 struct bio_list bl; ··· 780 781 const int rw = bio_data_dir(bio); 781 782 const int do_sync = bio_sync(bio); 782 783 int do_barriers; 784 + mdk_rdev_t *blocked_rdev; 783 785 784 786 /* 785 787 * Register the new request and wait if the reconstruction ··· 862 862 first = 0; 863 863 } 864 864 #endif 865 + retry_write: 866 + blocked_rdev = NULL; 865 867 rcu_read_lock(); 866 868 for (i = 0; i < disks; i++) { 867 - if ((rdev=rcu_dereference(conf->mirrors[i].rdev)) != NULL && 868 - !test_bit(Faulty, &rdev->flags)) { 869 + mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); 870 + if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) { 871 + atomic_inc(&rdev->nr_pending); 872 + blocked_rdev = rdev; 873 + break; 874 + } 875 + if (rdev && !test_bit(Faulty, &rdev->flags)) { 869 876 atomic_inc(&rdev->nr_pending); 870 877 if (test_bit(Faulty, &rdev->flags)) { 871 878 rdev_dec_pending(rdev, mddev); ··· 884 877 r1_bio->bios[i] = NULL; 885 878 } 886 879 rcu_read_unlock(); 880 + 881 + if (unlikely(blocked_rdev)) { 882 + /* Wait for this device to become unblocked */ 883 + int j; 884 + 885 + for (j = 0; j < i; j++) 886 + if (r1_bio->bios[j]) 887 + rdev_dec_pending(conf->mirrors[j].rdev, mddev); 888 + 889 + allow_barrier(conf); 890 + md_wait_for_blocked_rdev(blocked_rdev, mddev); 891 + wait_barrier(conf); 892 + goto retry_write; 893 + } 887 894 888 895 BUG_ON(targets == 0); /* we never fail the last device */ 889 896
+26 -3
drivers/md/raid10.c
··· 790 790 const int do_sync = bio_sync(bio); 791 791 struct bio_list bl; 792 792 unsigned long flags; 793 + mdk_rdev_t *blocked_rdev; 793 794 794 795 if (unlikely(bio_barrier(bio))) { 795 796 bio_endio(bio, -EOPNOTSUPP); ··· 880 879 /* 881 880 * WRITE: 882 881 */ 883 - /* first select target devices under spinlock and 882 + /* first select target devices under rcu_lock and 884 883 * inc refcount on their rdev. Record them by setting 885 884 * bios[x] to bio 886 885 */ 887 886 raid10_find_phys(conf, r10_bio); 887 + retry_write: 888 + blocked_rdev = 0; 888 889 rcu_read_lock(); 889 890 for (i = 0; i < conf->copies; i++) { 890 891 int d = r10_bio->devs[i].devnum; 891 892 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[d].rdev); 892 - if (rdev && 893 - !test_bit(Faulty, &rdev->flags)) { 893 + if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) { 894 + atomic_inc(&rdev->nr_pending); 895 + blocked_rdev = rdev; 896 + break; 897 + } 898 + if (rdev && !test_bit(Faulty, &rdev->flags)) { 894 899 atomic_inc(&rdev->nr_pending); 895 900 r10_bio->devs[i].bio = bio; 896 901 } else { ··· 905 898 } 906 899 } 907 900 rcu_read_unlock(); 901 + 902 + if (unlikely(blocked_rdev)) { 903 + /* Have to wait for this device to get unblocked, then retry */ 904 + int j; 905 + int d; 906 + 907 + for (j = 0; j < i; j++) 908 + if (r10_bio->devs[j].bio) { 909 + d = r10_bio->devs[j].devnum; 910 + rdev_dec_pending(conf->mirrors[d].rdev, mddev); 911 + } 912 + allow_barrier(conf); 913 + md_wait_for_blocked_rdev(blocked_rdev, mddev); 914 + wait_barrier(conf); 915 + goto retry_write; 916 + } 908 917 909 918 atomic_set(&r10_bio->remaining, 0); 910 919
+33
drivers/md/raid5.c
··· 2607 2607 } 2608 2608 } 2609 2609 2610 + 2610 2611 /* 2611 2612 * handle_stripe - do things to a stripe. 2612 2613 * ··· 2633 2632 struct stripe_head_state s; 2634 2633 struct r5dev *dev; 2635 2634 unsigned long pending = 0; 2635 + mdk_rdev_t *blocked_rdev = NULL; 2636 2636 2637 2637 memset(&s, 0, sizeof(s)); 2638 2638 pr_debug("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d " ··· 2693 2691 if (dev->written) 2694 2692 s.written++; 2695 2693 rdev = rcu_dereference(conf->disks[i].rdev); 2694 + if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) { 2695 + blocked_rdev = rdev; 2696 + atomic_inc(&rdev->nr_pending); 2697 + break; 2698 + } 2696 2699 if (!rdev || !test_bit(In_sync, &rdev->flags)) { 2697 2700 /* The ReadError flag will just be confusing now */ 2698 2701 clear_bit(R5_ReadError, &dev->flags); ··· 2711 2704 set_bit(R5_Insync, &dev->flags); 2712 2705 } 2713 2706 rcu_read_unlock(); 2707 + 2708 + if (unlikely(blocked_rdev)) { 2709 + set_bit(STRIPE_HANDLE, &sh->state); 2710 + goto unlock; 2711 + } 2714 2712 2715 2713 if (s.to_fill && !test_and_set_bit(STRIPE_OP_BIOFILL, &sh->ops.pending)) 2716 2714 sh->ops.count++; ··· 2906 2894 if (sh->ops.count) 2907 2895 pending = get_stripe_work(sh); 2908 2896 2897 + unlock: 2909 2898 spin_unlock(&sh->lock); 2899 + 2900 + /* wait for this device to become unblocked */ 2901 + if (unlikely(blocked_rdev)) 2902 + md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); 2910 2903 2911 2904 if (pending) 2912 2905 raid5_run_ops(sh, pending); ··· 2929 2912 struct stripe_head_state s; 2930 2913 struct r6_state r6s; 2931 2914 struct r5dev *dev, *pdev, *qdev; 2915 + mdk_rdev_t *blocked_rdev = NULL; 2932 2916 2933 2917 r6s.qd_idx = raid6_next_disk(pd_idx, disks); 2934 2918 pr_debug("handling stripe %llu, state=%#lx cnt=%d, " ··· 2993 2975 if (dev->written) 2994 2976 s.written++; 2995 2977 rdev = rcu_dereference(conf->disks[i].rdev); 2978 + if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) { 2979 + blocked_rdev = rdev; 2980 + atomic_inc(&rdev->nr_pending); 2981 + break; 2982 + } 2996 2983 if (!rdev || !test_bit(In_sync, &rdev->flags)) { 2997 2984 /* The ReadError flag will just be confusing now */ 2998 2985 clear_bit(R5_ReadError, &dev->flags); ··· 3012 2989 set_bit(R5_Insync, &dev->flags); 3013 2990 } 3014 2991 rcu_read_unlock(); 2992 + 2993 + if (unlikely(blocked_rdev)) { 2994 + set_bit(STRIPE_HANDLE, &sh->state); 2995 + goto unlock; 2996 + } 3015 2997 pr_debug("locked=%d uptodate=%d to_read=%d" 3016 2998 " to_write=%d failed=%d failed_num=%d,%d\n", 3017 2999 s.locked, s.uptodate, s.to_read, s.to_write, s.failed, ··· 3122 3094 !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) 3123 3095 handle_stripe_expansion(conf, sh, &r6s); 3124 3096 3097 + unlock: 3125 3098 spin_unlock(&sh->lock); 3099 + 3100 + /* wait for this device to become unblocked */ 3101 + if (unlikely(blocked_rdev)) 3102 + md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); 3126 3103 3127 3104 return_io(return_bi); 3128 3105
+1
include/linux/raid/md.h
··· 94 94 extern void md_do_sync(mddev_t *mddev); 95 95 extern void md_new_event(mddev_t *mddev); 96 96 extern void md_allow_write(mddev_t *mddev); 97 + extern void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev); 97 98 98 99 #endif /* CONFIG_MD */ 99 100 #endif
+4
include/linux/raid/md_k.h
··· 84 84 #define AllReserved 6 /* If whole device is reserved for 85 85 * one array */ 86 86 #define AutoDetected 7 /* added by auto-detect */ 87 + #define Blocked 8 /* An error occured on an externally 88 + * managed array, don't allow writes 89 + * until it is cleared */ 90 + wait_queue_head_t blocked_wait; 87 91 88 92 int desc_nr; /* descriptor index in the superblock */ 89 93 int raid_disk; /* role of device in array */