Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

dm: enhance internal suspend and resume interface

Rename dm_internal_{suspend,resume} to dm_internal_{suspend,resume}_fast
-- dm-stats will continue using these methods to avoid all the extra
suspend/resume logic that is not needed in order to quickly flush IO.

Introduce dm_internal_suspend_noflush() variant that actually calls the
mapped_device's target callbacks -- otherwise target-specific hooks are
avoided (e.g. dm-thin's thin_presuspend and thin_postsuspend). Common
code between dm_internal_{suspend_noflush,resume} and
dm_{suspend,resume} was factored out as __dm_{suspend,resume}.

Update dm_internal_{suspend_noflush,resume} to always take and release
the mapped_device's suspend_lock. Also update dm_{suspend,resume} to be
aware of potential for DM_INTERNAL_SUSPEND_FLAG to be set and respond
accordingly by interruptibly waiting for the DM_INTERNAL_SUSPEND_FLAG to
be cleared. Add lockdep annotation to dm_suspend() and dm_resume().

The existing DM_SUSPEND_FLAG remains unchanged.
DM_INTERNAL_SUSPEND_FLAG is set by dm_internal_suspend_noflush() and
cleared by dm_internal_resume().

Both DM_SUSPEND_FLAG and DM_INTERNAL_SUSPEND_FLAG may be set if a device
was already suspended when dm_internal_suspend_noflush() was called --
this can be thought of as a "nested suspend". A "nested suspend" can
occur with legacy userspace dm-thin code that might suspend all active
thin volumes before suspending the pool for resize.

But otherwise, in the normal dm-thin-pool suspend case moving forward:
the thin-pool will have DM_SUSPEND_FLAG set and all active thins from
that thin-pool will have DM_INTERNAL_SUSPEND_FLAG set.

Also add DM_INTERNAL_SUSPEND_FLAG to status report. This new
DM_INTERNAL_SUSPEND_FLAG state is being reported to assist with
debugging (e.g. 'dmsetup info' will report an internally suspended
device accordingly).

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Acked-by: Joe Thornber <ejt@redhat.com>

+193 -59
+4 -1
drivers/md/dm-ioctl.c
··· 684 684 int srcu_idx; 685 685 686 686 param->flags &= ~(DM_SUSPEND_FLAG | DM_READONLY_FLAG | 687 - DM_ACTIVE_PRESENT_FLAG); 687 + DM_ACTIVE_PRESENT_FLAG | DM_INTERNAL_SUSPEND_FLAG); 688 688 689 689 if (dm_suspended_md(md)) 690 690 param->flags |= DM_SUSPEND_FLAG; 691 + 692 + if (dm_suspended_internally_md(md)) 693 + param->flags |= DM_INTERNAL_SUSPEND_FLAG; 691 694 692 695 if (dm_test_deferred_remove_flag(md)) 693 696 param->flags |= DM_DEFERRED_REMOVE;
+1 -1
drivers/md/dm-stats.c
··· 824 824 return 1; 825 825 826 826 id = dm_stats_create(dm_get_stats(md), start, end, step, program_id, aux_data, 827 - dm_internal_suspend, dm_internal_resume, md); 827 + dm_internal_suspend_fast, dm_internal_resume_fast, md); 828 828 if (id < 0) 829 829 return id; 830 830
+174 -57
drivers/md/dm.c
··· 19 19 #include <linux/idr.h> 20 20 #include <linux/hdreg.h> 21 21 #include <linux/delay.h> 22 + #include <linux/wait.h> 22 23 23 24 #include <trace/events/block.h> 24 25 ··· 118 117 #define DMF_NOFLUSH_SUSPENDING 5 119 118 #define DMF_MERGE_IS_OPTIONAL 6 120 119 #define DMF_DEFERRED_REMOVE 7 120 + #define DMF_SUSPENDED_INTERNALLY 8 121 121 122 122 /* 123 123 * A dummy definition to make RCU happy. ··· 2720 2718 } 2721 2719 2722 2720 /* 2723 - * We need to be able to change a mapping table under a mounted 2724 - * filesystem. For example we might want to move some data in 2725 - * the background. Before the table can be swapped with 2726 - * dm_bind_table, dm_suspend must be called to flush any in 2727 - * flight bios and ensure that any further io gets deferred. 2728 - */ 2729 - /* 2730 - * Suspend mechanism in request-based dm. 2721 + * If __dm_suspend returns 0, the device is completely quiescent 2722 + * now. There is no request-processing activity. All new requests 2723 + * are being added to md->deferred list. 2731 2724 * 2732 - * 1. Flush all I/Os by lock_fs() if needed. 2733 - * 2. Stop dispatching any I/O by stopping the request_queue. 2734 - * 3. Wait for all in-flight I/Os to be completed or requeued. 2735 - * 2736 - * To abort suspend, start the request_queue. 2725 + * Caller must hold md->suspend_lock 2737 2726 */ 2738 - int dm_suspend(struct mapped_device *md, unsigned suspend_flags) 2727 + static int __dm_suspend(struct mapped_device *md, struct dm_table *map, 2728 + unsigned suspend_flags, int interruptible) 2739 2729 { 2740 - struct dm_table *map = NULL; 2741 - int r = 0; 2742 - int do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG ? 1 : 0; 2743 - int noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG ? 1 : 0; 2744 - 2745 - mutex_lock(&md->suspend_lock); 2746 - 2747 - if (dm_suspended_md(md)) { 2748 - r = -EINVAL; 2749 - goto out_unlock; 2750 - } 2751 - 2752 - map = rcu_dereference(md->map); 2730 + bool do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG; 2731 + bool noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG; 2732 + int r; 2753 2733 2754 2734 /* 2755 2735 * DMF_NOFLUSH_SUSPENDING must be set before presuspend. ··· 2756 2772 r = lock_fs(md); 2757 2773 if (r) { 2758 2774 dm_table_presuspend_undo_targets(map); 2759 - goto out_unlock; 2775 + return r; 2760 2776 } 2761 2777 } 2762 2778 ··· 2790 2806 * We call dm_wait_for_completion to wait for all existing requests 2791 2807 * to finish. 2792 2808 */ 2793 - r = dm_wait_for_completion(md, TASK_INTERRUPTIBLE); 2809 + r = dm_wait_for_completion(md, interruptible); 2794 2810 2795 2811 if (noflush) 2796 2812 clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); ··· 2806 2822 2807 2823 unlock_fs(md); 2808 2824 dm_table_presuspend_undo_targets(map); 2809 - goto out_unlock; /* pushback list is already flushed, so skip flush */ 2825 + /* pushback list is already flushed, so skip flush */ 2810 2826 } 2811 2827 2812 - /* 2813 - * If dm_wait_for_completion returned 0, the device is completely 2814 - * quiescent now. There is no request-processing activity. All new 2815 - * requests are being added to md->deferred list. 2816 - */ 2828 + return r; 2829 + } 2830 + 2831 + /* 2832 + * We need to be able to change a mapping table under a mounted 2833 + * filesystem. For example we might want to move some data in 2834 + * the background. Before the table can be swapped with 2835 + * dm_bind_table, dm_suspend must be called to flush any in 2836 + * flight bios and ensure that any further io gets deferred. 2837 + */ 2838 + /* 2839 + * Suspend mechanism in request-based dm. 2840 + * 2841 + * 1. Flush all I/Os by lock_fs() if needed. 2842 + * 2. Stop dispatching any I/O by stopping the request_queue. 2843 + * 3. Wait for all in-flight I/Os to be completed or requeued. 2844 + * 2845 + * To abort suspend, start the request_queue. 2846 + */ 2847 + int dm_suspend(struct mapped_device *md, unsigned suspend_flags) 2848 + { 2849 + struct dm_table *map = NULL; 2850 + int r = 0; 2851 + 2852 + retry: 2853 + mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING); 2854 + 2855 + if (dm_suspended_md(md)) { 2856 + r = -EINVAL; 2857 + goto out_unlock; 2858 + } 2859 + 2860 + if (dm_suspended_internally_md(md)) { 2861 + /* already internally suspended, wait for internal resume */ 2862 + mutex_unlock(&md->suspend_lock); 2863 + r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE); 2864 + if (r) 2865 + return r; 2866 + goto retry; 2867 + } 2868 + 2869 + map = rcu_dereference(md->map); 2870 + 2871 + r = __dm_suspend(md, map, suspend_flags, TASK_INTERRUPTIBLE); 2872 + if (r) 2873 + goto out_unlock; 2817 2874 2818 2875 set_bit(DMF_SUSPENDED, &md->flags); 2819 2876 ··· 2865 2840 return r; 2866 2841 } 2867 2842 2868 - int dm_resume(struct mapped_device *md) 2843 + static int __dm_resume(struct mapped_device *md, struct dm_table *map) 2869 2844 { 2870 - int r = -EINVAL; 2871 - struct dm_table *map = NULL; 2872 - 2873 - mutex_lock(&md->suspend_lock); 2874 - if (!dm_suspended_md(md)) 2875 - goto out; 2876 - 2877 - map = rcu_dereference(md->map); 2878 - if (!map || !dm_table_get_size(map)) 2879 - goto out; 2880 - 2881 - r = dm_table_resume_targets(map); 2882 - if (r) 2883 - goto out; 2845 + if (map) { 2846 + int r = dm_table_resume_targets(map); 2847 + if (r) 2848 + return r; 2849 + } 2884 2850 2885 2851 dm_queue_flush(md); 2886 2852 ··· 2884 2868 start_queue(md->queue); 2885 2869 2886 2870 unlock_fs(md); 2871 + 2872 + return 0; 2873 + } 2874 + 2875 + int dm_resume(struct mapped_device *md) 2876 + { 2877 + int r = -EINVAL; 2878 + struct dm_table *map = NULL; 2879 + 2880 + retry: 2881 + mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING); 2882 + 2883 + if (!dm_suspended_md(md)) 2884 + goto out; 2885 + 2886 + if (dm_suspended_internally_md(md)) { 2887 + /* already internally suspended, wait for internal resume */ 2888 + mutex_unlock(&md->suspend_lock); 2889 + r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE); 2890 + if (r) 2891 + return r; 2892 + goto retry; 2893 + } 2894 + 2895 + map = rcu_dereference(md->map); 2896 + if (!map || !dm_table_get_size(map)) 2897 + goto out; 2898 + 2899 + r = __dm_resume(md, map); 2900 + if (r) 2901 + goto out; 2887 2902 2888 2903 clear_bit(DMF_SUSPENDED, &md->flags); 2889 2904 ··· 2929 2882 * Internal suspend/resume works like userspace-driven suspend. It waits 2930 2883 * until all bios finish and prevents issuing new bios to the target drivers. 2931 2884 * It may be used only from the kernel. 2932 - * 2933 - * Internal suspend holds md->suspend_lock, which prevents interaction with 2934 - * userspace-driven suspend. 2935 2885 */ 2936 2886 2937 - void dm_internal_suspend(struct mapped_device *md) 2887 + static void __dm_internal_suspend(struct mapped_device *md, unsigned suspend_flags) 2888 + { 2889 + struct dm_table *map = NULL; 2890 + 2891 + if (dm_suspended_internally_md(md)) 2892 + return; /* nested internal suspend */ 2893 + 2894 + if (dm_suspended_md(md)) { 2895 + set_bit(DMF_SUSPENDED_INTERNALLY, &md->flags); 2896 + return; /* nest suspend */ 2897 + } 2898 + 2899 + map = rcu_dereference(md->map); 2900 + 2901 + /* 2902 + * Using TASK_UNINTERRUPTIBLE because only NOFLUSH internal suspend is 2903 + * supported. Properly supporting a TASK_INTERRUPTIBLE internal suspend 2904 + * would require changing .presuspend to return an error -- avoid this 2905 + * until there is a need for more elaborate variants of internal suspend. 2906 + */ 2907 + (void) __dm_suspend(md, map, suspend_flags, TASK_UNINTERRUPTIBLE); 2908 + 2909 + set_bit(DMF_SUSPENDED_INTERNALLY, &md->flags); 2910 + 2911 + dm_table_postsuspend_targets(map); 2912 + } 2913 + 2914 + static void __dm_internal_resume(struct mapped_device *md) 2915 + { 2916 + if (!dm_suspended_internally_md(md)) 2917 + return; /* resume from nested internal suspend */ 2918 + 2919 + if (dm_suspended_md(md)) 2920 + goto done; /* resume from nested suspend */ 2921 + 2922 + /* 2923 + * NOTE: existing callers don't need to call dm_table_resume_targets 2924 + * (which may fail -- so best to avoid it for now by passing NULL map) 2925 + */ 2926 + (void) __dm_resume(md, NULL); 2927 + 2928 + done: 2929 + clear_bit(DMF_SUSPENDED_INTERNALLY, &md->flags); 2930 + smp_mb__after_atomic(); 2931 + wake_up_bit(&md->flags, DMF_SUSPENDED_INTERNALLY); 2932 + } 2933 + 2934 + void dm_internal_suspend_noflush(struct mapped_device *md) 2938 2935 { 2939 2936 mutex_lock(&md->suspend_lock); 2940 - if (dm_suspended_md(md)) 2937 + __dm_internal_suspend(md, DM_SUSPEND_NOFLUSH_FLAG); 2938 + mutex_unlock(&md->suspend_lock); 2939 + } 2940 + EXPORT_SYMBOL_GPL(dm_internal_suspend_noflush); 2941 + 2942 + void dm_internal_resume(struct mapped_device *md) 2943 + { 2944 + mutex_lock(&md->suspend_lock); 2945 + __dm_internal_resume(md); 2946 + mutex_unlock(&md->suspend_lock); 2947 + } 2948 + EXPORT_SYMBOL_GPL(dm_internal_resume); 2949 + 2950 + /* 2951 + * Fast variants of internal suspend/resume hold md->suspend_lock, 2952 + * which prevents interaction with userspace-driven suspend. 2953 + */ 2954 + 2955 + void dm_internal_suspend_fast(struct mapped_device *md) 2956 + { 2957 + mutex_lock(&md->suspend_lock); 2958 + if (dm_suspended_md(md) || dm_suspended_internally_md(md)) 2941 2959 return; 2942 2960 2943 2961 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); ··· 3011 2899 dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE); 3012 2900 } 3013 2901 3014 - void dm_internal_resume(struct mapped_device *md) 2902 + void dm_internal_resume_fast(struct mapped_device *md) 3015 2903 { 3016 - if (dm_suspended_md(md)) 2904 + if (dm_suspended_md(md) || dm_suspended_internally_md(md)) 3017 2905 goto done; 3018 2906 3019 2907 dm_queue_flush(md); ··· 3097 2985 int dm_suspended_md(struct mapped_device *md) 3098 2986 { 3099 2987 return test_bit(DMF_SUSPENDED, &md->flags); 2988 + } 2989 + 2990 + int dm_suspended_internally_md(struct mapped_device *md) 2991 + { 2992 + return test_bit(DMF_SUSPENDED_INTERNALLY, &md->flags); 3100 2993 } 3101 2994 3102 2995 int dm_test_deferred_remove_flag(struct mapped_device *md)
+9
drivers/md/dm.h
··· 130 130 int dm_suspended_md(struct mapped_device *md); 131 131 132 132 /* 133 + * Internal suspend and resume methods. 134 + */ 135 + int dm_suspended_internally_md(struct mapped_device *md); 136 + void dm_internal_suspend_fast(struct mapped_device *md); 137 + void dm_internal_resume_fast(struct mapped_device *md); 138 + void dm_internal_suspend_noflush(struct mapped_device *md); 139 + void dm_internal_resume(struct mapped_device *md); 140 + 141 + /* 133 142 * Test if the device is scheduled for deferred remove. 134 143 */ 135 144 int dm_test_deferred_remove_flag(struct mapped_device *md);
+5
include/uapi/linux/dm-ioctl.h
··· 352 352 */ 353 353 #define DM_DEFERRED_REMOVE (1 << 17) /* In/Out */ 354 354 355 + /* 356 + * If set, the device is suspended internally. 357 + */ 358 + #define DM_INTERNAL_SUSPEND_FLAG (1 << 18) /* Out */ 359 + 355 360 #endif /* _LINUX_DM_IOCTL_H */