Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

md: set MD_CHANGE_PENDING in a atomic region

Some code waits for a metadata update by:

1. flagging that it is needed (MD_CHANGE_DEVS or MD_CHANGE_CLEAN)
2. setting MD_CHANGE_PENDING and waking the management thread
3. waiting for MD_CHANGE_PENDING to be cleared

If the first two are done without locking, the code in md_update_sb()
which checks if it needs to repeat might test if an update is needed
before step 1, then clear MD_CHANGE_PENDING after step 2, resulting
in the wait returning early.

So make sure all places that set MD_CHANGE_PENDING are atomicial, and
bit_clear_unless (suggested by Neil) is introduced for the purpose.

Cc: Martin Kepplinger <martink@posteo.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Sasha Levin <sasha.levin@oracle.com>
Cc: <linux-kernel@vger.kernel.org>
Reviewed-by: NeilBrown <neilb@suse.com>
Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>

authored by

Guoqing Jiang and committed by
Shaohua Li
85ad1d13 fe67d19a

+40 -23
+14 -13
drivers/md/md.c
··· 2295 2295 if (mddev_is_clustered(mddev)) { 2296 2296 if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags)) 2297 2297 force_change = 1; 2298 + if (test_and_clear_bit(MD_CHANGE_CLEAN, &mddev->flags)) 2299 + nospares = 1; 2298 2300 ret = md_cluster_ops->metadata_update_start(mddev); 2299 2301 /* Has someone else has updated the sb */ 2300 2302 if (!does_sb_need_changing(mddev)) { 2301 2303 if (ret == 0) 2302 2304 md_cluster_ops->metadata_update_cancel(mddev); 2303 - clear_bit(MD_CHANGE_PENDING, &mddev->flags); 2305 + bit_clear_unless(&mddev->flags, BIT(MD_CHANGE_PENDING), 2306 + BIT(MD_CHANGE_DEVS) | 2307 + BIT(MD_CHANGE_CLEAN)); 2304 2308 return; 2305 2309 } 2306 2310 } ··· 2438 2434 if (mddev_is_clustered(mddev) && ret == 0) 2439 2435 md_cluster_ops->metadata_update_finish(mddev); 2440 2436 2441 - spin_lock(&mddev->lock); 2442 2437 if (mddev->in_sync != sync_req || 2443 - test_bit(MD_CHANGE_DEVS, &mddev->flags)) { 2438 + !bit_clear_unless(&mddev->flags, BIT(MD_CHANGE_PENDING), 2439 + BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_CLEAN))) 2444 2440 /* have to write it out again */ 2445 - spin_unlock(&mddev->lock); 2446 2441 goto repeat; 2447 - } 2448 - clear_bit(MD_CHANGE_PENDING, &mddev->flags); 2449 - spin_unlock(&mddev->lock); 2450 2442 wake_up(&mddev->sb_wait); 2451 2443 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 2452 2444 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); ··· 8147 8147 } 8148 8148 } 8149 8149 skip: 8150 - set_bit(MD_CHANGE_DEVS, &mddev->flags); 8151 - 8152 8150 if (mddev_is_clustered(mddev) && 8153 8151 ret == 0) { 8154 8152 /* set CHANGE_PENDING here since maybe another 8155 8153 * update is needed, so other nodes are informed */ 8156 - set_bit(MD_CHANGE_PENDING, &mddev->flags); 8154 + set_mask_bits(&mddev->flags, 0, 8155 + BIT(MD_CHANGE_PENDING) | BIT(MD_CHANGE_DEVS)); 8157 8156 md_wakeup_thread(mddev->thread); 8158 8157 wait_event(mddev->sb_wait, 8159 8158 !test_bit(MD_CHANGE_PENDING, &mddev->flags)); 8160 8159 md_cluster_ops->resync_finish(mddev); 8161 - } 8160 + } else 8161 + set_bit(MD_CHANGE_DEVS, &mddev->flags); 8162 8162 8163 8163 spin_lock(&mddev->lock); 8164 8164 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { ··· 8550 8550 int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, 8551 8551 int is_new) 8552 8552 { 8553 + struct mddev *mddev = rdev->mddev; 8553 8554 int rv; 8554 8555 if (is_new) 8555 8556 s += rdev->new_data_offset; ··· 8560 8559 if (rv == 0) { 8561 8560 /* Make sure they get written out promptly */ 8562 8561 sysfs_notify_dirent_safe(rdev->sysfs_state); 8563 - set_bit(MD_CHANGE_CLEAN, &rdev->mddev->flags); 8564 - set_bit(MD_CHANGE_PENDING, &rdev->mddev->flags); 8562 + set_mask_bits(&mddev->flags, 0, 8563 + BIT(MD_CHANGE_CLEAN) | BIT(MD_CHANGE_PENDING)); 8565 8564 md_wakeup_thread(rdev->mddev->thread); 8566 8565 return 1; 8567 8566 } else
+2 -2
drivers/md/raid1.c
··· 1474 1474 * if recovery is running, make sure it aborts. 1475 1475 */ 1476 1476 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 1477 - set_bit(MD_CHANGE_DEVS, &mddev->flags); 1478 - set_bit(MD_CHANGE_PENDING, &mddev->flags); 1477 + set_mask_bits(&mddev->flags, 0, 1478 + BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING)); 1479 1479 printk(KERN_ALERT 1480 1480 "md/raid1:%s: Disk failure on %s, disabling device.\n" 1481 1481 "md/raid1:%s: Operation continuing on %d devices.\n",
+4 -4
drivers/md/raid10.c
··· 1102 1102 bio->bi_iter.bi_sector < conf->reshape_progress))) { 1103 1103 /* Need to update reshape_position in metadata */ 1104 1104 mddev->reshape_position = conf->reshape_progress; 1105 - set_bit(MD_CHANGE_DEVS, &mddev->flags); 1106 - set_bit(MD_CHANGE_PENDING, &mddev->flags); 1105 + set_mask_bits(&mddev->flags, 0, 1106 + BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING)); 1107 1107 md_wakeup_thread(mddev->thread); 1108 1108 wait_event(mddev->sb_wait, 1109 1109 !test_bit(MD_CHANGE_PENDING, &mddev->flags)); ··· 1591 1591 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 1592 1592 set_bit(Blocked, &rdev->flags); 1593 1593 set_bit(Faulty, &rdev->flags); 1594 - set_bit(MD_CHANGE_DEVS, &mddev->flags); 1595 - set_bit(MD_CHANGE_PENDING, &mddev->flags); 1594 + set_mask_bits(&mddev->flags, 0, 1595 + BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING)); 1596 1596 spin_unlock_irqrestore(&conf->device_lock, flags); 1597 1597 printk(KERN_ALERT 1598 1598 "md/raid10:%s: Disk failure on %s, disabling device.\n"
+2 -2
drivers/md/raid5-cache.c
··· 712 712 * in_teardown check workaround this issue. 713 713 */ 714 714 if (!log->in_teardown) { 715 - set_bit(MD_CHANGE_DEVS, &mddev->flags); 716 - set_bit(MD_CHANGE_PENDING, &mddev->flags); 715 + set_mask_bits(&mddev->flags, 0, 716 + BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING)); 717 717 md_wakeup_thread(mddev->thread); 718 718 wait_event(mddev->sb_wait, 719 719 !test_bit(MD_CHANGE_PENDING, &mddev->flags) ||
+2 -2
drivers/md/raid5.c
··· 2514 2514 2515 2515 set_bit(Blocked, &rdev->flags); 2516 2516 set_bit(Faulty, &rdev->flags); 2517 - set_bit(MD_CHANGE_DEVS, &mddev->flags); 2518 - set_bit(MD_CHANGE_PENDING, &mddev->flags); 2517 + set_mask_bits(&mddev->flags, 0, 2518 + BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING)); 2519 2519 printk(KERN_ALERT 2520 2520 "md/raid:%s: Disk failure on %s, disabling device.\n" 2521 2521 "md/raid:%s: Operation continuing on %d devices.\n",
+16
include/linux/bitops.h
··· 227 227 }) 228 228 #endif 229 229 230 + #ifndef bit_clear_unless 231 + #define bit_clear_unless(ptr, _clear, _test) \ 232 + ({ \ 233 + const typeof(*ptr) clear = (_clear), test = (_test); \ 234 + typeof(*ptr) old, new; \ 235 + \ 236 + do { \ 237 + old = ACCESS_ONCE(*ptr); \ 238 + new = old & ~clear; \ 239 + } while (!(old & test) && \ 240 + cmpxchg(ptr, old, new) != old); \ 241 + \ 242 + !(old & test); \ 243 + }) 244 + #endif 245 + 230 246 #ifndef find_last_bit 231 247 /** 232 248 * find_last_bit - find the last set bit in a memory region