Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

md: fix deadlock between mddev_suspend() and md_write_start()

If mddev_suspend() races with md_write_start() we can deadlock
with mddev_suspend() waiting for the request that is currently
in md_write_start() to complete the ->make_request() call,
and md_write_start() waiting for the metadata to be updated
to mark the array as 'dirty'.
As metadata updates done by md_check_recovery() only happen then
the mddev_lock() can be claimed, and as mddev_suspend() is often
called with the lock held, these threads wait indefinitely for each
other.

We fix this by having md_write_start() abort if mddev_suspend()
is happening, and ->make_request() aborts if md_write_start()
aborted.
md_make_request() can detect this abort, decrease the ->active_io
count, and wait for mddev_suspend().

Reported-by: Nix <nix@esperi.org.uk>
Fix: 68866e425be2(MD: no sync IO while suspended)
Cc: stable@vger.kernel.org
Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>

authored by

NeilBrown and committed by
Shaohua Li
cc27b0c7 63f700aa

+56 -35
+3 -2
drivers/md/faulty.c
··· 170 170 conf->nfaults = n+1; 171 171 } 172 172 173 - static void faulty_make_request(struct mddev *mddev, struct bio *bio) 173 + static bool faulty_make_request(struct mddev *mddev, struct bio *bio) 174 174 { 175 175 struct faulty_conf *conf = mddev->private; 176 176 int failit = 0; ··· 182 182 * just fail immediately 183 183 */ 184 184 bio_io_error(bio); 185 - return; 185 + return true; 186 186 } 187 187 188 188 if (check_sector(conf, bio->bi_iter.bi_sector, ··· 224 224 bio->bi_bdev = conf->rdev->bdev; 225 225 226 226 generic_make_request(bio); 227 + return true; 227 228 } 228 229 229 230 static void faulty_status(struct seq_file *seq, struct mddev *mddev)
+4 -3
drivers/md/linear.c
··· 245 245 kfree(conf); 246 246 } 247 247 248 - static void linear_make_request(struct mddev *mddev, struct bio *bio) 248 + static bool linear_make_request(struct mddev *mddev, struct bio *bio) 249 249 { 250 250 char b[BDEVNAME_SIZE]; 251 251 struct dev_info *tmp_dev; ··· 254 254 255 255 if (unlikely(bio->bi_opf & REQ_PREFLUSH)) { 256 256 md_flush_request(mddev, bio); 257 - return; 257 + return true; 258 258 } 259 259 260 260 tmp_dev = which_dev(mddev, bio_sector); ··· 292 292 mddev_check_write_zeroes(mddev, bio); 293 293 generic_make_request(bio); 294 294 } 295 - return; 295 + return true; 296 296 297 297 out_of_bounds: 298 298 pr_err("md/linear:%s: make_request: Sector %llu out of bounds on dev %s: %llu sectors, offset %llu\n", ··· 302 302 (unsigned long long)tmp_dev->rdev->sectors, 303 303 (unsigned long long)start_sector); 304 304 bio_io_error(bio); 305 + return true; 305 306 } 306 307 307 308 static void linear_status (struct seq_file *seq, struct mddev *mddev)
+17 -5
drivers/md/md.c
··· 277 277 bio_endio(bio); 278 278 return BLK_QC_T_NONE; 279 279 } 280 - smp_rmb(); /* Ensure implications of 'active' are visible */ 280 + check_suspended: 281 281 rcu_read_lock(); 282 282 if (mddev->suspended) { 283 283 DEFINE_WAIT(__wait); ··· 302 302 sectors = bio_sectors(bio); 303 303 /* bio could be mergeable after passing to underlayer */ 304 304 bio->bi_opf &= ~REQ_NOMERGE; 305 - mddev->pers->make_request(mddev, bio); 305 + if (!mddev->pers->make_request(mddev, bio)) { 306 + atomic_dec(&mddev->active_io); 307 + wake_up(&mddev->sb_wait); 308 + goto check_suspended; 309 + } 306 310 307 311 cpu = part_stat_lock(); 308 312 part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); ··· 331 327 if (mddev->suspended++) 332 328 return; 333 329 synchronize_rcu(); 330 + wake_up(&mddev->sb_wait); 334 331 wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0); 335 332 mddev->pers->quiesce(mddev, 1); 336 333 ··· 7955 7950 * If we need to update some array metadata (e.g. 'active' flag 7956 7951 * in superblock) before writing, schedule a superblock update 7957 7952 * and wait for it to complete. 7953 + * A return value of 'false' means that the write wasn't recorded 7954 + * and cannot proceed as the array is being suspend. 7958 7955 */ 7959 - void md_write_start(struct mddev *mddev, struct bio *bi) 7956 + bool md_write_start(struct mddev *mddev, struct bio *bi) 7960 7957 { 7961 7958 int did_change = 0; 7962 7959 if (bio_data_dir(bi) != WRITE) 7963 - return; 7960 + return true; 7964 7961 7965 7962 BUG_ON(mddev->ro == 1); 7966 7963 if (mddev->ro == 2) { ··· 7994 7987 if (did_change) 7995 7988 sysfs_notify_dirent_safe(mddev->sysfs_state); 7996 7989 wait_event(mddev->sb_wait, 7997 - !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); 7990 + !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) && !mddev->suspended); 7991 + if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) { 7992 + percpu_ref_put(&mddev->writes_pending); 7993 + return false; 7994 + } 7995 + return true; 7998 7996 } 7999 7997 EXPORT_SYMBOL(md_write_start); 8000 7998
+2 -2
drivers/md/md.h
··· 510 510 int level; 511 511 struct list_head list; 512 512 struct module *owner; 513 - void (*make_request)(struct mddev *mddev, struct bio *bio); 513 + bool (*make_request)(struct mddev *mddev, struct bio *bio); 514 514 int (*run)(struct mddev *mddev); 515 515 void (*free)(struct mddev *mddev, void *priv); 516 516 void (*status)(struct seq_file *seq, struct mddev *mddev); ··· 649 649 extern void md_check_recovery(struct mddev *mddev); 650 650 extern void md_reap_sync_thread(struct mddev *mddev); 651 651 extern int mddev_init_writes_pending(struct mddev *mddev); 652 - extern void md_write_start(struct mddev *mddev, struct bio *bi); 652 + extern bool md_write_start(struct mddev *mddev, struct bio *bi); 653 653 extern void md_write_inc(struct mddev *mddev, struct bio *bi); 654 654 extern void md_write_end(struct mddev *mddev); 655 655 extern void md_done_sync(struct mddev *mddev, int blocks, int ok);
+4 -4
drivers/md/multipath.c
··· 106 106 rdev_dec_pending(rdev, conf->mddev); 107 107 } 108 108 109 - static void multipath_make_request(struct mddev *mddev, struct bio * bio) 109 + static bool multipath_make_request(struct mddev *mddev, struct bio * bio) 110 110 { 111 111 struct mpconf *conf = mddev->private; 112 112 struct multipath_bh * mp_bh; ··· 114 114 115 115 if (unlikely(bio->bi_opf & REQ_PREFLUSH)) { 116 116 md_flush_request(mddev, bio); 117 - return; 117 + return true; 118 118 } 119 119 120 120 mp_bh = mempool_alloc(conf->pool, GFP_NOIO); ··· 126 126 if (mp_bh->path < 0) { 127 127 bio_io_error(bio); 128 128 mempool_free(mp_bh, conf->pool); 129 - return; 129 + return true; 130 130 } 131 131 multipath = conf->multipaths + mp_bh->path; 132 132 ··· 141 141 mddev_check_writesame(mddev, &mp_bh->bio); 142 142 mddev_check_write_zeroes(mddev, &mp_bh->bio); 143 143 generic_make_request(&mp_bh->bio); 144 - return; 144 + return true; 145 145 } 146 146 147 147 static void multipath_status(struct seq_file *seq, struct mddev *mddev)
+4 -3
drivers/md/raid0.c
··· 548 548 bio_endio(bio); 549 549 } 550 550 551 - static void raid0_make_request(struct mddev *mddev, struct bio *bio) 551 + static bool raid0_make_request(struct mddev *mddev, struct bio *bio) 552 552 { 553 553 struct strip_zone *zone; 554 554 struct md_rdev *tmp_dev; ··· 559 559 560 560 if (unlikely(bio->bi_opf & REQ_PREFLUSH)) { 561 561 md_flush_request(mddev, bio); 562 - return; 562 + return true; 563 563 } 564 564 565 565 if (unlikely((bio_op(bio) == REQ_OP_DISCARD))) { 566 566 raid0_handle_discard(mddev, bio); 567 - return; 567 + return true; 568 568 } 569 569 570 570 bio_sector = bio->bi_iter.bi_sector; ··· 599 599 mddev_check_writesame(mddev, bio); 600 600 mddev_check_write_zeroes(mddev, bio); 601 601 generic_make_request(bio); 602 + return true; 602 603 } 603 604 604 605 static void raid0_status(struct seq_file *seq, struct mddev *mddev)
+7 -4
drivers/md/raid1.c
··· 1321 1321 * Continue immediately if no resync is active currently. 1322 1322 */ 1323 1323 1324 - md_write_start(mddev, bio); /* wait on superblock update early */ 1325 1324 1326 1325 if ((bio_end_sector(bio) > mddev->suspend_lo && 1327 1326 bio->bi_iter.bi_sector < mddev->suspend_hi) || ··· 1549 1550 wake_up(&conf->wait_barrier); 1550 1551 } 1551 1552 1552 - static void raid1_make_request(struct mddev *mddev, struct bio *bio) 1553 + static bool raid1_make_request(struct mddev *mddev, struct bio *bio) 1553 1554 { 1554 1555 sector_t sectors; 1555 1556 1556 1557 if (unlikely(bio->bi_opf & REQ_PREFLUSH)) { 1557 1558 md_flush_request(mddev, bio); 1558 - return; 1559 + return true; 1559 1560 } 1560 1561 1561 1562 /* ··· 1570 1571 1571 1572 if (bio_data_dir(bio) == READ) 1572 1573 raid1_read_request(mddev, bio, sectors, NULL); 1573 - else 1574 + else { 1575 + if (!md_write_start(mddev,bio)) 1576 + return false; 1574 1577 raid1_write_request(mddev, bio, sectors); 1578 + } 1579 + return true; 1575 1580 } 1576 1581 1577 1582 static void raid1_status(struct seq_file *seq, struct mddev *mddev)
+6 -4
drivers/md/raid10.c
··· 1303 1303 sector_t sectors; 1304 1304 int max_sectors; 1305 1305 1306 - md_write_start(mddev, bio); 1307 - 1308 1306 /* 1309 1307 * Register the new request and wait if the reconstruction 1310 1308 * thread has put up a bar for new requests. ··· 1523 1525 raid10_write_request(mddev, bio, r10_bio); 1524 1526 } 1525 1527 1526 - static void raid10_make_request(struct mddev *mddev, struct bio *bio) 1528 + static bool raid10_make_request(struct mddev *mddev, struct bio *bio) 1527 1529 { 1528 1530 struct r10conf *conf = mddev->private; 1529 1531 sector_t chunk_mask = (conf->geo.chunk_mask & conf->prev.chunk_mask); ··· 1532 1534 1533 1535 if (unlikely(bio->bi_opf & REQ_PREFLUSH)) { 1534 1536 md_flush_request(mddev, bio); 1535 - return; 1537 + return true; 1536 1538 } 1539 + 1540 + if (!md_write_start(mddev, bio)) 1541 + return false; 1537 1542 1538 1543 /* 1539 1544 * If this request crosses a chunk boundary, we need to split ··· 1554 1553 1555 1554 /* In case raid10d snuck in to freeze_array */ 1556 1555 wake_up(&conf->wait_barrier); 1556 + return true; 1557 1557 } 1558 1558 1559 1559 static void raid10_status(struct seq_file *seq, struct mddev *mddev)
+9 -8
drivers/md/raid5.c
··· 5479 5479 last_sector = bi->bi_iter.bi_sector + (bi->bi_iter.bi_size>>9); 5480 5480 5481 5481 bi->bi_next = NULL; 5482 - md_write_start(mddev, bi); 5483 5482 5484 5483 stripe_sectors = conf->chunk_sectors * 5485 5484 (conf->raid_disks - conf->max_degraded); ··· 5548 5549 release_stripe_plug(mddev, sh); 5549 5550 } 5550 5551 5551 - md_write_end(mddev); 5552 5552 bio_endio(bi); 5553 5553 } 5554 5554 5555 - static void raid5_make_request(struct mddev *mddev, struct bio * bi) 5555 + static bool raid5_make_request(struct mddev *mddev, struct bio * bi) 5556 5556 { 5557 5557 struct r5conf *conf = mddev->private; 5558 5558 int dd_idx; ··· 5567 5569 int ret = r5l_handle_flush_request(conf->log, bi); 5568 5570 5569 5571 if (ret == 0) 5570 - return; 5572 + return true; 5571 5573 if (ret == -ENODEV) { 5572 5574 md_flush_request(mddev, bi); 5573 - return; 5575 + return true; 5574 5576 } 5575 5577 /* ret == -EAGAIN, fallback */ 5576 5578 /* ··· 5580 5582 do_flush = bi->bi_opf & REQ_PREFLUSH; 5581 5583 } 5582 5584 5585 + if (!md_write_start(mddev, bi)) 5586 + return false; 5583 5587 /* 5584 5588 * If array is degraded, better not do chunk aligned read because 5585 5589 * later we might have to read it again in order to reconstruct ··· 5591 5591 mddev->reshape_position == MaxSector) { 5592 5592 bi = chunk_aligned_read(mddev, bi); 5593 5593 if (!bi) 5594 - return; 5594 + return true; 5595 5595 } 5596 5596 5597 5597 if (unlikely(bio_op(bi) == REQ_OP_DISCARD)) { 5598 5598 make_discard_request(mddev, bi); 5599 - return; 5599 + md_write_end(mddev); 5600 + return true; 5600 5601 } 5601 5602 5602 5603 logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1); 5603 5604 last_sector = bio_end_sector(bi); 5604 5605 bi->bi_next = NULL; 5605 - md_write_start(mddev, bi); 5606 5606 5607 5607 prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); 5608 5608 for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { ··· 5740 5740 if (rw == WRITE) 5741 5741 md_write_end(mddev); 5742 5742 bio_endio(bi); 5743 + return true; 5743 5744 } 5744 5745 5745 5746 static sector_t raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks);