Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/shli/md

Pull MD updates from Shaohua Li:
"A few fixes of MD for this merge window. Mostly bug fixes:

- raid5 stripe batch fix from Amy

- Read error handling for raid1 FailFast device from Gioh

- raid10 recovery NULL pointer dereference fix from Guoqing

- Support write hint for raid5 stripe cache from Mariusz

- Fixes for device hot add/remove from Neil and Yufen

- Improve flush bio scalability from Xiao"

* 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/shli/md:
MD: fix lock contention for flush bios
md/raid5: Assigning NULL to sh->batch_head before testing bit R5_Overlap of a stripe
md/raid1: add error handling of read error from FailFast device
md: fix NULL dereference of mddev->pers in remove_and_add_spares()
raid5: copy write hint from origin bio to stripe
md: fix two problems with setting the "re-add" device state.
raid10: check bio in r10buf_pool_free to void NULL pointer dereference
md: fix an error code format and remove unsed bio_sector

+149 -71
+116 -55
drivers/md/md.c
··· 132 132 mddev->sync_speed_max : sysctl_speed_limit_max; 133 133 } 134 134 135 + static void * flush_info_alloc(gfp_t gfp_flags, void *data) 136 + { 137 + return kzalloc(sizeof(struct flush_info), gfp_flags); 138 + } 139 + static void flush_info_free(void *flush_info, void *data) 140 + { 141 + kfree(flush_info); 142 + } 143 + 144 + static void * flush_bio_alloc(gfp_t gfp_flags, void *data) 145 + { 146 + return kzalloc(sizeof(struct flush_bio), gfp_flags); 147 + } 148 + static void flush_bio_free(void *flush_bio, void *data) 149 + { 150 + kfree(flush_bio); 151 + } 152 + 135 153 static struct ctl_table_header *raid_table_header; 136 154 137 155 static struct ctl_table raid_table[] = { ··· 432 414 /* 433 415 * Generic flush handling for md 434 416 */ 435 - 436 - static void md_end_flush(struct bio *bio) 417 + static void submit_flushes(struct work_struct *ws) 437 418 { 438 - struct md_rdev *rdev = bio->bi_private; 439 - struct mddev *mddev = rdev->mddev; 419 + struct flush_info *fi = container_of(ws, struct flush_info, flush_work); 420 + struct mddev *mddev = fi->mddev; 421 + struct bio *bio = fi->bio; 422 + 423 + bio->bi_opf &= ~REQ_PREFLUSH; 424 + md_handle_request(mddev, bio); 425 + 426 + mempool_free(fi, mddev->flush_pool); 427 + } 428 + 429 + static void md_end_flush(struct bio *fbio) 430 + { 431 + struct flush_bio *fb = fbio->bi_private; 432 + struct md_rdev *rdev = fb->rdev; 433 + struct flush_info *fi = fb->fi; 434 + struct bio *bio = fi->bio; 435 + struct mddev *mddev = fi->mddev; 440 436 441 437 rdev_dec_pending(rdev, mddev); 442 438 443 - if (atomic_dec_and_test(&mddev->flush_pending)) { 444 - /* The pre-request flush has finished */ 445 - queue_work(md_wq, &mddev->flush_work); 439 + if (atomic_dec_and_test(&fi->flush_pending)) { 440 + if (bio->bi_iter.bi_size == 0) 441 + /* an empty barrier - all done */ 442 + bio_endio(bio); 443 + else { 444 + INIT_WORK(&fi->flush_work, submit_flushes); 445 + queue_work(md_wq, &fi->flush_work); 446 + } 446 447 } 447 - bio_put(bio); 448 + 449 + mempool_free(fb, mddev->flush_bio_pool); 450 + bio_put(fbio); 448 451 } 449 452 450 - static void md_submit_flush_data(struct work_struct *ws); 451 - 452 - static void submit_flushes(struct work_struct *ws) 453 + void md_flush_request(struct mddev *mddev, struct bio *bio) 453 454 { 454 - struct mddev *mddev = container_of(ws, struct mddev, flush_work); 455 455 struct md_rdev *rdev; 456 + struct flush_info *fi; 456 457 457 - INIT_WORK(&mddev->flush_work, md_submit_flush_data); 458 - atomic_set(&mddev->flush_pending, 1); 458 + fi = mempool_alloc(mddev->flush_pool, GFP_NOIO); 459 + 460 + fi->bio = bio; 461 + fi->mddev = mddev; 462 + atomic_set(&fi->flush_pending, 1); 463 + 459 464 rcu_read_lock(); 460 465 rdev_for_each_rcu(rdev, mddev) 461 466 if (rdev->raid_disk >= 0 && ··· 488 447 * we reclaim rcu_read_lock 489 448 */ 490 449 struct bio *bi; 450 + struct flush_bio *fb; 491 451 atomic_inc(&rdev->nr_pending); 492 452 atomic_inc(&rdev->nr_pending); 493 453 rcu_read_unlock(); 454 + 455 + fb = mempool_alloc(mddev->flush_bio_pool, GFP_NOIO); 456 + fb->fi = fi; 457 + fb->rdev = rdev; 458 + 494 459 bi = bio_alloc_mddev(GFP_NOIO, 0, mddev); 495 - bi->bi_end_io = md_end_flush; 496 - bi->bi_private = rdev; 497 460 bio_set_dev(bi, rdev->bdev); 461 + bi->bi_end_io = md_end_flush; 462 + bi->bi_private = fb; 498 463 bi->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; 499 - atomic_inc(&mddev->flush_pending); 464 + 465 + atomic_inc(&fi->flush_pending); 500 466 submit_bio(bi); 467 + 501 468 rcu_read_lock(); 502 469 rdev_dec_pending(rdev, mddev); 503 470 } 504 471 rcu_read_unlock(); 505 - if (atomic_dec_and_test(&mddev->flush_pending)) 506 - queue_work(md_wq, &mddev->flush_work); 507 - } 508 472 509 - static void md_submit_flush_data(struct work_struct *ws) 510 - { 511 - struct mddev *mddev = container_of(ws, struct mddev, flush_work); 512 - struct bio *bio = mddev->flush_bio; 513 - 514 - /* 515 - * must reset flush_bio before calling into md_handle_request to avoid a 516 - * deadlock, because other bios passed md_handle_request suspend check 517 - * could wait for this and below md_handle_request could wait for those 518 - * bios because of suspend check 519 - */ 520 - mddev->flush_bio = NULL; 521 - wake_up(&mddev->sb_wait); 522 - 523 - if (bio->bi_iter.bi_size == 0) 524 - /* an empty barrier - all done */ 525 - bio_endio(bio); 526 - else { 527 - bio->bi_opf &= ~REQ_PREFLUSH; 528 - md_handle_request(mddev, bio); 473 + if (atomic_dec_and_test(&fi->flush_pending)) { 474 + if (bio->bi_iter.bi_size == 0) 475 + /* an empty barrier - all done */ 476 + bio_endio(bio); 477 + else { 478 + INIT_WORK(&fi->flush_work, submit_flushes); 479 + queue_work(md_wq, &fi->flush_work); 480 + } 529 481 } 530 - } 531 - 532 - void md_flush_request(struct mddev *mddev, struct bio *bio) 533 - { 534 - spin_lock_irq(&mddev->lock); 535 - wait_event_lock_irq(mddev->sb_wait, 536 - !mddev->flush_bio, 537 - mddev->lock); 538 - mddev->flush_bio = bio; 539 - spin_unlock_irq(&mddev->lock); 540 - 541 - INIT_WORK(&mddev->flush_work, submit_flushes); 542 - queue_work(md_wq, &mddev->flush_work); 543 482 } 544 483 EXPORT_SYMBOL(md_flush_request); 545 484 ··· 567 546 atomic_set(&mddev->openers, 0); 568 547 atomic_set(&mddev->active_io, 0); 569 548 spin_lock_init(&mddev->lock); 570 - atomic_set(&mddev->flush_pending, 0); 571 549 init_waitqueue_head(&mddev->sb_wait); 572 550 init_waitqueue_head(&mddev->recovery_wait); 573 551 mddev->reshape_position = MaxSector; ··· 2864 2844 err = 0; 2865 2845 } 2866 2846 } else if (cmd_match(buf, "re-add")) { 2867 - if (test_bit(Faulty, &rdev->flags) && (rdev->raid_disk == -1)) { 2847 + if (test_bit(Faulty, &rdev->flags) && (rdev->raid_disk == -1) && 2848 + rdev->saved_raid_disk >= 0) { 2868 2849 /* clear_bit is performed _after_ all the devices 2869 2850 * have their local Faulty bit cleared. If any writes 2870 2851 * happen in the meantime in the local node, they ··· 5520 5499 if (err) 5521 5500 return err; 5522 5501 } 5502 + if (mddev->flush_pool == NULL) { 5503 + mddev->flush_pool = mempool_create(NR_FLUSH_INFOS, flush_info_alloc, 5504 + flush_info_free, mddev); 5505 + if (!mddev->flush_pool) { 5506 + err = -ENOMEM; 5507 + goto abort; 5508 + } 5509 + } 5510 + if (mddev->flush_bio_pool == NULL) { 5511 + mddev->flush_bio_pool = mempool_create(NR_FLUSH_BIOS, flush_bio_alloc, 5512 + flush_bio_free, mddev); 5513 + if (!mddev->flush_bio_pool) { 5514 + err = -ENOMEM; 5515 + goto abort; 5516 + } 5517 + } 5523 5518 5524 5519 spin_lock(&pers_lock); 5525 5520 pers = find_pers(mddev->level, mddev->clevel); ··· 5691 5654 sysfs_notify_dirent_safe(mddev->sysfs_action); 5692 5655 sysfs_notify(&mddev->kobj, NULL, "degraded"); 5693 5656 return 0; 5657 + 5658 + abort: 5659 + if (mddev->flush_bio_pool) { 5660 + mempool_destroy(mddev->flush_bio_pool); 5661 + mddev->flush_bio_pool = NULL; 5662 + } 5663 + if (mddev->flush_pool){ 5664 + mempool_destroy(mddev->flush_pool); 5665 + mddev->flush_pool = NULL; 5666 + } 5667 + 5668 + return err; 5694 5669 } 5695 5670 EXPORT_SYMBOL_GPL(md_run); 5696 5671 ··· 5913 5864 * This is called from dm-raid 5914 5865 */ 5915 5866 __md_stop(mddev); 5867 + if (mddev->flush_bio_pool) { 5868 + mempool_destroy(mddev->flush_bio_pool); 5869 + mddev->flush_bio_pool = NULL; 5870 + } 5871 + if (mddev->flush_pool) { 5872 + mempool_destroy(mddev->flush_pool); 5873 + mddev->flush_pool = NULL; 5874 + } 5916 5875 bioset_exit(&mddev->bio_set); 5917 5876 bioset_exit(&mddev->sync_set); 5918 5877 } ··· 6550 6493 { 6551 6494 char b[BDEVNAME_SIZE]; 6552 6495 struct md_rdev *rdev; 6496 + 6497 + if (!mddev->pers) 6498 + return -ENODEV; 6553 6499 6554 6500 rdev = find_rdev(mddev, dev); 6555 6501 if (!rdev) ··· 8671 8611 if (mddev->pers->hot_remove_disk( 8672 8612 mddev, rdev) == 0) { 8673 8613 sysfs_unlink_rdev(mddev, rdev); 8614 + rdev->saved_raid_disk = rdev->raid_disk; 8674 8615 rdev->raid_disk = -1; 8675 8616 removed++; 8676 8617 }
+15 -7
drivers/md/md.h
··· 252 252 MD_SB_NEED_REWRITE, /* metadata write needs to be repeated */ 253 253 }; 254 254 255 + #define NR_FLUSH_INFOS 8 256 + #define NR_FLUSH_BIOS 64 257 + struct flush_info { 258 + struct bio *bio; 259 + struct mddev *mddev; 260 + struct work_struct flush_work; 261 + atomic_t flush_pending; 262 + }; 263 + struct flush_bio { 264 + struct flush_info *fi; 265 + struct md_rdev *rdev; 266 + }; 267 + 255 268 struct mddev { 256 269 void *private; 257 270 struct md_personality *pers; ··· 470 457 * metadata and bitmap writes 471 458 */ 472 459 473 - /* Generic flush handling. 474 - * The last to finish preflush schedules a worker to submit 475 - * the rest of the request (without the REQ_PREFLUSH flag). 476 - */ 477 - struct bio *flush_bio; 478 - atomic_t flush_pending; 479 - struct work_struct flush_work; 460 + mempool_t *flush_pool; 461 + mempool_t *flush_bio_pool; 480 462 struct work_struct event_work; /* used by dm to report failure event */ 481 463 void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev); 482 464 struct md_cluster_info *cluster_info;
+2 -2
drivers/md/raid1.c
··· 2449 2449 struct mddev *mddev = conf->mddev; 2450 2450 struct bio *bio; 2451 2451 struct md_rdev *rdev; 2452 - sector_t bio_sector; 2453 2452 2454 2453 clear_bit(R1BIO_ReadError, &r1_bio->state); 2455 2454 /* we got a read error. Maybe the drive is bad. Maybe just ··· 2461 2462 */ 2462 2463 2463 2464 bio = r1_bio->bios[r1_bio->read_disk]; 2464 - bio_sector = conf->mirrors[r1_bio->read_disk].rdev->data_offset + r1_bio->sector; 2465 2465 bio_put(bio); 2466 2466 r1_bio->bios[r1_bio->read_disk] = NULL; 2467 2467 ··· 2471 2473 fix_read_error(conf, r1_bio->read_disk, 2472 2474 r1_bio->sector, r1_bio->sectors); 2473 2475 unfreeze_array(conf); 2476 + } else if (mddev->ro == 0 && test_bit(FailFast, &rdev->flags)) { 2477 + md_error(mddev, rdev); 2474 2478 } else { 2475 2479 r1_bio->bios[r1_bio->read_disk] = IO_BLOCKED; 2476 2480 }
+6 -4
drivers/md/raid10.c
··· 255 255 for (j = conf->copies; j--; ) { 256 256 struct bio *bio = r10bio->devs[j].bio; 257 257 258 - rp = get_resync_pages(bio); 259 - resync_free_pages(rp); 260 - bio_put(bio); 258 + if (bio) { 259 + rp = get_resync_pages(bio); 260 + resync_free_pages(rp); 261 + bio_put(bio); 262 + } 261 263 262 264 bio = r10bio->devs[j].repl_bio; 263 265 if (bio) ··· 2364 2362 { 2365 2363 int sect = 0; /* Offset from r10_bio->sector */ 2366 2364 int sectors = r10_bio->sectors; 2367 - struct md_rdev*rdev; 2365 + struct md_rdev *rdev; 2368 2366 int max_read_errors = atomic_read(&mddev->max_corr_read_errors); 2369 2367 int d = r10_bio->devs[r10_bio->read_slot].devnum; 2370 2368
+9 -3
drivers/md/raid5.c
··· 1139 1139 bi->bi_io_vec[0].bv_len = STRIPE_SIZE; 1140 1140 bi->bi_io_vec[0].bv_offset = 0; 1141 1141 bi->bi_iter.bi_size = STRIPE_SIZE; 1142 + bi->bi_write_hint = sh->dev[i].write_hint; 1143 + if (!rrdev) 1144 + sh->dev[i].write_hint = RWF_WRITE_LIFE_NOT_SET; 1142 1145 /* 1143 1146 * If this is discard request, set bi_vcnt 0. We don't 1144 1147 * want to confuse SCSI because SCSI will replace payload ··· 1193 1190 rbi->bi_io_vec[0].bv_len = STRIPE_SIZE; 1194 1191 rbi->bi_io_vec[0].bv_offset = 0; 1195 1192 rbi->bi_iter.bi_size = STRIPE_SIZE; 1193 + rbi->bi_write_hint = sh->dev[i].write_hint; 1194 + sh->dev[i].write_hint = RWF_WRITE_LIFE_NOT_SET; 1196 1195 /* 1197 1196 * If this is discard request, set bi_vcnt 0. We don't 1198 1197 * want to confuse SCSI because SCSI will replace payload ··· 3209 3204 (unsigned long long)sh->sector); 3210 3205 3211 3206 spin_lock_irq(&sh->stripe_lock); 3207 + sh->dev[dd_idx].write_hint = bi->bi_write_hint; 3212 3208 /* Don't allow new IO added to stripes in batch list */ 3213 3209 if (sh->batch_head) 3214 3210 goto overlap; ··· 4620 4614 4621 4615 sh->check_state = head_sh->check_state; 4622 4616 sh->reconstruct_state = head_sh->reconstruct_state; 4617 + spin_lock_irq(&sh->stripe_lock); 4618 + sh->batch_head = NULL; 4619 + spin_unlock_irq(&sh->stripe_lock); 4623 4620 for (i = 0; i < sh->disks; i++) { 4624 4621 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 4625 4622 do_wakeup = 1; 4626 4623 sh->dev[i].flags = head_sh->dev[i].flags & 4627 4624 (~((1 << R5_WriteError) | (1 << R5_Overlap))); 4628 4625 } 4629 - spin_lock_irq(&sh->stripe_lock); 4630 - sh->batch_head = NULL; 4631 - spin_unlock_irq(&sh->stripe_lock); 4632 4626 if (handle_flags == 0 || 4633 4627 sh->state & handle_flags) 4634 4628 set_bit(STRIPE_HANDLE, &sh->state);
+1
drivers/md/raid5.h
··· 257 257 sector_t sector; /* sector of this page */ 258 258 unsigned long flags; 259 259 u32 log_checksum; 260 + unsigned short write_hint; 260 261 } dev[1]; /* allocated with extra space depending of RAID geometry */ 261 262 }; 262 263