Merge branch 'for-next' of git://neil.brown.name/md

+26

Documentation/device-mapper/dm-raid.txt

··· 27 27 - rotating parity N (right-to-left) with data restart 28 28 raid6_nc RAID6 N continue 29 29 - rotating parity N (right-to-left) with data continuation 30 + raid10 Various RAID10 inspired algorithms chosen by additional params 31 + - RAID10: Striped Mirrors (aka 'Striping on top of mirrors') 32 + - RAID1E: Integrated Adjacent Stripe Mirroring 33 + - and other similar RAID10 variants 30 34 31 35 Reference: Chapter 4 of 32 36 http://www.snia.org/sites/default/files/SNIA_DDF_Technical_Position_v2.0.pdf ··· 62 58 The region_size multiplied by the number of regions is the 63 59 logical size of the array. The bitmap records the device 64 60 synchronisation state for each region. 61 + 62 + [raid10_copies <# copies>] 63 + [raid10_format near] 64 + These two options are used to alter the default layout of 65 + a RAID10 configuration. The number of copies is can be 66 + specified, but the default is 2. There are other variations 67 + to how the copies are laid down - the default and only current 68 + option is "near". Near copies are what most people think of 69 + with respect to mirroring. If these options are left 70 + unspecified, or 'raid10_copies 2' and/or 'raid10_format near' 71 + are given, then the layouts for 2, 3 and 4 devices are: 72 + 2 drives 3 drives 4 drives 73 + -------- ---------- -------------- 74 + A1 A1 A1 A1 A2 A1 A1 A2 A2 75 + A2 A2 A2 A3 A3 A3 A3 A4 A4 76 + A3 A3 A4 A4 A5 A5 A5 A6 A6 77 + A4 A4 A5 A6 A6 A7 A7 A8 A8 78 + .. .. .. .. .. .. .. .. .. 79 + The 2-device layout is equivalent 2-way RAID1. The 4-device 80 + layout is what a traditional RAID10 would look like. The 81 + 3-device layout is what might be called a 'RAID1E - Integrated 82 + Adjacent Stripe Mirroring'. 65 83 66 84 <#raid_devs>: The number of devices composing the array. 67 85 Each device consists of two entries. The first is the device

+90 -5

drivers/md/dm-raid.c

··· 11 11 #include "md.h" 12 12 #include "raid1.h" 13 13 #include "raid5.h" 14 + #include "raid10.h" 14 15 #include "bitmap.h" 15 16 16 17 #include <linux/device-mapper.h> ··· 53 52 #define DMPF_MAX_RECOVERY_RATE 0x20 54 53 #define DMPF_MAX_WRITE_BEHIND 0x40 55 54 #define DMPF_STRIPE_CACHE 0x80 56 - #define DMPF_REGION_SIZE 0X100 55 + #define DMPF_REGION_SIZE 0x100 56 + #define DMPF_RAID10_COPIES 0x200 57 + #define DMPF_RAID10_FORMAT 0x400 58 + 57 59 struct raid_set { 58 60 struct dm_target *ti; 59 61 ··· 80 76 const unsigned algorithm; /* RAID algorithm. */ 81 77 } raid_types[] = { 82 78 {"raid1", "RAID1 (mirroring)", 0, 2, 1, 0 /* NONE */}, 79 + {"raid10", "RAID10 (striped mirrors)", 0, 2, 10, UINT_MAX /* Varies */}, 83 80 {"raid4", "RAID4 (dedicated parity disk)", 1, 2, 5, ALGORITHM_PARITY_0}, 84 81 {"raid5_la", "RAID5 (left asymmetric)", 1, 2, 5, ALGORITHM_LEFT_ASYMMETRIC}, 85 82 {"raid5_ra", "RAID5 (right asymmetric)", 1, 2, 5, ALGORITHM_RIGHT_ASYMMETRIC}, ··· 90 85 {"raid6_nr", "RAID6 (N restart)", 2, 4, 6, ALGORITHM_ROTATING_N_RESTART}, 91 86 {"raid6_nc", "RAID6 (N continue)", 2, 4, 6, ALGORITHM_ROTATING_N_CONTINUE} 92 87 }; 88 + 89 + static unsigned raid10_md_layout_to_copies(int layout) 90 + { 91 + return layout & 0xFF; 92 + } 93 + 94 + static int raid10_format_to_md_layout(char *format, unsigned copies) 95 + { 96 + /* 1 "far" copy, and 'copies' "near" copies */ 97 + return (1 << 8) | (copies & 0xFF); 98 + } 93 99 94 100 static struct raid_type *get_raid_type(char *name) 95 101 { ··· 355 339 * [max_write_behind <sectors>] See '-write-behind=' (man mdadm) 356 340 * [stripe_cache <sectors>] Stripe cache size for higher RAIDs 357 341 * [region_size <sectors>] Defines granularity of bitmap 342 + * 343 + * RAID10-only options: 344 + * [raid10_copies <# copies>] Number of copies. (Default: 2) 345 + * [raid10_format <near>] Layout algorithm. (Default: near) 358 346 */ 359 347 static int parse_raid_params(struct raid_set *rs, char **argv, 360 348 unsigned num_raid_params) 361 349 { 350 + char *raid10_format = "near"; 351 + unsigned raid10_copies = 2; 362 352 unsigned i, rebuild_cnt = 0; 363 353 unsigned long value, region_size = 0; 364 354 sector_t sectors_per_dev = rs->ti->len; ··· 438 416 } 439 417 440 418 key = argv[i++]; 419 + 420 + /* Parameters that take a string value are checked here. */ 421 + if (!strcasecmp(key, "raid10_format")) { 422 + if (rs->raid_type->level != 10) { 423 + rs->ti->error = "'raid10_format' is an invalid parameter for this RAID type"; 424 + return -EINVAL; 425 + } 426 + if (strcmp("near", argv[i])) { 427 + rs->ti->error = "Invalid 'raid10_format' value given"; 428 + return -EINVAL; 429 + } 430 + raid10_format = argv[i]; 431 + rs->print_flags |= DMPF_RAID10_FORMAT; 432 + continue; 433 + } 434 + 441 435 if (strict_strtoul(argv[i], 10, &value) < 0) { 442 436 rs->ti->error = "Bad numerical argument given in raid params"; 443 437 return -EINVAL; 444 438 } 445 439 440 + /* Parameters that take a numeric value are checked here */ 446 441 if (!strcasecmp(key, "rebuild")) { 447 442 rebuild_cnt++; 448 443 ··· 478 439 return -EINVAL; 479 440 } 480 441 break; 442 + case 10: 481 443 default: 482 444 DMERR("The rebuild parameter is not supported for %s", rs->raid_type->name); 483 445 rs->ti->error = "Rebuild not supported for this RAID type"; ··· 535 495 */ 536 496 value /= 2; 537 497 538 - if (rs->raid_type->level < 5) { 498 + if ((rs->raid_type->level != 5) && 499 + (rs->raid_type->level != 6)) { 539 500 rs->ti->error = "Inappropriate argument: stripe_cache"; 540 501 return -EINVAL; 541 502 } ··· 561 520 } else if (!strcasecmp(key, "region_size")) { 562 521 rs->print_flags |= DMPF_REGION_SIZE; 563 522 region_size = value; 523 + } else if (!strcasecmp(key, "raid10_copies") && 524 + (rs->raid_type->level == 10)) { 525 + if ((value < 2) || (value > 0xFF)) { 526 + rs->ti->error = "Bad value for 'raid10_copies'"; 527 + return -EINVAL; 528 + } 529 + rs->print_flags |= DMPF_RAID10_COPIES; 530 + raid10_copies = value; 564 531 } else { 565 532 DMERR("Unable to parse RAID parameter: %s", key); 566 533 rs->ti->error = "Unable to parse RAID parameters"; ··· 587 538 if (dm_set_target_max_io_len(rs->ti, max_io_len)) 588 539 return -EINVAL; 589 540 590 - if ((rs->raid_type->level > 1) && 591 - sector_div(sectors_per_dev, (rs->md.raid_disks - rs->raid_type->parity_devs))) { 541 + if (rs->raid_type->level == 10) { 542 + if (raid10_copies > rs->md.raid_disks) { 543 + rs->ti->error = "Not enough devices to satisfy specification"; 544 + return -EINVAL; 545 + } 546 + 547 + /* (Len * #mirrors) / #devices */ 548 + sectors_per_dev = rs->ti->len * raid10_copies; 549 + sector_div(sectors_per_dev, rs->md.raid_disks); 550 + 551 + rs->md.layout = raid10_format_to_md_layout(raid10_format, 552 + raid10_copies); 553 + rs->md.new_layout = rs->md.layout; 554 + } else if ((rs->raid_type->level > 1) && 555 + sector_div(sectors_per_dev, 556 + (rs->md.raid_disks - rs->raid_type->parity_devs))) { 592 557 rs->ti->error = "Target length not divisible by number of data devices"; 593 558 return -EINVAL; 594 559 } ··· 628 565 629 566 if (rs->raid_type->level == 1) 630 567 return md_raid1_congested(&rs->md, bits); 568 + 569 + if (rs->raid_type->level == 10) 570 + return md_raid10_congested(&rs->md, bits); 631 571 632 572 return md_raid5_congested(&rs->md, bits); 633 573 } ··· 950 884 case 6: 951 885 redundancy = rs->raid_type->parity_devs; 952 886 break; 887 + case 10: 888 + redundancy = raid10_md_layout_to_copies(mddev->layout) - 1; 889 + break; 953 890 default: 954 891 ti->error = "Unknown RAID type"; 955 892 return -EINVAL; ··· 1118 1049 goto bad; 1119 1050 } 1120 1051 1052 + if (ti->len != rs->md.array_sectors) { 1053 + ti->error = "Array size does not match requested target length"; 1054 + ret = -EINVAL; 1055 + goto size_mismatch; 1056 + } 1121 1057 rs->callbacks.congested_fn = raid_is_congested; 1122 1058 dm_table_add_target_callbacks(ti->table, &rs->callbacks); 1123 1059 1124 1060 mddev_suspend(&rs->md); 1125 1061 return 0; 1126 1062 1063 + size_mismatch: 1064 + md_stop(&rs->md); 1127 1065 bad: 1128 1066 context_free(rs); 1129 1067 ··· 1279 1203 DMEMIT(" region_size %lu", 1280 1204 rs->md.bitmap_info.chunksize >> 9); 1281 1205 1206 + if (rs->print_flags & DMPF_RAID10_COPIES) 1207 + DMEMIT(" raid10_copies %u", 1208 + raid10_md_layout_to_copies(rs->md.layout)); 1209 + 1210 + if (rs->print_flags & DMPF_RAID10_FORMAT) 1211 + DMEMIT(" raid10_format near"); 1212 + 1282 1213 DMEMIT(" %d", rs->md.raid_disks); 1283 1214 for (i = 0; i < rs->md.raid_disks; i++) { 1284 1215 if (rs->dev[i].meta_dev) ··· 1360 1277 1361 1278 static struct target_type raid_target = { 1362 1279 .name = "raid", 1363 - .version = {1, 2, 0}, 1280 + .version = {1, 3, 0}, 1364 1281 .module = THIS_MODULE, 1365 1282 .ctr = raid_ctr, 1366 1283 .dtr = raid_dtr, ··· 1387 1304 module_exit(dm_raid_exit); 1388 1305 1389 1306 MODULE_DESCRIPTION(DM_NAME " raid4/5/6 target"); 1307 + MODULE_ALIAS("dm-raid1"); 1308 + MODULE_ALIAS("dm-raid10"); 1390 1309 MODULE_ALIAS("dm-raid4"); 1391 1310 MODULE_ALIAS("dm-raid5"); 1392 1311 MODULE_ALIAS("dm-raid6");

+2 -6

drivers/md/md.c

··· 3942 3942 break; 3943 3943 case clear: 3944 3944 /* stopping an active array */ 3945 - if (atomic_read(&mddev->openers) > 0) 3946 - return -EBUSY; 3947 3945 err = do_md_stop(mddev, 0, NULL); 3948 3946 break; 3949 3947 case inactive: 3950 3948 /* stopping an active array */ 3951 - if (mddev->pers) { 3952 - if (atomic_read(&mddev->openers) > 0) 3953 - return -EBUSY; 3949 + if (mddev->pers) 3954 3950 err = do_md_stop(mddev, 2, NULL); 3955 - } else 3951 + else 3956 3952 err = 0; /* already inactive */ 3957 3953 break; 3958 3954 case suspended:

+121 -43

drivers/md/raid1.c

··· 46 46 */ 47 47 #define NR_RAID1_BIOS 256 48 48 49 + /* when we get a read error on a read-only array, we redirect to another 50 + * device without failing the first device, or trying to over-write to 51 + * correct the read error. To keep track of bad blocks on a per-bio 52 + * level, we store IO_BLOCKED in the appropriate 'bios' pointer 53 + */ 54 + #define IO_BLOCKED ((struct bio *)1) 55 + /* When we successfully write to a known bad-block, we need to remove the 56 + * bad-block marking which must be done from process context. So we record 57 + * the success by setting devs[n].bio to IO_MADE_GOOD 58 + */ 59 + #define IO_MADE_GOOD ((struct bio *)2) 60 + 61 + #define BIO_SPECIAL(bio) ((unsigned long)bio <= 2) 62 + 49 63 /* When there are this many requests queue to be written by 50 64 * the raid1 thread, we become 'congested' to provide back-pressure 51 65 * for writeback. ··· 497 483 const sector_t this_sector = r1_bio->sector; 498 484 int sectors; 499 485 int best_good_sectors; 500 - int start_disk; 501 - int best_disk; 502 - int i; 486 + int best_disk, best_dist_disk, best_pending_disk; 487 + int has_nonrot_disk; 488 + int disk; 503 489 sector_t best_dist; 490 + unsigned int min_pending; 504 491 struct md_rdev *rdev; 505 492 int choose_first; 493 + int choose_next_idle; 506 494 507 495 rcu_read_lock(); 508 496 /* ··· 515 499 retry: 516 500 sectors = r1_bio->sectors; 517 501 best_disk = -1; 502 + best_dist_disk = -1; 518 503 best_dist = MaxSector; 504 + best_pending_disk = -1; 505 + min_pending = UINT_MAX; 519 506 best_good_sectors = 0; 507 + has_nonrot_disk = 0; 508 + choose_next_idle = 0; 520 509 521 510 if (conf->mddev->recovery_cp < MaxSector && 522 - (this_sector + sectors >= conf->next_resync)) { 511 + (this_sector + sectors >= conf->next_resync)) 523 512 choose_first = 1; 524 - start_disk = 0; 525 - } else { 513 + else 526 514 choose_first = 0; 527 - start_disk = conf->last_used; 528 - } 529 515 530 - for (i = 0 ; i < conf->raid_disks * 2 ; i++) { 516 + for (disk = 0 ; disk < conf->raid_disks * 2 ; disk++) { 531 517 sector_t dist; 532 518 sector_t first_bad; 533 519 int bad_sectors; 534 - 535 - int disk = start_disk + i; 536 - if (disk >= conf->raid_disks * 2) 537 - disk -= conf->raid_disks * 2; 520 + unsigned int pending; 521 + bool nonrot; 538 522 539 523 rdev = rcu_dereference(conf->mirrors[disk].rdev); 540 524 if (r1_bio->bios[disk] == IO_BLOCKED ··· 593 577 } else 594 578 best_good_sectors = sectors; 595 579 580 + nonrot = blk_queue_nonrot(bdev_get_queue(rdev->bdev)); 581 + has_nonrot_disk |= nonrot; 582 + pending = atomic_read(&rdev->nr_pending); 596 583 dist = abs(this_sector - conf->mirrors[disk].head_position); 597 - if (choose_first 598 - /* Don't change to another disk for sequential reads */ 599 - || conf->next_seq_sect == this_sector 600 - || dist == 0 601 - /* If device is idle, use it */ 602 - || atomic_read(&rdev->nr_pending) == 0) { 584 + if (choose_first) { 603 585 best_disk = disk; 604 586 break; 605 587 } 588 + /* Don't change to another disk for sequential reads */ 589 + if (conf->mirrors[disk].next_seq_sect == this_sector 590 + || dist == 0) { 591 + int opt_iosize = bdev_io_opt(rdev->bdev) >> 9; 592 + struct raid1_info *mirror = &conf->mirrors[disk]; 593 + 594 + best_disk = disk; 595 + /* 596 + * If buffered sequential IO size exceeds optimal 597 + * iosize, check if there is idle disk. If yes, choose 598 + * the idle disk. read_balance could already choose an 599 + * idle disk before noticing it's a sequential IO in 600 + * this disk. This doesn't matter because this disk 601 + * will idle, next time it will be utilized after the 602 + * first disk has IO size exceeds optimal iosize. In 603 + * this way, iosize of the first disk will be optimal 604 + * iosize at least. iosize of the second disk might be 605 + * small, but not a big deal since when the second disk 606 + * starts IO, the first disk is likely still busy. 607 + */ 608 + if (nonrot && opt_iosize > 0 && 609 + mirror->seq_start != MaxSector && 610 + mirror->next_seq_sect > opt_iosize && 611 + mirror->next_seq_sect - opt_iosize >= 612 + mirror->seq_start) { 613 + choose_next_idle = 1; 614 + continue; 615 + } 616 + break; 617 + } 618 + /* If device is idle, use it */ 619 + if (pending == 0) { 620 + best_disk = disk; 621 + break; 622 + } 623 + 624 + if (choose_next_idle) 625 + continue; 626 + 627 + if (min_pending > pending) { 628 + min_pending = pending; 629 + best_pending_disk = disk; 630 + } 631 + 606 632 if (dist < best_dist) { 607 633 best_dist = dist; 608 - best_disk = disk; 634 + best_dist_disk = disk; 609 635 } 636 + } 637 + 638 + /* 639 + * If all disks are rotational, choose the closest disk. If any disk is 640 + * non-rotational, choose the disk with less pending request even the 641 + * disk is rotational, which might/might not be optimal for raids with 642 + * mixed ratation/non-rotational disks depending on workload. 643 + */ 644 + if (best_disk == -1) { 645 + if (has_nonrot_disk) 646 + best_disk = best_pending_disk; 647 + else 648 + best_disk = best_dist_disk; 610 649 } 611 650 612 651 if (best_disk >= 0) { ··· 677 606 goto retry; 678 607 } 679 608 sectors = best_good_sectors; 680 - conf->next_seq_sect = this_sector + sectors; 681 - conf->last_used = best_disk; 609 + 610 + if (conf->mirrors[best_disk].next_seq_sect != this_sector) 611 + conf->mirrors[best_disk].seq_start = this_sector; 612 + 613 + conf->mirrors[best_disk].next_seq_sect = this_sector + sectors; 682 614 } 683 615 rcu_read_unlock(); 684 616 *max_sectors = sectors; ··· 947 873 static void make_request(struct mddev *mddev, struct bio * bio) 948 874 { 949 875 struct r1conf *conf = mddev->private; 950 - struct mirror_info *mirror; 876 + struct raid1_info *mirror; 951 877 struct r1bio *r1_bio; 952 878 struct bio *read_bio; 953 879 int i, disks; ··· 1438 1364 struct r1conf *conf = mddev->private; 1439 1365 int err = -EEXIST; 1440 1366 int mirror = 0; 1441 - struct mirror_info *p; 1367 + struct raid1_info *p; 1442 1368 int first = 0; 1443 1369 int last = conf->raid_disks - 1; 1444 1370 struct request_queue *q = bdev_get_queue(rdev->bdev); ··· 1507 1433 struct r1conf *conf = mddev->private; 1508 1434 int err = 0; 1509 1435 int number = rdev->raid_disk; 1510 - struct mirror_info *p = conf->mirrors+ number; 1436 + struct raid1_info *p = conf->mirrors + number; 1511 1437 1512 1438 if (rdev != p->rdev) 1513 1439 p = conf->mirrors + conf->raid_disks + number; ··· 2445 2371 bio->bi_rw = READ; 2446 2372 bio->bi_end_io = end_sync_read; 2447 2373 read_targets++; 2374 + } else if (!test_bit(WriteErrorSeen, &rdev->flags) && 2375 + test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && 2376 + !test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) { 2377 + /* 2378 + * The device is suitable for reading (InSync), 2379 + * but has bad block(s) here. Let's try to correct them, 2380 + * if we are doing resync or repair. Otherwise, leave 2381 + * this device alone for this sync request. 2382 + */ 2383 + bio->bi_rw = WRITE; 2384 + bio->bi_end_io = end_sync_write; 2385 + write_targets++; 2448 2386 } 2449 2387 } 2450 2388 if (bio->bi_end_io) { ··· 2514 2428 /* There is nowhere to write, so all non-sync 2515 2429 * drives must be failed - so we are finished 2516 2430 */ 2517 - sector_t rv = max_sector - sector_nr; 2431 + sector_t rv; 2432 + if (min_bad > 0) 2433 + max_sector = sector_nr + min_bad; 2434 + rv = max_sector - sector_nr; 2518 2435 *skipped = 1; 2519 2436 put_buf(r1_bio); 2520 2437 return rv; ··· 2610 2521 { 2611 2522 struct r1conf *conf; 2612 2523 int i; 2613 - struct mirror_info *disk; 2524 + struct raid1_info *disk; 2614 2525 struct md_rdev *rdev; 2615 2526 int err = -ENOMEM; 2616 2527 ··· 2618 2529 if (!conf) 2619 2530 goto abort; 2620 2531 2621 - conf->mirrors = kzalloc(sizeof(struct mirror_info) 2532 + conf->mirrors = kzalloc(sizeof(struct raid1_info) 2622 2533 * mddev->raid_disks * 2, 2623 2534 GFP_KERNEL); 2624 2535 if (!conf->mirrors) ··· 2661 2572 mddev->merge_check_needed = 1; 2662 2573 2663 2574 disk->head_position = 0; 2575 + disk->seq_start = MaxSector; 2664 2576 } 2665 2577 conf->raid_disks = mddev->raid_disks; 2666 2578 conf->mddev = mddev; ··· 2675 2585 conf->recovery_disabled = mddev->recovery_disabled - 1; 2676 2586 2677 2587 err = -EIO; 2678 - conf->last_used = -1; 2679 2588 for (i = 0; i < conf->raid_disks * 2; i++) { 2680 2589 2681 2590 disk = conf->mirrors + i; ··· 2700 2611 if (disk->rdev && 2701 2612 (disk->rdev->saved_raid_disk < 0)) 2702 2613 conf->fullsync = 1; 2703 - } else if (conf->last_used < 0) 2704 - /* 2705 - * The first working device is used as a 2706 - * starting point to read balancing. 2707 - */ 2708 - conf->last_used = i; 2614 + } 2709 2615 } 2710 2616 2711 - if (conf->last_used < 0) { 2712 - printk(KERN_ERR "md/raid1:%s: no operational mirrors\n", 2713 - mdname(mddev)); 2714 - goto abort; 2715 - } 2716 2617 err = -ENOMEM; 2717 2618 conf->thread = md_register_thread(raid1d, mddev, "raid1"); 2718 2619 if (!conf->thread) { ··· 2877 2798 */ 2878 2799 mempool_t *newpool, *oldpool; 2879 2800 struct pool_info *newpoolinfo; 2880 - struct mirror_info *newmirrors; 2801 + struct raid1_info *newmirrors; 2881 2802 struct r1conf *conf = mddev->private; 2882 2803 int cnt, raid_disks; 2883 2804 unsigned long flags; ··· 2920 2841 kfree(newpoolinfo); 2921 2842 return -ENOMEM; 2922 2843 } 2923 - newmirrors = kzalloc(sizeof(struct mirror_info) * raid_disks * 2, 2844 + newmirrors = kzalloc(sizeof(struct raid1_info) * raid_disks * 2, 2924 2845 GFP_KERNEL); 2925 2846 if (!newmirrors) { 2926 2847 kfree(newpoolinfo); ··· 2959 2880 conf->raid_disks = mddev->raid_disks = raid_disks; 2960 2881 mddev->delta_disks = 0; 2961 2882 2962 - conf->last_used = 0; /* just make sure it is in-range */ 2963 2883 lower_barrier(conf); 2964 2884 2965 2885 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);

+8 -22

drivers/md/raid1.h

··· 1 1 #ifndef _RAID1_H 2 2 #define _RAID1_H 3 3 4 - struct mirror_info { 4 + struct raid1_info { 5 5 struct md_rdev *rdev; 6 6 sector_t head_position; 7 + 8 + /* When choose the best device for a read (read_balance()) 9 + * we try to keep sequential reads one the same device 10 + */ 11 + sector_t next_seq_sect; 12 + sector_t seq_start; 7 13 }; 8 14 9 15 /* ··· 30 24 31 25 struct r1conf { 32 26 struct mddev *mddev; 33 - struct mirror_info *mirrors; /* twice 'raid_disks' to 27 + struct raid1_info *mirrors; /* twice 'raid_disks' to 34 28 * allow for replacements. 35 29 */ 36 30 int raid_disks; 37 31 38 - /* When choose the best device for a read (read_balance()) 39 - * we try to keep sequential reads one the same device 40 - * using 'last_used' and 'next_seq_sect' 41 - */ 42 - int last_used; 43 - sector_t next_seq_sect; 44 32 /* During resync, read_balancing is only allowed on the part 45 33 * of the array that has been resynced. 'next_resync' tells us 46 34 * where that is. ··· 134 134 struct bio *bios[0]; 135 135 /* DO NOT PUT ANY NEW FIELDS HERE - bios array is contiguously alloced*/ 136 136 }; 137 - 138 - /* when we get a read error on a read-only array, we redirect to another 139 - * device without failing the first device, or trying to over-write to 140 - * correct the read error. To keep track of bad blocks on a per-bio 141 - * level, we store IO_BLOCKED in the appropriate 'bios' pointer 142 - */ 143 - #define IO_BLOCKED ((struct bio *)1) 144 - /* When we successfully write to a known bad-block, we need to remove the 145 - * bad-block marking which must be done from process context. So we record 146 - * the success by setting bios[n] to IO_MADE_GOOD 147 - */ 148 - #define IO_MADE_GOOD ((struct bio *)2) 149 - 150 - #define BIO_SPECIAL(bio) ((unsigned long)bio <= 2) 151 137 152 138 /* bits for r1bio.state */ 153 139 #define R1BIO_Uptodate 0

+59 -33

drivers/md/raid10.c

··· 60 60 */ 61 61 #define NR_RAID10_BIOS 256 62 62 63 - /* When there are this many requests queue to be written by 63 + /* when we get a read error on a read-only array, we redirect to another 64 + * device without failing the first device, or trying to over-write to 65 + * correct the read error. To keep track of bad blocks on a per-bio 66 + * level, we store IO_BLOCKED in the appropriate 'bios' pointer 67 + */ 68 + #define IO_BLOCKED ((struct bio *)1) 69 + /* When we successfully write to a known bad-block, we need to remove the 70 + * bad-block marking which must be done from process context. So we record 71 + * the success by setting devs[n].bio to IO_MADE_GOOD 72 + */ 73 + #define IO_MADE_GOOD ((struct bio *)2) 74 + 75 + #define BIO_SPECIAL(bio) ((unsigned long)bio <= 2) 76 + 77 + /* When there are this many requests queued to be written by 64 78 * the raid10 thread, we become 'congested' to provide back-pressure 65 79 * for writeback. 66 80 */ ··· 731 717 int sectors = r10_bio->sectors; 732 718 int best_good_sectors; 733 719 sector_t new_distance, best_dist; 734 - struct md_rdev *rdev, *best_rdev; 720 + struct md_rdev *best_rdev, *rdev = NULL; 735 721 int do_balance; 736 722 int best_slot; 737 723 struct geom *geo = &conf->geo; ··· 853 839 return rdev; 854 840 } 855 841 856 - static int raid10_congested(void *data, int bits) 842 + int md_raid10_congested(struct mddev *mddev, int bits) 857 843 { 858 - struct mddev *mddev = data; 859 844 struct r10conf *conf = mddev->private; 860 845 int i, ret = 0; 861 846 ··· 862 849 conf->pending_count >= max_queued_requests) 863 850 return 1; 864 851 865 - if (mddev_congested(mddev, bits)) 866 - return 1; 867 852 rcu_read_lock(); 868 853 for (i = 0; 869 854 (i < conf->geo.raid_disks || i < conf->prev.raid_disks) ··· 876 865 } 877 866 rcu_read_unlock(); 878 867 return ret; 868 + } 869 + EXPORT_SYMBOL_GPL(md_raid10_congested); 870 + 871 + static int raid10_congested(void *data, int bits) 872 + { 873 + struct mddev *mddev = data; 874 + 875 + return mddev_congested(mddev, bits) || 876 + md_raid10_congested(mddev, bits); 879 877 } 880 878 881 879 static void flush_pending_writes(struct r10conf *conf) ··· 1566 1546 static void print_conf(struct r10conf *conf) 1567 1547 { 1568 1548 int i; 1569 - struct mirror_info *tmp; 1549 + struct raid10_info *tmp; 1570 1550 1571 1551 printk(KERN_DEBUG "RAID10 conf printout:\n"); 1572 1552 if (!conf) { ··· 1600 1580 { 1601 1581 int i; 1602 1582 struct r10conf *conf = mddev->private; 1603 - struct mirror_info *tmp; 1583 + struct raid10_info *tmp; 1604 1584 int count = 0; 1605 1585 unsigned long flags; 1606 1586 ··· 1675 1655 else 1676 1656 mirror = first; 1677 1657 for ( ; mirror <= last ; mirror++) { 1678 - struct mirror_info *p = &conf->mirrors[mirror]; 1658 + struct raid10_info *p = &conf->mirrors[mirror]; 1679 1659 if (p->recovery_disabled == mddev->recovery_disabled) 1680 1660 continue; 1681 1661 if (p->rdev) { ··· 1729 1709 int err = 0; 1730 1710 int number = rdev->raid_disk; 1731 1711 struct md_rdev **rdevp; 1732 - struct mirror_info *p = conf->mirrors + number; 1712 + struct raid10_info *p = conf->mirrors + number; 1733 1713 1734 1714 print_conf(conf); 1735 1715 if (rdev == p->rdev) ··· 2896 2876 sector_t sect; 2897 2877 int must_sync; 2898 2878 int any_working; 2899 - struct mirror_info *mirror = &conf->mirrors[i]; 2879 + struct raid10_info *mirror = &conf->mirrors[i]; 2900 2880 2901 2881 if ((mirror->rdev == NULL || 2902 2882 test_bit(In_sync, &mirror->rdev->flags)) ··· 3408 3388 goto out; 3409 3389 3410 3390 /* FIXME calc properly */ 3411 - conf->mirrors = kzalloc(sizeof(struct mirror_info)*(mddev->raid_disks + 3391 + conf->mirrors = kzalloc(sizeof(struct raid10_info)*(mddev->raid_disks + 3412 3392 max(0,mddev->delta_disks)), 3413 3393 GFP_KERNEL); 3414 3394 if (!conf->mirrors) ··· 3472 3452 { 3473 3453 struct r10conf *conf; 3474 3454 int i, disk_idx, chunk_size; 3475 - struct mirror_info *disk; 3455 + struct raid10_info *disk; 3476 3456 struct md_rdev *rdev; 3477 3457 sector_t size; 3478 3458 sector_t min_offset_diff = 0; ··· 3492 3472 conf->thread = NULL; 3493 3473 3494 3474 chunk_size = mddev->chunk_sectors << 9; 3495 - blk_queue_io_min(mddev->queue, chunk_size); 3496 - if (conf->geo.raid_disks % conf->geo.near_copies) 3497 - blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks); 3498 - else 3499 - blk_queue_io_opt(mddev->queue, chunk_size * 3500 - (conf->geo.raid_disks / conf->geo.near_copies)); 3475 + if (mddev->queue) { 3476 + blk_queue_io_min(mddev->queue, chunk_size); 3477 + if (conf->geo.raid_disks % conf->geo.near_copies) 3478 + blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks); 3479 + else 3480 + blk_queue_io_opt(mddev->queue, chunk_size * 3481 + (conf->geo.raid_disks / conf->geo.near_copies)); 3482 + } 3501 3483 3502 3484 rdev_for_each(rdev, mddev) { 3503 3485 long long diff; ··· 3533 3511 if (first || diff < min_offset_diff) 3534 3512 min_offset_diff = diff; 3535 3513 3536 - disk_stack_limits(mddev->gendisk, rdev->bdev, 3537 - rdev->data_offset << 9); 3514 + if (mddev->gendisk) 3515 + disk_stack_limits(mddev->gendisk, rdev->bdev, 3516 + rdev->data_offset << 9); 3538 3517 3539 3518 disk->head_position = 0; 3540 3519 } ··· 3598 3575 md_set_array_sectors(mddev, size); 3599 3576 mddev->resync_max_sectors = size; 3600 3577 3601 - mddev->queue->backing_dev_info.congested_fn = raid10_congested; 3602 - mddev->queue->backing_dev_info.congested_data = mddev; 3603 - 3604 - /* Calculate max read-ahead size. 3605 - * We need to readahead at least twice a whole stripe.... 3606 - * maybe... 3607 - */ 3608 - { 3578 + if (mddev->queue) { 3609 3579 int stripe = conf->geo.raid_disks * 3610 3580 ((mddev->chunk_sectors << 9) / PAGE_SIZE); 3581 + mddev->queue->backing_dev_info.congested_fn = raid10_congested; 3582 + mddev->queue->backing_dev_info.congested_data = mddev; 3583 + 3584 + /* Calculate max read-ahead size. 3585 + * We need to readahead at least twice a whole stripe.... 3586 + * maybe... 3587 + */ 3611 3588 stripe /= conf->geo.near_copies; 3612 3589 if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe) 3613 3590 mddev->queue->backing_dev_info.ra_pages = 2 * stripe; 3591 + blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec); 3614 3592 } 3615 3593 3616 - blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec); 3617 3594 3618 3595 if (md_integrity_register(mddev)) 3619 3596 goto out_free_conf; ··· 3664 3641 lower_barrier(conf); 3665 3642 3666 3643 md_unregister_thread(&mddev->thread); 3667 - blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ 3644 + if (mddev->queue) 3645 + /* the unplug fn references 'conf'*/ 3646 + blk_sync_queue(mddev->queue); 3647 + 3668 3648 if (conf->r10bio_pool) 3669 3649 mempool_destroy(conf->r10bio_pool); 3670 3650 kfree(conf->mirrors); ··· 3831 3805 if (mddev->delta_disks > 0) { 3832 3806 /* allocate new 'mirrors' list */ 3833 3807 conf->mirrors_new = kzalloc( 3834 - sizeof(struct mirror_info) 3808 + sizeof(struct raid10_info) 3835 3809 *(mddev->raid_disks + 3836 3810 mddev->delta_disks), 3837 3811 GFP_KERNEL); ··· 3956 3930 spin_lock_irq(&conf->device_lock); 3957 3931 if (conf->mirrors_new) { 3958 3932 memcpy(conf->mirrors_new, conf->mirrors, 3959 - sizeof(struct mirror_info)*conf->prev.raid_disks); 3933 + sizeof(struct raid10_info)*conf->prev.raid_disks); 3960 3934 smp_mb(); 3961 3935 kfree(conf->mirrors_old); /* FIXME and elsewhere */ 3962 3936 conf->mirrors_old = conf->mirrors;

+6 -17

drivers/md/raid10.h

··· 1 1 #ifndef _RAID10_H 2 2 #define _RAID10_H 3 3 4 - struct mirror_info { 4 + struct raid10_info { 5 5 struct md_rdev *rdev, *replacement; 6 6 sector_t head_position; 7 7 int recovery_disabled; /* matches ··· 13 13 14 14 struct r10conf { 15 15 struct mddev *mddev; 16 - struct mirror_info *mirrors; 17 - struct mirror_info *mirrors_new, *mirrors_old; 16 + struct raid10_info *mirrors; 17 + struct raid10_info *mirrors_new, *mirrors_old; 18 18 spinlock_t device_lock; 19 19 20 20 /* geometry */ ··· 123 123 } devs[0]; 124 124 }; 125 125 126 - /* when we get a read error on a read-only array, we redirect to another 127 - * device without failing the first device, or trying to over-write to 128 - * correct the read error. To keep track of bad blocks on a per-bio 129 - * level, we store IO_BLOCKED in the appropriate 'bios' pointer 130 - */ 131 - #define IO_BLOCKED ((struct bio*)1) 132 - /* When we successfully write to a known bad-block, we need to remove the 133 - * bad-block marking which must be done from process context. So we record 134 - * the success by setting devs[n].bio to IO_MADE_GOOD 135 - */ 136 - #define IO_MADE_GOOD ((struct bio *)2) 137 - 138 - #define BIO_SPECIAL(bio) ((unsigned long)bio <= 2) 139 - 140 126 /* bits for r10bio.state */ 141 127 enum r10bio_state { 142 128 R10BIO_Uptodate, ··· 145 159 */ 146 160 R10BIO_Previous, 147 161 }; 162 + 163 + extern int md_raid10_congested(struct mddev *mddev, int bits); 164 + 148 165 #endif

+112 -93

drivers/md/raid5.c

··· 99 99 * We maintain a biased count of active stripes in the bottom 16 bits of 100 100 * bi_phys_segments, and a count of processed stripes in the upper 16 bits 101 101 */ 102 - static inline int raid5_bi_phys_segments(struct bio *bio) 102 + static inline int raid5_bi_processed_stripes(struct bio *bio) 103 103 { 104 - return bio->bi_phys_segments & 0xffff; 104 + atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; 105 + return (atomic_read(segments) >> 16) & 0xffff; 105 106 } 106 107 107 - static inline int raid5_bi_hw_segments(struct bio *bio) 108 + static inline int raid5_dec_bi_active_stripes(struct bio *bio) 108 109 { 109 - return (bio->bi_phys_segments >> 16) & 0xffff; 110 + atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; 111 + return atomic_sub_return(1, segments) & 0xffff; 110 112 } 111 113 112 - static inline int raid5_dec_bi_phys_segments(struct bio *bio) 114 + static inline void raid5_inc_bi_active_stripes(struct bio *bio) 113 115 { 114 - --bio->bi_phys_segments; 115 - return raid5_bi_phys_segments(bio); 116 + atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; 117 + atomic_inc(segments); 116 118 } 117 119 118 - static inline int raid5_dec_bi_hw_segments(struct bio *bio) 120 + static inline void raid5_set_bi_processed_stripes(struct bio *bio, 121 + unsigned int cnt) 119 122 { 120 - unsigned short val = raid5_bi_hw_segments(bio); 123 + atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; 124 + int old, new; 121 125 122 - --val; 123 - bio->bi_phys_segments = (val << 16) | raid5_bi_phys_segments(bio); 124 - return val; 126 + do { 127 + old = atomic_read(segments); 128 + new = (old & 0xffff) | (cnt << 16); 129 + } while (atomic_cmpxchg(segments, old, new) != old); 125 130 } 126 131 127 - static inline void raid5_set_bi_hw_segments(struct bio *bio, unsigned int cnt) 132 + static inline void raid5_set_bi_stripes(struct bio *bio, unsigned int cnt) 128 133 { 129 - bio->bi_phys_segments = raid5_bi_phys_segments(bio) | (cnt << 16); 134 + atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; 135 + atomic_set(segments, cnt); 130 136 } 131 137 132 138 /* Find first data disk in a raid6 stripe */ ··· 196 190 test_bit(STRIPE_COMPUTE_RUN, &sh->state); 197 191 } 198 192 199 - static void __release_stripe(struct r5conf *conf, struct stripe_head *sh) 193 + static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh) 200 194 { 201 - if (atomic_dec_and_test(&sh->count)) { 202 - BUG_ON(!list_empty(&sh->lru)); 203 - BUG_ON(atomic_read(&conf->active_stripes)==0); 204 - if (test_bit(STRIPE_HANDLE, &sh->state)) { 205 - if (test_bit(STRIPE_DELAYED, &sh->state) && 206 - !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 207 - list_add_tail(&sh->lru, &conf->delayed_list); 208 - else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && 209 - sh->bm_seq - conf->seq_write > 0) 210 - list_add_tail(&sh->lru, &conf->bitmap_list); 211 - else { 212 - clear_bit(STRIPE_DELAYED, &sh->state); 213 - clear_bit(STRIPE_BIT_DELAY, &sh->state); 214 - list_add_tail(&sh->lru, &conf->handle_list); 215 - } 216 - md_wakeup_thread(conf->mddev->thread); 217 - } else { 218 - BUG_ON(stripe_operations_active(sh)); 219 - if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 220 - if (atomic_dec_return(&conf->preread_active_stripes) 221 - < IO_THRESHOLD) 222 - md_wakeup_thread(conf->mddev->thread); 223 - atomic_dec(&conf->active_stripes); 224 - if (!test_bit(STRIPE_EXPANDING, &sh->state)) { 225 - list_add_tail(&sh->lru, &conf->inactive_list); 226 - wake_up(&conf->wait_for_stripe); 227 - if (conf->retry_read_aligned) 228 - md_wakeup_thread(conf->mddev->thread); 229 - } 195 + BUG_ON(!list_empty(&sh->lru)); 196 + BUG_ON(atomic_read(&conf->active_stripes)==0); 197 + if (test_bit(STRIPE_HANDLE, &sh->state)) { 198 + if (test_bit(STRIPE_DELAYED, &sh->state) && 199 + !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 200 + list_add_tail(&sh->lru, &conf->delayed_list); 201 + else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && 202 + sh->bm_seq - conf->seq_write > 0) 203 + list_add_tail(&sh->lru, &conf->bitmap_list); 204 + else { 205 + clear_bit(STRIPE_DELAYED, &sh->state); 206 + clear_bit(STRIPE_BIT_DELAY, &sh->state); 207 + list_add_tail(&sh->lru, &conf->handle_list); 208 + } 209 + md_wakeup_thread(conf->mddev->thread); 210 + } else { 211 + BUG_ON(stripe_operations_active(sh)); 212 + if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 213 + if (atomic_dec_return(&conf->preread_active_stripes) 214 + < IO_THRESHOLD) 215 + md_wakeup_thread(conf->mddev->thread); 216 + atomic_dec(&conf->active_stripes); 217 + if (!test_bit(STRIPE_EXPANDING, &sh->state)) { 218 + list_add_tail(&sh->lru, &conf->inactive_list); 219 + wake_up(&conf->wait_for_stripe); 220 + if (conf->retry_read_aligned) 221 + md_wakeup_thread(conf->mddev->thread); 230 222 } 231 223 } 224 + } 225 + 226 + static void __release_stripe(struct r5conf *conf, struct stripe_head *sh) 227 + { 228 + if (atomic_dec_and_test(&sh->count)) 229 + do_release_stripe(conf, sh); 232 230 } 233 231 234 232 static void release_stripe(struct stripe_head *sh) ··· 240 230 struct r5conf *conf = sh->raid_conf; 241 231 unsigned long flags; 242 232 243 - spin_lock_irqsave(&conf->device_lock, flags); 244 - __release_stripe(conf, sh); 245 - spin_unlock_irqrestore(&conf->device_lock, flags); 233 + local_irq_save(flags); 234 + if (atomic_dec_and_lock(&sh->count, &conf->device_lock)) { 235 + do_release_stripe(conf, sh); 236 + spin_unlock(&conf->device_lock); 237 + } 238 + local_irq_restore(flags); 246 239 } 247 240 248 241 static inline void remove_hash(struct stripe_head *sh) ··· 653 640 else 654 641 bi->bi_sector = (sh->sector 655 642 + rdev->data_offset); 643 + if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) 644 + bi->bi_rw |= REQ_FLUSH; 645 + 656 646 bi->bi_flags = 1 << BIO_UPTODATE; 657 647 bi->bi_idx = 0; 658 648 bi->bi_io_vec[0].bv_len = STRIPE_SIZE; ··· 765 749 { 766 750 struct stripe_head *sh = stripe_head_ref; 767 751 struct bio *return_bi = NULL; 768 - struct r5conf *conf = sh->raid_conf; 769 752 int i; 770 753 771 754 pr_debug("%s: stripe %llu\n", __func__, 772 755 (unsigned long long)sh->sector); 773 756 774 757 /* clear completed biofills */ 775 - spin_lock_irq(&conf->device_lock); 776 758 for (i = sh->disks; i--; ) { 777 759 struct r5dev *dev = &sh->dev[i]; 778 760 ··· 788 774 while (rbi && rbi->bi_sector < 789 775 dev->sector + STRIPE_SECTORS) { 790 776 rbi2 = r5_next_bio(rbi, dev->sector); 791 - if (!raid5_dec_bi_phys_segments(rbi)) { 777 + if (!raid5_dec_bi_active_stripes(rbi)) { 792 778 rbi->bi_next = return_bi; 793 779 return_bi = rbi; 794 780 } ··· 796 782 } 797 783 } 798 784 } 799 - spin_unlock_irq(&conf->device_lock); 800 785 clear_bit(STRIPE_BIOFILL_RUN, &sh->state); 801 786 802 787 return_io(return_bi); ··· 807 794 static void ops_run_biofill(struct stripe_head *sh) 808 795 { 809 796 struct dma_async_tx_descriptor *tx = NULL; 810 - struct r5conf *conf = sh->raid_conf; 811 797 struct async_submit_ctl submit; 812 798 int i; 813 799 ··· 817 805 struct r5dev *dev = &sh->dev[i]; 818 806 if (test_bit(R5_Wantfill, &dev->flags)) { 819 807 struct bio *rbi; 820 - spin_lock_irq(&conf->device_lock); 808 + spin_lock_irq(&sh->stripe_lock); 821 809 dev->read = rbi = dev->toread; 822 810 dev->toread = NULL; 823 - spin_unlock_irq(&conf->device_lock); 811 + spin_unlock_irq(&sh->stripe_lock); 824 812 while (rbi && rbi->bi_sector < 825 813 dev->sector + STRIPE_SECTORS) { 826 814 tx = async_copy_data(0, rbi, dev->page, ··· 1156 1144 if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) { 1157 1145 struct bio *wbi; 1158 1146 1159 - spin_lock_irq(&sh->raid_conf->device_lock); 1147 + spin_lock_irq(&sh->stripe_lock); 1160 1148 chosen = dev->towrite; 1161 1149 dev->towrite = NULL; 1162 1150 BUG_ON(dev->written); 1163 1151 wbi = dev->written = chosen; 1164 - spin_unlock_irq(&sh->raid_conf->device_lock); 1152 + spin_unlock_irq(&sh->stripe_lock); 1165 1153 1166 1154 while (wbi && wbi->bi_sector < 1167 1155 dev->sector + STRIPE_SECTORS) { ··· 1466 1454 init_waitqueue_head(&sh->ops.wait_for_ops); 1467 1455 #endif 1468 1456 1457 + spin_lock_init(&sh->stripe_lock); 1458 + 1469 1459 if (grow_buffers(sh)) { 1470 1460 shrink_buffers(sh); 1471 1461 kmem_cache_free(conf->slab_cache, sh); ··· 1753 1739 atomic_add(STRIPE_SECTORS, &rdev->corrected_errors); 1754 1740 clear_bit(R5_ReadError, &sh->dev[i].flags); 1755 1741 clear_bit(R5_ReWrite, &sh->dev[i].flags); 1756 - } 1742 + } else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) 1743 + clear_bit(R5_ReadNoMerge, &sh->dev[i].flags); 1744 + 1757 1745 if (atomic_read(&rdev->read_errors)) 1758 1746 atomic_set(&rdev->read_errors, 0); 1759 1747 } else { ··· 1800 1784 else 1801 1785 retry = 1; 1802 1786 if (retry) 1803 - set_bit(R5_ReadError, &sh->dev[i].flags); 1787 + if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) { 1788 + set_bit(R5_ReadError, &sh->dev[i].flags); 1789 + clear_bit(R5_ReadNoMerge, &sh->dev[i].flags); 1790 + } else 1791 + set_bit(R5_ReadNoMerge, &sh->dev[i].flags); 1804 1792 else { 1805 1793 clear_bit(R5_ReadError, &sh->dev[i].flags); 1806 1794 clear_bit(R5_ReWrite, &sh->dev[i].flags); ··· 2360 2340 (unsigned long long)bi->bi_sector, 2361 2341 (unsigned long long)sh->sector); 2362 2342 2363 - 2364 - spin_lock_irq(&conf->device_lock); 2343 + /* 2344 + * If several bio share a stripe. The bio bi_phys_segments acts as a 2345 + * reference count to avoid race. The reference count should already be 2346 + * increased before this function is called (for example, in 2347 + * make_request()), so other bio sharing this stripe will not free the 2348 + * stripe. If a stripe is owned by one stripe, the stripe lock will 2349 + * protect it. 2350 + */ 2351 + spin_lock_irq(&sh->stripe_lock); 2365 2352 if (forwrite) { 2366 2353 bip = &sh->dev[dd_idx].towrite; 2367 - if (*bip == NULL && sh->dev[dd_idx].written == NULL) 2354 + if (*bip == NULL) 2368 2355 firstwrite = 1; 2369 2356 } else 2370 2357 bip = &sh->dev[dd_idx].toread; ··· 2387 2360 if (*bip) 2388 2361 bi->bi_next = *bip; 2389 2362 *bip = bi; 2390 - bi->bi_phys_segments++; 2363 + raid5_inc_bi_active_stripes(bi); 2391 2364 2392 2365 if (forwrite) { 2393 2366 /* check if page is covered */ ··· 2402 2375 if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS) 2403 2376 set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags); 2404 2377 } 2405 - spin_unlock_irq(&conf->device_lock); 2378 + spin_unlock_irq(&sh->stripe_lock); 2406 2379 2407 2380 pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n", 2408 2381 (unsigned long long)(*bip)->bi_sector, ··· 2418 2391 2419 2392 overlap: 2420 2393 set_bit(R5_Overlap, &sh->dev[dd_idx].flags); 2421 - spin_unlock_irq(&conf->device_lock); 2394 + spin_unlock_irq(&sh->stripe_lock); 2422 2395 return 0; 2423 2396 } 2424 2397 ··· 2468 2441 rdev_dec_pending(rdev, conf->mddev); 2469 2442 } 2470 2443 } 2471 - spin_lock_irq(&conf->device_lock); 2444 + spin_lock_irq(&sh->stripe_lock); 2472 2445 /* fail all writes first */ 2473 2446 bi = sh->dev[i].towrite; 2474 2447 sh->dev[i].towrite = NULL; 2448 + spin_unlock_irq(&sh->stripe_lock); 2475 2449 if (bi) { 2476 2450 s->to_write--; 2477 2451 bitmap_end = 1; ··· 2485 2457 sh->dev[i].sector + STRIPE_SECTORS) { 2486 2458 struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); 2487 2459 clear_bit(BIO_UPTODATE, &bi->bi_flags); 2488 - if (!raid5_dec_bi_phys_segments(bi)) { 2460 + if (!raid5_dec_bi_active_stripes(bi)) { 2489 2461 md_write_end(conf->mddev); 2490 2462 bi->bi_next = *return_bi; 2491 2463 *return_bi = bi; 2492 2464 } 2493 2465 bi = nextbi; 2494 2466 } 2467 + if (bitmap_end) 2468 + bitmap_endwrite(conf->mddev->bitmap, sh->sector, 2469 + STRIPE_SECTORS, 0, 0); 2470 + bitmap_end = 0; 2495 2471 /* and fail all 'written' */ 2496 2472 bi = sh->dev[i].written; 2497 2473 sh->dev[i].written = NULL; ··· 2504 2472 sh->dev[i].sector + STRIPE_SECTORS) { 2505 2473 struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector); 2506 2474 clear_bit(BIO_UPTODATE, &bi->bi_flags); 2507 - if (!raid5_dec_bi_phys_segments(bi)) { 2475 + if (!raid5_dec_bi_active_stripes(bi)) { 2508 2476 md_write_end(conf->mddev); 2509 2477 bi->bi_next = *return_bi; 2510 2478 *return_bi = bi; ··· 2528 2496 struct bio *nextbi = 2529 2497 r5_next_bio(bi, sh->dev[i].sector); 2530 2498 clear_bit(BIO_UPTODATE, &bi->bi_flags); 2531 - if (!raid5_dec_bi_phys_segments(bi)) { 2499 + if (!raid5_dec_bi_active_stripes(bi)) { 2532 2500 bi->bi_next = *return_bi; 2533 2501 *return_bi = bi; 2534 2502 } 2535 2503 bi = nextbi; 2536 2504 } 2537 2505 } 2538 - spin_unlock_irq(&conf->device_lock); 2539 2506 if (bitmap_end) 2540 2507 bitmap_endwrite(conf->mddev->bitmap, sh->sector, 2541 2508 STRIPE_SECTORS, 0, 0); ··· 2738 2707 test_bit(R5_UPTODATE, &dev->flags)) { 2739 2708 /* We can return any write requests */ 2740 2709 struct bio *wbi, *wbi2; 2741 - int bitmap_end = 0; 2742 2710 pr_debug("Return write for disc %d\n", i); 2743 - spin_lock_irq(&conf->device_lock); 2744 2711 wbi = dev->written; 2745 2712 dev->written = NULL; 2746 2713 while (wbi && wbi->bi_sector < 2747 2714 dev->sector + STRIPE_SECTORS) { 2748 2715 wbi2 = r5_next_bio(wbi, dev->sector); 2749 - if (!raid5_dec_bi_phys_segments(wbi)) { 2716 + if (!raid5_dec_bi_active_stripes(wbi)) { 2750 2717 md_write_end(conf->mddev); 2751 2718 wbi->bi_next = *return_bi; 2752 2719 *return_bi = wbi; 2753 2720 } 2754 2721 wbi = wbi2; 2755 2722 } 2756 - if (dev->towrite == NULL) 2757 - bitmap_end = 1; 2758 - spin_unlock_irq(&conf->device_lock); 2759 - if (bitmap_end) 2760 - bitmap_endwrite(conf->mddev->bitmap, 2761 - sh->sector, 2762 - STRIPE_SECTORS, 2723 + bitmap_endwrite(conf->mddev->bitmap, sh->sector, 2724 + STRIPE_SECTORS, 2763 2725 !test_bit(STRIPE_DEGRADED, &sh->state), 2764 - 0); 2726 + 0); 2765 2727 } 2766 2728 } 2767 2729 ··· 3206 3182 3207 3183 /* Now to look around and see what can be done */ 3208 3184 rcu_read_lock(); 3209 - spin_lock_irq(&conf->device_lock); 3210 3185 for (i=disks; i--; ) { 3211 3186 struct md_rdev *rdev; 3212 3187 sector_t first_bad; ··· 3351 3328 do_recovery = 1; 3352 3329 } 3353 3330 } 3354 - spin_unlock_irq(&conf->device_lock); 3355 3331 if (test_bit(STRIPE_SYNCING, &sh->state)) { 3356 3332 /* If there is a failed device being replaced, 3357 3333 * we must be recovering. ··· 3813 3791 * this sets the active strip count to 1 and the processed 3814 3792 * strip count to zero (upper 8 bits) 3815 3793 */ 3816 - bi->bi_phys_segments = 1; /* biased count of active stripes */ 3794 + raid5_set_bi_stripes(bi, 1); /* biased count of active stripes */ 3817 3795 } 3818 3796 3819 3797 return bi; ··· 4135 4113 finish_wait(&conf->wait_for_overlap, &w); 4136 4114 set_bit(STRIPE_HANDLE, &sh->state); 4137 4115 clear_bit(STRIPE_DELAYED, &sh->state); 4138 - if ((bi->bi_rw & REQ_SYNC) && 4116 + if ((bi->bi_rw & REQ_NOIDLE) && 4139 4117 !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 4140 4118 atomic_inc(&conf->preread_active_stripes); 4141 4119 mddev_check_plugged(mddev); ··· 4148 4126 } 4149 4127 } 4150 4128 4151 - spin_lock_irq(&conf->device_lock); 4152 - remaining = raid5_dec_bi_phys_segments(bi); 4153 - spin_unlock_irq(&conf->device_lock); 4129 + remaining = raid5_dec_bi_active_stripes(bi); 4154 4130 if (remaining == 0) { 4155 4131 4156 4132 if ( rw == WRITE ) ··· 4504 4484 sector += STRIPE_SECTORS, 4505 4485 scnt++) { 4506 4486 4507 - if (scnt < raid5_bi_hw_segments(raid_bio)) 4487 + if (scnt < raid5_bi_processed_stripes(raid_bio)) 4508 4488 /* already done this stripe */ 4509 4489 continue; 4510 4490 ··· 4512 4492 4513 4493 if (!sh) { 4514 4494 /* failed to get a stripe - must wait */ 4515 - raid5_set_bi_hw_segments(raid_bio, scnt); 4495 + raid5_set_bi_processed_stripes(raid_bio, scnt); 4516 4496 conf->retry_read_aligned = raid_bio; 4517 4497 return handled; 4518 4498 } 4519 4499 4520 4500 if (!add_stripe_bio(sh, raid_bio, dd_idx, 0)) { 4521 4501 release_stripe(sh); 4522 - raid5_set_bi_hw_segments(raid_bio, scnt); 4502 + raid5_set_bi_processed_stripes(raid_bio, scnt); 4523 4503 conf->retry_read_aligned = raid_bio; 4524 4504 return handled; 4525 4505 } 4526 4506 4507 + set_bit(R5_ReadNoMerge, &sh->dev[dd_idx].flags); 4527 4508 handle_stripe(sh); 4528 4509 release_stripe(sh); 4529 4510 handled++; 4530 4511 } 4531 - spin_lock_irq(&conf->device_lock); 4532 - remaining = raid5_dec_bi_phys_segments(raid_bio); 4533 - spin_unlock_irq(&conf->device_lock); 4512 + remaining = raid5_dec_bi_active_stripes(raid_bio); 4534 4513 if (remaining == 0) 4535 4514 bio_endio(raid_bio, 0); 4536 4515 if (atomic_dec_and_test(&conf->active_aligned_reads))

+2

drivers/md/raid5.h

··· 210 210 int disks; /* disks in stripe */ 211 211 enum check_states check_state; 212 212 enum reconstruct_states reconstruct_state; 213 + spinlock_t stripe_lock; 213 214 /** 214 215 * struct stripe_operations 215 216 * @target - STRIPE_OP_COMPUTE_BLK target ··· 274 273 R5_Wantwrite, 275 274 R5_Overlap, /* There is a pending overlapping request 276 275 * on this block */ 276 + R5_ReadNoMerge, /* prevent bio from merging in block-layer */ 277 277 R5_ReadError, /* seen a read error here recently */ 278 278 R5_ReWrite, /* have tried to over-write the readerror */ 279 279