commit 6e4513972a5ad28517477d21f301a02ac7a0df76 · tjh.dev/kernel

+2 -2

drivers/md/Kconfig

··· 100 100 If unsure, say Y. 101 101 102 102 config MD_RAID10 103 - tristate "RAID-10 (mirrored striping) mode (EXPERIMENTAL)" 104 - depends on BLK_DEV_MD && EXPERIMENTAL 103 + tristate "RAID-10 (mirrored striping) mode" 104 + depends on BLK_DEV_MD 105 105 ---help--- 106 106 RAID-10 provides a combination of striping (RAID-0) and 107 107 mirroring (RAID-1) with easier configuration and more flexible

+35 -6

drivers/md/bitmap.c

··· 505 505 return; 506 506 } 507 507 spin_unlock_irqrestore(&bitmap->lock, flags); 508 - sb = (bitmap_super_t *)kmap_atomic(bitmap->sb_page, KM_USER0); 508 + sb = kmap_atomic(bitmap->sb_page, KM_USER0); 509 509 sb->events = cpu_to_le64(bitmap->mddev->events); 510 510 if (bitmap->mddev->events < bitmap->events_cleared) { 511 511 /* rocking back to read-only */ ··· 526 526 527 527 if (!bitmap || !bitmap->sb_page) 528 528 return; 529 - sb = (bitmap_super_t *)kmap_atomic(bitmap->sb_page, KM_USER0); 529 + sb = kmap_atomic(bitmap->sb_page, KM_USER0); 530 530 printk(KERN_DEBUG "%s: bitmap file superblock:\n", bmname(bitmap)); 531 531 printk(KERN_DEBUG " magic: %08x\n", le32_to_cpu(sb->magic)); 532 532 printk(KERN_DEBUG " version: %d\n", le32_to_cpu(sb->version)); ··· 575 575 return err; 576 576 } 577 577 578 - sb = (bitmap_super_t *)kmap_atomic(bitmap->sb_page, KM_USER0); 578 + sb = kmap_atomic(bitmap->sb_page, KM_USER0); 579 579 580 580 chunksize = le32_to_cpu(sb->chunksize); 581 581 daemon_sleep = le32_to_cpu(sb->daemon_sleep) * HZ; ··· 661 661 return 0; 662 662 } 663 663 spin_unlock_irqrestore(&bitmap->lock, flags); 664 - sb = (bitmap_super_t *)kmap_atomic(bitmap->sb_page, KM_USER0); 664 + sb = kmap_atomic(bitmap->sb_page, KM_USER0); 665 665 old = le32_to_cpu(sb->state) & bits; 666 666 switch (op) { 667 667 case MASK_SET: sb->state |= cpu_to_le32(bits); ··· 1292 1292 if (!bitmap) return 0; 1293 1293 1294 1294 if (behind) { 1295 + int bw; 1295 1296 atomic_inc(&bitmap->behind_writes); 1297 + bw = atomic_read(&bitmap->behind_writes); 1298 + if (bw > bitmap->behind_writes_used) 1299 + bitmap->behind_writes_used = bw; 1300 + 1296 1301 PRINTK(KERN_DEBUG "inc write-behind count %d/%d\n", 1297 - atomic_read(&bitmap->behind_writes), bitmap->max_write_behind); 1302 + bw, bitmap->max_write_behind); 1298 1303 } 1299 1304 1300 1305 while (sectors) { ··· 1356 1351 { 1357 1352 if (!bitmap) return; 1358 1353 if (behind) { 1359 - atomic_dec(&bitmap->behind_writes); 1354 + if (atomic_dec_and_test(&bitmap->behind_writes)) 1355 + wake_up(&bitmap->behind_wait); 1360 1356 PRINTK(KERN_DEBUG "dec write-behind count %d/%d\n", 1361 1357 atomic_read(&bitmap->behind_writes), bitmap->max_write_behind); 1362 1358 } ··· 1681 1675 atomic_set(&bitmap->pending_writes, 0); 1682 1676 init_waitqueue_head(&bitmap->write_wait); 1683 1677 init_waitqueue_head(&bitmap->overflow_wait); 1678 + init_waitqueue_head(&bitmap->behind_wait); 1684 1679 1685 1680 bitmap->mddev = mddev; 1686 1681 ··· 2013 2006 static struct md_sysfs_entry bitmap_can_clear = 2014 2007 __ATTR(can_clear, S_IRUGO|S_IWUSR, can_clear_show, can_clear_store); 2015 2008 2009 + static ssize_t 2010 + behind_writes_used_show(mddev_t *mddev, char *page) 2011 + { 2012 + if (mddev->bitmap == NULL) 2013 + return sprintf(page, "0\n"); 2014 + return sprintf(page, "%lu\n", 2015 + mddev->bitmap->behind_writes_used); 2016 + } 2017 + 2018 + static ssize_t 2019 + behind_writes_used_reset(mddev_t *mddev, const char *buf, size_t len) 2020 + { 2021 + if (mddev->bitmap) 2022 + mddev->bitmap->behind_writes_used = 0; 2023 + return len; 2024 + } 2025 + 2026 + static struct md_sysfs_entry max_backlog_used = 2027 + __ATTR(max_backlog_used, S_IRUGO | S_IWUSR, 2028 + behind_writes_used_show, behind_writes_used_reset); 2029 + 2016 2030 static struct attribute *md_bitmap_attrs[] = { 2017 2031 &bitmap_location.attr, 2018 2032 &bitmap_timeout.attr, ··· 2041 2013 &bitmap_chunksize.attr, 2042 2014 &bitmap_metadata.attr, 2043 2015 &bitmap_can_clear.attr, 2016 + &max_backlog_used.attr, 2044 2017 NULL 2045 2018 }; 2046 2019 struct attribute_group md_bitmap_group = {

+2

drivers/md/bitmap.h

··· 227 227 int allclean; 228 228 229 229 atomic_t behind_writes; 230 + unsigned long behind_writes_used; /* highest actual value at runtime */ 230 231 231 232 /* 232 233 * the bitmap daemon - periodically wakes up and sweeps the bitmap ··· 240 239 atomic_t pending_writes; /* pending writes to the bitmap file */ 241 240 wait_queue_head_t write_wait; 242 241 wait_queue_head_t overflow_wait; 242 + wait_queue_head_t behind_wait; 243 243 244 244 struct sysfs_dirent *sysfs_can_clear; 245 245 };

+4 -5

drivers/md/faulty.c

··· 169 169 conf->nfaults = n+1; 170 170 } 171 171 172 - static int make_request(struct request_queue *q, struct bio *bio) 172 + static int make_request(mddev_t *mddev, struct bio *bio) 173 173 { 174 - mddev_t *mddev = q->queuedata; 175 - conf_t *conf = (conf_t*)mddev->private; 174 + conf_t *conf = mddev->private; 176 175 int failit = 0; 177 176 178 177 if (bio_data_dir(bio) == WRITE) { ··· 224 225 225 226 static void status(struct seq_file *seq, mddev_t *mddev) 226 227 { 227 - conf_t *conf = (conf_t*)mddev->private; 228 + conf_t *conf = mddev->private; 228 229 int n; 229 230 230 231 if ((n=atomic_read(&conf->counters[WriteTransient])) != 0) ··· 327 328 328 329 static int stop(mddev_t *mddev) 329 330 { 330 - conf_t *conf = (conf_t *)mddev->private; 331 + conf_t *conf = mddev->private; 331 332 332 333 kfree(conf); 333 334 mddev->private = NULL;

+16 -20

drivers/md/linear.c

··· 159 159 sector_t sectors; 160 160 161 161 if (j < 0 || j >= raid_disks || disk->rdev) { 162 - printk("linear: disk numbering problem. Aborting!\n"); 162 + printk(KERN_ERR "md/linear:%s: disk numbering problem. Aborting!\n", 163 + mdname(mddev)); 163 164 goto out; 164 165 } 165 166 ··· 188 187 189 188 } 190 189 if (cnt != raid_disks) { 191 - printk("linear: not enough drives present. Aborting!\n"); 190 + printk(KERN_ERR "md/linear:%s: not enough drives present. Aborting!\n", 191 + mdname(mddev)); 192 192 goto out; 193 193 } 194 194 ··· 284 282 rcu_barrier(); 285 283 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ 286 284 kfree(conf); 285 + mddev->private = NULL; 287 286 288 287 return 0; 289 288 } 290 289 291 - static int linear_make_request (struct request_queue *q, struct bio *bio) 290 + static int linear_make_request (mddev_t *mddev, struct bio *bio) 292 291 { 293 - const int rw = bio_data_dir(bio); 294 - mddev_t *mddev = q->queuedata; 295 292 dev_info_t *tmp_dev; 296 293 sector_t start_sector; 297 - int cpu; 298 294 299 295 if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) { 300 296 md_barrier_request(mddev, bio); 301 297 return 0; 302 298 } 303 - 304 - cpu = part_stat_lock(); 305 - part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); 306 - part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], 307 - bio_sectors(bio)); 308 - part_stat_unlock(); 309 299 310 300 rcu_read_lock(); 311 301 tmp_dev = which_dev(mddev, bio->bi_sector); ··· 308 314 || (bio->bi_sector < start_sector))) { 309 315 char b[BDEVNAME_SIZE]; 310 316 311 - printk("linear_make_request: Sector %llu out of bounds on " 312 - "dev %s: %llu sectors, offset %llu\n", 313 - (unsigned long long)bio->bi_sector, 314 - bdevname(tmp_dev->rdev->bdev, b), 315 - (unsigned long long)tmp_dev->rdev->sectors, 316 - (unsigned long long)start_sector); 317 + printk(KERN_ERR 318 + "md/linear:%s: make_request: Sector %llu out of bounds on " 319 + "dev %s: %llu sectors, offset %llu\n", 320 + mdname(mddev), 321 + (unsigned long long)bio->bi_sector, 322 + bdevname(tmp_dev->rdev->bdev, b), 323 + (unsigned long long)tmp_dev->rdev->sectors, 324 + (unsigned long long)start_sector); 317 325 rcu_read_unlock(); 318 326 bio_io_error(bio); 319 327 return 0; ··· 332 336 333 337 bp = bio_split(bio, end_sector - bio->bi_sector); 334 338 335 - if (linear_make_request(q, &bp->bio1)) 339 + if (linear_make_request(mddev, &bp->bio1)) 336 340 generic_make_request(&bp->bio1); 337 - if (linear_make_request(q, &bp->bio2)) 341 + if (linear_make_request(mddev, &bp->bio2)) 338 342 generic_make_request(&bp->bio2); 339 343 bio_pair_release(bp); 340 344 return 0;

+329 -208

drivers/md/md.c

··· 215 215 */ 216 216 static int md_make_request(struct request_queue *q, struct bio *bio) 217 217 { 218 + const int rw = bio_data_dir(bio); 218 219 mddev_t *mddev = q->queuedata; 219 220 int rv; 221 + int cpu; 222 + 220 223 if (mddev == NULL || mddev->pers == NULL) { 221 224 bio_io_error(bio); 222 225 return 0; ··· 240 237 } 241 238 atomic_inc(&mddev->active_io); 242 239 rcu_read_unlock(); 243 - rv = mddev->pers->make_request(q, bio); 240 + 241 + rv = mddev->pers->make_request(mddev, bio); 242 + 243 + cpu = part_stat_lock(); 244 + part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); 245 + part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], 246 + bio_sectors(bio)); 247 + part_stat_unlock(); 248 + 244 249 if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended) 245 250 wake_up(&mddev->sb_wait); 246 251 247 252 return rv; 248 253 } 249 254 255 + /* mddev_suspend makes sure no new requests are submitted 256 + * to the device, and that any requests that have been submitted 257 + * are completely handled. 258 + * Once ->stop is called and completes, the module will be completely 259 + * unused. 260 + */ 250 261 static void mddev_suspend(mddev_t *mddev) 251 262 { 252 263 BUG_ON(mddev->suspended); ··· 268 251 synchronize_rcu(); 269 252 wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0); 270 253 mddev->pers->quiesce(mddev, 1); 271 - md_unregister_thread(mddev->thread); 272 - mddev->thread = NULL; 273 - /* we now know that no code is executing in the personality module, 274 - * except possibly the tail end of a ->bi_end_io function, but that 275 - * is certain to complete before the module has a chance to get 276 - * unloaded 277 - */ 278 254 } 279 255 280 256 static void mddev_resume(mddev_t *mddev) ··· 354 344 bio_endio(bio, 0); 355 345 else { 356 346 bio->bi_rw &= ~(1<<BIO_RW_BARRIER); 357 - if (mddev->pers->make_request(mddev->queue, bio)) 347 + if (mddev->pers->make_request(mddev, bio)) 358 348 generic_make_request(bio); 359 349 mddev->barrier = POST_REQUEST_BARRIER; 360 350 submit_barriers(mddev); ··· 414 404 kfree(mddev); 415 405 } 416 406 spin_unlock(&all_mddevs_lock); 407 + } 408 + 409 + static void mddev_init(mddev_t *mddev) 410 + { 411 + mutex_init(&mddev->open_mutex); 412 + mutex_init(&mddev->reconfig_mutex); 413 + mutex_init(&mddev->bitmap_info.mutex); 414 + INIT_LIST_HEAD(&mddev->disks); 415 + INIT_LIST_HEAD(&mddev->all_mddevs); 416 + init_timer(&mddev->safemode_timer); 417 + atomic_set(&mddev->active, 1); 418 + atomic_set(&mddev->openers, 0); 419 + atomic_set(&mddev->active_io, 0); 420 + spin_lock_init(&mddev->write_lock); 421 + atomic_set(&mddev->flush_pending, 0); 422 + init_waitqueue_head(&mddev->sb_wait); 423 + init_waitqueue_head(&mddev->recovery_wait); 424 + mddev->reshape_position = MaxSector; 425 + mddev->resync_min = 0; 426 + mddev->resync_max = MaxSector; 427 + mddev->level = LEVEL_NONE; 417 428 } 418 429 419 430 static mddev_t * mddev_find(dev_t unit) ··· 503 472 else 504 473 new->md_minor = MINOR(unit) >> MdpMinorShift; 505 474 506 - mutex_init(&new->open_mutex); 507 - mutex_init(&new->reconfig_mutex); 508 - mutex_init(&new->bitmap_info.mutex); 509 - INIT_LIST_HEAD(&new->disks); 510 - INIT_LIST_HEAD(&new->all_mddevs); 511 - init_timer(&new->safemode_timer); 512 - atomic_set(&new->active, 1); 513 - atomic_set(&new->openers, 0); 514 - atomic_set(&new->active_io, 0); 515 - spin_lock_init(&new->write_lock); 516 - atomic_set(&new->flush_pending, 0); 517 - init_waitqueue_head(&new->sb_wait); 518 - init_waitqueue_head(&new->recovery_wait); 519 - new->reshape_position = MaxSector; 520 - new->resync_min = 0; 521 - new->resync_max = MaxSector; 522 - new->level = LEVEL_NONE; 475 + mddev_init(new); 523 476 524 477 goto retry; 525 478 } ··· 523 508 return mutex_trylock(&mddev->reconfig_mutex); 524 509 } 525 510 526 - static inline void mddev_unlock(mddev_t * mddev) 511 + static struct attribute_group md_redundancy_group; 512 + 513 + static void mddev_unlock(mddev_t * mddev) 527 514 { 528 - mutex_unlock(&mddev->reconfig_mutex); 515 + if (mddev->to_remove) { 516 + /* These cannot be removed under reconfig_mutex as 517 + * an access to the files will try to take reconfig_mutex 518 + * while holding the file unremovable, which leads to 519 + * a deadlock. 520 + * So hold open_mutex instead - we are allowed to take 521 + * it while holding reconfig_mutex, and md_run can 522 + * use it to wait for the remove to complete. 523 + */ 524 + struct attribute_group *to_remove = mddev->to_remove; 525 + mddev->to_remove = NULL; 526 + mutex_lock(&mddev->open_mutex); 527 + mutex_unlock(&mddev->reconfig_mutex); 528 + 529 + if (to_remove != &md_redundancy_group) 530 + sysfs_remove_group(&mddev->kobj, to_remove); 531 + if (mddev->pers == NULL || 532 + mddev->pers->sync_request == NULL) { 533 + sysfs_remove_group(&mddev->kobj, &md_redundancy_group); 534 + if (mddev->sysfs_action) 535 + sysfs_put(mddev->sysfs_action); 536 + mddev->sysfs_action = NULL; 537 + } 538 + mutex_unlock(&mddev->open_mutex); 539 + } else 540 + mutex_unlock(&mddev->reconfig_mutex); 529 541 530 542 md_wakeup_thread(mddev->thread); 531 543 } ··· 1071 1029 mddev->bitmap_info.default_offset; 1072 1030 1073 1031 } else if (mddev->pers == NULL) { 1074 - /* Insist on good event counter while assembling */ 1032 + /* Insist on good event counter while assembling, except 1033 + * for spares (which don't need an event count) */ 1075 1034 ++ev1; 1076 - if (ev1 < mddev->events) 1077 - return -EINVAL; 1035 + if (sb->disks[rdev->desc_nr].state & ( 1036 + (1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE))) 1037 + if (ev1 < mddev->events) 1038 + return -EINVAL; 1078 1039 } else if (mddev->bitmap) { 1079 1040 /* if adding to array with a bitmap, then we can accept an 1080 1041 * older device ... but not too old. ··· 1473 1428 } 1474 1429 1475 1430 } else if (mddev->pers == NULL) { 1476 - /* Insist of good event counter while assembling */ 1431 + /* Insist of good event counter while assembling, except for 1432 + * spares (which don't need an event count) */ 1477 1433 ++ev1; 1478 - if (ev1 < mddev->events) 1479 - return -EINVAL; 1434 + if (rdev->desc_nr >= 0 && 1435 + rdev->desc_nr < le32_to_cpu(sb->max_dev) && 1436 + le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < 0xfffe) 1437 + if (ev1 < mddev->events) 1438 + return -EINVAL; 1480 1439 } else if (mddev->bitmap) { 1481 1440 /* If adding to array with a bitmap, then we can accept an 1482 1441 * older device, but not too old. ··· 2096 2047 if (rdev->sb_events == mddev->events || 2097 2048 (nospares && 2098 2049 rdev->raid_disk < 0 && 2099 - (rdev->sb_events&1)==0 && 2100 2050 rdev->sb_events+1 == mddev->events)) { 2101 2051 /* Don't update this superblock */ 2102 2052 rdev->sb_loaded = 2; ··· 2148 2100 * and 'events' is odd, we can roll back to the previous clean state */ 2149 2101 if (nospares 2150 2102 && (mddev->in_sync && mddev->recovery_cp == MaxSector) 2151 - && (mddev->events & 1) 2152 - && mddev->events != 1) 2103 + && mddev->can_decrease_events 2104 + && mddev->events != 1) { 2153 2105 mddev->events--; 2154 - else { 2106 + mddev->can_decrease_events = 0; 2107 + } else { 2155 2108 /* otherwise we have to go forward and ... */ 2156 2109 mddev->events ++; 2157 - if (!mddev->in_sync || mddev->recovery_cp != MaxSector) { /* not clean */ 2158 - /* .. if the array isn't clean, an 'even' event must also go 2159 - * to spares. */ 2160 - if ((mddev->events&1)==0) { 2161 - nospares = 0; 2162 - sync_req = 2; /* force a second update to get the 2163 - * even/odd in sync */ 2164 - } 2165 - } else { 2166 - /* otherwise an 'odd' event must go to spares */ 2167 - if ((mddev->events&1)) { 2168 - nospares = 0; 2169 - sync_req = 2; /* force a second update to get the 2170 - * even/odd in sync */ 2171 - } 2172 - } 2110 + mddev->can_decrease_events = nospares; 2173 2111 } 2174 2112 2175 2113 if (!mddev->events) { ··· 2399 2365 return err; 2400 2366 sprintf(nm, "rd%d", rdev->raid_disk); 2401 2367 sysfs_remove_link(&rdev->mddev->kobj, nm); 2368 + rdev->raid_disk = -1; 2402 2369 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 2403 2370 md_wakeup_thread(rdev->mddev->thread); 2404 2371 } else if (rdev->mddev->pers) { ··· 2815 2780 2816 2781 i = 0; 2817 2782 rdev_for_each(rdev, tmp, mddev) { 2818 - if (rdev->desc_nr >= mddev->max_disks || 2819 - i > mddev->max_disks) { 2783 + if (mddev->max_disks && 2784 + (rdev->desc_nr >= mddev->max_disks || 2785 + i > mddev->max_disks)) { 2820 2786 printk(KERN_WARNING 2821 2787 "md: %s: %s: only %d devices permitted\n", 2822 2788 mdname(mddev), bdevname(rdev->bdev, b), ··· 2933 2897 static ssize_t 2934 2898 level_store(mddev_t *mddev, const char *buf, size_t len) 2935 2899 { 2936 - char level[16]; 2900 + char clevel[16]; 2937 2901 ssize_t rv = len; 2938 2902 struct mdk_personality *pers; 2903 + long level; 2939 2904 void *priv; 2940 2905 mdk_rdev_t *rdev; 2941 2906 ··· 2969 2932 } 2970 2933 2971 2934 /* Now find the new personality */ 2972 - if (len == 0 || len >= sizeof(level)) 2935 + if (len == 0 || len >= sizeof(clevel)) 2973 2936 return -EINVAL; 2974 - strncpy(level, buf, len); 2975 - if (level[len-1] == '\n') 2937 + strncpy(clevel, buf, len); 2938 + if (clevel[len-1] == '\n') 2976 2939 len--; 2977 - level[len] = 0; 2940 + clevel[len] = 0; 2941 + if (strict_strtol(clevel, 10, &level)) 2942 + level = LEVEL_NONE; 2978 2943 2979 - request_module("md-%s", level); 2944 + if (request_module("md-%s", clevel) != 0) 2945 + request_module("md-level-%s", clevel); 2980 2946 spin_lock(&pers_lock); 2981 - pers = find_pers(LEVEL_NONE, level); 2947 + pers = find_pers(level, clevel); 2982 2948 if (!pers || !try_module_get(pers->owner)) { 2983 2949 spin_unlock(&pers_lock); 2984 - printk(KERN_WARNING "md: personality %s not loaded\n", level); 2950 + printk(KERN_WARNING "md: personality %s not loaded\n", clevel); 2985 2951 return -EINVAL; 2986 2952 } 2987 2953 spin_unlock(&pers_lock); ··· 2997 2957 if (!pers->takeover) { 2998 2958 module_put(pers->owner); 2999 2959 printk(KERN_WARNING "md: %s: %s does not support personality takeover\n", 3000 - mdname(mddev), level); 2960 + mdname(mddev), clevel); 3001 2961 return -EINVAL; 3002 2962 } 3003 2963 ··· 3013 2973 mddev->delta_disks = 0; 3014 2974 module_put(pers->owner); 3015 2975 printk(KERN_WARNING "md: %s: %s would not accept array\n", 3016 - mdname(mddev), level); 2976 + mdname(mddev), clevel); 3017 2977 return PTR_ERR(priv); 3018 2978 } 3019 2979 3020 2980 /* Looks like we have a winner */ 3021 2981 mddev_suspend(mddev); 3022 2982 mddev->pers->stop(mddev); 2983 + 2984 + if (mddev->pers->sync_request == NULL && 2985 + pers->sync_request != NULL) { 2986 + /* need to add the md_redundancy_group */ 2987 + if (sysfs_create_group(&mddev->kobj, &md_redundancy_group)) 2988 + printk(KERN_WARNING 2989 + "md: cannot register extra attributes for %s\n", 2990 + mdname(mddev)); 2991 + mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, NULL, "sync_action"); 2992 + } 2993 + if (mddev->pers->sync_request != NULL && 2994 + pers->sync_request == NULL) { 2995 + /* need to remove the md_redundancy_group */ 2996 + if (mddev->to_remove == NULL) 2997 + mddev->to_remove = &md_redundancy_group; 2998 + } 2999 + 3000 + if (mddev->pers->sync_request == NULL && 3001 + mddev->external) { 3002 + /* We are converting from a no-redundancy array 3003 + * to a redundancy array and metadata is managed 3004 + * externally so we need to be sure that writes 3005 + * won't block due to a need to transition 3006 + * clean->dirty 3007 + * until external management is started. 3008 + */ 3009 + mddev->in_sync = 0; 3010 + mddev->safemode_delay = 0; 3011 + mddev->safemode = 0; 3012 + } 3013 + 3023 3014 module_put(mddev->pers->owner); 3024 3015 /* Invalidate devices that are now superfluous */ 3025 3016 list_for_each_entry(rdev, &mddev->disks, same_set) ··· 3065 2994 mddev->layout = mddev->new_layout; 3066 2995 mddev->chunk_sectors = mddev->new_chunk_sectors; 3067 2996 mddev->delta_disks = 0; 2997 + if (mddev->pers->sync_request == NULL) { 2998 + /* this is now an array without redundancy, so 2999 + * it must always be in_sync 3000 + */ 3001 + mddev->in_sync = 1; 3002 + del_timer_sync(&mddev->safemode_timer); 3003 + } 3068 3004 pers->run(mddev); 3069 3005 mddev_resume(mddev); 3070 3006 set_bit(MD_CHANGE_DEVS, &mddev->flags); 3071 3007 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3072 3008 md_wakeup_thread(mddev->thread); 3009 + sysfs_notify(&mddev->kobj, NULL, "level"); 3010 + md_new_event(mddev); 3073 3011 return rv; 3074 3012 } 3075 3013 ··· 3317 3237 } 3318 3238 3319 3239 static int do_md_stop(mddev_t * mddev, int ro, int is_open); 3240 + static int md_set_readonly(mddev_t * mddev, int is_open); 3320 3241 static int do_md_run(mddev_t * mddev); 3321 3242 static int restart_array(mddev_t *mddev); 3322 3243 ··· 3348 3267 break; /* not supported yet */ 3349 3268 case readonly: 3350 3269 if (mddev->pers) 3351 - err = do_md_stop(mddev, 1, 0); 3270 + err = md_set_readonly(mddev, 0); 3352 3271 else { 3353 3272 mddev->ro = 1; 3354 3273 set_disk_ro(mddev->gendisk, 1); ··· 3358 3277 case read_auto: 3359 3278 if (mddev->pers) { 3360 3279 if (mddev->ro == 0) 3361 - err = do_md_stop(mddev, 1, 0); 3280 + err = md_set_readonly(mddev, 0); 3362 3281 else if (mddev->ro == 1) 3363 3282 err = restart_array(mddev); 3364 3283 if (err == 0) { ··· 4163 4082 { 4164 4083 mddev_t *mddev = container_of(ws, mddev_t, del_work); 4165 4084 4166 - if (mddev->private) { 4167 - sysfs_remove_group(&mddev->kobj, &md_redundancy_group); 4168 - if (mddev->private != (void*)1) 4169 - sysfs_remove_group(&mddev->kobj, mddev->private); 4170 - if (mddev->sysfs_action) 4171 - sysfs_put(mddev->sysfs_action); 4172 - mddev->sysfs_action = NULL; 4173 - mddev->private = NULL; 4174 - } 4175 4085 sysfs_remove_group(&mddev->kobj, &md_bitmap_group); 4176 4086 kobject_del(&mddev->kobj); 4177 4087 kobject_put(&mddev->kobj); ··· 4306 4234 4307 4235 static int start_dirty_degraded; 4308 4236 4309 - static int do_md_run(mddev_t * mddev) 4237 + static int md_run(mddev_t *mddev) 4310 4238 { 4311 4239 int err; 4312 4240 mdk_rdev_t *rdev; 4313 - struct gendisk *disk; 4314 4241 struct mdk_personality *pers; 4315 4242 4316 4243 if (list_empty(&mddev->disks)) ··· 4318 4247 4319 4248 if (mddev->pers) 4320 4249 return -EBUSY; 4250 + 4251 + /* These two calls synchronise us with the 4252 + * sysfs_remove_group calls in mddev_unlock, 4253 + * so they must have completed. 4254 + */ 4255 + mutex_lock(&mddev->open_mutex); 4256 + mutex_unlock(&mddev->open_mutex); 4321 4257 4322 4258 /* 4323 4259 * Analyze all RAID superblock(s) ··· 4373 4295 } 4374 4296 sysfs_notify_dirent(rdev->sysfs_state); 4375 4297 } 4376 - 4377 - disk = mddev->gendisk; 4378 4298 4379 4299 spin_lock(&pers_lock); 4380 4300 pers = find_pers(mddev->level, mddev->clevel); ··· 4501 4425 if (mddev->flags) 4502 4426 md_update_sb(mddev, 0); 4503 4427 4504 - set_capacity(disk, mddev->array_sectors); 4505 - 4506 4428 md_wakeup_thread(mddev->thread); 4507 4429 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ 4508 4430 4509 - revalidate_disk(mddev->gendisk); 4510 - mddev->changed = 1; 4511 4431 md_new_event(mddev); 4512 4432 sysfs_notify_dirent(mddev->sysfs_state); 4513 4433 if (mddev->sysfs_action) 4514 4434 sysfs_notify_dirent(mddev->sysfs_action); 4515 4435 sysfs_notify(&mddev->kobj, NULL, "degraded"); 4516 - kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); 4517 4436 return 0; 4437 + } 4438 + 4439 + static int do_md_run(mddev_t *mddev) 4440 + { 4441 + int err; 4442 + 4443 + err = md_run(mddev); 4444 + if (err) 4445 + goto out; 4446 + 4447 + set_capacity(mddev->gendisk, mddev->array_sectors); 4448 + revalidate_disk(mddev->gendisk); 4449 + kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); 4450 + out: 4451 + return err; 4518 4452 } 4519 4453 4520 4454 static int restart_array(mddev_t *mddev) ··· 4577 4491 spin_unlock(&inode->i_lock); 4578 4492 } 4579 4493 4494 + static void md_clean(mddev_t *mddev) 4495 + { 4496 + mddev->array_sectors = 0; 4497 + mddev->external_size = 0; 4498 + mddev->dev_sectors = 0; 4499 + mddev->raid_disks = 0; 4500 + mddev->recovery_cp = 0; 4501 + mddev->resync_min = 0; 4502 + mddev->resync_max = MaxSector; 4503 + mddev->reshape_position = MaxSector; 4504 + mddev->external = 0; 4505 + mddev->persistent = 0; 4506 + mddev->level = LEVEL_NONE; 4507 + mddev->clevel[0] = 0; 4508 + mddev->flags = 0; 4509 + mddev->ro = 0; 4510 + mddev->metadata_type[0] = 0; 4511 + mddev->chunk_sectors = 0; 4512 + mddev->ctime = mddev->utime = 0; 4513 + mddev->layout = 0; 4514 + mddev->max_disks = 0; 4515 + mddev->events = 0; 4516 + mddev->can_decrease_events = 0; 4517 + mddev->delta_disks = 0; 4518 + mddev->new_level = LEVEL_NONE; 4519 + mddev->new_layout = 0; 4520 + mddev->new_chunk_sectors = 0; 4521 + mddev->curr_resync = 0; 4522 + mddev->resync_mismatches = 0; 4523 + mddev->suspend_lo = mddev->suspend_hi = 0; 4524 + mddev->sync_speed_min = mddev->sync_speed_max = 0; 4525 + mddev->recovery = 0; 4526 + mddev->in_sync = 0; 4527 + mddev->degraded = 0; 4528 + mddev->barriers_work = 0; 4529 + mddev->safemode = 0; 4530 + mddev->bitmap_info.offset = 0; 4531 + mddev->bitmap_info.default_offset = 0; 4532 + mddev->bitmap_info.chunksize = 0; 4533 + mddev->bitmap_info.daemon_sleep = 0; 4534 + mddev->bitmap_info.max_write_behind = 0; 4535 + } 4536 + 4537 + static void md_stop_writes(mddev_t *mddev) 4538 + { 4539 + if (mddev->sync_thread) { 4540 + set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4541 + set_bit(MD_RECOVERY_INTR, &mddev->recovery); 4542 + md_unregister_thread(mddev->sync_thread); 4543 + mddev->sync_thread = NULL; 4544 + } 4545 + 4546 + del_timer_sync(&mddev->safemode_timer); 4547 + 4548 + bitmap_flush(mddev); 4549 + md_super_wait(mddev); 4550 + 4551 + if (!mddev->in_sync || mddev->flags) { 4552 + /* mark array as shutdown cleanly */ 4553 + mddev->in_sync = 1; 4554 + md_update_sb(mddev, 1); 4555 + } 4556 + } 4557 + 4558 + static void md_stop(mddev_t *mddev) 4559 + { 4560 + md_stop_writes(mddev); 4561 + 4562 + mddev->pers->stop(mddev); 4563 + if (mddev->pers->sync_request && mddev->to_remove == NULL) 4564 + mddev->to_remove = &md_redundancy_group; 4565 + module_put(mddev->pers->owner); 4566 + mddev->pers = NULL; 4567 + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4568 + } 4569 + 4570 + static int md_set_readonly(mddev_t *mddev, int is_open) 4571 + { 4572 + int err = 0; 4573 + mutex_lock(&mddev->open_mutex); 4574 + if (atomic_read(&mddev->openers) > is_open) { 4575 + printk("md: %s still in use.\n",mdname(mddev)); 4576 + err = -EBUSY; 4577 + goto out; 4578 + } 4579 + if (mddev->pers) { 4580 + md_stop_writes(mddev); 4581 + 4582 + err = -ENXIO; 4583 + if (mddev->ro==1) 4584 + goto out; 4585 + mddev->ro = 1; 4586 + set_disk_ro(mddev->gendisk, 1); 4587 + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4588 + sysfs_notify_dirent(mddev->sysfs_state); 4589 + err = 0; 4590 + } 4591 + out: 4592 + mutex_unlock(&mddev->open_mutex); 4593 + return err; 4594 + } 4595 + 4580 4596 /* mode: 4581 4597 * 0 - completely stop and dis-assemble array 4582 - * 1 - switch to readonly 4583 4598 * 2 - stop but do not disassemble array 4584 4599 */ 4585 4600 static int do_md_stop(mddev_t * mddev, int mode, int is_open) ··· 4695 4508 err = -EBUSY; 4696 4509 } else if (mddev->pers) { 4697 4510 4698 - if (mddev->sync_thread) { 4699 - set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4700 - set_bit(MD_RECOVERY_INTR, &mddev->recovery); 4701 - md_unregister_thread(mddev->sync_thread); 4702 - mddev->sync_thread = NULL; 4703 - } 4511 + if (mddev->ro) 4512 + set_disk_ro(disk, 0); 4704 4513 4705 - del_timer_sync(&mddev->safemode_timer); 4514 + md_stop(mddev); 4515 + mddev->queue->merge_bvec_fn = NULL; 4516 + mddev->queue->unplug_fn = NULL; 4517 + mddev->queue->backing_dev_info.congested_fn = NULL; 4706 4518 4707 - switch(mode) { 4708 - case 1: /* readonly */ 4709 - err = -ENXIO; 4710 - if (mddev->ro==1) 4711 - goto out; 4712 - mddev->ro = 1; 4713 - break; 4714 - case 0: /* disassemble */ 4715 - case 2: /* stop */ 4716 - bitmap_flush(mddev); 4717 - md_super_wait(mddev); 4718 - if (mddev->ro) 4719 - set_disk_ro(disk, 0); 4519 + /* tell userspace to handle 'inactive' */ 4520 + sysfs_notify_dirent(mddev->sysfs_state); 4720 4521 4721 - mddev->pers->stop(mddev); 4722 - mddev->queue->merge_bvec_fn = NULL; 4723 - mddev->queue->unplug_fn = NULL; 4724 - mddev->queue->backing_dev_info.congested_fn = NULL; 4725 - module_put(mddev->pers->owner); 4726 - if (mddev->pers->sync_request && mddev->private == NULL) 4727 - mddev->private = (void*)1; 4728 - mddev->pers = NULL; 4729 - /* tell userspace to handle 'inactive' */ 4730 - sysfs_notify_dirent(mddev->sysfs_state); 4522 + list_for_each_entry(rdev, &mddev->disks, same_set) 4523 + if (rdev->raid_disk >= 0) { 4524 + char nm[20]; 4525 + sprintf(nm, "rd%d", rdev->raid_disk); 4526 + sysfs_remove_link(&mddev->kobj, nm); 4527 + } 4731 4528 4732 - list_for_each_entry(rdev, &mddev->disks, same_set) 4733 - if (rdev->raid_disk >= 0) { 4734 - char nm[20]; 4735 - sprintf(nm, "rd%d", rdev->raid_disk); 4736 - sysfs_remove_link(&mddev->kobj, nm); 4737 - } 4529 + set_capacity(disk, 0); 4530 + revalidate_disk(disk); 4738 4531 4739 - set_capacity(disk, 0); 4740 - mddev->changed = 1; 4741 - 4742 - if (mddev->ro) 4743 - mddev->ro = 0; 4744 - } 4745 - if (!mddev->in_sync || mddev->flags) { 4746 - /* mark array as shutdown cleanly */ 4747 - mddev->in_sync = 1; 4748 - md_update_sb(mddev, 1); 4749 - } 4750 - if (mode == 1) 4751 - set_disk_ro(disk, 1); 4752 - clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4532 + if (mddev->ro) 4533 + mddev->ro = 0; 4534 + 4753 4535 err = 0; 4754 4536 } 4755 - out: 4756 4537 mutex_unlock(&mddev->open_mutex); 4757 4538 if (err) 4758 4539 return err; ··· 4741 4586 4742 4587 export_array(mddev); 4743 4588 4744 - mddev->array_sectors = 0; 4745 - mddev->external_size = 0; 4746 - mddev->dev_sectors = 0; 4747 - mddev->raid_disks = 0; 4748 - mddev->recovery_cp = 0; 4749 - mddev->resync_min = 0; 4750 - mddev->resync_max = MaxSector; 4751 - mddev->reshape_position = MaxSector; 4752 - mddev->external = 0; 4753 - mddev->persistent = 0; 4754 - mddev->level = LEVEL_NONE; 4755 - mddev->clevel[0] = 0; 4756 - mddev->flags = 0; 4757 - mddev->ro = 0; 4758 - mddev->metadata_type[0] = 0; 4759 - mddev->chunk_sectors = 0; 4760 - mddev->ctime = mddev->utime = 0; 4761 - mddev->layout = 0; 4762 - mddev->max_disks = 0; 4763 - mddev->events = 0; 4764 - mddev->delta_disks = 0; 4765 - mddev->new_level = LEVEL_NONE; 4766 - mddev->new_layout = 0; 4767 - mddev->new_chunk_sectors = 0; 4768 - mddev->curr_resync = 0; 4769 - mddev->resync_mismatches = 0; 4770 - mddev->suspend_lo = mddev->suspend_hi = 0; 4771 - mddev->sync_speed_min = mddev->sync_speed_max = 0; 4772 - mddev->recovery = 0; 4773 - mddev->in_sync = 0; 4774 - mddev->changed = 0; 4775 - mddev->degraded = 0; 4776 - mddev->barriers_work = 0; 4777 - mddev->safemode = 0; 4778 - mddev->bitmap_info.offset = 0; 4779 - mddev->bitmap_info.default_offset = 0; 4780 - mddev->bitmap_info.chunksize = 0; 4781 - mddev->bitmap_info.daemon_sleep = 0; 4782 - mddev->bitmap_info.max_write_behind = 0; 4589 + md_clean(mddev); 4783 4590 kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); 4784 4591 if (mddev->hold_active == UNTIL_STOP) 4785 4592 mddev->hold_active = 0; 4786 4593 4787 - } else if (mddev->pers) 4788 - printk(KERN_INFO "md: %s switched to read-only mode.\n", 4789 - mdname(mddev)); 4594 + } 4790 4595 err = 0; 4791 4596 blk_integrity_unregister(disk); 4792 4597 md_new_event(mddev); ··· 5464 5349 if (mddev->pers->check_reshape == NULL) 5465 5350 return -EINVAL; 5466 5351 if (raid_disks <= 0 || 5467 - raid_disks >= mddev->max_disks) 5352 + (mddev->max_disks && raid_disks >= mddev->max_disks)) 5468 5353 return -EINVAL; 5469 5354 if (mddev->sync_thread || mddev->reshape_position != MaxSector) 5470 5355 return -EBUSY; ··· 5601 5486 5602 5487 geo->heads = 2; 5603 5488 geo->sectors = 4; 5604 - geo->cylinders = get_capacity(mddev->gendisk) / 8; 5489 + geo->cylinders = mddev->array_sectors / 8; 5605 5490 return 0; 5606 5491 } 5607 5492 ··· 5611 5496 int err = 0; 5612 5497 void __user *argp = (void __user *)arg; 5613 5498 mddev_t *mddev = NULL; 5499 + int ro; 5614 5500 5615 5501 if (!capable(CAP_SYS_ADMIN)) 5616 5502 return -EACCES; ··· 5744 5628 goto done_unlock; 5745 5629 5746 5630 case STOP_ARRAY_RO: 5747 - err = do_md_stop(mddev, 1, 1); 5631 + err = md_set_readonly(mddev, 1); 5748 5632 goto done_unlock; 5749 5633 5634 + case BLKROSET: 5635 + if (get_user(ro, (int __user *)(arg))) { 5636 + err = -EFAULT; 5637 + goto done_unlock; 5638 + } 5639 + err = -EINVAL; 5640 + 5641 + /* if the bdev is going readonly the value of mddev->ro 5642 + * does not matter, no writes are coming 5643 + */ 5644 + if (ro) 5645 + goto done_unlock; 5646 + 5647 + /* are we are already prepared for writes? */ 5648 + if (mddev->ro != 1) 5649 + goto done_unlock; 5650 + 5651 + /* transitioning to readauto need only happen for 5652 + * arrays that call md_write_start 5653 + */ 5654 + if (mddev->pers) { 5655 + err = restart_array(mddev); 5656 + if (err == 0) { 5657 + mddev->ro = 2; 5658 + set_disk_ro(mddev->gendisk, 0); 5659 + } 5660 + } 5661 + goto done_unlock; 5750 5662 } 5751 5663 5752 5664 /* ··· 5895 5751 atomic_inc(&mddev->openers); 5896 5752 mutex_unlock(&mddev->open_mutex); 5897 5753 5898 - check_disk_change(bdev); 5899 5754 out: 5900 5755 return err; 5901 5756 } ··· 5909 5766 5910 5767 return 0; 5911 5768 } 5912 - 5913 - static int md_media_changed(struct gendisk *disk) 5914 - { 5915 - mddev_t *mddev = disk->private_data; 5916 - 5917 - return mddev->changed; 5918 - } 5919 - 5920 - static int md_revalidate(struct gendisk *disk) 5921 - { 5922 - mddev_t *mddev = disk->private_data; 5923 - 5924 - mddev->changed = 0; 5925 - return 0; 5926 - } 5927 5769 static const struct block_device_operations md_fops = 5928 5770 { 5929 5771 .owner = THIS_MODULE, ··· 5919 5791 .compat_ioctl = md_compat_ioctl, 5920 5792 #endif 5921 5793 .getgeo = md_getgeo, 5922 - .media_changed = md_media_changed, 5923 - .revalidate_disk= md_revalidate, 5924 5794 }; 5925 5795 5926 5796 static int md_thread(void * arg) ··· 6032 5906 mddev->pers->error_handler(mddev,rdev); 6033 5907 if (mddev->degraded) 6034 5908 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 6035 - set_bit(StateChanged, &rdev->flags); 5909 + sysfs_notify_dirent(rdev->sysfs_state); 6036 5910 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 6037 5911 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6038 5912 md_wakeup_thread(mddev->thread); ··· 7024 6898 if (mddev->flags) 7025 6899 md_update_sb(mddev, 0); 7026 6900 7027 - list_for_each_entry(rdev, &mddev->disks, same_set) 7028 - if (test_and_clear_bit(StateChanged, &rdev->flags)) 7029 - sysfs_notify_dirent(rdev->sysfs_state); 7030 - 7031 - 7032 6901 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && 7033 6902 !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { 7034 6903 /* resync/recovery still happening */ ··· 7160 7039 * appears to still be in use. Hence 7161 7040 * the '100'. 7162 7041 */ 7163 - do_md_stop(mddev, 1, 100); 7042 + md_set_readonly(mddev, 100); 7164 7043 mddev_unlock(mddev); 7165 7044 } 7166 7045 /*

+8 -8

drivers/md/md.h

··· 74 74 #define Blocked 8 /* An error occured on an externally 75 75 * managed array, don't allow writes 76 76 * until it is cleared */ 77 - #define StateChanged 9 /* Faulty or Blocked has changed during 78 - * interrupt, so it needs to be 79 - * notified by the thread */ 80 77 wait_queue_head_t blocked_wait; 81 78 82 79 int desc_nr; /* descriptor index in the superblock */ ··· 150 153 int external_size; /* size managed 151 154 * externally */ 152 155 __u64 events; 156 + /* If the last 'event' was simply a clean->dirty transition, and 157 + * we didn't write it to the spares, then it is safe and simple 158 + * to just decrement the event count on a dirty->clean transition. 159 + * So we record that possibility here. 160 + */ 161 + int can_decrease_events; 153 162 154 163 char uuid[16]; 155 164 ··· 243 240 atomic_t active; /* general refcount */ 244 241 atomic_t openers; /* number of active opens */ 245 242 246 - int changed; /* true if we might need to reread partition info */ 247 243 int degraded; /* whether md should consider 248 244 * adding a spare 249 245 */ ··· 281 279 atomic_t writes_pending; 282 280 struct request_queue *queue; /* for plugging ... */ 283 281 284 - atomic_t write_behind; /* outstanding async IO */ 285 - unsigned int max_write_behind; /* 0 = sync */ 286 - 287 282 struct bitmap *bitmap; /* the bitmap for the device */ 288 283 struct { 289 284 struct file *file; /* the bitmap file */ ··· 304 305 atomic_t max_corr_read_errors; /* max read retries */ 305 306 struct list_head all_mddevs; 306 307 308 + struct attribute_group *to_remove; 307 309 /* Generic barrier handling. 308 310 * If there is a pending barrier request, all other 309 311 * writes are blocked while the devices are flushed. ··· 336 336 int level; 337 337 struct list_head list; 338 338 struct module *owner; 339 - int (*make_request)(struct request_queue *q, struct bio *bio); 339 + int (*make_request)(mddev_t *mddev, struct bio *bio); 340 340 int (*run)(mddev_t *mddev); 341 341 int (*stop)(mddev_t *mddev); 342 342 void (*status)(struct seq_file *seq, mddev_t *mddev);

+2 -11

drivers/md/multipath.c

··· 85 85 static void multipath_end_request(struct bio *bio, int error) 86 86 { 87 87 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 88 - struct multipath_bh * mp_bh = (struct multipath_bh *)(bio->bi_private); 88 + struct multipath_bh *mp_bh = bio->bi_private; 89 89 multipath_conf_t *conf = mp_bh->mddev->private; 90 90 mdk_rdev_t *rdev = conf->multipaths[mp_bh->path].rdev; 91 91 ··· 136 136 } 137 137 138 138 139 - static int multipath_make_request (struct request_queue *q, struct bio * bio) 139 + static int multipath_make_request(mddev_t *mddev, struct bio * bio) 140 140 { 141 - mddev_t *mddev = q->queuedata; 142 141 multipath_conf_t *conf = mddev->private; 143 142 struct multipath_bh * mp_bh; 144 143 struct multipath_info *multipath; 145 - const int rw = bio_data_dir(bio); 146 - int cpu; 147 144 148 145 if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) { 149 146 md_barrier_request(mddev, bio); ··· 151 154 152 155 mp_bh->master_bio = bio; 153 156 mp_bh->mddev = mddev; 154 - 155 - cpu = part_stat_lock(); 156 - part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); 157 - part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], 158 - bio_sectors(bio)); 159 - part_stat_unlock(); 160 157 161 158 mp_bh->path = multipath_map(conf); 162 159 if (mp_bh->path < 0) {

+190 -61

drivers/md/raid0.c

··· 23 23 #include <linux/slab.h> 24 24 #include "md.h" 25 25 #include "raid0.h" 26 + #include "raid5.h" 26 27 27 28 static void raid0_unplug(struct request_queue *q) 28 29 { 29 30 mddev_t *mddev = q->queuedata; 30 31 raid0_conf_t *conf = mddev->private; 31 32 mdk_rdev_t **devlist = conf->devlist; 33 + int raid_disks = conf->strip_zone[0].nb_dev; 32 34 int i; 33 35 34 - for (i=0; i<mddev->raid_disks; i++) { 36 + for (i=0; i < raid_disks; i++) { 35 37 struct request_queue *r_queue = bdev_get_queue(devlist[i]->bdev); 36 38 37 39 blk_unplug(r_queue); ··· 45 43 mddev_t *mddev = data; 46 44 raid0_conf_t *conf = mddev->private; 47 45 mdk_rdev_t **devlist = conf->devlist; 46 + int raid_disks = conf->strip_zone[0].nb_dev; 48 47 int i, ret = 0; 49 48 50 49 if (mddev_congested(mddev, bits)) 51 50 return 1; 52 51 53 - for (i = 0; i < mddev->raid_disks && !ret ; i++) { 52 + for (i = 0; i < raid_disks && !ret ; i++) { 54 53 struct request_queue *q = bdev_get_queue(devlist[i]->bdev); 55 54 56 55 ret |= bdi_congested(&q->backing_dev_info, bits); ··· 69 66 sector_t zone_start = 0; 70 67 char b[BDEVNAME_SIZE]; 71 68 raid0_conf_t *conf = mddev->private; 69 + int raid_disks = conf->strip_zone[0].nb_dev; 72 70 printk(KERN_INFO "******* %s configuration *********\n", 73 71 mdname(mddev)); 74 72 h = 0; 75 73 for (j = 0; j < conf->nr_strip_zones; j++) { 76 74 printk(KERN_INFO "zone%d=[", j); 77 75 for (k = 0; k < conf->strip_zone[j].nb_dev; k++) 78 - printk("%s/", 79 - bdevname(conf->devlist[j*mddev->raid_disks 76 + printk(KERN_CONT "%s/", 77 + bdevname(conf->devlist[j*raid_disks 80 78 + k]->bdev, b)); 81 - printk("]\n"); 79 + printk(KERN_CONT "]\n"); 82 80 83 81 zone_size = conf->strip_zone[j].zone_end - zone_start; 84 82 printk(KERN_INFO " zone offset=%llukb " ··· 92 88 printk(KERN_INFO "**********************************\n\n"); 93 89 } 94 90 95 - static int create_strip_zones(mddev_t *mddev) 91 + static int create_strip_zones(mddev_t *mddev, raid0_conf_t **private_conf) 96 92 { 97 93 int i, c, err; 98 94 sector_t curr_zone_end, sectors; ··· 105 101 if (!conf) 106 102 return -ENOMEM; 107 103 list_for_each_entry(rdev1, &mddev->disks, same_set) { 108 - printk(KERN_INFO "raid0: looking at %s\n", 109 - bdevname(rdev1->bdev,b)); 104 + printk(KERN_INFO "md/raid0:%s: looking at %s\n", 105 + mdname(mddev), 106 + bdevname(rdev1->bdev, b)); 110 107 c = 0; 111 108 112 109 /* round size to chunk_size */ ··· 116 111 rdev1->sectors = sectors * mddev->chunk_sectors; 117 112 118 113 list_for_each_entry(rdev2, &mddev->disks, same_set) { 119 - printk(KERN_INFO "raid0: comparing %s(%llu)", 114 + printk(KERN_INFO "md/raid0:%s: comparing %s(%llu)", 115 + mdname(mddev), 120 116 bdevname(rdev1->bdev,b), 121 117 (unsigned long long)rdev1->sectors); 122 - printk(KERN_INFO " with %s(%llu)\n", 118 + printk(KERN_CONT " with %s(%llu)\n", 123 119 bdevname(rdev2->bdev,b), 124 120 (unsigned long long)rdev2->sectors); 125 121 if (rdev2 == rdev1) { 126 - printk(KERN_INFO "raid0: END\n"); 122 + printk(KERN_INFO "md/raid0:%s: END\n", 123 + mdname(mddev)); 127 124 break; 128 125 } 129 126 if (rdev2->sectors == rdev1->sectors) { ··· 133 126 * Not unique, don't count it as a new 134 127 * group 135 128 */ 136 - printk(KERN_INFO "raid0: EQUAL\n"); 129 + printk(KERN_INFO "md/raid0:%s: EQUAL\n", 130 + mdname(mddev)); 137 131 c = 1; 138 132 break; 139 133 } 140 - printk(KERN_INFO "raid0: NOT EQUAL\n"); 134 + printk(KERN_INFO "md/raid0:%s: NOT EQUAL\n", 135 + mdname(mddev)); 141 136 } 142 137 if (!c) { 143 - printk(KERN_INFO "raid0: ==> UNIQUE\n"); 138 + printk(KERN_INFO "md/raid0:%s: ==> UNIQUE\n", 139 + mdname(mddev)); 144 140 conf->nr_strip_zones++; 145 - printk(KERN_INFO "raid0: %d zones\n", 146 - conf->nr_strip_zones); 141 + printk(KERN_INFO "md/raid0:%s: %d zones\n", 142 + mdname(mddev), conf->nr_strip_zones); 147 143 } 148 144 } 149 - printk(KERN_INFO "raid0: FINAL %d zones\n", conf->nr_strip_zones); 145 + printk(KERN_INFO "md/raid0:%s: FINAL %d zones\n", 146 + mdname(mddev), conf->nr_strip_zones); 150 147 err = -ENOMEM; 151 148 conf->strip_zone = kzalloc(sizeof(struct strip_zone)* 152 149 conf->nr_strip_zones, GFP_KERNEL); ··· 173 162 list_for_each_entry(rdev1, &mddev->disks, same_set) { 174 163 int j = rdev1->raid_disk; 175 164 165 + if (mddev->level == 10) 166 + /* taking over a raid10-n2 array */ 167 + j /= 2; 168 + 176 169 if (j < 0 || j >= mddev->raid_disks) { 177 - printk(KERN_ERR "raid0: bad disk number %d - " 178 - "aborting!\n", j); 170 + printk(KERN_ERR "md/raid0:%s: bad disk number %d - " 171 + "aborting!\n", mdname(mddev), j); 179 172 goto abort; 180 173 } 181 174 if (dev[j]) { 182 - printk(KERN_ERR "raid0: multiple devices for %d - " 183 - "aborting!\n", j); 175 + printk(KERN_ERR "md/raid0:%s: multiple devices for %d - " 176 + "aborting!\n", mdname(mddev), j); 184 177 goto abort; 185 178 } 186 179 dev[j] = rdev1; ··· 206 191 cnt++; 207 192 } 208 193 if (cnt != mddev->raid_disks) { 209 - printk(KERN_ERR "raid0: too few disks (%d of %d) - " 210 - "aborting!\n", cnt, mddev->raid_disks); 194 + printk(KERN_ERR "md/raid0:%s: too few disks (%d of %d) - " 195 + "aborting!\n", mdname(mddev), cnt, mddev->raid_disks); 211 196 goto abort; 212 197 } 213 198 zone->nb_dev = cnt; ··· 223 208 zone = conf->strip_zone + i; 224 209 dev = conf->devlist + i * mddev->raid_disks; 225 210 226 - printk(KERN_INFO "raid0: zone %d\n", i); 211 + printk(KERN_INFO "md/raid0:%s: zone %d\n", 212 + mdname(mddev), i); 227 213 zone->dev_start = smallest->sectors; 228 214 smallest = NULL; 229 215 c = 0; 230 216 231 217 for (j=0; j<cnt; j++) { 232 218 rdev = conf->devlist[j]; 233 - printk(KERN_INFO "raid0: checking %s ...", 234 - bdevname(rdev->bdev, b)); 219 + printk(KERN_INFO "md/raid0:%s: checking %s ...", 220 + mdname(mddev), 221 + bdevname(rdev->bdev, b)); 235 222 if (rdev->sectors <= zone->dev_start) { 236 - printk(KERN_INFO " nope.\n"); 223 + printk(KERN_CONT " nope.\n"); 237 224 continue; 238 225 } 239 - printk(KERN_INFO " contained as device %d\n", c); 226 + printk(KERN_CONT " contained as device %d\n", c); 240 227 dev[c] = rdev; 241 228 c++; 242 229 if (!smallest || rdev->sectors < smallest->sectors) { 243 230 smallest = rdev; 244 - printk(KERN_INFO " (%llu) is smallest!.\n", 245 - (unsigned long long)rdev->sectors); 231 + printk(KERN_INFO "md/raid0:%s: (%llu) is smallest!.\n", 232 + mdname(mddev), 233 + (unsigned long long)rdev->sectors); 246 234 } 247 235 } 248 236 249 237 zone->nb_dev = c; 250 238 sectors = (smallest->sectors - zone->dev_start) * c; 251 - printk(KERN_INFO "raid0: zone->nb_dev: %d, sectors: %llu\n", 252 - zone->nb_dev, (unsigned long long)sectors); 239 + printk(KERN_INFO "md/raid0:%s: zone->nb_dev: %d, sectors: %llu\n", 240 + mdname(mddev), 241 + zone->nb_dev, (unsigned long long)sectors); 253 242 254 243 curr_zone_end += sectors; 255 244 zone->zone_end = curr_zone_end; 256 245 257 - printk(KERN_INFO "raid0: current zone start: %llu\n", 258 - (unsigned long long)smallest->sectors); 246 + printk(KERN_INFO "md/raid0:%s: current zone start: %llu\n", 247 + mdname(mddev), 248 + (unsigned long long)smallest->sectors); 259 249 } 260 250 mddev->queue->unplug_fn = raid0_unplug; 261 251 mddev->queue->backing_dev_info.congested_fn = raid0_congested; ··· 271 251 * chunk size is a multiple of that sector size 272 252 */ 273 253 if ((mddev->chunk_sectors << 9) % queue_logical_block_size(mddev->queue)) { 274 - printk(KERN_ERR "%s chunk_size of %d not valid\n", 254 + printk(KERN_ERR "md/raid0:%s: chunk_size of %d not valid\n", 275 255 mdname(mddev), 276 256 mddev->chunk_sectors << 9); 277 257 goto abort; ··· 281 261 blk_queue_io_opt(mddev->queue, 282 262 (mddev->chunk_sectors << 9) * mddev->raid_disks); 283 263 284 - printk(KERN_INFO "raid0: done.\n"); 285 - mddev->private = conf; 264 + printk(KERN_INFO "md/raid0:%s: done.\n", mdname(mddev)); 265 + *private_conf = conf; 266 + 286 267 return 0; 287 268 abort: 288 269 kfree(conf->strip_zone); 289 270 kfree(conf->devlist); 290 271 kfree(conf); 291 - mddev->private = NULL; 272 + *private_conf = NULL; 292 273 return err; 293 274 } 294 275 ··· 340 319 341 320 static int raid0_run(mddev_t *mddev) 342 321 { 322 + raid0_conf_t *conf; 343 323 int ret; 344 324 345 325 if (mddev->chunk_sectors == 0) { 346 - printk(KERN_ERR "md/raid0: chunk size must be set.\n"); 326 + printk(KERN_ERR "md/raid0:%s: chunk size must be set.\n", 327 + mdname(mddev)); 347 328 return -EINVAL; 348 329 } 349 330 if (md_check_no_bitmap(mddev)) ··· 353 330 blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors); 354 331 mddev->queue->queue_lock = &mddev->queue->__queue_lock; 355 332 356 - ret = create_strip_zones(mddev); 357 - if (ret < 0) 358 - return ret; 333 + /* if private is not null, we are here after takeover */ 334 + if (mddev->private == NULL) { 335 + ret = create_strip_zones(mddev, &conf); 336 + if (ret < 0) 337 + return ret; 338 + mddev->private = conf; 339 + } 340 + conf = mddev->private; 341 + if (conf->scale_raid_disks) { 342 + int i; 343 + for (i=0; i < conf->strip_zone[0].nb_dev; i++) 344 + conf->devlist[i]->raid_disk /= conf->scale_raid_disks; 345 + /* FIXME update sysfs rd links */ 346 + } 359 347 360 348 /* calculate array device size */ 361 349 md_set_array_sectors(mddev, raid0_size(mddev, 0, 0)); 362 350 363 - printk(KERN_INFO "raid0 : md_size is %llu sectors.\n", 364 - (unsigned long long)mddev->array_sectors); 351 + printk(KERN_INFO "md/raid0:%s: md_size is %llu sectors.\n", 352 + mdname(mddev), 353 + (unsigned long long)mddev->array_sectors); 365 354 /* calculate the max read-ahead size. 366 355 * For read-ahead of large files to be effective, we need to 367 356 * readahead at least twice a whole stripe. i.e. number of devices ··· 437 402 unsigned int sect_in_chunk; 438 403 sector_t chunk; 439 404 raid0_conf_t *conf = mddev->private; 405 + int raid_disks = conf->strip_zone[0].nb_dev; 440 406 unsigned int chunk_sects = mddev->chunk_sectors; 441 407 442 408 if (is_power_of_2(chunk_sects)) { ··· 460 424 * + the position in the chunk 461 425 */ 462 426 *sector_offset = (chunk * chunk_sects) + sect_in_chunk; 463 - return conf->devlist[(zone - conf->strip_zone)*mddev->raid_disks 427 + return conf->devlist[(zone - conf->strip_zone)*raid_disks 464 428 + sector_div(sector, zone->nb_dev)]; 465 429 } 466 430 ··· 480 444 } 481 445 } 482 446 483 - static int raid0_make_request(struct request_queue *q, struct bio *bio) 447 + static int raid0_make_request(mddev_t *mddev, struct bio *bio) 484 448 { 485 - mddev_t *mddev = q->queuedata; 486 449 unsigned int chunk_sects; 487 450 sector_t sector_offset; 488 451 struct strip_zone *zone; 489 452 mdk_rdev_t *tmp_dev; 490 - const int rw = bio_data_dir(bio); 491 - int cpu; 492 453 493 454 if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) { 494 455 md_barrier_request(mddev, bio); 495 456 return 0; 496 457 } 497 - 498 - cpu = part_stat_lock(); 499 - part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); 500 - part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], 501 - bio_sectors(bio)); 502 - part_stat_unlock(); 503 458 504 459 chunk_sects = mddev->chunk_sectors; 505 460 if (unlikely(!is_io_in_chunk_boundary(mddev, chunk_sects, bio))) { ··· 509 482 else 510 483 bp = bio_split(bio, chunk_sects - 511 484 sector_div(sector, chunk_sects)); 512 - if (raid0_make_request(q, &bp->bio1)) 485 + if (raid0_make_request(mddev, &bp->bio1)) 513 486 generic_make_request(&bp->bio1); 514 - if (raid0_make_request(q, &bp->bio2)) 487 + if (raid0_make_request(mddev, &bp->bio2)) 515 488 generic_make_request(&bp->bio2); 516 489 517 490 bio_pair_release(bp); ··· 531 504 return 1; 532 505 533 506 bad_map: 534 - printk("raid0_make_request bug: can't convert block across chunks" 535 - " or bigger than %dk %llu %d\n", chunk_sects / 2, 536 - (unsigned long long)bio->bi_sector, bio->bi_size >> 10); 507 + printk("md/raid0:%s: make_request bug: can't convert block across chunks" 508 + " or bigger than %dk %llu %d\n", 509 + mdname(mddev), chunk_sects / 2, 510 + (unsigned long long)bio->bi_sector, bio->bi_size >> 10); 537 511 538 512 bio_io_error(bio); 539 513 return 0; ··· 547 519 int j, k, h; 548 520 char b[BDEVNAME_SIZE]; 549 521 raid0_conf_t *conf = mddev->private; 522 + int raid_disks = conf->strip_zone[0].nb_dev; 550 523 551 524 sector_t zone_size; 552 525 sector_t zone_start = 0; ··· 558 529 seq_printf(seq, "=["); 559 530 for (k = 0; k < conf->strip_zone[j].nb_dev; k++) 560 531 seq_printf(seq, "%s/", bdevname( 561 - conf->devlist[j*mddev->raid_disks + k] 532 + conf->devlist[j*raid_disks + k] 562 533 ->bdev, b)); 563 534 564 535 zone_size = conf->strip_zone[j].zone_end - zone_start; ··· 573 544 return; 574 545 } 575 546 547 + static void *raid0_takeover_raid5(mddev_t *mddev) 548 + { 549 + mdk_rdev_t *rdev; 550 + raid0_conf_t *priv_conf; 551 + 552 + if (mddev->degraded != 1) { 553 + printk(KERN_ERR "md/raid0:%s: raid5 must be degraded! Degraded disks: %d\n", 554 + mdname(mddev), 555 + mddev->degraded); 556 + return ERR_PTR(-EINVAL); 557 + } 558 + 559 + list_for_each_entry(rdev, &mddev->disks, same_set) { 560 + /* check slot number for a disk */ 561 + if (rdev->raid_disk == mddev->raid_disks-1) { 562 + printk(KERN_ERR "md/raid0:%s: raid5 must have missing parity disk!\n", 563 + mdname(mddev)); 564 + return ERR_PTR(-EINVAL); 565 + } 566 + } 567 + 568 + /* Set new parameters */ 569 + mddev->new_level = 0; 570 + mddev->new_chunk_sectors = mddev->chunk_sectors; 571 + mddev->raid_disks--; 572 + mddev->delta_disks = -1; 573 + /* make sure it will be not marked as dirty */ 574 + mddev->recovery_cp = MaxSector; 575 + 576 + create_strip_zones(mddev, &priv_conf); 577 + return priv_conf; 578 + } 579 + 580 + static void *raid0_takeover_raid10(mddev_t *mddev) 581 + { 582 + raid0_conf_t *priv_conf; 583 + 584 + /* Check layout: 585 + * - far_copies must be 1 586 + * - near_copies must be 2 587 + * - disks number must be even 588 + * - all mirrors must be already degraded 589 + */ 590 + if (mddev->layout != ((1 << 8) + 2)) { 591 + printk(KERN_ERR "md/raid0:%s:: Raid0 cannot takover layout: 0x%x\n", 592 + mdname(mddev), 593 + mddev->layout); 594 + return ERR_PTR(-EINVAL); 595 + } 596 + if (mddev->raid_disks & 1) { 597 + printk(KERN_ERR "md/raid0:%s: Raid0 cannot takover Raid10 with odd disk number.\n", 598 + mdname(mddev)); 599 + return ERR_PTR(-EINVAL); 600 + } 601 + if (mddev->degraded != (mddev->raid_disks>>1)) { 602 + printk(KERN_ERR "md/raid0:%s: All mirrors must be already degraded!\n", 603 + mdname(mddev)); 604 + return ERR_PTR(-EINVAL); 605 + } 606 + 607 + /* Set new parameters */ 608 + mddev->new_level = 0; 609 + mddev->new_chunk_sectors = mddev->chunk_sectors; 610 + mddev->delta_disks = - mddev->raid_disks / 2; 611 + mddev->raid_disks += mddev->delta_disks; 612 + mddev->degraded = 0; 613 + /* make sure it will be not marked as dirty */ 614 + mddev->recovery_cp = MaxSector; 615 + 616 + create_strip_zones(mddev, &priv_conf); 617 + priv_conf->scale_raid_disks = 2; 618 + return priv_conf; 619 + } 620 + 621 + static void *raid0_takeover(mddev_t *mddev) 622 + { 623 + /* raid0 can take over: 624 + * raid5 - providing it is Raid4 layout and one disk is faulty 625 + * raid10 - assuming we have all necessary active disks 626 + */ 627 + if (mddev->level == 5) { 628 + if (mddev->layout == ALGORITHM_PARITY_N) 629 + return raid0_takeover_raid5(mddev); 630 + 631 + printk(KERN_ERR "md/raid0:%s: Raid can only takeover Raid5 with layout: %d\n", 632 + mdname(mddev), ALGORITHM_PARITY_N); 633 + } 634 + 635 + if (mddev->level == 10) 636 + return raid0_takeover_raid10(mddev); 637 + 638 + return ERR_PTR(-EINVAL); 639 + } 640 + 641 + static void raid0_quiesce(mddev_t *mddev, int state) 642 + { 643 + } 644 + 576 645 static struct mdk_personality raid0_personality= 577 646 { 578 647 .name = "raid0", ··· 681 554 .stop = raid0_stop, 682 555 .status = raid0_status, 683 556 .size = raid0_size, 557 + .takeover = raid0_takeover, 558 + .quiesce = raid0_quiesce, 684 559 }; 685 560 686 561 static int __init raid0_init (void)

+3

drivers/md/raid0.h

··· 13 13 struct strip_zone *strip_zone; 14 14 mdk_rdev_t **devlist; /* lists of rdevs, pointed to by strip_zone->dev */ 15 15 int nr_strip_zones; 16 + int scale_raid_disks; /* divide rdev->raid_disks by this in run() 17 + * to handle conversion from raid10 18 + */ 16 19 }; 17 20 18 21 typedef struct raid0_private_data raid0_conf_t;

+59 -55

drivers/md/raid1.c

··· 263 263 static void raid1_end_read_request(struct bio *bio, int error) 264 264 { 265 265 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 266 - r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); 266 + r1bio_t *r1_bio = bio->bi_private; 267 267 int mirror; 268 268 conf_t *conf = r1_bio->mddev->private; 269 269 ··· 297 297 */ 298 298 char b[BDEVNAME_SIZE]; 299 299 if (printk_ratelimit()) 300 - printk(KERN_ERR "raid1: %s: rescheduling sector %llu\n", 300 + printk(KERN_ERR "md/raid1:%s: %s: rescheduling sector %llu\n", 301 + mdname(conf->mddev), 301 302 bdevname(conf->mirrors[mirror].rdev->bdev,b), (unsigned long long)r1_bio->sector); 302 303 reschedule_retry(r1_bio); 303 304 } ··· 309 308 static void raid1_end_write_request(struct bio *bio, int error) 310 309 { 311 310 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 312 - r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); 311 + r1bio_t *r1_bio = bio->bi_private; 313 312 int mirror, behind = test_bit(R1BIO_BehindIO, &r1_bio->state); 314 313 conf_t *conf = r1_bio->mddev->private; 315 314 struct bio *to_put = NULL; ··· 419 418 */ 420 419 static int read_balance(conf_t *conf, r1bio_t *r1_bio) 421 420 { 422 - const unsigned long this_sector = r1_bio->sector; 421 + const sector_t this_sector = r1_bio->sector; 423 422 int new_disk = conf->last_used, disk = new_disk; 424 423 int wonly_disk = -1; 425 424 const int sectors = r1_bio->sectors; ··· 435 434 retry: 436 435 if (conf->mddev->recovery_cp < MaxSector && 437 436 (this_sector + sectors >= conf->next_resync)) { 438 - /* Choose the first operation device, for consistancy */ 437 + /* Choose the first operational device, for consistancy */ 439 438 new_disk = 0; 440 439 441 440 for (rdev = rcu_dereference(conf->mirrors[new_disk].rdev); ··· 775 774 return NULL; 776 775 } 777 776 778 - static int make_request(struct request_queue *q, struct bio * bio) 777 + static int make_request(mddev_t *mddev, struct bio * bio) 779 778 { 780 - mddev_t *mddev = q->queuedata; 781 779 conf_t *conf = mddev->private; 782 780 mirror_info_t *mirror; 783 781 r1bio_t *r1_bio; ··· 788 788 struct page **behind_pages = NULL; 789 789 const int rw = bio_data_dir(bio); 790 790 const bool do_sync = bio_rw_flagged(bio, BIO_RW_SYNCIO); 791 - int cpu; 792 791 bool do_barriers; 793 792 mdk_rdev_t *blocked_rdev; 794 793 ··· 833 834 834 835 bitmap = mddev->bitmap; 835 836 836 - cpu = part_stat_lock(); 837 - part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); 838 - part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], 839 - bio_sectors(bio)); 840 - part_stat_unlock(); 841 - 842 837 /* 843 838 * make_request() can abort the operation when READA is being 844 839 * used and no empty request is available. ··· 859 866 } 860 867 mirror = conf->mirrors + rdisk; 861 868 869 + if (test_bit(WriteMostly, &mirror->rdev->flags) && 870 + bitmap) { 871 + /* Reading from a write-mostly device must 872 + * take care not to over-take any writes 873 + * that are 'behind' 874 + */ 875 + wait_event(bitmap->behind_wait, 876 + atomic_read(&bitmap->behind_writes) == 0); 877 + } 862 878 r1_bio->read_disk = rdisk; 863 879 864 880 read_bio = bio_clone(bio, GFP_NOIO); ··· 914 912 if (test_bit(Faulty, &rdev->flags)) { 915 913 rdev_dec_pending(rdev, mddev); 916 914 r1_bio->bios[i] = NULL; 917 - } else 915 + } else { 918 916 r1_bio->bios[i] = bio; 919 - targets++; 917 + targets++; 918 + } 920 919 } else 921 920 r1_bio->bios[i] = NULL; 922 921 } ··· 945 942 set_bit(R1BIO_Degraded, &r1_bio->state); 946 943 } 947 944 948 - /* do behind I/O ? */ 945 + /* do behind I/O ? 946 + * Not if there are too many, or cannot allocate memory, 947 + * or a reader on WriteMostly is waiting for behind writes 948 + * to flush */ 949 949 if (bitmap && 950 950 (atomic_read(&bitmap->behind_writes) 951 951 < mddev->bitmap_info.max_write_behind) && 952 + !waitqueue_active(&bitmap->behind_wait) && 952 953 (behind_pages = alloc_behind_pages(bio)) != NULL) 953 954 set_bit(R1BIO_BehindIO, &r1_bio->state); 954 955 ··· 1077 1070 } else 1078 1071 set_bit(Faulty, &rdev->flags); 1079 1072 set_bit(MD_CHANGE_DEVS, &mddev->flags); 1080 - printk(KERN_ALERT "raid1: Disk failure on %s, disabling device.\n" 1081 - "raid1: Operation continuing on %d devices.\n", 1082 - bdevname(rdev->bdev,b), conf->raid_disks - mddev->degraded); 1073 + printk(KERN_ALERT "md/raid1:%s: Disk failure on %s, disabling device.\n" 1074 + KERN_ALERT "md/raid1:%s: Operation continuing on %d devices.\n", 1075 + mdname(mddev), bdevname(rdev->bdev, b), 1076 + mdname(mddev), conf->raid_disks - mddev->degraded); 1083 1077 } 1084 1078 1085 1079 static void print_conf(conf_t *conf) 1086 1080 { 1087 1081 int i; 1088 1082 1089 - printk("RAID1 conf printout:\n"); 1083 + printk(KERN_DEBUG "RAID1 conf printout:\n"); 1090 1084 if (!conf) { 1091 - printk("(!conf)\n"); 1085 + printk(KERN_DEBUG "(!conf)\n"); 1092 1086 return; 1093 1087 } 1094 - printk(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded, 1088 + printk(KERN_DEBUG " --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded, 1095 1089 conf->raid_disks); 1096 1090 1097 1091 rcu_read_lock(); ··· 1100 1092 char b[BDEVNAME_SIZE]; 1101 1093 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); 1102 1094 if (rdev) 1103 - printk(" disk %d, wo:%d, o:%d, dev:%s\n", 1095 + printk(KERN_DEBUG " disk %d, wo:%d, o:%d, dev:%s\n", 1104 1096 i, !test_bit(In_sync, &rdev->flags), 1105 1097 !test_bit(Faulty, &rdev->flags), 1106 1098 bdevname(rdev->bdev,b)); ··· 1231 1223 1232 1224 static void end_sync_read(struct bio *bio, int error) 1233 1225 { 1234 - r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); 1226 + r1bio_t *r1_bio = bio->bi_private; 1235 1227 int i; 1236 1228 1237 1229 for (i=r1_bio->mddev->raid_disks; i--; ) ··· 1254 1246 static void end_sync_write(struct bio *bio, int error) 1255 1247 { 1256 1248 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 1257 - r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); 1249 + r1bio_t *r1_bio = bio->bi_private; 1258 1250 mddev_t *mddev = r1_bio->mddev; 1259 1251 conf_t *conf = mddev->private; 1260 1252 int i; ··· 1461 1453 char b[BDEVNAME_SIZE]; 1462 1454 /* Cannot read from anywhere, array is toast */ 1463 1455 md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev); 1464 - printk(KERN_ALERT "raid1: %s: unrecoverable I/O read error" 1456 + printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O read error" 1465 1457 " for block %llu\n", 1466 - bdevname(bio->bi_bdev,b), 1458 + mdname(mddev), 1459 + bdevname(bio->bi_bdev, b), 1467 1460 (unsigned long long)r1_bio->sector); 1468 1461 md_done_sync(mddev, r1_bio->sectors, 0); 1469 1462 put_buf(r1_bio); ··· 1586 1577 else { 1587 1578 atomic_add(s, &rdev->corrected_errors); 1588 1579 printk(KERN_INFO 1589 - "raid1:%s: read error corrected " 1580 + "md/raid1:%s: read error corrected " 1590 1581 "(%d sectors at %llu on %s)\n", 1591 1582 mdname(mddev), s, 1592 1583 (unsigned long long)(sect + ··· 1691 1682 1692 1683 bio = r1_bio->bios[r1_bio->read_disk]; 1693 1684 if ((disk=read_balance(conf, r1_bio)) == -1) { 1694 - printk(KERN_ALERT "raid1: %s: unrecoverable I/O" 1685 + printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O" 1695 1686 " read error for block %llu\n", 1687 + mdname(mddev), 1696 1688 bdevname(bio->bi_bdev,b), 1697 1689 (unsigned long long)r1_bio->sector); 1698 1690 raid_end_bio_io(r1_bio); ··· 1707 1697 r1_bio->bios[r1_bio->read_disk] = bio; 1708 1698 rdev = conf->mirrors[disk].rdev; 1709 1699 if (printk_ratelimit()) 1710 - printk(KERN_ERR "raid1: %s: redirecting sector %llu to" 1711 - " another mirror\n", 1712 - bdevname(rdev->bdev,b), 1713 - (unsigned long long)r1_bio->sector); 1700 + printk(KERN_ERR "md/raid1:%s: redirecting sector %llu to" 1701 + " other mirror: %s\n", 1702 + mdname(mddev), 1703 + (unsigned long long)r1_bio->sector, 1704 + bdevname(rdev->bdev,b)); 1714 1705 bio->bi_sector = r1_bio->sector + rdev->data_offset; 1715 1706 bio->bi_bdev = rdev->bdev; 1716 1707 bio->bi_end_io = raid1_end_read_request; ··· 1766 1755 int still_degraded = 0; 1767 1756 1768 1757 if (!conf->r1buf_pool) 1769 - { 1770 - /* 1771 - printk("sync start - bitmap %p\n", mddev->bitmap); 1772 - */ 1773 1758 if (init_resync(conf)) 1774 1759 return 0; 1775 - } 1776 1760 1777 1761 max_sector = mddev->dev_sectors; 1778 1762 if (sector_nr >= max_sector) { ··· 2048 2042 2049 2043 err = -EIO; 2050 2044 if (conf->last_used < 0) { 2051 - printk(KERN_ERR "raid1: no operational mirrors for %s\n", 2045 + printk(KERN_ERR "md/raid1:%s: no operational mirrors\n", 2052 2046 mdname(mddev)); 2053 2047 goto abort; 2054 2048 } ··· 2056 2050 conf->thread = md_register_thread(raid1d, mddev, NULL); 2057 2051 if (!conf->thread) { 2058 2052 printk(KERN_ERR 2059 - "raid1: couldn't allocate thread for %s\n", 2053 + "md/raid1:%s: couldn't allocate thread\n", 2060 2054 mdname(mddev)); 2061 2055 goto abort; 2062 2056 } ··· 2082 2076 mdk_rdev_t *rdev; 2083 2077 2084 2078 if (mddev->level != 1) { 2085 - printk("raid1: %s: raid level not set to mirroring (%d)\n", 2079 + printk(KERN_ERR "md/raid1:%s: raid level not set to mirroring (%d)\n", 2086 2080 mdname(mddev), mddev->level); 2087 2081 return -EIO; 2088 2082 } 2089 2083 if (mddev->reshape_position != MaxSector) { 2090 - printk("raid1: %s: reshape_position set but not supported\n", 2084 + printk(KERN_ERR "md/raid1:%s: reshape_position set but not supported\n", 2091 2085 mdname(mddev)); 2092 2086 return -EIO; 2093 2087 } ··· 2130 2124 mddev->recovery_cp = MaxSector; 2131 2125 2132 2126 if (mddev->recovery_cp != MaxSector) 2133 - printk(KERN_NOTICE "raid1: %s is not clean" 2127 + printk(KERN_NOTICE "md/raid1:%s: not clean" 2134 2128 " -- starting background reconstruction\n", 2135 2129 mdname(mddev)); 2136 2130 printk(KERN_INFO 2137 - "raid1: raid set %s active with %d out of %d mirrors\n", 2131 + "md/raid1:%s: active with %d out of %d mirrors\n", 2138 2132 mdname(mddev), mddev->raid_disks - mddev->degraded, 2139 2133 mddev->raid_disks); 2140 2134 ··· 2158 2152 { 2159 2153 conf_t *conf = mddev->private; 2160 2154 struct bitmap *bitmap = mddev->bitmap; 2161 - int behind_wait = 0; 2162 2155 2163 2156 /* wait for behind writes to complete */ 2164 - while (bitmap && atomic_read(&bitmap->behind_writes) > 0) { 2165 - behind_wait++; 2166 - printk(KERN_INFO "raid1: behind writes in progress on device %s, waiting to stop (%d)\n", mdname(mddev), behind_wait); 2167 - set_current_state(TASK_UNINTERRUPTIBLE); 2168 - schedule_timeout(HZ); /* wait a second */ 2157 + if (bitmap && atomic_read(&bitmap->behind_writes) > 0) { 2158 + printk(KERN_INFO "md/raid1:%s: behind writes in progress - waiting to stop.\n", 2159 + mdname(mddev)); 2169 2160 /* need to kick something here to make sure I/O goes? */ 2161 + wait_event(bitmap->behind_wait, 2162 + atomic_read(&bitmap->behind_writes) == 0); 2170 2163 } 2171 2164 2172 2165 raise_barrier(conf); ··· 2196 2191 if (mddev->array_sectors > raid1_size(mddev, sectors, 0)) 2197 2192 return -EINVAL; 2198 2193 set_capacity(mddev->gendisk, mddev->array_sectors); 2199 - mddev->changed = 1; 2200 2194 revalidate_disk(mddev->gendisk); 2201 2195 if (sectors > mddev->dev_sectors && 2202 2196 mddev->recovery_cp == MaxSector) { ··· 2290 2286 if (sysfs_create_link(&mddev->kobj, 2291 2287 &rdev->kobj, nm)) 2292 2288 printk(KERN_WARNING 2293 - "md/raid1: cannot register " 2294 - "%s for %s\n", 2295 - nm, mdname(mddev)); 2289 + "md/raid1:%s: cannot register " 2290 + "%s\n", 2291 + mdname(mddev), nm); 2296 2292 } 2297 2293 if (rdev) 2298 2294 newmirrors[d2++].rdev = rdev;

+198 -102

drivers/md/raid10.c

··· 24 24 #include <linux/seq_file.h> 25 25 #include "md.h" 26 26 #include "raid10.h" 27 + #include "raid0.h" 27 28 #include "bitmap.h" 28 29 29 30 /* ··· 256 255 static void raid10_end_read_request(struct bio *bio, int error) 257 256 { 258 257 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 259 - r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private); 258 + r10bio_t *r10_bio = bio->bi_private; 260 259 int slot, dev; 261 260 conf_t *conf = r10_bio->mddev->private; 262 261 ··· 286 285 */ 287 286 char b[BDEVNAME_SIZE]; 288 287 if (printk_ratelimit()) 289 - printk(KERN_ERR "raid10: %s: rescheduling sector %llu\n", 288 + printk(KERN_ERR "md/raid10:%s: %s: rescheduling sector %llu\n", 289 + mdname(conf->mddev), 290 290 bdevname(conf->mirrors[dev].rdev->bdev,b), (unsigned long long)r10_bio->sector); 291 291 reschedule_retry(r10_bio); 292 292 } ··· 298 296 static void raid10_end_write_request(struct bio *bio, int error) 299 297 { 300 298 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 301 - r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private); 299 + r10bio_t *r10_bio = bio->bi_private; 302 300 int slot, dev; 303 301 conf_t *conf = r10_bio->mddev->private; 304 302 ··· 496 494 */ 497 495 static int read_balance(conf_t *conf, r10bio_t *r10_bio) 498 496 { 499 - const unsigned long this_sector = r10_bio->sector; 497 + const sector_t this_sector = r10_bio->sector; 500 498 int disk, slot, nslot; 501 499 const int sectors = r10_bio->sectors; 502 500 sector_t new_distance, current_distance; ··· 603 601 int i; 604 602 605 603 rcu_read_lock(); 606 - for (i=0; i<mddev->raid_disks; i++) { 604 + for (i=0; i < conf->raid_disks; i++) { 607 605 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); 608 606 if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) { 609 607 struct request_queue *r_queue = bdev_get_queue(rdev->bdev); ··· 637 635 if (mddev_congested(mddev, bits)) 638 636 return 1; 639 637 rcu_read_lock(); 640 - for (i = 0; i < mddev->raid_disks && ret == 0; i++) { 638 + for (i = 0; i < conf->raid_disks && ret == 0; i++) { 641 639 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); 642 640 if (rdev && !test_bit(Faulty, &rdev->flags)) { 643 641 struct request_queue *q = bdev_get_queue(rdev->bdev); ··· 790 788 spin_unlock_irq(&conf->resync_lock); 791 789 } 792 790 793 - static int make_request(struct request_queue *q, struct bio * bio) 791 + static int make_request(mddev_t *mddev, struct bio * bio) 794 792 { 795 - mddev_t *mddev = q->queuedata; 796 793 conf_t *conf = mddev->private; 797 794 mirror_info_t *mirror; 798 795 r10bio_t *r10_bio; 799 796 struct bio *read_bio; 800 - int cpu; 801 797 int i; 802 798 int chunk_sects = conf->chunk_mask + 1; 803 799 const int rw = bio_data_dir(bio); ··· 825 825 */ 826 826 bp = bio_split(bio, 827 827 chunk_sects - (bio->bi_sector & (chunk_sects - 1)) ); 828 - if (make_request(q, &bp->bio1)) 828 + if (make_request(mddev, &bp->bio1)) 829 829 generic_make_request(&bp->bio1); 830 - if (make_request(q, &bp->bio2)) 830 + if (make_request(mddev, &bp->bio2)) 831 831 generic_make_request(&bp->bio2); 832 832 833 833 bio_pair_release(bp); 834 834 return 0; 835 835 bad_map: 836 - printk("raid10_make_request bug: can't convert block across chunks" 837 - " or bigger than %dk %llu %d\n", chunk_sects/2, 836 + printk("md/raid10:%s: make_request bug: can't convert block across chunks" 837 + " or bigger than %dk %llu %d\n", mdname(mddev), chunk_sects/2, 838 838 (unsigned long long)bio->bi_sector, bio->bi_size >> 10); 839 839 840 840 bio_io_error(bio); ··· 849 849 * Continue immediately if no resync is active currently. 850 850 */ 851 851 wait_barrier(conf); 852 - 853 - cpu = part_stat_lock(); 854 - part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); 855 - part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], 856 - bio_sectors(bio)); 857 - part_stat_unlock(); 858 852 859 853 r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO); 860 854 ··· 1033 1039 } 1034 1040 set_bit(Faulty, &rdev->flags); 1035 1041 set_bit(MD_CHANGE_DEVS, &mddev->flags); 1036 - printk(KERN_ALERT "raid10: Disk failure on %s, disabling device.\n" 1037 - "raid10: Operation continuing on %d devices.\n", 1038 - bdevname(rdev->bdev,b), conf->raid_disks - mddev->degraded); 1042 + printk(KERN_ALERT "md/raid10:%s: Disk failure on %s, disabling device.\n" 1043 + KERN_ALERT "md/raid10:%s: Operation continuing on %d devices.\n", 1044 + mdname(mddev), bdevname(rdev->bdev, b), 1045 + mdname(mddev), conf->raid_disks - mddev->degraded); 1039 1046 } 1040 1047 1041 1048 static void print_conf(conf_t *conf) ··· 1044 1049 int i; 1045 1050 mirror_info_t *tmp; 1046 1051 1047 - printk("RAID10 conf printout:\n"); 1052 + printk(KERN_DEBUG "RAID10 conf printout:\n"); 1048 1053 if (!conf) { 1049 - printk("(!conf)\n"); 1054 + printk(KERN_DEBUG "(!conf)\n"); 1050 1055 return; 1051 1056 } 1052 - printk(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded, 1057 + printk(KERN_DEBUG " --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded, 1053 1058 conf->raid_disks); 1054 1059 1055 1060 for (i = 0; i < conf->raid_disks; i++) { 1056 1061 char b[BDEVNAME_SIZE]; 1057 1062 tmp = conf->mirrors + i; 1058 1063 if (tmp->rdev) 1059 - printk(" disk %d, wo:%d, o:%d, dev:%s\n", 1064 + printk(KERN_DEBUG " disk %d, wo:%d, o:%d, dev:%s\n", 1060 1065 i, !test_bit(In_sync, &tmp->rdev->flags), 1061 1066 !test_bit(Faulty, &tmp->rdev->flags), 1062 1067 bdevname(tmp->rdev->bdev,b)); ··· 1127 1132 int mirror; 1128 1133 mirror_info_t *p; 1129 1134 int first = 0; 1130 - int last = mddev->raid_disks - 1; 1135 + int last = conf->raid_disks - 1; 1131 1136 1132 1137 if (mddev->recovery_cp < MaxSector) 1133 1138 /* only hot-add to in-sync arrays, as recovery is ··· 1219 1224 1220 1225 static void end_sync_read(struct bio *bio, int error) 1221 1226 { 1222 - r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private); 1227 + r10bio_t *r10_bio = bio->bi_private; 1223 1228 conf_t *conf = r10_bio->mddev->private; 1224 1229 int i,d; 1225 1230 ··· 1256 1261 static void end_sync_write(struct bio *bio, int error) 1257 1262 { 1258 1263 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 1259 - r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private); 1264 + r10bio_t *r10_bio = bio->bi_private; 1260 1265 mddev_t *mddev = r10_bio->mddev; 1261 1266 conf_t *conf = mddev->private; 1262 1267 int i,d; ··· 1505 1510 if (cur_read_error_count > max_read_errors) { 1506 1511 rcu_read_unlock(); 1507 1512 printk(KERN_NOTICE 1508 - "raid10: %s: Raid device exceeded " 1513 + "md/raid10:%s: %s: Raid device exceeded " 1509 1514 "read_error threshold " 1510 1515 "[cur %d:max %d]\n", 1516 + mdname(mddev), 1511 1517 b, cur_read_error_count, max_read_errors); 1512 1518 printk(KERN_NOTICE 1513 - "raid10: %s: Failing raid " 1514 - "device\n", b); 1519 + "md/raid10:%s: %s: Failing raid " 1520 + "device\n", mdname(mddev), b); 1515 1521 md_error(mddev, conf->mirrors[d].rdev); 1516 1522 return; 1517 1523 } ··· 1582 1586 == 0) { 1583 1587 /* Well, this device is dead */ 1584 1588 printk(KERN_NOTICE 1585 - "raid10:%s: read correction " 1589 + "md/raid10:%s: read correction " 1586 1590 "write failed" 1587 1591 " (%d sectors at %llu on %s)\n", 1588 1592 mdname(mddev), s, 1589 1593 (unsigned long long)(sect+ 1590 1594 rdev->data_offset), 1591 1595 bdevname(rdev->bdev, b)); 1592 - printk(KERN_NOTICE "raid10:%s: failing " 1596 + printk(KERN_NOTICE "md/raid10:%s: %s: failing " 1593 1597 "drive\n", 1598 + mdname(mddev), 1594 1599 bdevname(rdev->bdev, b)); 1595 1600 md_error(mddev, rdev); 1596 1601 } ··· 1619 1622 READ) == 0) { 1620 1623 /* Well, this device is dead */ 1621 1624 printk(KERN_NOTICE 1622 - "raid10:%s: unable to read back " 1625 + "md/raid10:%s: unable to read back " 1623 1626 "corrected sectors" 1624 1627 " (%d sectors at %llu on %s)\n", 1625 1628 mdname(mddev), s, 1626 1629 (unsigned long long)(sect+ 1627 1630 rdev->data_offset), 1628 1631 bdevname(rdev->bdev, b)); 1629 - printk(KERN_NOTICE "raid10:%s: failing drive\n", 1632 + printk(KERN_NOTICE "md/raid10:%s: %s: failing drive\n", 1633 + mdname(mddev), 1630 1634 bdevname(rdev->bdev, b)); 1631 1635 1632 1636 md_error(mddev, rdev); 1633 1637 } else { 1634 1638 printk(KERN_INFO 1635 - "raid10:%s: read error corrected" 1639 + "md/raid10:%s: read error corrected" 1636 1640 " (%d sectors at %llu on %s)\n", 1637 1641 mdname(mddev), s, 1638 1642 (unsigned long long)(sect+ ··· 1708 1710 mddev->ro ? IO_BLOCKED : NULL; 1709 1711 mirror = read_balance(conf, r10_bio); 1710 1712 if (mirror == -1) { 1711 - printk(KERN_ALERT "raid10: %s: unrecoverable I/O" 1713 + printk(KERN_ALERT "md/raid10:%s: %s: unrecoverable I/O" 1712 1714 " read error for block %llu\n", 1715 + mdname(mddev), 1713 1716 bdevname(bio->bi_bdev,b), 1714 1717 (unsigned long long)r10_bio->sector); 1715 1718 raid_end_bio_io(r10_bio); ··· 1720 1721 bio_put(bio); 1721 1722 rdev = conf->mirrors[mirror].rdev; 1722 1723 if (printk_ratelimit()) 1723 - printk(KERN_ERR "raid10: %s: redirecting sector %llu to" 1724 + printk(KERN_ERR "md/raid10:%s: %s: redirecting sector %llu to" 1724 1725 " another mirror\n", 1726 + mdname(mddev), 1725 1727 bdevname(rdev->bdev,b), 1726 1728 (unsigned long long)r10_bio->sector); 1727 1729 bio = bio_clone(r10_bio->master_bio, GFP_NOIO); ··· 1980 1980 r10_bio = rb2; 1981 1981 if (!test_and_set_bit(MD_RECOVERY_INTR, 1982 1982 &mddev->recovery)) 1983 - printk(KERN_INFO "raid10: %s: insufficient working devices for recovery.\n", 1983 + printk(KERN_INFO "md/raid10:%s: insufficient " 1984 + "working devices for recovery.\n", 1984 1985 mdname(mddev)); 1985 1986 break; 1986 1987 } ··· 2141 2140 conf_t *conf = mddev->private; 2142 2141 2143 2142 if (!raid_disks) 2144 - raid_disks = mddev->raid_disks; 2143 + raid_disks = conf->raid_disks; 2145 2144 if (!sectors) 2146 - sectors = mddev->dev_sectors; 2145 + sectors = conf->dev_sectors; 2147 2146 2148 2147 size = sectors >> conf->chunk_shift; 2149 2148 sector_div(size, conf->far_copies); ··· 2153 2152 return size << conf->chunk_shift; 2154 2153 } 2155 2154 2156 - static int run(mddev_t *mddev) 2155 + 2156 + static conf_t *setup_conf(mddev_t *mddev) 2157 2157 { 2158 - conf_t *conf; 2159 - int i, disk_idx, chunk_size; 2160 - mirror_info_t *disk; 2161 - mdk_rdev_t *rdev; 2158 + conf_t *conf = NULL; 2162 2159 int nc, fc, fo; 2163 2160 sector_t stride, size; 2161 + int err = -EINVAL; 2164 2162 2165 2163 if (mddev->chunk_sectors < (PAGE_SIZE >> 9) || 2166 2164 !is_power_of_2(mddev->chunk_sectors)) { 2167 - printk(KERN_ERR "md/raid10: chunk size must be " 2168 - "at least PAGE_SIZE(%ld) and be a power of 2.\n", PAGE_SIZE); 2169 - return -EINVAL; 2165 + printk(KERN_ERR "md/raid10:%s: chunk size must be " 2166 + "at least PAGE_SIZE(%ld) and be a power of 2.\n", 2167 + mdname(mddev), PAGE_SIZE); 2168 + goto out; 2170 2169 } 2171 2170 2172 2171 nc = mddev->layout & 255; 2173 2172 fc = (mddev->layout >> 8) & 255; 2174 2173 fo = mddev->layout & (1<<16); 2174 + 2175 2175 if ((nc*fc) <2 || (nc*fc) > mddev->raid_disks || 2176 2176 (mddev->layout >> 17)) { 2177 - printk(KERN_ERR "raid10: %s: unsupported raid10 layout: 0x%8x\n", 2177 + printk(KERN_ERR "md/raid10:%s: unsupported raid10 layout: 0x%8x\n", 2178 2178 mdname(mddev), mddev->layout); 2179 2179 goto out; 2180 2180 } 2181 - /* 2182 - * copy the already verified devices into our private RAID10 2183 - * bookkeeping area. [whatever we allocate in run(), 2184 - * should be freed in stop()] 2185 - */ 2181 + 2182 + err = -ENOMEM; 2186 2183 conf = kzalloc(sizeof(conf_t), GFP_KERNEL); 2187 - mddev->private = conf; 2188 - if (!conf) { 2189 - printk(KERN_ERR "raid10: couldn't allocate memory for %s\n", 2190 - mdname(mddev)); 2184 + if (!conf) 2191 2185 goto out; 2192 - } 2186 + 2193 2187 conf->mirrors = kzalloc(sizeof(struct mirror_info)*mddev->raid_disks, 2194 - GFP_KERNEL); 2195 - if (!conf->mirrors) { 2196 - printk(KERN_ERR "raid10: couldn't allocate memory for %s\n", 2197 - mdname(mddev)); 2198 - goto out_free_conf; 2199 - } 2188 + GFP_KERNEL); 2189 + if (!conf->mirrors) 2190 + goto out; 2200 2191 2201 2192 conf->tmppage = alloc_page(GFP_KERNEL); 2202 2193 if (!conf->tmppage) 2203 - goto out_free_conf; 2194 + goto out; 2195 + 2204 2196 2205 2197 conf->raid_disks = mddev->raid_disks; 2206 2198 conf->near_copies = nc; 2207 2199 conf->far_copies = fc; 2208 2200 conf->copies = nc*fc; 2209 2201 conf->far_offset = fo; 2210 - conf->chunk_mask = mddev->chunk_sectors - 1; 2211 - conf->chunk_shift = ffz(~mddev->chunk_sectors); 2202 + conf->chunk_mask = mddev->new_chunk_sectors - 1; 2203 + conf->chunk_shift = ffz(~mddev->new_chunk_sectors); 2204 + 2205 + conf->r10bio_pool = mempool_create(NR_RAID10_BIOS, r10bio_pool_alloc, 2206 + r10bio_pool_free, conf); 2207 + if (!conf->r10bio_pool) 2208 + goto out; 2209 + 2212 2210 size = mddev->dev_sectors >> conf->chunk_shift; 2213 2211 sector_div(size, fc); 2214 2212 size = size * conf->raid_disks; ··· 2221 2221 */ 2222 2222 stride += conf->raid_disks - 1; 2223 2223 sector_div(stride, conf->raid_disks); 2224 - mddev->dev_sectors = stride << conf->chunk_shift; 2224 + 2225 + conf->dev_sectors = stride << conf->chunk_shift; 2225 2226 2226 2227 if (fo) 2227 2228 stride = 1; ··· 2230 2229 sector_div(stride, fc); 2231 2230 conf->stride = stride << conf->chunk_shift; 2232 2231 2233 - conf->r10bio_pool = mempool_create(NR_RAID10_BIOS, r10bio_pool_alloc, 2234 - r10bio_pool_free, conf); 2235 - if (!conf->r10bio_pool) { 2236 - printk(KERN_ERR "raid10: couldn't allocate memory for %s\n", 2237 - mdname(mddev)); 2238 - goto out_free_conf; 2239 - } 2240 2232 2241 - conf->mddev = mddev; 2242 2233 spin_lock_init(&conf->device_lock); 2234 + INIT_LIST_HEAD(&conf->retry_list); 2235 + 2236 + spin_lock_init(&conf->resync_lock); 2237 + init_waitqueue_head(&conf->wait_barrier); 2238 + 2239 + conf->thread = md_register_thread(raid10d, mddev, NULL); 2240 + if (!conf->thread) 2241 + goto out; 2242 + 2243 + conf->scale_disks = 0; 2244 + conf->mddev = mddev; 2245 + return conf; 2246 + 2247 + out: 2248 + printk(KERN_ERR "md/raid10:%s: couldn't allocate memory.\n", 2249 + mdname(mddev)); 2250 + if (conf) { 2251 + if (conf->r10bio_pool) 2252 + mempool_destroy(conf->r10bio_pool); 2253 + kfree(conf->mirrors); 2254 + safe_put_page(conf->tmppage); 2255 + kfree(conf); 2256 + } 2257 + return ERR_PTR(err); 2258 + } 2259 + 2260 + static int run(mddev_t *mddev) 2261 + { 2262 + conf_t *conf; 2263 + int i, disk_idx, chunk_size; 2264 + mirror_info_t *disk; 2265 + mdk_rdev_t *rdev; 2266 + sector_t size; 2267 + 2268 + /* 2269 + * copy the already verified devices into our private RAID10 2270 + * bookkeeping area. [whatever we allocate in run(), 2271 + * should be freed in stop()] 2272 + */ 2273 + 2274 + if (mddev->private == NULL) { 2275 + conf = setup_conf(mddev); 2276 + if (IS_ERR(conf)) 2277 + return PTR_ERR(conf); 2278 + mddev->private = conf; 2279 + } 2280 + conf = mddev->private; 2281 + if (!conf) 2282 + goto out; 2283 + 2243 2284 mddev->queue->queue_lock = &conf->device_lock; 2285 + 2286 + mddev->thread = conf->thread; 2287 + conf->thread = NULL; 2244 2288 2245 2289 chunk_size = mddev->chunk_sectors << 9; 2246 2290 blk_queue_io_min(mddev->queue, chunk_size); ··· 2297 2251 2298 2252 list_for_each_entry(rdev, &mddev->disks, same_set) { 2299 2253 disk_idx = rdev->raid_disk; 2300 - if (disk_idx >= mddev->raid_disks 2254 + if (disk_idx >= conf->raid_disks 2301 2255 || disk_idx < 0) 2302 2256 continue; 2257 + if (conf->scale_disks) { 2258 + disk_idx *= conf->scale_disks; 2259 + rdev->raid_disk = disk_idx; 2260 + /* MOVE 'rd%d' link !! */ 2261 + } 2303 2262 disk = conf->mirrors + disk_idx; 2304 2263 2305 2264 disk->rdev = rdev; ··· 2322 2271 2323 2272 disk->head_position = 0; 2324 2273 } 2325 - INIT_LIST_HEAD(&conf->retry_list); 2326 - 2327 - spin_lock_init(&conf->resync_lock); 2328 - init_waitqueue_head(&conf->wait_barrier); 2329 - 2330 2274 /* need to check that every block has at least one working mirror */ 2331 2275 if (!enough(conf)) { 2332 - printk(KERN_ERR "raid10: not enough operational mirrors for %s\n", 2276 + printk(KERN_ERR "md/raid10:%s: not enough operational mirrors.\n", 2333 2277 mdname(mddev)); 2334 2278 goto out_free_conf; 2335 2279 } ··· 2343 2297 } 2344 2298 } 2345 2299 2346 - 2347 - mddev->thread = md_register_thread(raid10d, mddev, NULL); 2348 - if (!mddev->thread) { 2349 - printk(KERN_ERR 2350 - "raid10: couldn't allocate thread for %s\n", 2351 - mdname(mddev)); 2352 - goto out_free_conf; 2353 - } 2354 - 2355 2300 if (mddev->recovery_cp != MaxSector) 2356 - printk(KERN_NOTICE "raid10: %s is not clean" 2301 + printk(KERN_NOTICE "md/raid10:%s: not clean" 2357 2302 " -- starting background reconstruction\n", 2358 2303 mdname(mddev)); 2359 2304 printk(KERN_INFO 2360 - "raid10: raid set %s active with %d out of %d devices\n", 2361 - mdname(mddev), mddev->raid_disks - mddev->degraded, 2362 - mddev->raid_disks); 2305 + "md/raid10:%s: active with %d out of %d devices\n", 2306 + mdname(mddev), conf->raid_disks - mddev->degraded, 2307 + conf->raid_disks); 2363 2308 /* 2364 2309 * Ok, everything is just fine now 2365 2310 */ 2366 - md_set_array_sectors(mddev, raid10_size(mddev, 0, 0)); 2367 - mddev->resync_max_sectors = raid10_size(mddev, 0, 0); 2311 + mddev->dev_sectors = conf->dev_sectors; 2312 + size = raid10_size(mddev, 0, 0); 2313 + md_set_array_sectors(mddev, size); 2314 + mddev->resync_max_sectors = size; 2368 2315 2369 2316 mddev->queue->unplug_fn = raid10_unplug; 2370 2317 mddev->queue->backing_dev_info.congested_fn = raid10_congested; ··· 2375 2336 mddev->queue->backing_dev_info.ra_pages = 2* stripe; 2376 2337 } 2377 2338 2378 - if (conf->near_copies < mddev->raid_disks) 2339 + if (conf->near_copies < conf->raid_disks) 2379 2340 blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec); 2380 2341 md_integrity_register(mddev); 2381 2342 return 0; ··· 2387 2348 kfree(conf->mirrors); 2388 2349 kfree(conf); 2389 2350 mddev->private = NULL; 2351 + md_unregister_thread(mddev->thread); 2390 2352 out: 2391 2353 return -EIO; 2392 2354 } ··· 2424 2384 } 2425 2385 } 2426 2386 2387 + static void *raid10_takeover_raid0(mddev_t *mddev) 2388 + { 2389 + mdk_rdev_t *rdev; 2390 + conf_t *conf; 2391 + 2392 + if (mddev->degraded > 0) { 2393 + printk(KERN_ERR "md/raid10:%s: Error: degraded raid0!\n", 2394 + mdname(mddev)); 2395 + return ERR_PTR(-EINVAL); 2396 + } 2397 + 2398 + /* Update slot numbers to obtain 2399 + * degraded raid10 with missing mirrors 2400 + */ 2401 + list_for_each_entry(rdev, &mddev->disks, same_set) { 2402 + rdev->raid_disk *= 2; 2403 + } 2404 + 2405 + /* Set new parameters */ 2406 + mddev->new_level = 10; 2407 + /* new layout: far_copies = 1, near_copies = 2 */ 2408 + mddev->new_layout = (1<<8) + 2; 2409 + mddev->new_chunk_sectors = mddev->chunk_sectors; 2410 + mddev->delta_disks = mddev->raid_disks; 2411 + mddev->degraded = mddev->raid_disks; 2412 + mddev->raid_disks *= 2; 2413 + /* make sure it will be not marked as dirty */ 2414 + mddev->recovery_cp = MaxSector; 2415 + 2416 + conf = setup_conf(mddev); 2417 + conf->scale_disks = 2; 2418 + return conf; 2419 + } 2420 + 2421 + static void *raid10_takeover(mddev_t *mddev) 2422 + { 2423 + struct raid0_private_data *raid0_priv; 2424 + 2425 + /* raid10 can take over: 2426 + * raid0 - providing it has only two drives 2427 + */ 2428 + if (mddev->level == 0) { 2429 + /* for raid0 takeover only one zone is supported */ 2430 + raid0_priv = mddev->private; 2431 + if (raid0_priv->nr_strip_zones > 1) { 2432 + printk(KERN_ERR "md/raid10:%s: cannot takeover raid 0" 2433 + " with more than one zone.\n", 2434 + mdname(mddev)); 2435 + return ERR_PTR(-EINVAL); 2436 + } 2437 + return raid10_takeover_raid0(mddev); 2438 + } 2439 + return ERR_PTR(-EINVAL); 2440 + } 2441 + 2427 2442 static struct mdk_personality raid10_personality = 2428 2443 { 2429 2444 .name = "raid10", ··· 2495 2400 .sync_request = sync_request, 2496 2401 .quiesce = raid10_quiesce, 2497 2402 .size = raid10_size, 2403 + .takeover = raid10_takeover, 2498 2404 }; 2499 2405 2500 2406 static int __init raid_init(void)

+12

drivers/md/raid10.h

··· 33 33 * 1 stripe. 34 34 */ 35 35 36 + sector_t dev_sectors; /* temp copy of mddev->dev_sectors */ 37 + 36 38 int chunk_shift; /* shift from chunks to sectors */ 37 39 sector_t chunk_mask; 40 + 41 + int scale_disks; /* When starting array, multiply 42 + * each ->raid_disk by this. 43 + * Need for raid0->raid10 migration 44 + */ 38 45 39 46 struct list_head retry_list; 40 47 /* queue pending writes and submit them on unplug */ ··· 64 57 mempool_t *r10bio_pool; 65 58 mempool_t *r10buf_pool; 66 59 struct page *tmppage; 60 + 61 + /* When taking over an array from a different personality, we store 62 + * the new thread here until we fully activate the array. 63 + */ 64 + struct mdk_thread_s *thread; 67 65 }; 68 66 69 67 typedef struct r10_private_data_s conf_t;

+127 -104

drivers/md/raid5.c

··· 53 53 #include <linux/slab.h> 54 54 #include "md.h" 55 55 #include "raid5.h" 56 + #include "raid0.h" 56 57 #include "bitmap.h" 57 58 58 59 /* ··· 1510 1509 set_bit(R5_UPTODATE, &sh->dev[i].flags); 1511 1510 if (test_bit(R5_ReadError, &sh->dev[i].flags)) { 1512 1511 rdev = conf->disks[i].rdev; 1513 - printk_rl(KERN_INFO "raid5:%s: read error corrected" 1512 + printk_rl(KERN_INFO "md/raid:%s: read error corrected" 1514 1513 " (%lu sectors at %llu on %s)\n", 1515 1514 mdname(conf->mddev), STRIPE_SECTORS, 1516 1515 (unsigned long long)(sh->sector ··· 1530 1529 atomic_inc(&rdev->read_errors); 1531 1530 if (conf->mddev->degraded >= conf->max_degraded) 1532 1531 printk_rl(KERN_WARNING 1533 - "raid5:%s: read error not correctable " 1532 + "md/raid:%s: read error not correctable " 1534 1533 "(sector %llu on %s).\n", 1535 1534 mdname(conf->mddev), 1536 1535 (unsigned long long)(sh->sector ··· 1539 1538 else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) 1540 1539 /* Oh, no!!! */ 1541 1540 printk_rl(KERN_WARNING 1542 - "raid5:%s: read error NOT corrected!! " 1541 + "md/raid:%s: read error NOT corrected!! " 1543 1542 "(sector %llu on %s).\n", 1544 1543 mdname(conf->mddev), 1545 1544 (unsigned long long)(sh->sector ··· 1548 1547 else if (atomic_read(&rdev->read_errors) 1549 1548 > conf->max_nr_stripes) 1550 1549 printk(KERN_WARNING 1551 - "raid5:%s: Too many read errors, failing device %s.\n", 1550 + "md/raid:%s: Too many read errors, failing device %s.\n", 1552 1551 mdname(conf->mddev), bdn); 1553 1552 else 1554 1553 retry = 1; ··· 1620 1619 static void error(mddev_t *mddev, mdk_rdev_t *rdev) 1621 1620 { 1622 1621 char b[BDEVNAME_SIZE]; 1623 - raid5_conf_t *conf = (raid5_conf_t *) mddev->private; 1624 - pr_debug("raid5: error called\n"); 1622 + raid5_conf_t *conf = mddev->private; 1623 + pr_debug("raid456: error called\n"); 1625 1624 1626 1625 if (!test_bit(Faulty, &rdev->flags)) { 1627 1626 set_bit(MD_CHANGE_DEVS, &mddev->flags); ··· 1637 1636 } 1638 1637 set_bit(Faulty, &rdev->flags); 1639 1638 printk(KERN_ALERT 1640 - "raid5: Disk failure on %s, disabling device.\n" 1641 - "raid5: Operation continuing on %d devices.\n", 1642 - bdevname(rdev->bdev,b), conf->raid_disks - mddev->degraded); 1639 + "md/raid:%s: Disk failure on %s, disabling device.\n" 1640 + KERN_ALERT 1641 + "md/raid:%s: Operation continuing on %d devices.\n", 1642 + mdname(mddev), 1643 + bdevname(rdev->bdev, b), 1644 + mdname(mddev), 1645 + conf->raid_disks - mddev->degraded); 1643 1646 } 1644 1647 } 1645 1648 ··· 1719 1714 pd_idx = data_disks; 1720 1715 break; 1721 1716 default: 1722 - printk(KERN_ERR "raid5: unsupported algorithm %d\n", 1723 - algorithm); 1724 1717 BUG(); 1725 1718 } 1726 1719 break; ··· 1835 1832 qd_idx = raid_disks - 1; 1836 1833 break; 1837 1834 1838 - 1839 1835 default: 1840 - printk(KERN_CRIT "raid6: unsupported algorithm %d\n", 1841 - algorithm); 1842 1836 BUG(); 1843 1837 } 1844 1838 break; ··· 1898 1898 case ALGORITHM_PARITY_N: 1899 1899 break; 1900 1900 default: 1901 - printk(KERN_ERR "raid5: unsupported algorithm %d\n", 1902 - algorithm); 1903 1901 BUG(); 1904 1902 } 1905 1903 break; ··· 1956 1958 i -= 1; 1957 1959 break; 1958 1960 default: 1959 - printk(KERN_CRIT "raid6: unsupported algorithm %d\n", 1960 - algorithm); 1961 1961 BUG(); 1962 1962 } 1963 1963 break; ··· 1968 1972 previous, &dummy1, &sh2); 1969 1973 if (check != sh->sector || dummy1 != dd_idx || sh2.pd_idx != sh->pd_idx 1970 1974 || sh2.qd_idx != sh->qd_idx) { 1971 - printk(KERN_ERR "compute_blocknr: map not correct\n"); 1975 + printk(KERN_ERR "md/raid:%s: compute_blocknr: map not correct\n", 1976 + mdname(conf->mddev)); 1972 1977 return 0; 1973 1978 } 1974 1979 return r_sector; ··· 3706 3709 3707 3710 bio_put(bi); 3708 3711 3709 - mddev = raid_bi->bi_bdev->bd_disk->queue->queuedata; 3710 - conf = mddev->private; 3711 3712 rdev = (void*)raid_bi->bi_next; 3712 3713 raid_bi->bi_next = NULL; 3714 + mddev = rdev->mddev; 3715 + conf = mddev->private; 3713 3716 3714 3717 rdev_dec_pending(rdev, conf->mddev); 3715 3718 ··· 3746 3749 } 3747 3750 3748 3751 3749 - static int chunk_aligned_read(struct request_queue *q, struct bio * raid_bio) 3752 + static int chunk_aligned_read(mddev_t *mddev, struct bio * raid_bio) 3750 3753 { 3751 - mddev_t *mddev = q->queuedata; 3752 3754 raid5_conf_t *conf = mddev->private; 3753 3755 int dd_idx; 3754 3756 struct bio* align_bi; ··· 3862 3866 return sh; 3863 3867 } 3864 3868 3865 - static int make_request(struct request_queue *q, struct bio * bi) 3869 + static int make_request(mddev_t *mddev, struct bio * bi) 3866 3870 { 3867 - mddev_t *mddev = q->queuedata; 3868 3871 raid5_conf_t *conf = mddev->private; 3869 3872 int dd_idx; 3870 3873 sector_t new_sector; 3871 3874 sector_t logical_sector, last_sector; 3872 3875 struct stripe_head *sh; 3873 3876 const int rw = bio_data_dir(bi); 3874 - int cpu, remaining; 3877 + int remaining; 3875 3878 3876 3879 if (unlikely(bio_rw_flagged(bi, BIO_RW_BARRIER))) { 3877 3880 /* Drain all pending writes. We only really need ··· 3885 3890 3886 3891 md_write_start(mddev, bi); 3887 3892 3888 - cpu = part_stat_lock(); 3889 - part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); 3890 - part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], 3891 - bio_sectors(bi)); 3892 - part_stat_unlock(); 3893 - 3894 3893 if (rw == READ && 3895 3894 mddev->reshape_position == MaxSector && 3896 - chunk_aligned_read(q,bi)) 3895 + chunk_aligned_read(mddev,bi)) 3897 3896 return 0; 3898 3897 3899 3898 logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1); ··· 3935 3946 new_sector = raid5_compute_sector(conf, logical_sector, 3936 3947 previous, 3937 3948 &dd_idx, NULL); 3938 - pr_debug("raid5: make_request, sector %llu logical %llu\n", 3949 + pr_debug("raid456: make_request, sector %llu logical %llu\n", 3939 3950 (unsigned long long)new_sector, 3940 3951 (unsigned long long)logical_sector); 3941 3952 ··· 4043 4054 * As the reads complete, handle_stripe will copy the data 4044 4055 * into the destination stripe and release that stripe. 4045 4056 */ 4046 - raid5_conf_t *conf = (raid5_conf_t *) mddev->private; 4057 + raid5_conf_t *conf = mddev->private; 4047 4058 struct stripe_head *sh; 4048 4059 sector_t first_sector, last_sector; 4049 4060 int raid_disks = conf->previous_raid_disks; ··· 4252 4263 /* FIXME go_faster isn't used */ 4253 4264 static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster) 4254 4265 { 4255 - raid5_conf_t *conf = (raid5_conf_t *) mddev->private; 4266 + raid5_conf_t *conf = mddev->private; 4256 4267 struct stripe_head *sh; 4257 4268 sector_t max_sector = mddev->dev_sectors; 4258 4269 int sync_blocks; ··· 4714 4725 if (mddev->new_level != 5 4715 4726 && mddev->new_level != 4 4716 4727 && mddev->new_level != 6) { 4717 - printk(KERN_ERR "raid5: %s: raid level not set to 4/5/6 (%d)\n", 4728 + printk(KERN_ERR "md/raid:%s: raid level not set to 4/5/6 (%d)\n", 4718 4729 mdname(mddev), mddev->new_level); 4719 4730 return ERR_PTR(-EIO); 4720 4731 } ··· 4722 4733 && !algorithm_valid_raid5(mddev->new_layout)) || 4723 4734 (mddev->new_level == 6 4724 4735 && !algorithm_valid_raid6(mddev->new_layout))) { 4725 - printk(KERN_ERR "raid5: %s: layout %d not supported\n", 4736 + printk(KERN_ERR "md/raid:%s: layout %d not supported\n", 4726 4737 mdname(mddev), mddev->new_layout); 4727 4738 return ERR_PTR(-EIO); 4728 4739 } 4729 4740 if (mddev->new_level == 6 && mddev->raid_disks < 4) { 4730 - printk(KERN_ERR "raid6: not enough configured devices for %s (%d, minimum 4)\n", 4741 + printk(KERN_ERR "md/raid:%s: not enough configured devices (%d, minimum 4)\n", 4731 4742 mdname(mddev), mddev->raid_disks); 4732 4743 return ERR_PTR(-EINVAL); 4733 4744 } ··· 4735 4746 if (!mddev->new_chunk_sectors || 4736 4747 (mddev->new_chunk_sectors << 9) % PAGE_SIZE || 4737 4748 !is_power_of_2(mddev->new_chunk_sectors)) { 4738 - printk(KERN_ERR "raid5: invalid chunk size %d for %s\n", 4739 - mddev->new_chunk_sectors << 9, mdname(mddev)); 4749 + printk(KERN_ERR "md/raid:%s: invalid chunk size %d\n", 4750 + mdname(mddev), mddev->new_chunk_sectors << 9); 4740 4751 return ERR_PTR(-EINVAL); 4741 4752 } 4742 4753 ··· 4778 4789 if (raid5_alloc_percpu(conf) != 0) 4779 4790 goto abort; 4780 4791 4781 - pr_debug("raid5: run(%s) called.\n", mdname(mddev)); 4792 + pr_debug("raid456: run(%s) called.\n", mdname(mddev)); 4782 4793 4783 4794 list_for_each_entry(rdev, &mddev->disks, same_set) { 4784 4795 raid_disk = rdev->raid_disk; ··· 4791 4802 4792 4803 if (test_bit(In_sync, &rdev->flags)) { 4793 4804 char b[BDEVNAME_SIZE]; 4794 - printk(KERN_INFO "raid5: device %s operational as raid" 4795 - " disk %d\n", bdevname(rdev->bdev,b), 4796 - raid_disk); 4805 + printk(KERN_INFO "md/raid:%s: device %s operational as raid" 4806 + " disk %d\n", 4807 + mdname(mddev), bdevname(rdev->bdev, b), raid_disk); 4797 4808 } else 4798 4809 /* Cannot rely on bitmap to complete recovery */ 4799 4810 conf->fullsync = 1; ··· 4817 4828 max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; 4818 4829 if (grow_stripes(conf, conf->max_nr_stripes)) { 4819 4830 printk(KERN_ERR 4820 - "raid5: couldn't allocate %dkB for buffers\n", memory); 4831 + "md/raid:%s: couldn't allocate %dkB for buffers\n", 4832 + mdname(mddev), memory); 4821 4833 goto abort; 4822 4834 } else 4823 - printk(KERN_INFO "raid5: allocated %dkB for %s\n", 4824 - memory, mdname(mddev)); 4835 + printk(KERN_INFO "md/raid:%s: allocated %dkB\n", 4836 + mdname(mddev), memory); 4825 4837 4826 4838 conf->thread = md_register_thread(raid5d, mddev, NULL); 4827 4839 if (!conf->thread) { 4828 4840 printk(KERN_ERR 4829 - "raid5: couldn't allocate thread for %s\n", 4841 + "md/raid:%s: couldn't allocate thread.\n", 4830 4842 mdname(mddev)); 4831 4843 goto abort; 4832 4844 } ··· 4878 4888 sector_t reshape_offset = 0; 4879 4889 4880 4890 if (mddev->recovery_cp != MaxSector) 4881 - printk(KERN_NOTICE "raid5: %s is not clean" 4891 + printk(KERN_NOTICE "md/raid:%s: not clean" 4882 4892 " -- starting background reconstruction\n", 4883 4893 mdname(mddev)); 4884 4894 if (mddev->reshape_position != MaxSector) { ··· 4892 4902 int max_degraded = (mddev->level == 6 ? 2 : 1); 4893 4903 4894 4904 if (mddev->new_level != mddev->level) { 4895 - printk(KERN_ERR "raid5: %s: unsupported reshape " 4905 + printk(KERN_ERR "md/raid:%s: unsupported reshape " 4896 4906 "required - aborting.\n", 4897 4907 mdname(mddev)); 4898 4908 return -EINVAL; ··· 4905 4915 here_new = mddev->reshape_position; 4906 4916 if (sector_div(here_new, mddev->new_chunk_sectors * 4907 4917 (mddev->raid_disks - max_degraded))) { 4908 - printk(KERN_ERR "raid5: reshape_position not " 4909 - "on a stripe boundary\n"); 4918 + printk(KERN_ERR "md/raid:%s: reshape_position not " 4919 + "on a stripe boundary\n", mdname(mddev)); 4910 4920 return -EINVAL; 4911 4921 } 4912 4922 reshape_offset = here_new * mddev->new_chunk_sectors; ··· 4927 4937 if ((here_new * mddev->new_chunk_sectors != 4928 4938 here_old * mddev->chunk_sectors) || 4929 4939 mddev->ro == 0) { 4930 - printk(KERN_ERR "raid5: in-place reshape must be started" 4931 - " in read-only mode - aborting\n"); 4940 + printk(KERN_ERR "md/raid:%s: in-place reshape must be started" 4941 + " in read-only mode - aborting\n", 4942 + mdname(mddev)); 4932 4943 return -EINVAL; 4933 4944 } 4934 4945 } else if (mddev->delta_disks < 0 ··· 4938 4947 : (here_new * mddev->new_chunk_sectors >= 4939 4948 here_old * mddev->chunk_sectors)) { 4940 4949 /* Reading from the same stripe as writing to - bad */ 4941 - printk(KERN_ERR "raid5: reshape_position too early for " 4942 - "auto-recovery - aborting.\n"); 4950 + printk(KERN_ERR "md/raid:%s: reshape_position too early for " 4951 + "auto-recovery - aborting.\n", 4952 + mdname(mddev)); 4943 4953 return -EINVAL; 4944 4954 } 4945 - printk(KERN_INFO "raid5: reshape will continue\n"); 4955 + printk(KERN_INFO "md/raid:%s: reshape will continue\n", 4956 + mdname(mddev)); 4946 4957 /* OK, we should be able to continue; */ 4947 4958 } else { 4948 4959 BUG_ON(mddev->level != mddev->new_level); ··· 4986 4993 mddev->minor_version > 90) 4987 4994 rdev->recovery_offset = reshape_offset; 4988 4995 4989 - printk("%d: w=%d pa=%d pr=%d m=%d a=%d r=%d op1=%d op2=%d\n", 4990 - rdev->raid_disk, working_disks, conf->prev_algo, 4991 - conf->previous_raid_disks, conf->max_degraded, 4992 - conf->algorithm, conf->raid_disks, 4993 - only_parity(rdev->raid_disk, 4994 - conf->prev_algo, 4995 - conf->previous_raid_disks, 4996 - conf->max_degraded), 4997 - only_parity(rdev->raid_disk, 4998 - conf->algorithm, 4999 - conf->raid_disks, 5000 - conf->max_degraded)); 5001 4996 if (rdev->recovery_offset < reshape_offset) { 5002 4997 /* We need to check old and new layout */ 5003 4998 if (!only_parity(rdev->raid_disk, ··· 5006 5025 - working_disks); 5007 5026 5008 5027 if (mddev->degraded > conf->max_degraded) { 5009 - printk(KERN_ERR "raid5: not enough operational devices for %s" 5028 + printk(KERN_ERR "md/raid:%s: not enough operational devices" 5010 5029 " (%d/%d failed)\n", 5011 5030 mdname(mddev), mddev->degraded, conf->raid_disks); 5012 5031 goto abort; ··· 5020 5039 mddev->recovery_cp != MaxSector) { 5021 5040 if (mddev->ok_start_degraded) 5022 5041 printk(KERN_WARNING 5023 - "raid5: starting dirty degraded array: %s" 5024 - "- data corruption possible.\n", 5042 + "md/raid:%s: starting dirty degraded array" 5043 + " - data corruption possible.\n", 5025 5044 mdname(mddev)); 5026 5045 else { 5027 5046 printk(KERN_ERR 5028 - "raid5: cannot start dirty degraded array for %s\n", 5047 + "md/raid:%s: cannot start dirty degraded array.\n", 5029 5048 mdname(mddev)); 5030 5049 goto abort; 5031 5050 } 5032 5051 } 5033 5052 5034 5053 if (mddev->degraded == 0) 5035 - printk("raid5: raid level %d set %s active with %d out of %d" 5036 - " devices, algorithm %d\n", conf->level, mdname(mddev), 5054 + printk(KERN_INFO "md/raid:%s: raid level %d active with %d out of %d" 5055 + " devices, algorithm %d\n", mdname(mddev), conf->level, 5037 5056 mddev->raid_disks-mddev->degraded, mddev->raid_disks, 5038 5057 mddev->new_layout); 5039 5058 else 5040 - printk(KERN_ALERT "raid5: raid level %d set %s active with %d" 5041 - " out of %d devices, algorithm %d\n", conf->level, 5042 - mdname(mddev), mddev->raid_disks - mddev->degraded, 5043 - mddev->raid_disks, mddev->new_layout); 5059 + printk(KERN_ALERT "md/raid:%s: raid level %d active with %d" 5060 + " out of %d devices, algorithm %d\n", 5061 + mdname(mddev), conf->level, 5062 + mddev->raid_disks - mddev->degraded, 5063 + mddev->raid_disks, mddev->new_layout); 5044 5064 5045 5065 print_raid5_conf(conf); 5046 5066 5047 5067 if (conf->reshape_progress != MaxSector) { 5048 - printk("...ok start reshape thread\n"); 5049 5068 conf->reshape_safe = conf->reshape_progress; 5050 5069 atomic_set(&conf->reshape_stripes, 0); 5051 5070 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); ··· 5068 5087 } 5069 5088 5070 5089 /* Ok, everything is just fine now */ 5071 - if (sysfs_create_group(&mddev->kobj, &raid5_attrs_group)) 5090 + if (mddev->to_remove == &raid5_attrs_group) 5091 + mddev->to_remove = NULL; 5092 + else if (sysfs_create_group(&mddev->kobj, &raid5_attrs_group)) 5072 5093 printk(KERN_WARNING 5073 - "raid5: failed to create sysfs attributes for %s\n", 5094 + "md/raid:%s: failed to create sysfs attributes.\n", 5074 5095 mdname(mddev)); 5075 5096 5076 5097 mddev->queue->queue_lock = &conf->device_lock; ··· 5102 5119 free_conf(conf); 5103 5120 } 5104 5121 mddev->private = NULL; 5105 - printk(KERN_ALERT "raid5: failed to run raid set %s\n", mdname(mddev)); 5122 + printk(KERN_ALERT "md/raid:%s: failed to run raid set.\n", mdname(mddev)); 5106 5123 return -EIO; 5107 5124 } 5108 5125 5109 - 5110 - 5111 5126 static int stop(mddev_t *mddev) 5112 5127 { 5113 - raid5_conf_t *conf = (raid5_conf_t *) mddev->private; 5128 + raid5_conf_t *conf = mddev->private; 5114 5129 5115 5130 md_unregister_thread(mddev->thread); 5116 5131 mddev->thread = NULL; 5117 5132 mddev->queue->backing_dev_info.congested_fn = NULL; 5118 5133 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ 5119 5134 free_conf(conf); 5120 - mddev->private = &raid5_attrs_group; 5135 + mddev->private = NULL; 5136 + mddev->to_remove = &raid5_attrs_group; 5121 5137 return 0; 5122 5138 } 5123 5139 ··· 5157 5175 5158 5176 static void status(struct seq_file *seq, mddev_t *mddev) 5159 5177 { 5160 - raid5_conf_t *conf = (raid5_conf_t *) mddev->private; 5178 + raid5_conf_t *conf = mddev->private; 5161 5179 int i; 5162 5180 5163 5181 seq_printf(seq, " level %d, %dk chunk, algorithm %d", mddev->level, ··· 5179 5197 int i; 5180 5198 struct disk_info *tmp; 5181 5199 5182 - printk("RAID5 conf printout:\n"); 5200 + printk(KERN_DEBUG "RAID conf printout:\n"); 5183 5201 if (!conf) { 5184 5202 printk("(conf==NULL)\n"); 5185 5203 return; 5186 5204 } 5187 - printk(" --- rd:%d wd:%d\n", conf->raid_disks, 5188 - conf->raid_disks - conf->mddev->degraded); 5205 + printk(KERN_DEBUG " --- level:%d rd:%d wd:%d\n", conf->level, 5206 + conf->raid_disks, 5207 + conf->raid_disks - conf->mddev->degraded); 5189 5208 5190 5209 for (i = 0; i < conf->raid_disks; i++) { 5191 5210 char b[BDEVNAME_SIZE]; 5192 5211 tmp = conf->disks + i; 5193 5212 if (tmp->rdev) 5194 - printk(" disk %d, o:%d, dev:%s\n", 5195 - i, !test_bit(Faulty, &tmp->rdev->flags), 5196 - bdevname(tmp->rdev->bdev,b)); 5213 + printk(KERN_DEBUG " disk %d, o:%d, dev:%s\n", 5214 + i, !test_bit(Faulty, &tmp->rdev->flags), 5215 + bdevname(tmp->rdev->bdev, b)); 5197 5216 } 5198 5217 } 5199 5218 ··· 5317 5334 raid5_size(mddev, sectors, mddev->raid_disks)) 5318 5335 return -EINVAL; 5319 5336 set_capacity(mddev->gendisk, mddev->array_sectors); 5320 - mddev->changed = 1; 5321 5337 revalidate_disk(mddev->gendisk); 5322 5338 if (sectors > mddev->dev_sectors && mddev->recovery_cp == MaxSector) { 5323 5339 mddev->recovery_cp = mddev->dev_sectors; ··· 5342 5360 > conf->max_nr_stripes || 5343 5361 ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4 5344 5362 > conf->max_nr_stripes) { 5345 - printk(KERN_WARNING "raid5: reshape: not enough stripes. Needed %lu\n", 5363 + printk(KERN_WARNING "md/raid:%s: reshape: not enough stripes. Needed %lu\n", 5364 + mdname(mddev), 5346 5365 ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9) 5347 5366 / STRIPE_SIZE)*4); 5348 5367 return 0; ··· 5414 5431 */ 5415 5432 if (raid5_size(mddev, 0, conf->raid_disks + mddev->delta_disks) 5416 5433 < mddev->array_sectors) { 5417 - printk(KERN_ERR "md: %s: array size must be reduced " 5434 + printk(KERN_ERR "md/raid:%s: array size must be reduced " 5418 5435 "before number of disks\n", mdname(mddev)); 5419 5436 return -EINVAL; 5420 5437 } ··· 5452 5469 if (sysfs_create_link(&mddev->kobj, 5453 5470 &rdev->kobj, nm)) 5454 5471 printk(KERN_WARNING 5455 - "raid5: failed to create " 5456 - " link %s for %s\n", 5457 - nm, mdname(mddev)); 5472 + "md/raid:%s: failed to create " 5473 + " link %s\n", 5474 + mdname(mddev), nm); 5458 5475 } else 5459 5476 break; 5460 5477 } ··· 5531 5548 if (mddev->delta_disks > 0) { 5532 5549 md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); 5533 5550 set_capacity(mddev->gendisk, mddev->array_sectors); 5534 - mddev->changed = 1; 5535 5551 revalidate_disk(mddev->gendisk); 5536 5552 } else { 5537 5553 int d; ··· 5592 5610 spin_unlock_irq(&conf->device_lock); 5593 5611 break; 5594 5612 } 5613 + } 5614 + 5615 + 5616 + static void *raid45_takeover_raid0(mddev_t *mddev, int level) 5617 + { 5618 + struct raid0_private_data *raid0_priv = mddev->private; 5619 + 5620 + /* for raid0 takeover only one zone is supported */ 5621 + if (raid0_priv->nr_strip_zones > 1) { 5622 + printk(KERN_ERR "md/raid:%s: cannot takeover raid0 with more than one zone.\n", 5623 + mdname(mddev)); 5624 + return ERR_PTR(-EINVAL); 5625 + } 5626 + 5627 + mddev->new_level = level; 5628 + mddev->new_layout = ALGORITHM_PARITY_N; 5629 + mddev->new_chunk_sectors = mddev->chunk_sectors; 5630 + mddev->raid_disks += 1; 5631 + mddev->delta_disks = 1; 5632 + /* make sure it will be not marked as dirty */ 5633 + mddev->recovery_cp = MaxSector; 5634 + 5635 + return setup_conf(mddev); 5595 5636 } 5596 5637 5597 5638 ··· 5742 5737 static void *raid5_takeover(mddev_t *mddev) 5743 5738 { 5744 5739 /* raid5 can take over: 5745 - * raid0 - if all devices are the same - make it a raid4 layout 5740 + * raid0 - if there is only one strip zone - make it a raid4 layout 5746 5741 * raid1 - if there are two drives. We need to know the chunk size 5747 5742 * raid4 - trivial - just use a raid4 layout. 5748 5743 * raid6 - Providing it is a *_6 layout 5749 5744 */ 5750 - 5745 + if (mddev->level == 0) 5746 + return raid45_takeover_raid0(mddev, 5); 5751 5747 if (mddev->level == 1) 5752 5748 return raid5_takeover_raid1(mddev); 5753 5749 if (mddev->level == 4) { ··· 5762 5756 return ERR_PTR(-EINVAL); 5763 5757 } 5764 5758 5759 + static void *raid4_takeover(mddev_t *mddev) 5760 + { 5761 + /* raid4 can take over: 5762 + * raid0 - if there is only one strip zone 5763 + * raid5 - if layout is right 5764 + */ 5765 + if (mddev->level == 0) 5766 + return raid45_takeover_raid0(mddev, 4); 5767 + if (mddev->level == 5 && 5768 + mddev->layout == ALGORITHM_PARITY_N) { 5769 + mddev->new_layout = 0; 5770 + mddev->new_level = 4; 5771 + return setup_conf(mddev); 5772 + } 5773 + return ERR_PTR(-EINVAL); 5774 + } 5765 5775 5766 5776 static struct mdk_personality raid5_personality; 5767 5777 ··· 5893 5871 .start_reshape = raid5_start_reshape, 5894 5872 .finish_reshape = raid5_finish_reshape, 5895 5873 .quiesce = raid5_quiesce, 5874 + .takeover = raid4_takeover, 5896 5875 }; 5897 5876 5898 5877 static int __init raid5_init(void)