Merge branch 'for-linus' of git://neil.brown.name/md

* 'for-linus' of git://neil.brown.name/md: (45 commits)
md: don't insist on valid event count for spare devices.
md: simplify updating of event count to sometimes avoid updating spares.
md/raid6: Fix raid-6 read-error correction in degraded state
md: restore ability of spare drives to spin down.
md: Fix read balancing in RAID1 and RAID10 on drives > 2TB
md/linear: standardise all printk messages
md/raid0: tidy up printk messages.
md/raid10: tidy up printk messages.
md/raid1: improve printk messages
md/raid5: improve consistency of error messages.
md: remove EXPERIMENTAL designation from RAID10
md: allow integers to be passed to md/level
md: notify mdstat waiters of level change
md/raid4: permit raid0 takeover
md/raid1: delay reads that could overtake behind-writes.
md/raid1: fix confusing 'redirect sector' message.
md: don't unregister the thread in mddev_suspend
md: factor out init code for an mddev
md: pass mddev to make_request functions rather than request_queue
md: call md_stop_writes from md_stop
...

+987 -582
+2 -2
drivers/md/Kconfig
··· 100 If unsure, say Y. 101 102 config MD_RAID10 103 - tristate "RAID-10 (mirrored striping) mode (EXPERIMENTAL)" 104 - depends on BLK_DEV_MD && EXPERIMENTAL 105 ---help--- 106 RAID-10 provides a combination of striping (RAID-0) and 107 mirroring (RAID-1) with easier configuration and more flexible
··· 100 If unsure, say Y. 101 102 config MD_RAID10 103 + tristate "RAID-10 (mirrored striping) mode" 104 + depends on BLK_DEV_MD 105 ---help--- 106 RAID-10 provides a combination of striping (RAID-0) and 107 mirroring (RAID-1) with easier configuration and more flexible
+35 -6
drivers/md/bitmap.c
··· 505 return; 506 } 507 spin_unlock_irqrestore(&bitmap->lock, flags); 508 - sb = (bitmap_super_t *)kmap_atomic(bitmap->sb_page, KM_USER0); 509 sb->events = cpu_to_le64(bitmap->mddev->events); 510 if (bitmap->mddev->events < bitmap->events_cleared) { 511 /* rocking back to read-only */ ··· 526 527 if (!bitmap || !bitmap->sb_page) 528 return; 529 - sb = (bitmap_super_t *)kmap_atomic(bitmap->sb_page, KM_USER0); 530 printk(KERN_DEBUG "%s: bitmap file superblock:\n", bmname(bitmap)); 531 printk(KERN_DEBUG " magic: %08x\n", le32_to_cpu(sb->magic)); 532 printk(KERN_DEBUG " version: %d\n", le32_to_cpu(sb->version)); ··· 575 return err; 576 } 577 578 - sb = (bitmap_super_t *)kmap_atomic(bitmap->sb_page, KM_USER0); 579 580 chunksize = le32_to_cpu(sb->chunksize); 581 daemon_sleep = le32_to_cpu(sb->daemon_sleep) * HZ; ··· 661 return 0; 662 } 663 spin_unlock_irqrestore(&bitmap->lock, flags); 664 - sb = (bitmap_super_t *)kmap_atomic(bitmap->sb_page, KM_USER0); 665 old = le32_to_cpu(sb->state) & bits; 666 switch (op) { 667 case MASK_SET: sb->state |= cpu_to_le32(bits); ··· 1292 if (!bitmap) return 0; 1293 1294 if (behind) { 1295 atomic_inc(&bitmap->behind_writes); 1296 PRINTK(KERN_DEBUG "inc write-behind count %d/%d\n", 1297 - atomic_read(&bitmap->behind_writes), bitmap->max_write_behind); 1298 } 1299 1300 while (sectors) { ··· 1356 { 1357 if (!bitmap) return; 1358 if (behind) { 1359 - atomic_dec(&bitmap->behind_writes); 1360 PRINTK(KERN_DEBUG "dec write-behind count %d/%d\n", 1361 atomic_read(&bitmap->behind_writes), bitmap->max_write_behind); 1362 } ··· 1681 atomic_set(&bitmap->pending_writes, 0); 1682 init_waitqueue_head(&bitmap->write_wait); 1683 init_waitqueue_head(&bitmap->overflow_wait); 1684 1685 bitmap->mddev = mddev; 1686 ··· 2013 static struct md_sysfs_entry bitmap_can_clear = 2014 __ATTR(can_clear, S_IRUGO|S_IWUSR, can_clear_show, can_clear_store); 2015 2016 static struct attribute *md_bitmap_attrs[] = { 2017 &bitmap_location.attr, 2018 &bitmap_timeout.attr, ··· 2041 &bitmap_chunksize.attr, 2042 &bitmap_metadata.attr, 2043 &bitmap_can_clear.attr, 2044 NULL 2045 }; 2046 struct attribute_group md_bitmap_group = {
··· 505 return; 506 } 507 spin_unlock_irqrestore(&bitmap->lock, flags); 508 + sb = kmap_atomic(bitmap->sb_page, KM_USER0); 509 sb->events = cpu_to_le64(bitmap->mddev->events); 510 if (bitmap->mddev->events < bitmap->events_cleared) { 511 /* rocking back to read-only */ ··· 526 527 if (!bitmap || !bitmap->sb_page) 528 return; 529 + sb = kmap_atomic(bitmap->sb_page, KM_USER0); 530 printk(KERN_DEBUG "%s: bitmap file superblock:\n", bmname(bitmap)); 531 printk(KERN_DEBUG " magic: %08x\n", le32_to_cpu(sb->magic)); 532 printk(KERN_DEBUG " version: %d\n", le32_to_cpu(sb->version)); ··· 575 return err; 576 } 577 578 + sb = kmap_atomic(bitmap->sb_page, KM_USER0); 579 580 chunksize = le32_to_cpu(sb->chunksize); 581 daemon_sleep = le32_to_cpu(sb->daemon_sleep) * HZ; ··· 661 return 0; 662 } 663 spin_unlock_irqrestore(&bitmap->lock, flags); 664 + sb = kmap_atomic(bitmap->sb_page, KM_USER0); 665 old = le32_to_cpu(sb->state) & bits; 666 switch (op) { 667 case MASK_SET: sb->state |= cpu_to_le32(bits); ··· 1292 if (!bitmap) return 0; 1293 1294 if (behind) { 1295 + int bw; 1296 atomic_inc(&bitmap->behind_writes); 1297 + bw = atomic_read(&bitmap->behind_writes); 1298 + if (bw > bitmap->behind_writes_used) 1299 + bitmap->behind_writes_used = bw; 1300 + 1301 PRINTK(KERN_DEBUG "inc write-behind count %d/%d\n", 1302 + bw, bitmap->max_write_behind); 1303 } 1304 1305 while (sectors) { ··· 1351 { 1352 if (!bitmap) return; 1353 if (behind) { 1354 + if (atomic_dec_and_test(&bitmap->behind_writes)) 1355 + wake_up(&bitmap->behind_wait); 1356 PRINTK(KERN_DEBUG "dec write-behind count %d/%d\n", 1357 atomic_read(&bitmap->behind_writes), bitmap->max_write_behind); 1358 } ··· 1675 atomic_set(&bitmap->pending_writes, 0); 1676 init_waitqueue_head(&bitmap->write_wait); 1677 init_waitqueue_head(&bitmap->overflow_wait); 1678 + init_waitqueue_head(&bitmap->behind_wait); 1679 1680 bitmap->mddev = mddev; 1681 ··· 2006 static struct md_sysfs_entry bitmap_can_clear = 2007 __ATTR(can_clear, S_IRUGO|S_IWUSR, can_clear_show, can_clear_store); 2008 2009 + static ssize_t 2010 + behind_writes_used_show(mddev_t *mddev, char *page) 2011 + { 2012 + if (mddev->bitmap == NULL) 2013 + return sprintf(page, "0\n"); 2014 + return sprintf(page, "%lu\n", 2015 + mddev->bitmap->behind_writes_used); 2016 + } 2017 + 2018 + static ssize_t 2019 + behind_writes_used_reset(mddev_t *mddev, const char *buf, size_t len) 2020 + { 2021 + if (mddev->bitmap) 2022 + mddev->bitmap->behind_writes_used = 0; 2023 + return len; 2024 + } 2025 + 2026 + static struct md_sysfs_entry max_backlog_used = 2027 + __ATTR(max_backlog_used, S_IRUGO | S_IWUSR, 2028 + behind_writes_used_show, behind_writes_used_reset); 2029 + 2030 static struct attribute *md_bitmap_attrs[] = { 2031 &bitmap_location.attr, 2032 &bitmap_timeout.attr, ··· 2013 &bitmap_chunksize.attr, 2014 &bitmap_metadata.attr, 2015 &bitmap_can_clear.attr, 2016 + &max_backlog_used.attr, 2017 NULL 2018 }; 2019 struct attribute_group md_bitmap_group = {
+2
drivers/md/bitmap.h
··· 227 int allclean; 228 229 atomic_t behind_writes; 230 231 /* 232 * the bitmap daemon - periodically wakes up and sweeps the bitmap ··· 240 atomic_t pending_writes; /* pending writes to the bitmap file */ 241 wait_queue_head_t write_wait; 242 wait_queue_head_t overflow_wait; 243 244 struct sysfs_dirent *sysfs_can_clear; 245 };
··· 227 int allclean; 228 229 atomic_t behind_writes; 230 + unsigned long behind_writes_used; /* highest actual value at runtime */ 231 232 /* 233 * the bitmap daemon - periodically wakes up and sweeps the bitmap ··· 239 atomic_t pending_writes; /* pending writes to the bitmap file */ 240 wait_queue_head_t write_wait; 241 wait_queue_head_t overflow_wait; 242 + wait_queue_head_t behind_wait; 243 244 struct sysfs_dirent *sysfs_can_clear; 245 };
+4 -5
drivers/md/faulty.c
··· 169 conf->nfaults = n+1; 170 } 171 172 - static int make_request(struct request_queue *q, struct bio *bio) 173 { 174 - mddev_t *mddev = q->queuedata; 175 - conf_t *conf = (conf_t*)mddev->private; 176 int failit = 0; 177 178 if (bio_data_dir(bio) == WRITE) { ··· 224 225 static void status(struct seq_file *seq, mddev_t *mddev) 226 { 227 - conf_t *conf = (conf_t*)mddev->private; 228 int n; 229 230 if ((n=atomic_read(&conf->counters[WriteTransient])) != 0) ··· 327 328 static int stop(mddev_t *mddev) 329 { 330 - conf_t *conf = (conf_t *)mddev->private; 331 332 kfree(conf); 333 mddev->private = NULL;
··· 169 conf->nfaults = n+1; 170 } 171 172 + static int make_request(mddev_t *mddev, struct bio *bio) 173 { 174 + conf_t *conf = mddev->private; 175 int failit = 0; 176 177 if (bio_data_dir(bio) == WRITE) { ··· 225 226 static void status(struct seq_file *seq, mddev_t *mddev) 227 { 228 + conf_t *conf = mddev->private; 229 int n; 230 231 if ((n=atomic_read(&conf->counters[WriteTransient])) != 0) ··· 328 329 static int stop(mddev_t *mddev) 330 { 331 + conf_t *conf = mddev->private; 332 333 kfree(conf); 334 mddev->private = NULL;
+16 -20
drivers/md/linear.c
··· 159 sector_t sectors; 160 161 if (j < 0 || j >= raid_disks || disk->rdev) { 162 - printk("linear: disk numbering problem. Aborting!\n"); 163 goto out; 164 } 165 ··· 188 189 } 190 if (cnt != raid_disks) { 191 - printk("linear: not enough drives present. Aborting!\n"); 192 goto out; 193 } 194 ··· 284 rcu_barrier(); 285 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ 286 kfree(conf); 287 288 return 0; 289 } 290 291 - static int linear_make_request (struct request_queue *q, struct bio *bio) 292 { 293 - const int rw = bio_data_dir(bio); 294 - mddev_t *mddev = q->queuedata; 295 dev_info_t *tmp_dev; 296 sector_t start_sector; 297 - int cpu; 298 299 if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) { 300 md_barrier_request(mddev, bio); 301 return 0; 302 } 303 - 304 - cpu = part_stat_lock(); 305 - part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); 306 - part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], 307 - bio_sectors(bio)); 308 - part_stat_unlock(); 309 310 rcu_read_lock(); 311 tmp_dev = which_dev(mddev, bio->bi_sector); ··· 308 || (bio->bi_sector < start_sector))) { 309 char b[BDEVNAME_SIZE]; 310 311 - printk("linear_make_request: Sector %llu out of bounds on " 312 - "dev %s: %llu sectors, offset %llu\n", 313 - (unsigned long long)bio->bi_sector, 314 - bdevname(tmp_dev->rdev->bdev, b), 315 - (unsigned long long)tmp_dev->rdev->sectors, 316 - (unsigned long long)start_sector); 317 rcu_read_unlock(); 318 bio_io_error(bio); 319 return 0; ··· 332 333 bp = bio_split(bio, end_sector - bio->bi_sector); 334 335 - if (linear_make_request(q, &bp->bio1)) 336 generic_make_request(&bp->bio1); 337 - if (linear_make_request(q, &bp->bio2)) 338 generic_make_request(&bp->bio2); 339 bio_pair_release(bp); 340 return 0;
··· 159 sector_t sectors; 160 161 if (j < 0 || j >= raid_disks || disk->rdev) { 162 + printk(KERN_ERR "md/linear:%s: disk numbering problem. Aborting!\n", 163 + mdname(mddev)); 164 goto out; 165 } 166 ··· 187 188 } 189 if (cnt != raid_disks) { 190 + printk(KERN_ERR "md/linear:%s: not enough drives present. Aborting!\n", 191 + mdname(mddev)); 192 goto out; 193 } 194 ··· 282 rcu_barrier(); 283 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ 284 kfree(conf); 285 + mddev->private = NULL; 286 287 return 0; 288 } 289 290 + static int linear_make_request (mddev_t *mddev, struct bio *bio) 291 { 292 dev_info_t *tmp_dev; 293 sector_t start_sector; 294 295 if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) { 296 md_barrier_request(mddev, bio); 297 return 0; 298 } 299 300 rcu_read_lock(); 301 tmp_dev = which_dev(mddev, bio->bi_sector); ··· 314 || (bio->bi_sector < start_sector))) { 315 char b[BDEVNAME_SIZE]; 316 317 + printk(KERN_ERR 318 + "md/linear:%s: make_request: Sector %llu out of bounds on " 319 + "dev %s: %llu sectors, offset %llu\n", 320 + mdname(mddev), 321 + (unsigned long long)bio->bi_sector, 322 + bdevname(tmp_dev->rdev->bdev, b), 323 + (unsigned long long)tmp_dev->rdev->sectors, 324 + (unsigned long long)start_sector); 325 rcu_read_unlock(); 326 bio_io_error(bio); 327 return 0; ··· 336 337 bp = bio_split(bio, end_sector - bio->bi_sector); 338 339 + if (linear_make_request(mddev, &bp->bio1)) 340 generic_make_request(&bp->bio1); 341 + if (linear_make_request(mddev, &bp->bio2)) 342 generic_make_request(&bp->bio2); 343 bio_pair_release(bp); 344 return 0;
+329 -208
drivers/md/md.c
··· 215 */ 216 static int md_make_request(struct request_queue *q, struct bio *bio) 217 { 218 mddev_t *mddev = q->queuedata; 219 int rv; 220 if (mddev == NULL || mddev->pers == NULL) { 221 bio_io_error(bio); 222 return 0; ··· 240 } 241 atomic_inc(&mddev->active_io); 242 rcu_read_unlock(); 243 - rv = mddev->pers->make_request(q, bio); 244 if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended) 245 wake_up(&mddev->sb_wait); 246 247 return rv; 248 } 249 250 static void mddev_suspend(mddev_t *mddev) 251 { 252 BUG_ON(mddev->suspended); ··· 268 synchronize_rcu(); 269 wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0); 270 mddev->pers->quiesce(mddev, 1); 271 - md_unregister_thread(mddev->thread); 272 - mddev->thread = NULL; 273 - /* we now know that no code is executing in the personality module, 274 - * except possibly the tail end of a ->bi_end_io function, but that 275 - * is certain to complete before the module has a chance to get 276 - * unloaded 277 - */ 278 } 279 280 static void mddev_resume(mddev_t *mddev) ··· 354 bio_endio(bio, 0); 355 else { 356 bio->bi_rw &= ~(1<<BIO_RW_BARRIER); 357 - if (mddev->pers->make_request(mddev->queue, bio)) 358 generic_make_request(bio); 359 mddev->barrier = POST_REQUEST_BARRIER; 360 submit_barriers(mddev); ··· 414 kfree(mddev); 415 } 416 spin_unlock(&all_mddevs_lock); 417 } 418 419 static mddev_t * mddev_find(dev_t unit) ··· 503 else 504 new->md_minor = MINOR(unit) >> MdpMinorShift; 505 506 - mutex_init(&new->open_mutex); 507 - mutex_init(&new->reconfig_mutex); 508 - mutex_init(&new->bitmap_info.mutex); 509 - INIT_LIST_HEAD(&new->disks); 510 - INIT_LIST_HEAD(&new->all_mddevs); 511 - init_timer(&new->safemode_timer); 512 - atomic_set(&new->active, 1); 513 - atomic_set(&new->openers, 0); 514 - atomic_set(&new->active_io, 0); 515 - spin_lock_init(&new->write_lock); 516 - atomic_set(&new->flush_pending, 0); 517 - init_waitqueue_head(&new->sb_wait); 518 - init_waitqueue_head(&new->recovery_wait); 519 - new->reshape_position = MaxSector; 520 - new->resync_min = 0; 521 - new->resync_max = MaxSector; 522 - new->level = LEVEL_NONE; 523 524 goto retry; 525 } ··· 523 return mutex_trylock(&mddev->reconfig_mutex); 524 } 525 526 - static inline void mddev_unlock(mddev_t * mddev) 527 { 528 - mutex_unlock(&mddev->reconfig_mutex); 529 530 md_wakeup_thread(mddev->thread); 531 } ··· 1071 mddev->bitmap_info.default_offset; 1072 1073 } else if (mddev->pers == NULL) { 1074 - /* Insist on good event counter while assembling */ 1075 ++ev1; 1076 - if (ev1 < mddev->events) 1077 - return -EINVAL; 1078 } else if (mddev->bitmap) { 1079 /* if adding to array with a bitmap, then we can accept an 1080 * older device ... but not too old. ··· 1473 } 1474 1475 } else if (mddev->pers == NULL) { 1476 - /* Insist of good event counter while assembling */ 1477 ++ev1; 1478 - if (ev1 < mddev->events) 1479 - return -EINVAL; 1480 } else if (mddev->bitmap) { 1481 /* If adding to array with a bitmap, then we can accept an 1482 * older device, but not too old. ··· 2096 if (rdev->sb_events == mddev->events || 2097 (nospares && 2098 rdev->raid_disk < 0 && 2099 - (rdev->sb_events&1)==0 && 2100 rdev->sb_events+1 == mddev->events)) { 2101 /* Don't update this superblock */ 2102 rdev->sb_loaded = 2; ··· 2148 * and 'events' is odd, we can roll back to the previous clean state */ 2149 if (nospares 2150 && (mddev->in_sync && mddev->recovery_cp == MaxSector) 2151 - && (mddev->events & 1) 2152 - && mddev->events != 1) 2153 mddev->events--; 2154 - else { 2155 /* otherwise we have to go forward and ... */ 2156 mddev->events ++; 2157 - if (!mddev->in_sync || mddev->recovery_cp != MaxSector) { /* not clean */ 2158 - /* .. if the array isn't clean, an 'even' event must also go 2159 - * to spares. */ 2160 - if ((mddev->events&1)==0) { 2161 - nospares = 0; 2162 - sync_req = 2; /* force a second update to get the 2163 - * even/odd in sync */ 2164 - } 2165 - } else { 2166 - /* otherwise an 'odd' event must go to spares */ 2167 - if ((mddev->events&1)) { 2168 - nospares = 0; 2169 - sync_req = 2; /* force a second update to get the 2170 - * even/odd in sync */ 2171 - } 2172 - } 2173 } 2174 2175 if (!mddev->events) { ··· 2399 return err; 2400 sprintf(nm, "rd%d", rdev->raid_disk); 2401 sysfs_remove_link(&rdev->mddev->kobj, nm); 2402 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 2403 md_wakeup_thread(rdev->mddev->thread); 2404 } else if (rdev->mddev->pers) { ··· 2815 2816 i = 0; 2817 rdev_for_each(rdev, tmp, mddev) { 2818 - if (rdev->desc_nr >= mddev->max_disks || 2819 - i > mddev->max_disks) { 2820 printk(KERN_WARNING 2821 "md: %s: %s: only %d devices permitted\n", 2822 mdname(mddev), bdevname(rdev->bdev, b), ··· 2933 static ssize_t 2934 level_store(mddev_t *mddev, const char *buf, size_t len) 2935 { 2936 - char level[16]; 2937 ssize_t rv = len; 2938 struct mdk_personality *pers; 2939 void *priv; 2940 mdk_rdev_t *rdev; 2941 ··· 2969 } 2970 2971 /* Now find the new personality */ 2972 - if (len == 0 || len >= sizeof(level)) 2973 return -EINVAL; 2974 - strncpy(level, buf, len); 2975 - if (level[len-1] == '\n') 2976 len--; 2977 - level[len] = 0; 2978 2979 - request_module("md-%s", level); 2980 spin_lock(&pers_lock); 2981 - pers = find_pers(LEVEL_NONE, level); 2982 if (!pers || !try_module_get(pers->owner)) { 2983 spin_unlock(&pers_lock); 2984 - printk(KERN_WARNING "md: personality %s not loaded\n", level); 2985 return -EINVAL; 2986 } 2987 spin_unlock(&pers_lock); ··· 2997 if (!pers->takeover) { 2998 module_put(pers->owner); 2999 printk(KERN_WARNING "md: %s: %s does not support personality takeover\n", 3000 - mdname(mddev), level); 3001 return -EINVAL; 3002 } 3003 ··· 3013 mddev->delta_disks = 0; 3014 module_put(pers->owner); 3015 printk(KERN_WARNING "md: %s: %s would not accept array\n", 3016 - mdname(mddev), level); 3017 return PTR_ERR(priv); 3018 } 3019 3020 /* Looks like we have a winner */ 3021 mddev_suspend(mddev); 3022 mddev->pers->stop(mddev); 3023 module_put(mddev->pers->owner); 3024 /* Invalidate devices that are now superfluous */ 3025 list_for_each_entry(rdev, &mddev->disks, same_set) ··· 3065 mddev->layout = mddev->new_layout; 3066 mddev->chunk_sectors = mddev->new_chunk_sectors; 3067 mddev->delta_disks = 0; 3068 pers->run(mddev); 3069 mddev_resume(mddev); 3070 set_bit(MD_CHANGE_DEVS, &mddev->flags); 3071 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3072 md_wakeup_thread(mddev->thread); 3073 return rv; 3074 } 3075 ··· 3317 } 3318 3319 static int do_md_stop(mddev_t * mddev, int ro, int is_open); 3320 static int do_md_run(mddev_t * mddev); 3321 static int restart_array(mddev_t *mddev); 3322 ··· 3348 break; /* not supported yet */ 3349 case readonly: 3350 if (mddev->pers) 3351 - err = do_md_stop(mddev, 1, 0); 3352 else { 3353 mddev->ro = 1; 3354 set_disk_ro(mddev->gendisk, 1); ··· 3358 case read_auto: 3359 if (mddev->pers) { 3360 if (mddev->ro == 0) 3361 - err = do_md_stop(mddev, 1, 0); 3362 else if (mddev->ro == 1) 3363 err = restart_array(mddev); 3364 if (err == 0) { ··· 4163 { 4164 mddev_t *mddev = container_of(ws, mddev_t, del_work); 4165 4166 - if (mddev->private) { 4167 - sysfs_remove_group(&mddev->kobj, &md_redundancy_group); 4168 - if (mddev->private != (void*)1) 4169 - sysfs_remove_group(&mddev->kobj, mddev->private); 4170 - if (mddev->sysfs_action) 4171 - sysfs_put(mddev->sysfs_action); 4172 - mddev->sysfs_action = NULL; 4173 - mddev->private = NULL; 4174 - } 4175 sysfs_remove_group(&mddev->kobj, &md_bitmap_group); 4176 kobject_del(&mddev->kobj); 4177 kobject_put(&mddev->kobj); ··· 4306 4307 static int start_dirty_degraded; 4308 4309 - static int do_md_run(mddev_t * mddev) 4310 { 4311 int err; 4312 mdk_rdev_t *rdev; 4313 - struct gendisk *disk; 4314 struct mdk_personality *pers; 4315 4316 if (list_empty(&mddev->disks)) ··· 4318 4319 if (mddev->pers) 4320 return -EBUSY; 4321 4322 /* 4323 * Analyze all RAID superblock(s) ··· 4373 } 4374 sysfs_notify_dirent(rdev->sysfs_state); 4375 } 4376 - 4377 - disk = mddev->gendisk; 4378 4379 spin_lock(&pers_lock); 4380 pers = find_pers(mddev->level, mddev->clevel); ··· 4501 if (mddev->flags) 4502 md_update_sb(mddev, 0); 4503 4504 - set_capacity(disk, mddev->array_sectors); 4505 - 4506 md_wakeup_thread(mddev->thread); 4507 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ 4508 4509 - revalidate_disk(mddev->gendisk); 4510 - mddev->changed = 1; 4511 md_new_event(mddev); 4512 sysfs_notify_dirent(mddev->sysfs_state); 4513 if (mddev->sysfs_action) 4514 sysfs_notify_dirent(mddev->sysfs_action); 4515 sysfs_notify(&mddev->kobj, NULL, "degraded"); 4516 - kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); 4517 return 0; 4518 } 4519 4520 static int restart_array(mddev_t *mddev) ··· 4577 spin_unlock(&inode->i_lock); 4578 } 4579 4580 /* mode: 4581 * 0 - completely stop and dis-assemble array 4582 - * 1 - switch to readonly 4583 * 2 - stop but do not disassemble array 4584 */ 4585 static int do_md_stop(mddev_t * mddev, int mode, int is_open) ··· 4695 err = -EBUSY; 4696 } else if (mddev->pers) { 4697 4698 - if (mddev->sync_thread) { 4699 - set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4700 - set_bit(MD_RECOVERY_INTR, &mddev->recovery); 4701 - md_unregister_thread(mddev->sync_thread); 4702 - mddev->sync_thread = NULL; 4703 - } 4704 4705 - del_timer_sync(&mddev->safemode_timer); 4706 4707 - switch(mode) { 4708 - case 1: /* readonly */ 4709 - err = -ENXIO; 4710 - if (mddev->ro==1) 4711 - goto out; 4712 - mddev->ro = 1; 4713 - break; 4714 - case 0: /* disassemble */ 4715 - case 2: /* stop */ 4716 - bitmap_flush(mddev); 4717 - md_super_wait(mddev); 4718 - if (mddev->ro) 4719 - set_disk_ro(disk, 0); 4720 4721 - mddev->pers->stop(mddev); 4722 - mddev->queue->merge_bvec_fn = NULL; 4723 - mddev->queue->unplug_fn = NULL; 4724 - mddev->queue->backing_dev_info.congested_fn = NULL; 4725 - module_put(mddev->pers->owner); 4726 - if (mddev->pers->sync_request && mddev->private == NULL) 4727 - mddev->private = (void*)1; 4728 - mddev->pers = NULL; 4729 - /* tell userspace to handle 'inactive' */ 4730 - sysfs_notify_dirent(mddev->sysfs_state); 4731 4732 - list_for_each_entry(rdev, &mddev->disks, same_set) 4733 - if (rdev->raid_disk >= 0) { 4734 - char nm[20]; 4735 - sprintf(nm, "rd%d", rdev->raid_disk); 4736 - sysfs_remove_link(&mddev->kobj, nm); 4737 - } 4738 4739 - set_capacity(disk, 0); 4740 - mddev->changed = 1; 4741 - 4742 - if (mddev->ro) 4743 - mddev->ro = 0; 4744 - } 4745 - if (!mddev->in_sync || mddev->flags) { 4746 - /* mark array as shutdown cleanly */ 4747 - mddev->in_sync = 1; 4748 - md_update_sb(mddev, 1); 4749 - } 4750 - if (mode == 1) 4751 - set_disk_ro(disk, 1); 4752 - clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4753 err = 0; 4754 } 4755 - out: 4756 mutex_unlock(&mddev->open_mutex); 4757 if (err) 4758 return err; ··· 4741 4742 export_array(mddev); 4743 4744 - mddev->array_sectors = 0; 4745 - mddev->external_size = 0; 4746 - mddev->dev_sectors = 0; 4747 - mddev->raid_disks = 0; 4748 - mddev->recovery_cp = 0; 4749 - mddev->resync_min = 0; 4750 - mddev->resync_max = MaxSector; 4751 - mddev->reshape_position = MaxSector; 4752 - mddev->external = 0; 4753 - mddev->persistent = 0; 4754 - mddev->level = LEVEL_NONE; 4755 - mddev->clevel[0] = 0; 4756 - mddev->flags = 0; 4757 - mddev->ro = 0; 4758 - mddev->metadata_type[0] = 0; 4759 - mddev->chunk_sectors = 0; 4760 - mddev->ctime = mddev->utime = 0; 4761 - mddev->layout = 0; 4762 - mddev->max_disks = 0; 4763 - mddev->events = 0; 4764 - mddev->delta_disks = 0; 4765 - mddev->new_level = LEVEL_NONE; 4766 - mddev->new_layout = 0; 4767 - mddev->new_chunk_sectors = 0; 4768 - mddev->curr_resync = 0; 4769 - mddev->resync_mismatches = 0; 4770 - mddev->suspend_lo = mddev->suspend_hi = 0; 4771 - mddev->sync_speed_min = mddev->sync_speed_max = 0; 4772 - mddev->recovery = 0; 4773 - mddev->in_sync = 0; 4774 - mddev->changed = 0; 4775 - mddev->degraded = 0; 4776 - mddev->barriers_work = 0; 4777 - mddev->safemode = 0; 4778 - mddev->bitmap_info.offset = 0; 4779 - mddev->bitmap_info.default_offset = 0; 4780 - mddev->bitmap_info.chunksize = 0; 4781 - mddev->bitmap_info.daemon_sleep = 0; 4782 - mddev->bitmap_info.max_write_behind = 0; 4783 kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); 4784 if (mddev->hold_active == UNTIL_STOP) 4785 mddev->hold_active = 0; 4786 4787 - } else if (mddev->pers) 4788 - printk(KERN_INFO "md: %s switched to read-only mode.\n", 4789 - mdname(mddev)); 4790 err = 0; 4791 blk_integrity_unregister(disk); 4792 md_new_event(mddev); ··· 5464 if (mddev->pers->check_reshape == NULL) 5465 return -EINVAL; 5466 if (raid_disks <= 0 || 5467 - raid_disks >= mddev->max_disks) 5468 return -EINVAL; 5469 if (mddev->sync_thread || mddev->reshape_position != MaxSector) 5470 return -EBUSY; ··· 5601 5602 geo->heads = 2; 5603 geo->sectors = 4; 5604 - geo->cylinders = get_capacity(mddev->gendisk) / 8; 5605 return 0; 5606 } 5607 ··· 5611 int err = 0; 5612 void __user *argp = (void __user *)arg; 5613 mddev_t *mddev = NULL; 5614 5615 if (!capable(CAP_SYS_ADMIN)) 5616 return -EACCES; ··· 5744 goto done_unlock; 5745 5746 case STOP_ARRAY_RO: 5747 - err = do_md_stop(mddev, 1, 1); 5748 goto done_unlock; 5749 5750 } 5751 5752 /* ··· 5895 atomic_inc(&mddev->openers); 5896 mutex_unlock(&mddev->open_mutex); 5897 5898 - check_disk_change(bdev); 5899 out: 5900 return err; 5901 } ··· 5909 5910 return 0; 5911 } 5912 - 5913 - static int md_media_changed(struct gendisk *disk) 5914 - { 5915 - mddev_t *mddev = disk->private_data; 5916 - 5917 - return mddev->changed; 5918 - } 5919 - 5920 - static int md_revalidate(struct gendisk *disk) 5921 - { 5922 - mddev_t *mddev = disk->private_data; 5923 - 5924 - mddev->changed = 0; 5925 - return 0; 5926 - } 5927 static const struct block_device_operations md_fops = 5928 { 5929 .owner = THIS_MODULE, ··· 5919 .compat_ioctl = md_compat_ioctl, 5920 #endif 5921 .getgeo = md_getgeo, 5922 - .media_changed = md_media_changed, 5923 - .revalidate_disk= md_revalidate, 5924 }; 5925 5926 static int md_thread(void * arg) ··· 6032 mddev->pers->error_handler(mddev,rdev); 6033 if (mddev->degraded) 6034 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 6035 - set_bit(StateChanged, &rdev->flags); 6036 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 6037 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6038 md_wakeup_thread(mddev->thread); ··· 7024 if (mddev->flags) 7025 md_update_sb(mddev, 0); 7026 7027 - list_for_each_entry(rdev, &mddev->disks, same_set) 7028 - if (test_and_clear_bit(StateChanged, &rdev->flags)) 7029 - sysfs_notify_dirent(rdev->sysfs_state); 7030 - 7031 - 7032 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && 7033 !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { 7034 /* resync/recovery still happening */ ··· 7160 * appears to still be in use. Hence 7161 * the '100'. 7162 */ 7163 - do_md_stop(mddev, 1, 100); 7164 mddev_unlock(mddev); 7165 } 7166 /*
··· 215 */ 216 static int md_make_request(struct request_queue *q, struct bio *bio) 217 { 218 + const int rw = bio_data_dir(bio); 219 mddev_t *mddev = q->queuedata; 220 int rv; 221 + int cpu; 222 + 223 if (mddev == NULL || mddev->pers == NULL) { 224 bio_io_error(bio); 225 return 0; ··· 237 } 238 atomic_inc(&mddev->active_io); 239 rcu_read_unlock(); 240 + 241 + rv = mddev->pers->make_request(mddev, bio); 242 + 243 + cpu = part_stat_lock(); 244 + part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); 245 + part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], 246 + bio_sectors(bio)); 247 + part_stat_unlock(); 248 + 249 if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended) 250 wake_up(&mddev->sb_wait); 251 252 return rv; 253 } 254 255 + /* mddev_suspend makes sure no new requests are submitted 256 + * to the device, and that any requests that have been submitted 257 + * are completely handled. 258 + * Once ->stop is called and completes, the module will be completely 259 + * unused. 260 + */ 261 static void mddev_suspend(mddev_t *mddev) 262 { 263 BUG_ON(mddev->suspended); ··· 251 synchronize_rcu(); 252 wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0); 253 mddev->pers->quiesce(mddev, 1); 254 } 255 256 static void mddev_resume(mddev_t *mddev) ··· 344 bio_endio(bio, 0); 345 else { 346 bio->bi_rw &= ~(1<<BIO_RW_BARRIER); 347 + if (mddev->pers->make_request(mddev, bio)) 348 generic_make_request(bio); 349 mddev->barrier = POST_REQUEST_BARRIER; 350 submit_barriers(mddev); ··· 404 kfree(mddev); 405 } 406 spin_unlock(&all_mddevs_lock); 407 + } 408 + 409 + static void mddev_init(mddev_t *mddev) 410 + { 411 + mutex_init(&mddev->open_mutex); 412 + mutex_init(&mddev->reconfig_mutex); 413 + mutex_init(&mddev->bitmap_info.mutex); 414 + INIT_LIST_HEAD(&mddev->disks); 415 + INIT_LIST_HEAD(&mddev->all_mddevs); 416 + init_timer(&mddev->safemode_timer); 417 + atomic_set(&mddev->active, 1); 418 + atomic_set(&mddev->openers, 0); 419 + atomic_set(&mddev->active_io, 0); 420 + spin_lock_init(&mddev->write_lock); 421 + atomic_set(&mddev->flush_pending, 0); 422 + init_waitqueue_head(&mddev->sb_wait); 423 + init_waitqueue_head(&mddev->recovery_wait); 424 + mddev->reshape_position = MaxSector; 425 + mddev->resync_min = 0; 426 + mddev->resync_max = MaxSector; 427 + mddev->level = LEVEL_NONE; 428 } 429 430 static mddev_t * mddev_find(dev_t unit) ··· 472 else 473 new->md_minor = MINOR(unit) >> MdpMinorShift; 474 475 + mddev_init(new); 476 477 goto retry; 478 } ··· 508 return mutex_trylock(&mddev->reconfig_mutex); 509 } 510 511 + static struct attribute_group md_redundancy_group; 512 + 513 + static void mddev_unlock(mddev_t * mddev) 514 { 515 + if (mddev->to_remove) { 516 + /* These cannot be removed under reconfig_mutex as 517 + * an access to the files will try to take reconfig_mutex 518 + * while holding the file unremovable, which leads to 519 + * a deadlock. 520 + * So hold open_mutex instead - we are allowed to take 521 + * it while holding reconfig_mutex, and md_run can 522 + * use it to wait for the remove to complete. 523 + */ 524 + struct attribute_group *to_remove = mddev->to_remove; 525 + mddev->to_remove = NULL; 526 + mutex_lock(&mddev->open_mutex); 527 + mutex_unlock(&mddev->reconfig_mutex); 528 + 529 + if (to_remove != &md_redundancy_group) 530 + sysfs_remove_group(&mddev->kobj, to_remove); 531 + if (mddev->pers == NULL || 532 + mddev->pers->sync_request == NULL) { 533 + sysfs_remove_group(&mddev->kobj, &md_redundancy_group); 534 + if (mddev->sysfs_action) 535 + sysfs_put(mddev->sysfs_action); 536 + mddev->sysfs_action = NULL; 537 + } 538 + mutex_unlock(&mddev->open_mutex); 539 + } else 540 + mutex_unlock(&mddev->reconfig_mutex); 541 542 md_wakeup_thread(mddev->thread); 543 } ··· 1029 mddev->bitmap_info.default_offset; 1030 1031 } else if (mddev->pers == NULL) { 1032 + /* Insist on good event counter while assembling, except 1033 + * for spares (which don't need an event count) */ 1034 ++ev1; 1035 + if (sb->disks[rdev->desc_nr].state & ( 1036 + (1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE))) 1037 + if (ev1 < mddev->events) 1038 + return -EINVAL; 1039 } else if (mddev->bitmap) { 1040 /* if adding to array with a bitmap, then we can accept an 1041 * older device ... but not too old. ··· 1428 } 1429 1430 } else if (mddev->pers == NULL) { 1431 + /* Insist of good event counter while assembling, except for 1432 + * spares (which don't need an event count) */ 1433 ++ev1; 1434 + if (rdev->desc_nr >= 0 && 1435 + rdev->desc_nr < le32_to_cpu(sb->max_dev) && 1436 + le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < 0xfffe) 1437 + if (ev1 < mddev->events) 1438 + return -EINVAL; 1439 } else if (mddev->bitmap) { 1440 /* If adding to array with a bitmap, then we can accept an 1441 * older device, but not too old. ··· 2047 if (rdev->sb_events == mddev->events || 2048 (nospares && 2049 rdev->raid_disk < 0 && 2050 rdev->sb_events+1 == mddev->events)) { 2051 /* Don't update this superblock */ 2052 rdev->sb_loaded = 2; ··· 2100 * and 'events' is odd, we can roll back to the previous clean state */ 2101 if (nospares 2102 && (mddev->in_sync && mddev->recovery_cp == MaxSector) 2103 + && mddev->can_decrease_events 2104 + && mddev->events != 1) { 2105 mddev->events--; 2106 + mddev->can_decrease_events = 0; 2107 + } else { 2108 /* otherwise we have to go forward and ... */ 2109 mddev->events ++; 2110 + mddev->can_decrease_events = nospares; 2111 } 2112 2113 if (!mddev->events) { ··· 2365 return err; 2366 sprintf(nm, "rd%d", rdev->raid_disk); 2367 sysfs_remove_link(&rdev->mddev->kobj, nm); 2368 + rdev->raid_disk = -1; 2369 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 2370 md_wakeup_thread(rdev->mddev->thread); 2371 } else if (rdev->mddev->pers) { ··· 2780 2781 i = 0; 2782 rdev_for_each(rdev, tmp, mddev) { 2783 + if (mddev->max_disks && 2784 + (rdev->desc_nr >= mddev->max_disks || 2785 + i > mddev->max_disks)) { 2786 printk(KERN_WARNING 2787 "md: %s: %s: only %d devices permitted\n", 2788 mdname(mddev), bdevname(rdev->bdev, b), ··· 2897 static ssize_t 2898 level_store(mddev_t *mddev, const char *buf, size_t len) 2899 { 2900 + char clevel[16]; 2901 ssize_t rv = len; 2902 struct mdk_personality *pers; 2903 + long level; 2904 void *priv; 2905 mdk_rdev_t *rdev; 2906 ··· 2932 } 2933 2934 /* Now find the new personality */ 2935 + if (len == 0 || len >= sizeof(clevel)) 2936 return -EINVAL; 2937 + strncpy(clevel, buf, len); 2938 + if (clevel[len-1] == '\n') 2939 len--; 2940 + clevel[len] = 0; 2941 + if (strict_strtol(clevel, 10, &level)) 2942 + level = LEVEL_NONE; 2943 2944 + if (request_module("md-%s", clevel) != 0) 2945 + request_module("md-level-%s", clevel); 2946 spin_lock(&pers_lock); 2947 + pers = find_pers(level, clevel); 2948 if (!pers || !try_module_get(pers->owner)) { 2949 spin_unlock(&pers_lock); 2950 + printk(KERN_WARNING "md: personality %s not loaded\n", clevel); 2951 return -EINVAL; 2952 } 2953 spin_unlock(&pers_lock); ··· 2957 if (!pers->takeover) { 2958 module_put(pers->owner); 2959 printk(KERN_WARNING "md: %s: %s does not support personality takeover\n", 2960 + mdname(mddev), clevel); 2961 return -EINVAL; 2962 } 2963 ··· 2973 mddev->delta_disks = 0; 2974 module_put(pers->owner); 2975 printk(KERN_WARNING "md: %s: %s would not accept array\n", 2976 + mdname(mddev), clevel); 2977 return PTR_ERR(priv); 2978 } 2979 2980 /* Looks like we have a winner */ 2981 mddev_suspend(mddev); 2982 mddev->pers->stop(mddev); 2983 + 2984 + if (mddev->pers->sync_request == NULL && 2985 + pers->sync_request != NULL) { 2986 + /* need to add the md_redundancy_group */ 2987 + if (sysfs_create_group(&mddev->kobj, &md_redundancy_group)) 2988 + printk(KERN_WARNING 2989 + "md: cannot register extra attributes for %s\n", 2990 + mdname(mddev)); 2991 + mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, NULL, "sync_action"); 2992 + } 2993 + if (mddev->pers->sync_request != NULL && 2994 + pers->sync_request == NULL) { 2995 + /* need to remove the md_redundancy_group */ 2996 + if (mddev->to_remove == NULL) 2997 + mddev->to_remove = &md_redundancy_group; 2998 + } 2999 + 3000 + if (mddev->pers->sync_request == NULL && 3001 + mddev->external) { 3002 + /* We are converting from a no-redundancy array 3003 + * to a redundancy array and metadata is managed 3004 + * externally so we need to be sure that writes 3005 + * won't block due to a need to transition 3006 + * clean->dirty 3007 + * until external management is started. 3008 + */ 3009 + mddev->in_sync = 0; 3010 + mddev->safemode_delay = 0; 3011 + mddev->safemode = 0; 3012 + } 3013 + 3014 module_put(mddev->pers->owner); 3015 /* Invalidate devices that are now superfluous */ 3016 list_for_each_entry(rdev, &mddev->disks, same_set) ··· 2994 mddev->layout = mddev->new_layout; 2995 mddev->chunk_sectors = mddev->new_chunk_sectors; 2996 mddev->delta_disks = 0; 2997 + if (mddev->pers->sync_request == NULL) { 2998 + /* this is now an array without redundancy, so 2999 + * it must always be in_sync 3000 + */ 3001 + mddev->in_sync = 1; 3002 + del_timer_sync(&mddev->safemode_timer); 3003 + } 3004 pers->run(mddev); 3005 mddev_resume(mddev); 3006 set_bit(MD_CHANGE_DEVS, &mddev->flags); 3007 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3008 md_wakeup_thread(mddev->thread); 3009 + sysfs_notify(&mddev->kobj, NULL, "level"); 3010 + md_new_event(mddev); 3011 return rv; 3012 } 3013 ··· 3237 } 3238 3239 static int do_md_stop(mddev_t * mddev, int ro, int is_open); 3240 + static int md_set_readonly(mddev_t * mddev, int is_open); 3241 static int do_md_run(mddev_t * mddev); 3242 static int restart_array(mddev_t *mddev); 3243 ··· 3267 break; /* not supported yet */ 3268 case readonly: 3269 if (mddev->pers) 3270 + err = md_set_readonly(mddev, 0); 3271 else { 3272 mddev->ro = 1; 3273 set_disk_ro(mddev->gendisk, 1); ··· 3277 case read_auto: 3278 if (mddev->pers) { 3279 if (mddev->ro == 0) 3280 + err = md_set_readonly(mddev, 0); 3281 else if (mddev->ro == 1) 3282 err = restart_array(mddev); 3283 if (err == 0) { ··· 4082 { 4083 mddev_t *mddev = container_of(ws, mddev_t, del_work); 4084 4085 sysfs_remove_group(&mddev->kobj, &md_bitmap_group); 4086 kobject_del(&mddev->kobj); 4087 kobject_put(&mddev->kobj); ··· 4234 4235 static int start_dirty_degraded; 4236 4237 + static int md_run(mddev_t *mddev) 4238 { 4239 int err; 4240 mdk_rdev_t *rdev; 4241 struct mdk_personality *pers; 4242 4243 if (list_empty(&mddev->disks)) ··· 4247 4248 if (mddev->pers) 4249 return -EBUSY; 4250 + 4251 + /* These two calls synchronise us with the 4252 + * sysfs_remove_group calls in mddev_unlock, 4253 + * so they must have completed. 4254 + */ 4255 + mutex_lock(&mddev->open_mutex); 4256 + mutex_unlock(&mddev->open_mutex); 4257 4258 /* 4259 * Analyze all RAID superblock(s) ··· 4295 } 4296 sysfs_notify_dirent(rdev->sysfs_state); 4297 } 4298 4299 spin_lock(&pers_lock); 4300 pers = find_pers(mddev->level, mddev->clevel); ··· 4425 if (mddev->flags) 4426 md_update_sb(mddev, 0); 4427 4428 md_wakeup_thread(mddev->thread); 4429 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ 4430 4431 md_new_event(mddev); 4432 sysfs_notify_dirent(mddev->sysfs_state); 4433 if (mddev->sysfs_action) 4434 sysfs_notify_dirent(mddev->sysfs_action); 4435 sysfs_notify(&mddev->kobj, NULL, "degraded"); 4436 return 0; 4437 + } 4438 + 4439 + static int do_md_run(mddev_t *mddev) 4440 + { 4441 + int err; 4442 + 4443 + err = md_run(mddev); 4444 + if (err) 4445 + goto out; 4446 + 4447 + set_capacity(mddev->gendisk, mddev->array_sectors); 4448 + revalidate_disk(mddev->gendisk); 4449 + kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); 4450 + out: 4451 + return err; 4452 } 4453 4454 static int restart_array(mddev_t *mddev) ··· 4491 spin_unlock(&inode->i_lock); 4492 } 4493 4494 + static void md_clean(mddev_t *mddev) 4495 + { 4496 + mddev->array_sectors = 0; 4497 + mddev->external_size = 0; 4498 + mddev->dev_sectors = 0; 4499 + mddev->raid_disks = 0; 4500 + mddev->recovery_cp = 0; 4501 + mddev->resync_min = 0; 4502 + mddev->resync_max = MaxSector; 4503 + mddev->reshape_position = MaxSector; 4504 + mddev->external = 0; 4505 + mddev->persistent = 0; 4506 + mddev->level = LEVEL_NONE; 4507 + mddev->clevel[0] = 0; 4508 + mddev->flags = 0; 4509 + mddev->ro = 0; 4510 + mddev->metadata_type[0] = 0; 4511 + mddev->chunk_sectors = 0; 4512 + mddev->ctime = mddev->utime = 0; 4513 + mddev->layout = 0; 4514 + mddev->max_disks = 0; 4515 + mddev->events = 0; 4516 + mddev->can_decrease_events = 0; 4517 + mddev->delta_disks = 0; 4518 + mddev->new_level = LEVEL_NONE; 4519 + mddev->new_layout = 0; 4520 + mddev->new_chunk_sectors = 0; 4521 + mddev->curr_resync = 0; 4522 + mddev->resync_mismatches = 0; 4523 + mddev->suspend_lo = mddev->suspend_hi = 0; 4524 + mddev->sync_speed_min = mddev->sync_speed_max = 0; 4525 + mddev->recovery = 0; 4526 + mddev->in_sync = 0; 4527 + mddev->degraded = 0; 4528 + mddev->barriers_work = 0; 4529 + mddev->safemode = 0; 4530 + mddev->bitmap_info.offset = 0; 4531 + mddev->bitmap_info.default_offset = 0; 4532 + mddev->bitmap_info.chunksize = 0; 4533 + mddev->bitmap_info.daemon_sleep = 0; 4534 + mddev->bitmap_info.max_write_behind = 0; 4535 + } 4536 + 4537 + static void md_stop_writes(mddev_t *mddev) 4538 + { 4539 + if (mddev->sync_thread) { 4540 + set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4541 + set_bit(MD_RECOVERY_INTR, &mddev->recovery); 4542 + md_unregister_thread(mddev->sync_thread); 4543 + mddev->sync_thread = NULL; 4544 + } 4545 + 4546 + del_timer_sync(&mddev->safemode_timer); 4547 + 4548 + bitmap_flush(mddev); 4549 + md_super_wait(mddev); 4550 + 4551 + if (!mddev->in_sync || mddev->flags) { 4552 + /* mark array as shutdown cleanly */ 4553 + mddev->in_sync = 1; 4554 + md_update_sb(mddev, 1); 4555 + } 4556 + } 4557 + 4558 + static void md_stop(mddev_t *mddev) 4559 + { 4560 + md_stop_writes(mddev); 4561 + 4562 + mddev->pers->stop(mddev); 4563 + if (mddev->pers->sync_request && mddev->to_remove == NULL) 4564 + mddev->to_remove = &md_redundancy_group; 4565 + module_put(mddev->pers->owner); 4566 + mddev->pers = NULL; 4567 + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4568 + } 4569 + 4570 + static int md_set_readonly(mddev_t *mddev, int is_open) 4571 + { 4572 + int err = 0; 4573 + mutex_lock(&mddev->open_mutex); 4574 + if (atomic_read(&mddev->openers) > is_open) { 4575 + printk("md: %s still in use.\n",mdname(mddev)); 4576 + err = -EBUSY; 4577 + goto out; 4578 + } 4579 + if (mddev->pers) { 4580 + md_stop_writes(mddev); 4581 + 4582 + err = -ENXIO; 4583 + if (mddev->ro==1) 4584 + goto out; 4585 + mddev->ro = 1; 4586 + set_disk_ro(mddev->gendisk, 1); 4587 + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4588 + sysfs_notify_dirent(mddev->sysfs_state); 4589 + err = 0; 4590 + } 4591 + out: 4592 + mutex_unlock(&mddev->open_mutex); 4593 + return err; 4594 + } 4595 + 4596 /* mode: 4597 * 0 - completely stop and dis-assemble array 4598 * 2 - stop but do not disassemble array 4599 */ 4600 static int do_md_stop(mddev_t * mddev, int mode, int is_open) ··· 4508 err = -EBUSY; 4509 } else if (mddev->pers) { 4510 4511 + if (mddev->ro) 4512 + set_disk_ro(disk, 0); 4513 4514 + md_stop(mddev); 4515 + mddev->queue->merge_bvec_fn = NULL; 4516 + mddev->queue->unplug_fn = NULL; 4517 + mddev->queue->backing_dev_info.congested_fn = NULL; 4518 4519 + /* tell userspace to handle 'inactive' */ 4520 + sysfs_notify_dirent(mddev->sysfs_state); 4521 4522 + list_for_each_entry(rdev, &mddev->disks, same_set) 4523 + if (rdev->raid_disk >= 0) { 4524 + char nm[20]; 4525 + sprintf(nm, "rd%d", rdev->raid_disk); 4526 + sysfs_remove_link(&mddev->kobj, nm); 4527 + } 4528 4529 + set_capacity(disk, 0); 4530 + revalidate_disk(disk); 4531 4532 + if (mddev->ro) 4533 + mddev->ro = 0; 4534 + 4535 err = 0; 4536 } 4537 mutex_unlock(&mddev->open_mutex); 4538 if (err) 4539 return err; ··· 4586 4587 export_array(mddev); 4588 4589 + md_clean(mddev); 4590 kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); 4591 if (mddev->hold_active == UNTIL_STOP) 4592 mddev->hold_active = 0; 4593 4594 + } 4595 err = 0; 4596 blk_integrity_unregister(disk); 4597 md_new_event(mddev); ··· 5349 if (mddev->pers->check_reshape == NULL) 5350 return -EINVAL; 5351 if (raid_disks <= 0 || 5352 + (mddev->max_disks && raid_disks >= mddev->max_disks)) 5353 return -EINVAL; 5354 if (mddev->sync_thread || mddev->reshape_position != MaxSector) 5355 return -EBUSY; ··· 5486 5487 geo->heads = 2; 5488 geo->sectors = 4; 5489 + geo->cylinders = mddev->array_sectors / 8; 5490 return 0; 5491 } 5492 ··· 5496 int err = 0; 5497 void __user *argp = (void __user *)arg; 5498 mddev_t *mddev = NULL; 5499 + int ro; 5500 5501 if (!capable(CAP_SYS_ADMIN)) 5502 return -EACCES; ··· 5628 goto done_unlock; 5629 5630 case STOP_ARRAY_RO: 5631 + err = md_set_readonly(mddev, 1); 5632 goto done_unlock; 5633 5634 + case BLKROSET: 5635 + if (get_user(ro, (int __user *)(arg))) { 5636 + err = -EFAULT; 5637 + goto done_unlock; 5638 + } 5639 + err = -EINVAL; 5640 + 5641 + /* if the bdev is going readonly the value of mddev->ro 5642 + * does not matter, no writes are coming 5643 + */ 5644 + if (ro) 5645 + goto done_unlock; 5646 + 5647 + /* are we are already prepared for writes? */ 5648 + if (mddev->ro != 1) 5649 + goto done_unlock; 5650 + 5651 + /* transitioning to readauto need only happen for 5652 + * arrays that call md_write_start 5653 + */ 5654 + if (mddev->pers) { 5655 + err = restart_array(mddev); 5656 + if (err == 0) { 5657 + mddev->ro = 2; 5658 + set_disk_ro(mddev->gendisk, 0); 5659 + } 5660 + } 5661 + goto done_unlock; 5662 } 5663 5664 /* ··· 5751 atomic_inc(&mddev->openers); 5752 mutex_unlock(&mddev->open_mutex); 5753 5754 out: 5755 return err; 5756 } ··· 5766 5767 return 0; 5768 } 5769 static const struct block_device_operations md_fops = 5770 { 5771 .owner = THIS_MODULE, ··· 5791 .compat_ioctl = md_compat_ioctl, 5792 #endif 5793 .getgeo = md_getgeo, 5794 }; 5795 5796 static int md_thread(void * arg) ··· 5906 mddev->pers->error_handler(mddev,rdev); 5907 if (mddev->degraded) 5908 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 5909 + sysfs_notify_dirent(rdev->sysfs_state); 5910 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 5911 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5912 md_wakeup_thread(mddev->thread); ··· 6898 if (mddev->flags) 6899 md_update_sb(mddev, 0); 6900 6901 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && 6902 !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { 6903 /* resync/recovery still happening */ ··· 7039 * appears to still be in use. Hence 7040 * the '100'. 7041 */ 7042 + md_set_readonly(mddev, 100); 7043 mddev_unlock(mddev); 7044 } 7045 /*
+8 -8
drivers/md/md.h
··· 74 #define Blocked 8 /* An error occured on an externally 75 * managed array, don't allow writes 76 * until it is cleared */ 77 - #define StateChanged 9 /* Faulty or Blocked has changed during 78 - * interrupt, so it needs to be 79 - * notified by the thread */ 80 wait_queue_head_t blocked_wait; 81 82 int desc_nr; /* descriptor index in the superblock */ ··· 150 int external_size; /* size managed 151 * externally */ 152 __u64 events; 153 154 char uuid[16]; 155 ··· 243 atomic_t active; /* general refcount */ 244 atomic_t openers; /* number of active opens */ 245 246 - int changed; /* true if we might need to reread partition info */ 247 int degraded; /* whether md should consider 248 * adding a spare 249 */ ··· 281 atomic_t writes_pending; 282 struct request_queue *queue; /* for plugging ... */ 283 284 - atomic_t write_behind; /* outstanding async IO */ 285 - unsigned int max_write_behind; /* 0 = sync */ 286 - 287 struct bitmap *bitmap; /* the bitmap for the device */ 288 struct { 289 struct file *file; /* the bitmap file */ ··· 304 atomic_t max_corr_read_errors; /* max read retries */ 305 struct list_head all_mddevs; 306 307 /* Generic barrier handling. 308 * If there is a pending barrier request, all other 309 * writes are blocked while the devices are flushed. ··· 336 int level; 337 struct list_head list; 338 struct module *owner; 339 - int (*make_request)(struct request_queue *q, struct bio *bio); 340 int (*run)(mddev_t *mddev); 341 int (*stop)(mddev_t *mddev); 342 void (*status)(struct seq_file *seq, mddev_t *mddev);
··· 74 #define Blocked 8 /* An error occured on an externally 75 * managed array, don't allow writes 76 * until it is cleared */ 77 wait_queue_head_t blocked_wait; 78 79 int desc_nr; /* descriptor index in the superblock */ ··· 153 int external_size; /* size managed 154 * externally */ 155 __u64 events; 156 + /* If the last 'event' was simply a clean->dirty transition, and 157 + * we didn't write it to the spares, then it is safe and simple 158 + * to just decrement the event count on a dirty->clean transition. 159 + * So we record that possibility here. 160 + */ 161 + int can_decrease_events; 162 163 char uuid[16]; 164 ··· 240 atomic_t active; /* general refcount */ 241 atomic_t openers; /* number of active opens */ 242 243 int degraded; /* whether md should consider 244 * adding a spare 245 */ ··· 279 atomic_t writes_pending; 280 struct request_queue *queue; /* for plugging ... */ 281 282 struct bitmap *bitmap; /* the bitmap for the device */ 283 struct { 284 struct file *file; /* the bitmap file */ ··· 305 atomic_t max_corr_read_errors; /* max read retries */ 306 struct list_head all_mddevs; 307 308 + struct attribute_group *to_remove; 309 /* Generic barrier handling. 310 * If there is a pending barrier request, all other 311 * writes are blocked while the devices are flushed. ··· 336 int level; 337 struct list_head list; 338 struct module *owner; 339 + int (*make_request)(mddev_t *mddev, struct bio *bio); 340 int (*run)(mddev_t *mddev); 341 int (*stop)(mddev_t *mddev); 342 void (*status)(struct seq_file *seq, mddev_t *mddev);
+2 -11
drivers/md/multipath.c
··· 85 static void multipath_end_request(struct bio *bio, int error) 86 { 87 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 88 - struct multipath_bh * mp_bh = (struct multipath_bh *)(bio->bi_private); 89 multipath_conf_t *conf = mp_bh->mddev->private; 90 mdk_rdev_t *rdev = conf->multipaths[mp_bh->path].rdev; 91 ··· 136 } 137 138 139 - static int multipath_make_request (struct request_queue *q, struct bio * bio) 140 { 141 - mddev_t *mddev = q->queuedata; 142 multipath_conf_t *conf = mddev->private; 143 struct multipath_bh * mp_bh; 144 struct multipath_info *multipath; 145 - const int rw = bio_data_dir(bio); 146 - int cpu; 147 148 if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) { 149 md_barrier_request(mddev, bio); ··· 151 152 mp_bh->master_bio = bio; 153 mp_bh->mddev = mddev; 154 - 155 - cpu = part_stat_lock(); 156 - part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); 157 - part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], 158 - bio_sectors(bio)); 159 - part_stat_unlock(); 160 161 mp_bh->path = multipath_map(conf); 162 if (mp_bh->path < 0) {
··· 85 static void multipath_end_request(struct bio *bio, int error) 86 { 87 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 88 + struct multipath_bh *mp_bh = bio->bi_private; 89 multipath_conf_t *conf = mp_bh->mddev->private; 90 mdk_rdev_t *rdev = conf->multipaths[mp_bh->path].rdev; 91 ··· 136 } 137 138 139 + static int multipath_make_request(mddev_t *mddev, struct bio * bio) 140 { 141 multipath_conf_t *conf = mddev->private; 142 struct multipath_bh * mp_bh; 143 struct multipath_info *multipath; 144 145 if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) { 146 md_barrier_request(mddev, bio); ··· 154 155 mp_bh->master_bio = bio; 156 mp_bh->mddev = mddev; 157 158 mp_bh->path = multipath_map(conf); 159 if (mp_bh->path < 0) {
+190 -61
drivers/md/raid0.c
··· 23 #include <linux/slab.h> 24 #include "md.h" 25 #include "raid0.h" 26 27 static void raid0_unplug(struct request_queue *q) 28 { 29 mddev_t *mddev = q->queuedata; 30 raid0_conf_t *conf = mddev->private; 31 mdk_rdev_t **devlist = conf->devlist; 32 int i; 33 34 - for (i=0; i<mddev->raid_disks; i++) { 35 struct request_queue *r_queue = bdev_get_queue(devlist[i]->bdev); 36 37 blk_unplug(r_queue); ··· 45 mddev_t *mddev = data; 46 raid0_conf_t *conf = mddev->private; 47 mdk_rdev_t **devlist = conf->devlist; 48 int i, ret = 0; 49 50 if (mddev_congested(mddev, bits)) 51 return 1; 52 53 - for (i = 0; i < mddev->raid_disks && !ret ; i++) { 54 struct request_queue *q = bdev_get_queue(devlist[i]->bdev); 55 56 ret |= bdi_congested(&q->backing_dev_info, bits); ··· 69 sector_t zone_start = 0; 70 char b[BDEVNAME_SIZE]; 71 raid0_conf_t *conf = mddev->private; 72 printk(KERN_INFO "******* %s configuration *********\n", 73 mdname(mddev)); 74 h = 0; 75 for (j = 0; j < conf->nr_strip_zones; j++) { 76 printk(KERN_INFO "zone%d=[", j); 77 for (k = 0; k < conf->strip_zone[j].nb_dev; k++) 78 - printk("%s/", 79 - bdevname(conf->devlist[j*mddev->raid_disks 80 + k]->bdev, b)); 81 - printk("]\n"); 82 83 zone_size = conf->strip_zone[j].zone_end - zone_start; 84 printk(KERN_INFO " zone offset=%llukb " ··· 92 printk(KERN_INFO "**********************************\n\n"); 93 } 94 95 - static int create_strip_zones(mddev_t *mddev) 96 { 97 int i, c, err; 98 sector_t curr_zone_end, sectors; ··· 105 if (!conf) 106 return -ENOMEM; 107 list_for_each_entry(rdev1, &mddev->disks, same_set) { 108 - printk(KERN_INFO "raid0: looking at %s\n", 109 - bdevname(rdev1->bdev,b)); 110 c = 0; 111 112 /* round size to chunk_size */ ··· 116 rdev1->sectors = sectors * mddev->chunk_sectors; 117 118 list_for_each_entry(rdev2, &mddev->disks, same_set) { 119 - printk(KERN_INFO "raid0: comparing %s(%llu)", 120 bdevname(rdev1->bdev,b), 121 (unsigned long long)rdev1->sectors); 122 - printk(KERN_INFO " with %s(%llu)\n", 123 bdevname(rdev2->bdev,b), 124 (unsigned long long)rdev2->sectors); 125 if (rdev2 == rdev1) { 126 - printk(KERN_INFO "raid0: END\n"); 127 break; 128 } 129 if (rdev2->sectors == rdev1->sectors) { ··· 133 * Not unique, don't count it as a new 134 * group 135 */ 136 - printk(KERN_INFO "raid0: EQUAL\n"); 137 c = 1; 138 break; 139 } 140 - printk(KERN_INFO "raid0: NOT EQUAL\n"); 141 } 142 if (!c) { 143 - printk(KERN_INFO "raid0: ==> UNIQUE\n"); 144 conf->nr_strip_zones++; 145 - printk(KERN_INFO "raid0: %d zones\n", 146 - conf->nr_strip_zones); 147 } 148 } 149 - printk(KERN_INFO "raid0: FINAL %d zones\n", conf->nr_strip_zones); 150 err = -ENOMEM; 151 conf->strip_zone = kzalloc(sizeof(struct strip_zone)* 152 conf->nr_strip_zones, GFP_KERNEL); ··· 173 list_for_each_entry(rdev1, &mddev->disks, same_set) { 174 int j = rdev1->raid_disk; 175 176 if (j < 0 || j >= mddev->raid_disks) { 177 - printk(KERN_ERR "raid0: bad disk number %d - " 178 - "aborting!\n", j); 179 goto abort; 180 } 181 if (dev[j]) { 182 - printk(KERN_ERR "raid0: multiple devices for %d - " 183 - "aborting!\n", j); 184 goto abort; 185 } 186 dev[j] = rdev1; ··· 206 cnt++; 207 } 208 if (cnt != mddev->raid_disks) { 209 - printk(KERN_ERR "raid0: too few disks (%d of %d) - " 210 - "aborting!\n", cnt, mddev->raid_disks); 211 goto abort; 212 } 213 zone->nb_dev = cnt; ··· 223 zone = conf->strip_zone + i; 224 dev = conf->devlist + i * mddev->raid_disks; 225 226 - printk(KERN_INFO "raid0: zone %d\n", i); 227 zone->dev_start = smallest->sectors; 228 smallest = NULL; 229 c = 0; 230 231 for (j=0; j<cnt; j++) { 232 rdev = conf->devlist[j]; 233 - printk(KERN_INFO "raid0: checking %s ...", 234 - bdevname(rdev->bdev, b)); 235 if (rdev->sectors <= zone->dev_start) { 236 - printk(KERN_INFO " nope.\n"); 237 continue; 238 } 239 - printk(KERN_INFO " contained as device %d\n", c); 240 dev[c] = rdev; 241 c++; 242 if (!smallest || rdev->sectors < smallest->sectors) { 243 smallest = rdev; 244 - printk(KERN_INFO " (%llu) is smallest!.\n", 245 - (unsigned long long)rdev->sectors); 246 } 247 } 248 249 zone->nb_dev = c; 250 sectors = (smallest->sectors - zone->dev_start) * c; 251 - printk(KERN_INFO "raid0: zone->nb_dev: %d, sectors: %llu\n", 252 - zone->nb_dev, (unsigned long long)sectors); 253 254 curr_zone_end += sectors; 255 zone->zone_end = curr_zone_end; 256 257 - printk(KERN_INFO "raid0: current zone start: %llu\n", 258 - (unsigned long long)smallest->sectors); 259 } 260 mddev->queue->unplug_fn = raid0_unplug; 261 mddev->queue->backing_dev_info.congested_fn = raid0_congested; ··· 271 * chunk size is a multiple of that sector size 272 */ 273 if ((mddev->chunk_sectors << 9) % queue_logical_block_size(mddev->queue)) { 274 - printk(KERN_ERR "%s chunk_size of %d not valid\n", 275 mdname(mddev), 276 mddev->chunk_sectors << 9); 277 goto abort; ··· 281 blk_queue_io_opt(mddev->queue, 282 (mddev->chunk_sectors << 9) * mddev->raid_disks); 283 284 - printk(KERN_INFO "raid0: done.\n"); 285 - mddev->private = conf; 286 return 0; 287 abort: 288 kfree(conf->strip_zone); 289 kfree(conf->devlist); 290 kfree(conf); 291 - mddev->private = NULL; 292 return err; 293 } 294 ··· 340 341 static int raid0_run(mddev_t *mddev) 342 { 343 int ret; 344 345 if (mddev->chunk_sectors == 0) { 346 - printk(KERN_ERR "md/raid0: chunk size must be set.\n"); 347 return -EINVAL; 348 } 349 if (md_check_no_bitmap(mddev)) ··· 353 blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors); 354 mddev->queue->queue_lock = &mddev->queue->__queue_lock; 355 356 - ret = create_strip_zones(mddev); 357 - if (ret < 0) 358 - return ret; 359 360 /* calculate array device size */ 361 md_set_array_sectors(mddev, raid0_size(mddev, 0, 0)); 362 363 - printk(KERN_INFO "raid0 : md_size is %llu sectors.\n", 364 - (unsigned long long)mddev->array_sectors); 365 /* calculate the max read-ahead size. 366 * For read-ahead of large files to be effective, we need to 367 * readahead at least twice a whole stripe. i.e. number of devices ··· 437 unsigned int sect_in_chunk; 438 sector_t chunk; 439 raid0_conf_t *conf = mddev->private; 440 unsigned int chunk_sects = mddev->chunk_sectors; 441 442 if (is_power_of_2(chunk_sects)) { ··· 460 * + the position in the chunk 461 */ 462 *sector_offset = (chunk * chunk_sects) + sect_in_chunk; 463 - return conf->devlist[(zone - conf->strip_zone)*mddev->raid_disks 464 + sector_div(sector, zone->nb_dev)]; 465 } 466 ··· 480 } 481 } 482 483 - static int raid0_make_request(struct request_queue *q, struct bio *bio) 484 { 485 - mddev_t *mddev = q->queuedata; 486 unsigned int chunk_sects; 487 sector_t sector_offset; 488 struct strip_zone *zone; 489 mdk_rdev_t *tmp_dev; 490 - const int rw = bio_data_dir(bio); 491 - int cpu; 492 493 if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) { 494 md_barrier_request(mddev, bio); 495 return 0; 496 } 497 - 498 - cpu = part_stat_lock(); 499 - part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); 500 - part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], 501 - bio_sectors(bio)); 502 - part_stat_unlock(); 503 504 chunk_sects = mddev->chunk_sectors; 505 if (unlikely(!is_io_in_chunk_boundary(mddev, chunk_sects, bio))) { ··· 509 else 510 bp = bio_split(bio, chunk_sects - 511 sector_div(sector, chunk_sects)); 512 - if (raid0_make_request(q, &bp->bio1)) 513 generic_make_request(&bp->bio1); 514 - if (raid0_make_request(q, &bp->bio2)) 515 generic_make_request(&bp->bio2); 516 517 bio_pair_release(bp); ··· 531 return 1; 532 533 bad_map: 534 - printk("raid0_make_request bug: can't convert block across chunks" 535 - " or bigger than %dk %llu %d\n", chunk_sects / 2, 536 - (unsigned long long)bio->bi_sector, bio->bi_size >> 10); 537 538 bio_io_error(bio); 539 return 0; ··· 547 int j, k, h; 548 char b[BDEVNAME_SIZE]; 549 raid0_conf_t *conf = mddev->private; 550 551 sector_t zone_size; 552 sector_t zone_start = 0; ··· 558 seq_printf(seq, "=["); 559 for (k = 0; k < conf->strip_zone[j].nb_dev; k++) 560 seq_printf(seq, "%s/", bdevname( 561 - conf->devlist[j*mddev->raid_disks + k] 562 ->bdev, b)); 563 564 zone_size = conf->strip_zone[j].zone_end - zone_start; ··· 573 return; 574 } 575 576 static struct mdk_personality raid0_personality= 577 { 578 .name = "raid0", ··· 681 .stop = raid0_stop, 682 .status = raid0_status, 683 .size = raid0_size, 684 }; 685 686 static int __init raid0_init (void)
··· 23 #include <linux/slab.h> 24 #include "md.h" 25 #include "raid0.h" 26 + #include "raid5.h" 27 28 static void raid0_unplug(struct request_queue *q) 29 { 30 mddev_t *mddev = q->queuedata; 31 raid0_conf_t *conf = mddev->private; 32 mdk_rdev_t **devlist = conf->devlist; 33 + int raid_disks = conf->strip_zone[0].nb_dev; 34 int i; 35 36 + for (i=0; i < raid_disks; i++) { 37 struct request_queue *r_queue = bdev_get_queue(devlist[i]->bdev); 38 39 blk_unplug(r_queue); ··· 43 mddev_t *mddev = data; 44 raid0_conf_t *conf = mddev->private; 45 mdk_rdev_t **devlist = conf->devlist; 46 + int raid_disks = conf->strip_zone[0].nb_dev; 47 int i, ret = 0; 48 49 if (mddev_congested(mddev, bits)) 50 return 1; 51 52 + for (i = 0; i < raid_disks && !ret ; i++) { 53 struct request_queue *q = bdev_get_queue(devlist[i]->bdev); 54 55 ret |= bdi_congested(&q->backing_dev_info, bits); ··· 66 sector_t zone_start = 0; 67 char b[BDEVNAME_SIZE]; 68 raid0_conf_t *conf = mddev->private; 69 + int raid_disks = conf->strip_zone[0].nb_dev; 70 printk(KERN_INFO "******* %s configuration *********\n", 71 mdname(mddev)); 72 h = 0; 73 for (j = 0; j < conf->nr_strip_zones; j++) { 74 printk(KERN_INFO "zone%d=[", j); 75 for (k = 0; k < conf->strip_zone[j].nb_dev; k++) 76 + printk(KERN_CONT "%s/", 77 + bdevname(conf->devlist[j*raid_disks 78 + k]->bdev, b)); 79 + printk(KERN_CONT "]\n"); 80 81 zone_size = conf->strip_zone[j].zone_end - zone_start; 82 printk(KERN_INFO " zone offset=%llukb " ··· 88 printk(KERN_INFO "**********************************\n\n"); 89 } 90 91 + static int create_strip_zones(mddev_t *mddev, raid0_conf_t **private_conf) 92 { 93 int i, c, err; 94 sector_t curr_zone_end, sectors; ··· 101 if (!conf) 102 return -ENOMEM; 103 list_for_each_entry(rdev1, &mddev->disks, same_set) { 104 + printk(KERN_INFO "md/raid0:%s: looking at %s\n", 105 + mdname(mddev), 106 + bdevname(rdev1->bdev, b)); 107 c = 0; 108 109 /* round size to chunk_size */ ··· 111 rdev1->sectors = sectors * mddev->chunk_sectors; 112 113 list_for_each_entry(rdev2, &mddev->disks, same_set) { 114 + printk(KERN_INFO "md/raid0:%s: comparing %s(%llu)", 115 + mdname(mddev), 116 bdevname(rdev1->bdev,b), 117 (unsigned long long)rdev1->sectors); 118 + printk(KERN_CONT " with %s(%llu)\n", 119 bdevname(rdev2->bdev,b), 120 (unsigned long long)rdev2->sectors); 121 if (rdev2 == rdev1) { 122 + printk(KERN_INFO "md/raid0:%s: END\n", 123 + mdname(mddev)); 124 break; 125 } 126 if (rdev2->sectors == rdev1->sectors) { ··· 126 * Not unique, don't count it as a new 127 * group 128 */ 129 + printk(KERN_INFO "md/raid0:%s: EQUAL\n", 130 + mdname(mddev)); 131 c = 1; 132 break; 133 } 134 + printk(KERN_INFO "md/raid0:%s: NOT EQUAL\n", 135 + mdname(mddev)); 136 } 137 if (!c) { 138 + printk(KERN_INFO "md/raid0:%s: ==> UNIQUE\n", 139 + mdname(mddev)); 140 conf->nr_strip_zones++; 141 + printk(KERN_INFO "md/raid0:%s: %d zones\n", 142 + mdname(mddev), conf->nr_strip_zones); 143 } 144 } 145 + printk(KERN_INFO "md/raid0:%s: FINAL %d zones\n", 146 + mdname(mddev), conf->nr_strip_zones); 147 err = -ENOMEM; 148 conf->strip_zone = kzalloc(sizeof(struct strip_zone)* 149 conf->nr_strip_zones, GFP_KERNEL); ··· 162 list_for_each_entry(rdev1, &mddev->disks, same_set) { 163 int j = rdev1->raid_disk; 164 165 + if (mddev->level == 10) 166 + /* taking over a raid10-n2 array */ 167 + j /= 2; 168 + 169 if (j < 0 || j >= mddev->raid_disks) { 170 + printk(KERN_ERR "md/raid0:%s: bad disk number %d - " 171 + "aborting!\n", mdname(mddev), j); 172 goto abort; 173 } 174 if (dev[j]) { 175 + printk(KERN_ERR "md/raid0:%s: multiple devices for %d - " 176 + "aborting!\n", mdname(mddev), j); 177 goto abort; 178 } 179 dev[j] = rdev1; ··· 191 cnt++; 192 } 193 if (cnt != mddev->raid_disks) { 194 + printk(KERN_ERR "md/raid0:%s: too few disks (%d of %d) - " 195 + "aborting!\n", mdname(mddev), cnt, mddev->raid_disks); 196 goto abort; 197 } 198 zone->nb_dev = cnt; ··· 208 zone = conf->strip_zone + i; 209 dev = conf->devlist + i * mddev->raid_disks; 210 211 + printk(KERN_INFO "md/raid0:%s: zone %d\n", 212 + mdname(mddev), i); 213 zone->dev_start = smallest->sectors; 214 smallest = NULL; 215 c = 0; 216 217 for (j=0; j<cnt; j++) { 218 rdev = conf->devlist[j]; 219 + printk(KERN_INFO "md/raid0:%s: checking %s ...", 220 + mdname(mddev), 221 + bdevname(rdev->bdev, b)); 222 if (rdev->sectors <= zone->dev_start) { 223 + printk(KERN_CONT " nope.\n"); 224 continue; 225 } 226 + printk(KERN_CONT " contained as device %d\n", c); 227 dev[c] = rdev; 228 c++; 229 if (!smallest || rdev->sectors < smallest->sectors) { 230 smallest = rdev; 231 + printk(KERN_INFO "md/raid0:%s: (%llu) is smallest!.\n", 232 + mdname(mddev), 233 + (unsigned long long)rdev->sectors); 234 } 235 } 236 237 zone->nb_dev = c; 238 sectors = (smallest->sectors - zone->dev_start) * c; 239 + printk(KERN_INFO "md/raid0:%s: zone->nb_dev: %d, sectors: %llu\n", 240 + mdname(mddev), 241 + zone->nb_dev, (unsigned long long)sectors); 242 243 curr_zone_end += sectors; 244 zone->zone_end = curr_zone_end; 245 246 + printk(KERN_INFO "md/raid0:%s: current zone start: %llu\n", 247 + mdname(mddev), 248 + (unsigned long long)smallest->sectors); 249 } 250 mddev->queue->unplug_fn = raid0_unplug; 251 mddev->queue->backing_dev_info.congested_fn = raid0_congested; ··· 251 * chunk size is a multiple of that sector size 252 */ 253 if ((mddev->chunk_sectors << 9) % queue_logical_block_size(mddev->queue)) { 254 + printk(KERN_ERR "md/raid0:%s: chunk_size of %d not valid\n", 255 mdname(mddev), 256 mddev->chunk_sectors << 9); 257 goto abort; ··· 261 blk_queue_io_opt(mddev->queue, 262 (mddev->chunk_sectors << 9) * mddev->raid_disks); 263 264 + printk(KERN_INFO "md/raid0:%s: done.\n", mdname(mddev)); 265 + *private_conf = conf; 266 + 267 return 0; 268 abort: 269 kfree(conf->strip_zone); 270 kfree(conf->devlist); 271 kfree(conf); 272 + *private_conf = NULL; 273 return err; 274 } 275 ··· 319 320 static int raid0_run(mddev_t *mddev) 321 { 322 + raid0_conf_t *conf; 323 int ret; 324 325 if (mddev->chunk_sectors == 0) { 326 + printk(KERN_ERR "md/raid0:%s: chunk size must be set.\n", 327 + mdname(mddev)); 328 return -EINVAL; 329 } 330 if (md_check_no_bitmap(mddev)) ··· 330 blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors); 331 mddev->queue->queue_lock = &mddev->queue->__queue_lock; 332 333 + /* if private is not null, we are here after takeover */ 334 + if (mddev->private == NULL) { 335 + ret = create_strip_zones(mddev, &conf); 336 + if (ret < 0) 337 + return ret; 338 + mddev->private = conf; 339 + } 340 + conf = mddev->private; 341 + if (conf->scale_raid_disks) { 342 + int i; 343 + for (i=0; i < conf->strip_zone[0].nb_dev; i++) 344 + conf->devlist[i]->raid_disk /= conf->scale_raid_disks; 345 + /* FIXME update sysfs rd links */ 346 + } 347 348 /* calculate array device size */ 349 md_set_array_sectors(mddev, raid0_size(mddev, 0, 0)); 350 351 + printk(KERN_INFO "md/raid0:%s: md_size is %llu sectors.\n", 352 + mdname(mddev), 353 + (unsigned long long)mddev->array_sectors); 354 /* calculate the max read-ahead size. 355 * For read-ahead of large files to be effective, we need to 356 * readahead at least twice a whole stripe. i.e. number of devices ··· 402 unsigned int sect_in_chunk; 403 sector_t chunk; 404 raid0_conf_t *conf = mddev->private; 405 + int raid_disks = conf->strip_zone[0].nb_dev; 406 unsigned int chunk_sects = mddev->chunk_sectors; 407 408 if (is_power_of_2(chunk_sects)) { ··· 424 * + the position in the chunk 425 */ 426 *sector_offset = (chunk * chunk_sects) + sect_in_chunk; 427 + return conf->devlist[(zone - conf->strip_zone)*raid_disks 428 + sector_div(sector, zone->nb_dev)]; 429 } 430 ··· 444 } 445 } 446 447 + static int raid0_make_request(mddev_t *mddev, struct bio *bio) 448 { 449 unsigned int chunk_sects; 450 sector_t sector_offset; 451 struct strip_zone *zone; 452 mdk_rdev_t *tmp_dev; 453 454 if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) { 455 md_barrier_request(mddev, bio); 456 return 0; 457 } 458 459 chunk_sects = mddev->chunk_sectors; 460 if (unlikely(!is_io_in_chunk_boundary(mddev, chunk_sects, bio))) { ··· 482 else 483 bp = bio_split(bio, chunk_sects - 484 sector_div(sector, chunk_sects)); 485 + if (raid0_make_request(mddev, &bp->bio1)) 486 generic_make_request(&bp->bio1); 487 + if (raid0_make_request(mddev, &bp->bio2)) 488 generic_make_request(&bp->bio2); 489 490 bio_pair_release(bp); ··· 504 return 1; 505 506 bad_map: 507 + printk("md/raid0:%s: make_request bug: can't convert block across chunks" 508 + " or bigger than %dk %llu %d\n", 509 + mdname(mddev), chunk_sects / 2, 510 + (unsigned long long)bio->bi_sector, bio->bi_size >> 10); 511 512 bio_io_error(bio); 513 return 0; ··· 519 int j, k, h; 520 char b[BDEVNAME_SIZE]; 521 raid0_conf_t *conf = mddev->private; 522 + int raid_disks = conf->strip_zone[0].nb_dev; 523 524 sector_t zone_size; 525 sector_t zone_start = 0; ··· 529 seq_printf(seq, "=["); 530 for (k = 0; k < conf->strip_zone[j].nb_dev; k++) 531 seq_printf(seq, "%s/", bdevname( 532 + conf->devlist[j*raid_disks + k] 533 ->bdev, b)); 534 535 zone_size = conf->strip_zone[j].zone_end - zone_start; ··· 544 return; 545 } 546 547 + static void *raid0_takeover_raid5(mddev_t *mddev) 548 + { 549 + mdk_rdev_t *rdev; 550 + raid0_conf_t *priv_conf; 551 + 552 + if (mddev->degraded != 1) { 553 + printk(KERN_ERR "md/raid0:%s: raid5 must be degraded! Degraded disks: %d\n", 554 + mdname(mddev), 555 + mddev->degraded); 556 + return ERR_PTR(-EINVAL); 557 + } 558 + 559 + list_for_each_entry(rdev, &mddev->disks, same_set) { 560 + /* check slot number for a disk */ 561 + if (rdev->raid_disk == mddev->raid_disks-1) { 562 + printk(KERN_ERR "md/raid0:%s: raid5 must have missing parity disk!\n", 563 + mdname(mddev)); 564 + return ERR_PTR(-EINVAL); 565 + } 566 + } 567 + 568 + /* Set new parameters */ 569 + mddev->new_level = 0; 570 + mddev->new_chunk_sectors = mddev->chunk_sectors; 571 + mddev->raid_disks--; 572 + mddev->delta_disks = -1; 573 + /* make sure it will be not marked as dirty */ 574 + mddev->recovery_cp = MaxSector; 575 + 576 + create_strip_zones(mddev, &priv_conf); 577 + return priv_conf; 578 + } 579 + 580 + static void *raid0_takeover_raid10(mddev_t *mddev) 581 + { 582 + raid0_conf_t *priv_conf; 583 + 584 + /* Check layout: 585 + * - far_copies must be 1 586 + * - near_copies must be 2 587 + * - disks number must be even 588 + * - all mirrors must be already degraded 589 + */ 590 + if (mddev->layout != ((1 << 8) + 2)) { 591 + printk(KERN_ERR "md/raid0:%s:: Raid0 cannot takover layout: 0x%x\n", 592 + mdname(mddev), 593 + mddev->layout); 594 + return ERR_PTR(-EINVAL); 595 + } 596 + if (mddev->raid_disks & 1) { 597 + printk(KERN_ERR "md/raid0:%s: Raid0 cannot takover Raid10 with odd disk number.\n", 598 + mdname(mddev)); 599 + return ERR_PTR(-EINVAL); 600 + } 601 + if (mddev->degraded != (mddev->raid_disks>>1)) { 602 + printk(KERN_ERR "md/raid0:%s: All mirrors must be already degraded!\n", 603 + mdname(mddev)); 604 + return ERR_PTR(-EINVAL); 605 + } 606 + 607 + /* Set new parameters */ 608 + mddev->new_level = 0; 609 + mddev->new_chunk_sectors = mddev->chunk_sectors; 610 + mddev->delta_disks = - mddev->raid_disks / 2; 611 + mddev->raid_disks += mddev->delta_disks; 612 + mddev->degraded = 0; 613 + /* make sure it will be not marked as dirty */ 614 + mddev->recovery_cp = MaxSector; 615 + 616 + create_strip_zones(mddev, &priv_conf); 617 + priv_conf->scale_raid_disks = 2; 618 + return priv_conf; 619 + } 620 + 621 + static void *raid0_takeover(mddev_t *mddev) 622 + { 623 + /* raid0 can take over: 624 + * raid5 - providing it is Raid4 layout and one disk is faulty 625 + * raid10 - assuming we have all necessary active disks 626 + */ 627 + if (mddev->level == 5) { 628 + if (mddev->layout == ALGORITHM_PARITY_N) 629 + return raid0_takeover_raid5(mddev); 630 + 631 + printk(KERN_ERR "md/raid0:%s: Raid can only takeover Raid5 with layout: %d\n", 632 + mdname(mddev), ALGORITHM_PARITY_N); 633 + } 634 + 635 + if (mddev->level == 10) 636 + return raid0_takeover_raid10(mddev); 637 + 638 + return ERR_PTR(-EINVAL); 639 + } 640 + 641 + static void raid0_quiesce(mddev_t *mddev, int state) 642 + { 643 + } 644 + 645 static struct mdk_personality raid0_personality= 646 { 647 .name = "raid0", ··· 554 .stop = raid0_stop, 555 .status = raid0_status, 556 .size = raid0_size, 557 + .takeover = raid0_takeover, 558 + .quiesce = raid0_quiesce, 559 }; 560 561 static int __init raid0_init (void)
+3
drivers/md/raid0.h
··· 13 struct strip_zone *strip_zone; 14 mdk_rdev_t **devlist; /* lists of rdevs, pointed to by strip_zone->dev */ 15 int nr_strip_zones; 16 }; 17 18 typedef struct raid0_private_data raid0_conf_t;
··· 13 struct strip_zone *strip_zone; 14 mdk_rdev_t **devlist; /* lists of rdevs, pointed to by strip_zone->dev */ 15 int nr_strip_zones; 16 + int scale_raid_disks; /* divide rdev->raid_disks by this in run() 17 + * to handle conversion from raid10 18 + */ 19 }; 20 21 typedef struct raid0_private_data raid0_conf_t;
+59 -55
drivers/md/raid1.c
··· 263 static void raid1_end_read_request(struct bio *bio, int error) 264 { 265 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 266 - r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); 267 int mirror; 268 conf_t *conf = r1_bio->mddev->private; 269 ··· 297 */ 298 char b[BDEVNAME_SIZE]; 299 if (printk_ratelimit()) 300 - printk(KERN_ERR "raid1: %s: rescheduling sector %llu\n", 301 bdevname(conf->mirrors[mirror].rdev->bdev,b), (unsigned long long)r1_bio->sector); 302 reschedule_retry(r1_bio); 303 } ··· 309 static void raid1_end_write_request(struct bio *bio, int error) 310 { 311 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 312 - r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); 313 int mirror, behind = test_bit(R1BIO_BehindIO, &r1_bio->state); 314 conf_t *conf = r1_bio->mddev->private; 315 struct bio *to_put = NULL; ··· 419 */ 420 static int read_balance(conf_t *conf, r1bio_t *r1_bio) 421 { 422 - const unsigned long this_sector = r1_bio->sector; 423 int new_disk = conf->last_used, disk = new_disk; 424 int wonly_disk = -1; 425 const int sectors = r1_bio->sectors; ··· 435 retry: 436 if (conf->mddev->recovery_cp < MaxSector && 437 (this_sector + sectors >= conf->next_resync)) { 438 - /* Choose the first operation device, for consistancy */ 439 new_disk = 0; 440 441 for (rdev = rcu_dereference(conf->mirrors[new_disk].rdev); ··· 775 return NULL; 776 } 777 778 - static int make_request(struct request_queue *q, struct bio * bio) 779 { 780 - mddev_t *mddev = q->queuedata; 781 conf_t *conf = mddev->private; 782 mirror_info_t *mirror; 783 r1bio_t *r1_bio; ··· 788 struct page **behind_pages = NULL; 789 const int rw = bio_data_dir(bio); 790 const bool do_sync = bio_rw_flagged(bio, BIO_RW_SYNCIO); 791 - int cpu; 792 bool do_barriers; 793 mdk_rdev_t *blocked_rdev; 794 ··· 833 834 bitmap = mddev->bitmap; 835 836 - cpu = part_stat_lock(); 837 - part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); 838 - part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], 839 - bio_sectors(bio)); 840 - part_stat_unlock(); 841 - 842 /* 843 * make_request() can abort the operation when READA is being 844 * used and no empty request is available. ··· 859 } 860 mirror = conf->mirrors + rdisk; 861 862 r1_bio->read_disk = rdisk; 863 864 read_bio = bio_clone(bio, GFP_NOIO); ··· 914 if (test_bit(Faulty, &rdev->flags)) { 915 rdev_dec_pending(rdev, mddev); 916 r1_bio->bios[i] = NULL; 917 - } else 918 r1_bio->bios[i] = bio; 919 - targets++; 920 } else 921 r1_bio->bios[i] = NULL; 922 } ··· 945 set_bit(R1BIO_Degraded, &r1_bio->state); 946 } 947 948 - /* do behind I/O ? */ 949 if (bitmap && 950 (atomic_read(&bitmap->behind_writes) 951 < mddev->bitmap_info.max_write_behind) && 952 (behind_pages = alloc_behind_pages(bio)) != NULL) 953 set_bit(R1BIO_BehindIO, &r1_bio->state); 954 ··· 1077 } else 1078 set_bit(Faulty, &rdev->flags); 1079 set_bit(MD_CHANGE_DEVS, &mddev->flags); 1080 - printk(KERN_ALERT "raid1: Disk failure on %s, disabling device.\n" 1081 - "raid1: Operation continuing on %d devices.\n", 1082 - bdevname(rdev->bdev,b), conf->raid_disks - mddev->degraded); 1083 } 1084 1085 static void print_conf(conf_t *conf) 1086 { 1087 int i; 1088 1089 - printk("RAID1 conf printout:\n"); 1090 if (!conf) { 1091 - printk("(!conf)\n"); 1092 return; 1093 } 1094 - printk(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded, 1095 conf->raid_disks); 1096 1097 rcu_read_lock(); ··· 1100 char b[BDEVNAME_SIZE]; 1101 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); 1102 if (rdev) 1103 - printk(" disk %d, wo:%d, o:%d, dev:%s\n", 1104 i, !test_bit(In_sync, &rdev->flags), 1105 !test_bit(Faulty, &rdev->flags), 1106 bdevname(rdev->bdev,b)); ··· 1231 1232 static void end_sync_read(struct bio *bio, int error) 1233 { 1234 - r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); 1235 int i; 1236 1237 for (i=r1_bio->mddev->raid_disks; i--; ) ··· 1254 static void end_sync_write(struct bio *bio, int error) 1255 { 1256 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 1257 - r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); 1258 mddev_t *mddev = r1_bio->mddev; 1259 conf_t *conf = mddev->private; 1260 int i; ··· 1461 char b[BDEVNAME_SIZE]; 1462 /* Cannot read from anywhere, array is toast */ 1463 md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev); 1464 - printk(KERN_ALERT "raid1: %s: unrecoverable I/O read error" 1465 " for block %llu\n", 1466 - bdevname(bio->bi_bdev,b), 1467 (unsigned long long)r1_bio->sector); 1468 md_done_sync(mddev, r1_bio->sectors, 0); 1469 put_buf(r1_bio); ··· 1586 else { 1587 atomic_add(s, &rdev->corrected_errors); 1588 printk(KERN_INFO 1589 - "raid1:%s: read error corrected " 1590 "(%d sectors at %llu on %s)\n", 1591 mdname(mddev), s, 1592 (unsigned long long)(sect + ··· 1691 1692 bio = r1_bio->bios[r1_bio->read_disk]; 1693 if ((disk=read_balance(conf, r1_bio)) == -1) { 1694 - printk(KERN_ALERT "raid1: %s: unrecoverable I/O" 1695 " read error for block %llu\n", 1696 bdevname(bio->bi_bdev,b), 1697 (unsigned long long)r1_bio->sector); 1698 raid_end_bio_io(r1_bio); ··· 1707 r1_bio->bios[r1_bio->read_disk] = bio; 1708 rdev = conf->mirrors[disk].rdev; 1709 if (printk_ratelimit()) 1710 - printk(KERN_ERR "raid1: %s: redirecting sector %llu to" 1711 - " another mirror\n", 1712 - bdevname(rdev->bdev,b), 1713 - (unsigned long long)r1_bio->sector); 1714 bio->bi_sector = r1_bio->sector + rdev->data_offset; 1715 bio->bi_bdev = rdev->bdev; 1716 bio->bi_end_io = raid1_end_read_request; ··· 1766 int still_degraded = 0; 1767 1768 if (!conf->r1buf_pool) 1769 - { 1770 - /* 1771 - printk("sync start - bitmap %p\n", mddev->bitmap); 1772 - */ 1773 if (init_resync(conf)) 1774 return 0; 1775 - } 1776 1777 max_sector = mddev->dev_sectors; 1778 if (sector_nr >= max_sector) { ··· 2048 2049 err = -EIO; 2050 if (conf->last_used < 0) { 2051 - printk(KERN_ERR "raid1: no operational mirrors for %s\n", 2052 mdname(mddev)); 2053 goto abort; 2054 } ··· 2056 conf->thread = md_register_thread(raid1d, mddev, NULL); 2057 if (!conf->thread) { 2058 printk(KERN_ERR 2059 - "raid1: couldn't allocate thread for %s\n", 2060 mdname(mddev)); 2061 goto abort; 2062 } ··· 2082 mdk_rdev_t *rdev; 2083 2084 if (mddev->level != 1) { 2085 - printk("raid1: %s: raid level not set to mirroring (%d)\n", 2086 mdname(mddev), mddev->level); 2087 return -EIO; 2088 } 2089 if (mddev->reshape_position != MaxSector) { 2090 - printk("raid1: %s: reshape_position set but not supported\n", 2091 mdname(mddev)); 2092 return -EIO; 2093 } ··· 2130 mddev->recovery_cp = MaxSector; 2131 2132 if (mddev->recovery_cp != MaxSector) 2133 - printk(KERN_NOTICE "raid1: %s is not clean" 2134 " -- starting background reconstruction\n", 2135 mdname(mddev)); 2136 printk(KERN_INFO 2137 - "raid1: raid set %s active with %d out of %d mirrors\n", 2138 mdname(mddev), mddev->raid_disks - mddev->degraded, 2139 mddev->raid_disks); 2140 ··· 2158 { 2159 conf_t *conf = mddev->private; 2160 struct bitmap *bitmap = mddev->bitmap; 2161 - int behind_wait = 0; 2162 2163 /* wait for behind writes to complete */ 2164 - while (bitmap && atomic_read(&bitmap->behind_writes) > 0) { 2165 - behind_wait++; 2166 - printk(KERN_INFO "raid1: behind writes in progress on device %s, waiting to stop (%d)\n", mdname(mddev), behind_wait); 2167 - set_current_state(TASK_UNINTERRUPTIBLE); 2168 - schedule_timeout(HZ); /* wait a second */ 2169 /* need to kick something here to make sure I/O goes? */ 2170 } 2171 2172 raise_barrier(conf); ··· 2196 if (mddev->array_sectors > raid1_size(mddev, sectors, 0)) 2197 return -EINVAL; 2198 set_capacity(mddev->gendisk, mddev->array_sectors); 2199 - mddev->changed = 1; 2200 revalidate_disk(mddev->gendisk); 2201 if (sectors > mddev->dev_sectors && 2202 mddev->recovery_cp == MaxSector) { ··· 2290 if (sysfs_create_link(&mddev->kobj, 2291 &rdev->kobj, nm)) 2292 printk(KERN_WARNING 2293 - "md/raid1: cannot register " 2294 - "%s for %s\n", 2295 - nm, mdname(mddev)); 2296 } 2297 if (rdev) 2298 newmirrors[d2++].rdev = rdev;
··· 263 static void raid1_end_read_request(struct bio *bio, int error) 264 { 265 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 266 + r1bio_t *r1_bio = bio->bi_private; 267 int mirror; 268 conf_t *conf = r1_bio->mddev->private; 269 ··· 297 */ 298 char b[BDEVNAME_SIZE]; 299 if (printk_ratelimit()) 300 + printk(KERN_ERR "md/raid1:%s: %s: rescheduling sector %llu\n", 301 + mdname(conf->mddev), 302 bdevname(conf->mirrors[mirror].rdev->bdev,b), (unsigned long long)r1_bio->sector); 303 reschedule_retry(r1_bio); 304 } ··· 308 static void raid1_end_write_request(struct bio *bio, int error) 309 { 310 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 311 + r1bio_t *r1_bio = bio->bi_private; 312 int mirror, behind = test_bit(R1BIO_BehindIO, &r1_bio->state); 313 conf_t *conf = r1_bio->mddev->private; 314 struct bio *to_put = NULL; ··· 418 */ 419 static int read_balance(conf_t *conf, r1bio_t *r1_bio) 420 { 421 + const sector_t this_sector = r1_bio->sector; 422 int new_disk = conf->last_used, disk = new_disk; 423 int wonly_disk = -1; 424 const int sectors = r1_bio->sectors; ··· 434 retry: 435 if (conf->mddev->recovery_cp < MaxSector && 436 (this_sector + sectors >= conf->next_resync)) { 437 + /* Choose the first operational device, for consistancy */ 438 new_disk = 0; 439 440 for (rdev = rcu_dereference(conf->mirrors[new_disk].rdev); ··· 774 return NULL; 775 } 776 777 + static int make_request(mddev_t *mddev, struct bio * bio) 778 { 779 conf_t *conf = mddev->private; 780 mirror_info_t *mirror; 781 r1bio_t *r1_bio; ··· 788 struct page **behind_pages = NULL; 789 const int rw = bio_data_dir(bio); 790 const bool do_sync = bio_rw_flagged(bio, BIO_RW_SYNCIO); 791 bool do_barriers; 792 mdk_rdev_t *blocked_rdev; 793 ··· 834 835 bitmap = mddev->bitmap; 836 837 /* 838 * make_request() can abort the operation when READA is being 839 * used and no empty request is available. ··· 866 } 867 mirror = conf->mirrors + rdisk; 868 869 + if (test_bit(WriteMostly, &mirror->rdev->flags) && 870 + bitmap) { 871 + /* Reading from a write-mostly device must 872 + * take care not to over-take any writes 873 + * that are 'behind' 874 + */ 875 + wait_event(bitmap->behind_wait, 876 + atomic_read(&bitmap->behind_writes) == 0); 877 + } 878 r1_bio->read_disk = rdisk; 879 880 read_bio = bio_clone(bio, GFP_NOIO); ··· 912 if (test_bit(Faulty, &rdev->flags)) { 913 rdev_dec_pending(rdev, mddev); 914 r1_bio->bios[i] = NULL; 915 + } else { 916 r1_bio->bios[i] = bio; 917 + targets++; 918 + } 919 } else 920 r1_bio->bios[i] = NULL; 921 } ··· 942 set_bit(R1BIO_Degraded, &r1_bio->state); 943 } 944 945 + /* do behind I/O ? 946 + * Not if there are too many, or cannot allocate memory, 947 + * or a reader on WriteMostly is waiting for behind writes 948 + * to flush */ 949 if (bitmap && 950 (atomic_read(&bitmap->behind_writes) 951 < mddev->bitmap_info.max_write_behind) && 952 + !waitqueue_active(&bitmap->behind_wait) && 953 (behind_pages = alloc_behind_pages(bio)) != NULL) 954 set_bit(R1BIO_BehindIO, &r1_bio->state); 955 ··· 1070 } else 1071 set_bit(Faulty, &rdev->flags); 1072 set_bit(MD_CHANGE_DEVS, &mddev->flags); 1073 + printk(KERN_ALERT "md/raid1:%s: Disk failure on %s, disabling device.\n" 1074 + KERN_ALERT "md/raid1:%s: Operation continuing on %d devices.\n", 1075 + mdname(mddev), bdevname(rdev->bdev, b), 1076 + mdname(mddev), conf->raid_disks - mddev->degraded); 1077 } 1078 1079 static void print_conf(conf_t *conf) 1080 { 1081 int i; 1082 1083 + printk(KERN_DEBUG "RAID1 conf printout:\n"); 1084 if (!conf) { 1085 + printk(KERN_DEBUG "(!conf)\n"); 1086 return; 1087 } 1088 + printk(KERN_DEBUG " --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded, 1089 conf->raid_disks); 1090 1091 rcu_read_lock(); ··· 1092 char b[BDEVNAME_SIZE]; 1093 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); 1094 if (rdev) 1095 + printk(KERN_DEBUG " disk %d, wo:%d, o:%d, dev:%s\n", 1096 i, !test_bit(In_sync, &rdev->flags), 1097 !test_bit(Faulty, &rdev->flags), 1098 bdevname(rdev->bdev,b)); ··· 1223 1224 static void end_sync_read(struct bio *bio, int error) 1225 { 1226 + r1bio_t *r1_bio = bio->bi_private; 1227 int i; 1228 1229 for (i=r1_bio->mddev->raid_disks; i--; ) ··· 1246 static void end_sync_write(struct bio *bio, int error) 1247 { 1248 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 1249 + r1bio_t *r1_bio = bio->bi_private; 1250 mddev_t *mddev = r1_bio->mddev; 1251 conf_t *conf = mddev->private; 1252 int i; ··· 1453 char b[BDEVNAME_SIZE]; 1454 /* Cannot read from anywhere, array is toast */ 1455 md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev); 1456 + printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O read error" 1457 " for block %llu\n", 1458 + mdname(mddev), 1459 + bdevname(bio->bi_bdev, b), 1460 (unsigned long long)r1_bio->sector); 1461 md_done_sync(mddev, r1_bio->sectors, 0); 1462 put_buf(r1_bio); ··· 1577 else { 1578 atomic_add(s, &rdev->corrected_errors); 1579 printk(KERN_INFO 1580 + "md/raid1:%s: read error corrected " 1581 "(%d sectors at %llu on %s)\n", 1582 mdname(mddev), s, 1583 (unsigned long long)(sect + ··· 1682 1683 bio = r1_bio->bios[r1_bio->read_disk]; 1684 if ((disk=read_balance(conf, r1_bio)) == -1) { 1685 + printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O" 1686 " read error for block %llu\n", 1687 + mdname(mddev), 1688 bdevname(bio->bi_bdev,b), 1689 (unsigned long long)r1_bio->sector); 1690 raid_end_bio_io(r1_bio); ··· 1697 r1_bio->bios[r1_bio->read_disk] = bio; 1698 rdev = conf->mirrors[disk].rdev; 1699 if (printk_ratelimit()) 1700 + printk(KERN_ERR "md/raid1:%s: redirecting sector %llu to" 1701 + " other mirror: %s\n", 1702 + mdname(mddev), 1703 + (unsigned long long)r1_bio->sector, 1704 + bdevname(rdev->bdev,b)); 1705 bio->bi_sector = r1_bio->sector + rdev->data_offset; 1706 bio->bi_bdev = rdev->bdev; 1707 bio->bi_end_io = raid1_end_read_request; ··· 1755 int still_degraded = 0; 1756 1757 if (!conf->r1buf_pool) 1758 if (init_resync(conf)) 1759 return 0; 1760 1761 max_sector = mddev->dev_sectors; 1762 if (sector_nr >= max_sector) { ··· 2042 2043 err = -EIO; 2044 if (conf->last_used < 0) { 2045 + printk(KERN_ERR "md/raid1:%s: no operational mirrors\n", 2046 mdname(mddev)); 2047 goto abort; 2048 } ··· 2050 conf->thread = md_register_thread(raid1d, mddev, NULL); 2051 if (!conf->thread) { 2052 printk(KERN_ERR 2053 + "md/raid1:%s: couldn't allocate thread\n", 2054 mdname(mddev)); 2055 goto abort; 2056 } ··· 2076 mdk_rdev_t *rdev; 2077 2078 if (mddev->level != 1) { 2079 + printk(KERN_ERR "md/raid1:%s: raid level not set to mirroring (%d)\n", 2080 mdname(mddev), mddev->level); 2081 return -EIO; 2082 } 2083 if (mddev->reshape_position != MaxSector) { 2084 + printk(KERN_ERR "md/raid1:%s: reshape_position set but not supported\n", 2085 mdname(mddev)); 2086 return -EIO; 2087 } ··· 2124 mddev->recovery_cp = MaxSector; 2125 2126 if (mddev->recovery_cp != MaxSector) 2127 + printk(KERN_NOTICE "md/raid1:%s: not clean" 2128 " -- starting background reconstruction\n", 2129 mdname(mddev)); 2130 printk(KERN_INFO 2131 + "md/raid1:%s: active with %d out of %d mirrors\n", 2132 mdname(mddev), mddev->raid_disks - mddev->degraded, 2133 mddev->raid_disks); 2134 ··· 2152 { 2153 conf_t *conf = mddev->private; 2154 struct bitmap *bitmap = mddev->bitmap; 2155 2156 /* wait for behind writes to complete */ 2157 + if (bitmap && atomic_read(&bitmap->behind_writes) > 0) { 2158 + printk(KERN_INFO "md/raid1:%s: behind writes in progress - waiting to stop.\n", 2159 + mdname(mddev)); 2160 /* need to kick something here to make sure I/O goes? */ 2161 + wait_event(bitmap->behind_wait, 2162 + atomic_read(&bitmap->behind_writes) == 0); 2163 } 2164 2165 raise_barrier(conf); ··· 2191 if (mddev->array_sectors > raid1_size(mddev, sectors, 0)) 2192 return -EINVAL; 2193 set_capacity(mddev->gendisk, mddev->array_sectors); 2194 revalidate_disk(mddev->gendisk); 2195 if (sectors > mddev->dev_sectors && 2196 mddev->recovery_cp == MaxSector) { ··· 2286 if (sysfs_create_link(&mddev->kobj, 2287 &rdev->kobj, nm)) 2288 printk(KERN_WARNING 2289 + "md/raid1:%s: cannot register " 2290 + "%s\n", 2291 + mdname(mddev), nm); 2292 } 2293 if (rdev) 2294 newmirrors[d2++].rdev = rdev;
+198 -102
drivers/md/raid10.c
··· 24 #include <linux/seq_file.h> 25 #include "md.h" 26 #include "raid10.h" 27 #include "bitmap.h" 28 29 /* ··· 256 static void raid10_end_read_request(struct bio *bio, int error) 257 { 258 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 259 - r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private); 260 int slot, dev; 261 conf_t *conf = r10_bio->mddev->private; 262 ··· 286 */ 287 char b[BDEVNAME_SIZE]; 288 if (printk_ratelimit()) 289 - printk(KERN_ERR "raid10: %s: rescheduling sector %llu\n", 290 bdevname(conf->mirrors[dev].rdev->bdev,b), (unsigned long long)r10_bio->sector); 291 reschedule_retry(r10_bio); 292 } ··· 298 static void raid10_end_write_request(struct bio *bio, int error) 299 { 300 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 301 - r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private); 302 int slot, dev; 303 conf_t *conf = r10_bio->mddev->private; 304 ··· 496 */ 497 static int read_balance(conf_t *conf, r10bio_t *r10_bio) 498 { 499 - const unsigned long this_sector = r10_bio->sector; 500 int disk, slot, nslot; 501 const int sectors = r10_bio->sectors; 502 sector_t new_distance, current_distance; ··· 603 int i; 604 605 rcu_read_lock(); 606 - for (i=0; i<mddev->raid_disks; i++) { 607 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); 608 if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) { 609 struct request_queue *r_queue = bdev_get_queue(rdev->bdev); ··· 637 if (mddev_congested(mddev, bits)) 638 return 1; 639 rcu_read_lock(); 640 - for (i = 0; i < mddev->raid_disks && ret == 0; i++) { 641 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); 642 if (rdev && !test_bit(Faulty, &rdev->flags)) { 643 struct request_queue *q = bdev_get_queue(rdev->bdev); ··· 790 spin_unlock_irq(&conf->resync_lock); 791 } 792 793 - static int make_request(struct request_queue *q, struct bio * bio) 794 { 795 - mddev_t *mddev = q->queuedata; 796 conf_t *conf = mddev->private; 797 mirror_info_t *mirror; 798 r10bio_t *r10_bio; 799 struct bio *read_bio; 800 - int cpu; 801 int i; 802 int chunk_sects = conf->chunk_mask + 1; 803 const int rw = bio_data_dir(bio); ··· 825 */ 826 bp = bio_split(bio, 827 chunk_sects - (bio->bi_sector & (chunk_sects - 1)) ); 828 - if (make_request(q, &bp->bio1)) 829 generic_make_request(&bp->bio1); 830 - if (make_request(q, &bp->bio2)) 831 generic_make_request(&bp->bio2); 832 833 bio_pair_release(bp); 834 return 0; 835 bad_map: 836 - printk("raid10_make_request bug: can't convert block across chunks" 837 - " or bigger than %dk %llu %d\n", chunk_sects/2, 838 (unsigned long long)bio->bi_sector, bio->bi_size >> 10); 839 840 bio_io_error(bio); ··· 849 * Continue immediately if no resync is active currently. 850 */ 851 wait_barrier(conf); 852 - 853 - cpu = part_stat_lock(); 854 - part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); 855 - part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], 856 - bio_sectors(bio)); 857 - part_stat_unlock(); 858 859 r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO); 860 ··· 1033 } 1034 set_bit(Faulty, &rdev->flags); 1035 set_bit(MD_CHANGE_DEVS, &mddev->flags); 1036 - printk(KERN_ALERT "raid10: Disk failure on %s, disabling device.\n" 1037 - "raid10: Operation continuing on %d devices.\n", 1038 - bdevname(rdev->bdev,b), conf->raid_disks - mddev->degraded); 1039 } 1040 1041 static void print_conf(conf_t *conf) ··· 1044 int i; 1045 mirror_info_t *tmp; 1046 1047 - printk("RAID10 conf printout:\n"); 1048 if (!conf) { 1049 - printk("(!conf)\n"); 1050 return; 1051 } 1052 - printk(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded, 1053 conf->raid_disks); 1054 1055 for (i = 0; i < conf->raid_disks; i++) { 1056 char b[BDEVNAME_SIZE]; 1057 tmp = conf->mirrors + i; 1058 if (tmp->rdev) 1059 - printk(" disk %d, wo:%d, o:%d, dev:%s\n", 1060 i, !test_bit(In_sync, &tmp->rdev->flags), 1061 !test_bit(Faulty, &tmp->rdev->flags), 1062 bdevname(tmp->rdev->bdev,b)); ··· 1127 int mirror; 1128 mirror_info_t *p; 1129 int first = 0; 1130 - int last = mddev->raid_disks - 1; 1131 1132 if (mddev->recovery_cp < MaxSector) 1133 /* only hot-add to in-sync arrays, as recovery is ··· 1219 1220 static void end_sync_read(struct bio *bio, int error) 1221 { 1222 - r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private); 1223 conf_t *conf = r10_bio->mddev->private; 1224 int i,d; 1225 ··· 1256 static void end_sync_write(struct bio *bio, int error) 1257 { 1258 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 1259 - r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private); 1260 mddev_t *mddev = r10_bio->mddev; 1261 conf_t *conf = mddev->private; 1262 int i,d; ··· 1505 if (cur_read_error_count > max_read_errors) { 1506 rcu_read_unlock(); 1507 printk(KERN_NOTICE 1508 - "raid10: %s: Raid device exceeded " 1509 "read_error threshold " 1510 "[cur %d:max %d]\n", 1511 b, cur_read_error_count, max_read_errors); 1512 printk(KERN_NOTICE 1513 - "raid10: %s: Failing raid " 1514 - "device\n", b); 1515 md_error(mddev, conf->mirrors[d].rdev); 1516 return; 1517 } ··· 1582 == 0) { 1583 /* Well, this device is dead */ 1584 printk(KERN_NOTICE 1585 - "raid10:%s: read correction " 1586 "write failed" 1587 " (%d sectors at %llu on %s)\n", 1588 mdname(mddev), s, 1589 (unsigned long long)(sect+ 1590 rdev->data_offset), 1591 bdevname(rdev->bdev, b)); 1592 - printk(KERN_NOTICE "raid10:%s: failing " 1593 "drive\n", 1594 bdevname(rdev->bdev, b)); 1595 md_error(mddev, rdev); 1596 } ··· 1619 READ) == 0) { 1620 /* Well, this device is dead */ 1621 printk(KERN_NOTICE 1622 - "raid10:%s: unable to read back " 1623 "corrected sectors" 1624 " (%d sectors at %llu on %s)\n", 1625 mdname(mddev), s, 1626 (unsigned long long)(sect+ 1627 rdev->data_offset), 1628 bdevname(rdev->bdev, b)); 1629 - printk(KERN_NOTICE "raid10:%s: failing drive\n", 1630 bdevname(rdev->bdev, b)); 1631 1632 md_error(mddev, rdev); 1633 } else { 1634 printk(KERN_INFO 1635 - "raid10:%s: read error corrected" 1636 " (%d sectors at %llu on %s)\n", 1637 mdname(mddev), s, 1638 (unsigned long long)(sect+ ··· 1708 mddev->ro ? IO_BLOCKED : NULL; 1709 mirror = read_balance(conf, r10_bio); 1710 if (mirror == -1) { 1711 - printk(KERN_ALERT "raid10: %s: unrecoverable I/O" 1712 " read error for block %llu\n", 1713 bdevname(bio->bi_bdev,b), 1714 (unsigned long long)r10_bio->sector); 1715 raid_end_bio_io(r10_bio); ··· 1720 bio_put(bio); 1721 rdev = conf->mirrors[mirror].rdev; 1722 if (printk_ratelimit()) 1723 - printk(KERN_ERR "raid10: %s: redirecting sector %llu to" 1724 " another mirror\n", 1725 bdevname(rdev->bdev,b), 1726 (unsigned long long)r10_bio->sector); 1727 bio = bio_clone(r10_bio->master_bio, GFP_NOIO); ··· 1980 r10_bio = rb2; 1981 if (!test_and_set_bit(MD_RECOVERY_INTR, 1982 &mddev->recovery)) 1983 - printk(KERN_INFO "raid10: %s: insufficient working devices for recovery.\n", 1984 mdname(mddev)); 1985 break; 1986 } ··· 2141 conf_t *conf = mddev->private; 2142 2143 if (!raid_disks) 2144 - raid_disks = mddev->raid_disks; 2145 if (!sectors) 2146 - sectors = mddev->dev_sectors; 2147 2148 size = sectors >> conf->chunk_shift; 2149 sector_div(size, conf->far_copies); ··· 2153 return size << conf->chunk_shift; 2154 } 2155 2156 - static int run(mddev_t *mddev) 2157 { 2158 - conf_t *conf; 2159 - int i, disk_idx, chunk_size; 2160 - mirror_info_t *disk; 2161 - mdk_rdev_t *rdev; 2162 int nc, fc, fo; 2163 sector_t stride, size; 2164 2165 if (mddev->chunk_sectors < (PAGE_SIZE >> 9) || 2166 !is_power_of_2(mddev->chunk_sectors)) { 2167 - printk(KERN_ERR "md/raid10: chunk size must be " 2168 - "at least PAGE_SIZE(%ld) and be a power of 2.\n", PAGE_SIZE); 2169 - return -EINVAL; 2170 } 2171 2172 nc = mddev->layout & 255; 2173 fc = (mddev->layout >> 8) & 255; 2174 fo = mddev->layout & (1<<16); 2175 if ((nc*fc) <2 || (nc*fc) > mddev->raid_disks || 2176 (mddev->layout >> 17)) { 2177 - printk(KERN_ERR "raid10: %s: unsupported raid10 layout: 0x%8x\n", 2178 mdname(mddev), mddev->layout); 2179 goto out; 2180 } 2181 - /* 2182 - * copy the already verified devices into our private RAID10 2183 - * bookkeeping area. [whatever we allocate in run(), 2184 - * should be freed in stop()] 2185 - */ 2186 conf = kzalloc(sizeof(conf_t), GFP_KERNEL); 2187 - mddev->private = conf; 2188 - if (!conf) { 2189 - printk(KERN_ERR "raid10: couldn't allocate memory for %s\n", 2190 - mdname(mddev)); 2191 goto out; 2192 - } 2193 conf->mirrors = kzalloc(sizeof(struct mirror_info)*mddev->raid_disks, 2194 - GFP_KERNEL); 2195 - if (!conf->mirrors) { 2196 - printk(KERN_ERR "raid10: couldn't allocate memory for %s\n", 2197 - mdname(mddev)); 2198 - goto out_free_conf; 2199 - } 2200 2201 conf->tmppage = alloc_page(GFP_KERNEL); 2202 if (!conf->tmppage) 2203 - goto out_free_conf; 2204 2205 conf->raid_disks = mddev->raid_disks; 2206 conf->near_copies = nc; 2207 conf->far_copies = fc; 2208 conf->copies = nc*fc; 2209 conf->far_offset = fo; 2210 - conf->chunk_mask = mddev->chunk_sectors - 1; 2211 - conf->chunk_shift = ffz(~mddev->chunk_sectors); 2212 size = mddev->dev_sectors >> conf->chunk_shift; 2213 sector_div(size, fc); 2214 size = size * conf->raid_disks; ··· 2221 */ 2222 stride += conf->raid_disks - 1; 2223 sector_div(stride, conf->raid_disks); 2224 - mddev->dev_sectors = stride << conf->chunk_shift; 2225 2226 if (fo) 2227 stride = 1; ··· 2230 sector_div(stride, fc); 2231 conf->stride = stride << conf->chunk_shift; 2232 2233 - conf->r10bio_pool = mempool_create(NR_RAID10_BIOS, r10bio_pool_alloc, 2234 - r10bio_pool_free, conf); 2235 - if (!conf->r10bio_pool) { 2236 - printk(KERN_ERR "raid10: couldn't allocate memory for %s\n", 2237 - mdname(mddev)); 2238 - goto out_free_conf; 2239 - } 2240 2241 - conf->mddev = mddev; 2242 spin_lock_init(&conf->device_lock); 2243 mddev->queue->queue_lock = &conf->device_lock; 2244 2245 chunk_size = mddev->chunk_sectors << 9; 2246 blk_queue_io_min(mddev->queue, chunk_size); ··· 2297 2298 list_for_each_entry(rdev, &mddev->disks, same_set) { 2299 disk_idx = rdev->raid_disk; 2300 - if (disk_idx >= mddev->raid_disks 2301 || disk_idx < 0) 2302 continue; 2303 disk = conf->mirrors + disk_idx; 2304 2305 disk->rdev = rdev; ··· 2322 2323 disk->head_position = 0; 2324 } 2325 - INIT_LIST_HEAD(&conf->retry_list); 2326 - 2327 - spin_lock_init(&conf->resync_lock); 2328 - init_waitqueue_head(&conf->wait_barrier); 2329 - 2330 /* need to check that every block has at least one working mirror */ 2331 if (!enough(conf)) { 2332 - printk(KERN_ERR "raid10: not enough operational mirrors for %s\n", 2333 mdname(mddev)); 2334 goto out_free_conf; 2335 } ··· 2343 } 2344 } 2345 2346 - 2347 - mddev->thread = md_register_thread(raid10d, mddev, NULL); 2348 - if (!mddev->thread) { 2349 - printk(KERN_ERR 2350 - "raid10: couldn't allocate thread for %s\n", 2351 - mdname(mddev)); 2352 - goto out_free_conf; 2353 - } 2354 - 2355 if (mddev->recovery_cp != MaxSector) 2356 - printk(KERN_NOTICE "raid10: %s is not clean" 2357 " -- starting background reconstruction\n", 2358 mdname(mddev)); 2359 printk(KERN_INFO 2360 - "raid10: raid set %s active with %d out of %d devices\n", 2361 - mdname(mddev), mddev->raid_disks - mddev->degraded, 2362 - mddev->raid_disks); 2363 /* 2364 * Ok, everything is just fine now 2365 */ 2366 - md_set_array_sectors(mddev, raid10_size(mddev, 0, 0)); 2367 - mddev->resync_max_sectors = raid10_size(mddev, 0, 0); 2368 2369 mddev->queue->unplug_fn = raid10_unplug; 2370 mddev->queue->backing_dev_info.congested_fn = raid10_congested; ··· 2375 mddev->queue->backing_dev_info.ra_pages = 2* stripe; 2376 } 2377 2378 - if (conf->near_copies < mddev->raid_disks) 2379 blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec); 2380 md_integrity_register(mddev); 2381 return 0; ··· 2387 kfree(conf->mirrors); 2388 kfree(conf); 2389 mddev->private = NULL; 2390 out: 2391 return -EIO; 2392 } ··· 2424 } 2425 } 2426 2427 static struct mdk_personality raid10_personality = 2428 { 2429 .name = "raid10", ··· 2495 .sync_request = sync_request, 2496 .quiesce = raid10_quiesce, 2497 .size = raid10_size, 2498 }; 2499 2500 static int __init raid_init(void)
··· 24 #include <linux/seq_file.h> 25 #include "md.h" 26 #include "raid10.h" 27 + #include "raid0.h" 28 #include "bitmap.h" 29 30 /* ··· 255 static void raid10_end_read_request(struct bio *bio, int error) 256 { 257 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 258 + r10bio_t *r10_bio = bio->bi_private; 259 int slot, dev; 260 conf_t *conf = r10_bio->mddev->private; 261 ··· 285 */ 286 char b[BDEVNAME_SIZE]; 287 if (printk_ratelimit()) 288 + printk(KERN_ERR "md/raid10:%s: %s: rescheduling sector %llu\n", 289 + mdname(conf->mddev), 290 bdevname(conf->mirrors[dev].rdev->bdev,b), (unsigned long long)r10_bio->sector); 291 reschedule_retry(r10_bio); 292 } ··· 296 static void raid10_end_write_request(struct bio *bio, int error) 297 { 298 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 299 + r10bio_t *r10_bio = bio->bi_private; 300 int slot, dev; 301 conf_t *conf = r10_bio->mddev->private; 302 ··· 494 */ 495 static int read_balance(conf_t *conf, r10bio_t *r10_bio) 496 { 497 + const sector_t this_sector = r10_bio->sector; 498 int disk, slot, nslot; 499 const int sectors = r10_bio->sectors; 500 sector_t new_distance, current_distance; ··· 601 int i; 602 603 rcu_read_lock(); 604 + for (i=0; i < conf->raid_disks; i++) { 605 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); 606 if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) { 607 struct request_queue *r_queue = bdev_get_queue(rdev->bdev); ··· 635 if (mddev_congested(mddev, bits)) 636 return 1; 637 rcu_read_lock(); 638 + for (i = 0; i < conf->raid_disks && ret == 0; i++) { 639 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); 640 if (rdev && !test_bit(Faulty, &rdev->flags)) { 641 struct request_queue *q = bdev_get_queue(rdev->bdev); ··· 788 spin_unlock_irq(&conf->resync_lock); 789 } 790 791 + static int make_request(mddev_t *mddev, struct bio * bio) 792 { 793 conf_t *conf = mddev->private; 794 mirror_info_t *mirror; 795 r10bio_t *r10_bio; 796 struct bio *read_bio; 797 int i; 798 int chunk_sects = conf->chunk_mask + 1; 799 const int rw = bio_data_dir(bio); ··· 825 */ 826 bp = bio_split(bio, 827 chunk_sects - (bio->bi_sector & (chunk_sects - 1)) ); 828 + if (make_request(mddev, &bp->bio1)) 829 generic_make_request(&bp->bio1); 830 + if (make_request(mddev, &bp->bio2)) 831 generic_make_request(&bp->bio2); 832 833 bio_pair_release(bp); 834 return 0; 835 bad_map: 836 + printk("md/raid10:%s: make_request bug: can't convert block across chunks" 837 + " or bigger than %dk %llu %d\n", mdname(mddev), chunk_sects/2, 838 (unsigned long long)bio->bi_sector, bio->bi_size >> 10); 839 840 bio_io_error(bio); ··· 849 * Continue immediately if no resync is active currently. 850 */ 851 wait_barrier(conf); 852 853 r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO); 854 ··· 1039 } 1040 set_bit(Faulty, &rdev->flags); 1041 set_bit(MD_CHANGE_DEVS, &mddev->flags); 1042 + printk(KERN_ALERT "md/raid10:%s: Disk failure on %s, disabling device.\n" 1043 + KERN_ALERT "md/raid10:%s: Operation continuing on %d devices.\n", 1044 + mdname(mddev), bdevname(rdev->bdev, b), 1045 + mdname(mddev), conf->raid_disks - mddev->degraded); 1046 } 1047 1048 static void print_conf(conf_t *conf) ··· 1049 int i; 1050 mirror_info_t *tmp; 1051 1052 + printk(KERN_DEBUG "RAID10 conf printout:\n"); 1053 if (!conf) { 1054 + printk(KERN_DEBUG "(!conf)\n"); 1055 return; 1056 } 1057 + printk(KERN_DEBUG " --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded, 1058 conf->raid_disks); 1059 1060 for (i = 0; i < conf->raid_disks; i++) { 1061 char b[BDEVNAME_SIZE]; 1062 tmp = conf->mirrors + i; 1063 if (tmp->rdev) 1064 + printk(KERN_DEBUG " disk %d, wo:%d, o:%d, dev:%s\n", 1065 i, !test_bit(In_sync, &tmp->rdev->flags), 1066 !test_bit(Faulty, &tmp->rdev->flags), 1067 bdevname(tmp->rdev->bdev,b)); ··· 1132 int mirror; 1133 mirror_info_t *p; 1134 int first = 0; 1135 + int last = conf->raid_disks - 1; 1136 1137 if (mddev->recovery_cp < MaxSector) 1138 /* only hot-add to in-sync arrays, as recovery is ··· 1224 1225 static void end_sync_read(struct bio *bio, int error) 1226 { 1227 + r10bio_t *r10_bio = bio->bi_private; 1228 conf_t *conf = r10_bio->mddev->private; 1229 int i,d; 1230 ··· 1261 static void end_sync_write(struct bio *bio, int error) 1262 { 1263 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 1264 + r10bio_t *r10_bio = bio->bi_private; 1265 mddev_t *mddev = r10_bio->mddev; 1266 conf_t *conf = mddev->private; 1267 int i,d; ··· 1510 if (cur_read_error_count > max_read_errors) { 1511 rcu_read_unlock(); 1512 printk(KERN_NOTICE 1513 + "md/raid10:%s: %s: Raid device exceeded " 1514 "read_error threshold " 1515 "[cur %d:max %d]\n", 1516 + mdname(mddev), 1517 b, cur_read_error_count, max_read_errors); 1518 printk(KERN_NOTICE 1519 + "md/raid10:%s: %s: Failing raid " 1520 + "device\n", mdname(mddev), b); 1521 md_error(mddev, conf->mirrors[d].rdev); 1522 return; 1523 } ··· 1586 == 0) { 1587 /* Well, this device is dead */ 1588 printk(KERN_NOTICE 1589 + "md/raid10:%s: read correction " 1590 "write failed" 1591 " (%d sectors at %llu on %s)\n", 1592 mdname(mddev), s, 1593 (unsigned long long)(sect+ 1594 rdev->data_offset), 1595 bdevname(rdev->bdev, b)); 1596 + printk(KERN_NOTICE "md/raid10:%s: %s: failing " 1597 "drive\n", 1598 + mdname(mddev), 1599 bdevname(rdev->bdev, b)); 1600 md_error(mddev, rdev); 1601 } ··· 1622 READ) == 0) { 1623 /* Well, this device is dead */ 1624 printk(KERN_NOTICE 1625 + "md/raid10:%s: unable to read back " 1626 "corrected sectors" 1627 " (%d sectors at %llu on %s)\n", 1628 mdname(mddev), s, 1629 (unsigned long long)(sect+ 1630 rdev->data_offset), 1631 bdevname(rdev->bdev, b)); 1632 + printk(KERN_NOTICE "md/raid10:%s: %s: failing drive\n", 1633 + mdname(mddev), 1634 bdevname(rdev->bdev, b)); 1635 1636 md_error(mddev, rdev); 1637 } else { 1638 printk(KERN_INFO 1639 + "md/raid10:%s: read error corrected" 1640 " (%d sectors at %llu on %s)\n", 1641 mdname(mddev), s, 1642 (unsigned long long)(sect+ ··· 1710 mddev->ro ? IO_BLOCKED : NULL; 1711 mirror = read_balance(conf, r10_bio); 1712 if (mirror == -1) { 1713 + printk(KERN_ALERT "md/raid10:%s: %s: unrecoverable I/O" 1714 " read error for block %llu\n", 1715 + mdname(mddev), 1716 bdevname(bio->bi_bdev,b), 1717 (unsigned long long)r10_bio->sector); 1718 raid_end_bio_io(r10_bio); ··· 1721 bio_put(bio); 1722 rdev = conf->mirrors[mirror].rdev; 1723 if (printk_ratelimit()) 1724 + printk(KERN_ERR "md/raid10:%s: %s: redirecting sector %llu to" 1725 " another mirror\n", 1726 + mdname(mddev), 1727 bdevname(rdev->bdev,b), 1728 (unsigned long long)r10_bio->sector); 1729 bio = bio_clone(r10_bio->master_bio, GFP_NOIO); ··· 1980 r10_bio = rb2; 1981 if (!test_and_set_bit(MD_RECOVERY_INTR, 1982 &mddev->recovery)) 1983 + printk(KERN_INFO "md/raid10:%s: insufficient " 1984 + "working devices for recovery.\n", 1985 mdname(mddev)); 1986 break; 1987 } ··· 2140 conf_t *conf = mddev->private; 2141 2142 if (!raid_disks) 2143 + raid_disks = conf->raid_disks; 2144 if (!sectors) 2145 + sectors = conf->dev_sectors; 2146 2147 size = sectors >> conf->chunk_shift; 2148 sector_div(size, conf->far_copies); ··· 2152 return size << conf->chunk_shift; 2153 } 2154 2155 + 2156 + static conf_t *setup_conf(mddev_t *mddev) 2157 { 2158 + conf_t *conf = NULL; 2159 int nc, fc, fo; 2160 sector_t stride, size; 2161 + int err = -EINVAL; 2162 2163 if (mddev->chunk_sectors < (PAGE_SIZE >> 9) || 2164 !is_power_of_2(mddev->chunk_sectors)) { 2165 + printk(KERN_ERR "md/raid10:%s: chunk size must be " 2166 + "at least PAGE_SIZE(%ld) and be a power of 2.\n", 2167 + mdname(mddev), PAGE_SIZE); 2168 + goto out; 2169 } 2170 2171 nc = mddev->layout & 255; 2172 fc = (mddev->layout >> 8) & 255; 2173 fo = mddev->layout & (1<<16); 2174 + 2175 if ((nc*fc) <2 || (nc*fc) > mddev->raid_disks || 2176 (mddev->layout >> 17)) { 2177 + printk(KERN_ERR "md/raid10:%s: unsupported raid10 layout: 0x%8x\n", 2178 mdname(mddev), mddev->layout); 2179 goto out; 2180 } 2181 + 2182 + err = -ENOMEM; 2183 conf = kzalloc(sizeof(conf_t), GFP_KERNEL); 2184 + if (!conf) 2185 goto out; 2186 + 2187 conf->mirrors = kzalloc(sizeof(struct mirror_info)*mddev->raid_disks, 2188 + GFP_KERNEL); 2189 + if (!conf->mirrors) 2190 + goto out; 2191 2192 conf->tmppage = alloc_page(GFP_KERNEL); 2193 if (!conf->tmppage) 2194 + goto out; 2195 + 2196 2197 conf->raid_disks = mddev->raid_disks; 2198 conf->near_copies = nc; 2199 conf->far_copies = fc; 2200 conf->copies = nc*fc; 2201 conf->far_offset = fo; 2202 + conf->chunk_mask = mddev->new_chunk_sectors - 1; 2203 + conf->chunk_shift = ffz(~mddev->new_chunk_sectors); 2204 + 2205 + conf->r10bio_pool = mempool_create(NR_RAID10_BIOS, r10bio_pool_alloc, 2206 + r10bio_pool_free, conf); 2207 + if (!conf->r10bio_pool) 2208 + goto out; 2209 + 2210 size = mddev->dev_sectors >> conf->chunk_shift; 2211 sector_div(size, fc); 2212 size = size * conf->raid_disks; ··· 2221 */ 2222 stride += conf->raid_disks - 1; 2223 sector_div(stride, conf->raid_disks); 2224 + 2225 + conf->dev_sectors = stride << conf->chunk_shift; 2226 2227 if (fo) 2228 stride = 1; ··· 2229 sector_div(stride, fc); 2230 conf->stride = stride << conf->chunk_shift; 2231 2232 2233 spin_lock_init(&conf->device_lock); 2234 + INIT_LIST_HEAD(&conf->retry_list); 2235 + 2236 + spin_lock_init(&conf->resync_lock); 2237 + init_waitqueue_head(&conf->wait_barrier); 2238 + 2239 + conf->thread = md_register_thread(raid10d, mddev, NULL); 2240 + if (!conf->thread) 2241 + goto out; 2242 + 2243 + conf->scale_disks = 0; 2244 + conf->mddev = mddev; 2245 + return conf; 2246 + 2247 + out: 2248 + printk(KERN_ERR "md/raid10:%s: couldn't allocate memory.\n", 2249 + mdname(mddev)); 2250 + if (conf) { 2251 + if (conf->r10bio_pool) 2252 + mempool_destroy(conf->r10bio_pool); 2253 + kfree(conf->mirrors); 2254 + safe_put_page(conf->tmppage); 2255 + kfree(conf); 2256 + } 2257 + return ERR_PTR(err); 2258 + } 2259 + 2260 + static int run(mddev_t *mddev) 2261 + { 2262 + conf_t *conf; 2263 + int i, disk_idx, chunk_size; 2264 + mirror_info_t *disk; 2265 + mdk_rdev_t *rdev; 2266 + sector_t size; 2267 + 2268 + /* 2269 + * copy the already verified devices into our private RAID10 2270 + * bookkeeping area. [whatever we allocate in run(), 2271 + * should be freed in stop()] 2272 + */ 2273 + 2274 + if (mddev->private == NULL) { 2275 + conf = setup_conf(mddev); 2276 + if (IS_ERR(conf)) 2277 + return PTR_ERR(conf); 2278 + mddev->private = conf; 2279 + } 2280 + conf = mddev->private; 2281 + if (!conf) 2282 + goto out; 2283 + 2284 mddev->queue->queue_lock = &conf->device_lock; 2285 + 2286 + mddev->thread = conf->thread; 2287 + conf->thread = NULL; 2288 2289 chunk_size = mddev->chunk_sectors << 9; 2290 blk_queue_io_min(mddev->queue, chunk_size); ··· 2251 2252 list_for_each_entry(rdev, &mddev->disks, same_set) { 2253 disk_idx = rdev->raid_disk; 2254 + if (disk_idx >= conf->raid_disks 2255 || disk_idx < 0) 2256 continue; 2257 + if (conf->scale_disks) { 2258 + disk_idx *= conf->scale_disks; 2259 + rdev->raid_disk = disk_idx; 2260 + /* MOVE 'rd%d' link !! */ 2261 + } 2262 disk = conf->mirrors + disk_idx; 2263 2264 disk->rdev = rdev; ··· 2271 2272 disk->head_position = 0; 2273 } 2274 /* need to check that every block has at least one working mirror */ 2275 if (!enough(conf)) { 2276 + printk(KERN_ERR "md/raid10:%s: not enough operational mirrors.\n", 2277 mdname(mddev)); 2278 goto out_free_conf; 2279 } ··· 2297 } 2298 } 2299 2300 if (mddev->recovery_cp != MaxSector) 2301 + printk(KERN_NOTICE "md/raid10:%s: not clean" 2302 " -- starting background reconstruction\n", 2303 mdname(mddev)); 2304 printk(KERN_INFO 2305 + "md/raid10:%s: active with %d out of %d devices\n", 2306 + mdname(mddev), conf->raid_disks - mddev->degraded, 2307 + conf->raid_disks); 2308 /* 2309 * Ok, everything is just fine now 2310 */ 2311 + mddev->dev_sectors = conf->dev_sectors; 2312 + size = raid10_size(mddev, 0, 0); 2313 + md_set_array_sectors(mddev, size); 2314 + mddev->resync_max_sectors = size; 2315 2316 mddev->queue->unplug_fn = raid10_unplug; 2317 mddev->queue->backing_dev_info.congested_fn = raid10_congested; ··· 2336 mddev->queue->backing_dev_info.ra_pages = 2* stripe; 2337 } 2338 2339 + if (conf->near_copies < conf->raid_disks) 2340 blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec); 2341 md_integrity_register(mddev); 2342 return 0; ··· 2348 kfree(conf->mirrors); 2349 kfree(conf); 2350 mddev->private = NULL; 2351 + md_unregister_thread(mddev->thread); 2352 out: 2353 return -EIO; 2354 } ··· 2384 } 2385 } 2386 2387 + static void *raid10_takeover_raid0(mddev_t *mddev) 2388 + { 2389 + mdk_rdev_t *rdev; 2390 + conf_t *conf; 2391 + 2392 + if (mddev->degraded > 0) { 2393 + printk(KERN_ERR "md/raid10:%s: Error: degraded raid0!\n", 2394 + mdname(mddev)); 2395 + return ERR_PTR(-EINVAL); 2396 + } 2397 + 2398 + /* Update slot numbers to obtain 2399 + * degraded raid10 with missing mirrors 2400 + */ 2401 + list_for_each_entry(rdev, &mddev->disks, same_set) { 2402 + rdev->raid_disk *= 2; 2403 + } 2404 + 2405 + /* Set new parameters */ 2406 + mddev->new_level = 10; 2407 + /* new layout: far_copies = 1, near_copies = 2 */ 2408 + mddev->new_layout = (1<<8) + 2; 2409 + mddev->new_chunk_sectors = mddev->chunk_sectors; 2410 + mddev->delta_disks = mddev->raid_disks; 2411 + mddev->degraded = mddev->raid_disks; 2412 + mddev->raid_disks *= 2; 2413 + /* make sure it will be not marked as dirty */ 2414 + mddev->recovery_cp = MaxSector; 2415 + 2416 + conf = setup_conf(mddev); 2417 + conf->scale_disks = 2; 2418 + return conf; 2419 + } 2420 + 2421 + static void *raid10_takeover(mddev_t *mddev) 2422 + { 2423 + struct raid0_private_data *raid0_priv; 2424 + 2425 + /* raid10 can take over: 2426 + * raid0 - providing it has only two drives 2427 + */ 2428 + if (mddev->level == 0) { 2429 + /* for raid0 takeover only one zone is supported */ 2430 + raid0_priv = mddev->private; 2431 + if (raid0_priv->nr_strip_zones > 1) { 2432 + printk(KERN_ERR "md/raid10:%s: cannot takeover raid 0" 2433 + " with more than one zone.\n", 2434 + mdname(mddev)); 2435 + return ERR_PTR(-EINVAL); 2436 + } 2437 + return raid10_takeover_raid0(mddev); 2438 + } 2439 + return ERR_PTR(-EINVAL); 2440 + } 2441 + 2442 static struct mdk_personality raid10_personality = 2443 { 2444 .name = "raid10", ··· 2400 .sync_request = sync_request, 2401 .quiesce = raid10_quiesce, 2402 .size = raid10_size, 2403 + .takeover = raid10_takeover, 2404 }; 2405 2406 static int __init raid_init(void)
+12
drivers/md/raid10.h
··· 33 * 1 stripe. 34 */ 35 36 int chunk_shift; /* shift from chunks to sectors */ 37 sector_t chunk_mask; 38 39 struct list_head retry_list; 40 /* queue pending writes and submit them on unplug */ ··· 64 mempool_t *r10bio_pool; 65 mempool_t *r10buf_pool; 66 struct page *tmppage; 67 }; 68 69 typedef struct r10_private_data_s conf_t;
··· 33 * 1 stripe. 34 */ 35 36 + sector_t dev_sectors; /* temp copy of mddev->dev_sectors */ 37 + 38 int chunk_shift; /* shift from chunks to sectors */ 39 sector_t chunk_mask; 40 + 41 + int scale_disks; /* When starting array, multiply 42 + * each ->raid_disk by this. 43 + * Need for raid0->raid10 migration 44 + */ 45 46 struct list_head retry_list; 47 /* queue pending writes and submit them on unplug */ ··· 57 mempool_t *r10bio_pool; 58 mempool_t *r10buf_pool; 59 struct page *tmppage; 60 + 61 + /* When taking over an array from a different personality, we store 62 + * the new thread here until we fully activate the array. 63 + */ 64 + struct mdk_thread_s *thread; 65 }; 66 67 typedef struct r10_private_data_s conf_t;
+127 -104
drivers/md/raid5.c
··· 53 #include <linux/slab.h> 54 #include "md.h" 55 #include "raid5.h" 56 #include "bitmap.h" 57 58 /* ··· 1510 set_bit(R5_UPTODATE, &sh->dev[i].flags); 1511 if (test_bit(R5_ReadError, &sh->dev[i].flags)) { 1512 rdev = conf->disks[i].rdev; 1513 - printk_rl(KERN_INFO "raid5:%s: read error corrected" 1514 " (%lu sectors at %llu on %s)\n", 1515 mdname(conf->mddev), STRIPE_SECTORS, 1516 (unsigned long long)(sh->sector ··· 1530 atomic_inc(&rdev->read_errors); 1531 if (conf->mddev->degraded >= conf->max_degraded) 1532 printk_rl(KERN_WARNING 1533 - "raid5:%s: read error not correctable " 1534 "(sector %llu on %s).\n", 1535 mdname(conf->mddev), 1536 (unsigned long long)(sh->sector ··· 1539 else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) 1540 /* Oh, no!!! */ 1541 printk_rl(KERN_WARNING 1542 - "raid5:%s: read error NOT corrected!! " 1543 "(sector %llu on %s).\n", 1544 mdname(conf->mddev), 1545 (unsigned long long)(sh->sector ··· 1548 else if (atomic_read(&rdev->read_errors) 1549 > conf->max_nr_stripes) 1550 printk(KERN_WARNING 1551 - "raid5:%s: Too many read errors, failing device %s.\n", 1552 mdname(conf->mddev), bdn); 1553 else 1554 retry = 1; ··· 1620 static void error(mddev_t *mddev, mdk_rdev_t *rdev) 1621 { 1622 char b[BDEVNAME_SIZE]; 1623 - raid5_conf_t *conf = (raid5_conf_t *) mddev->private; 1624 - pr_debug("raid5: error called\n"); 1625 1626 if (!test_bit(Faulty, &rdev->flags)) { 1627 set_bit(MD_CHANGE_DEVS, &mddev->flags); ··· 1637 } 1638 set_bit(Faulty, &rdev->flags); 1639 printk(KERN_ALERT 1640 - "raid5: Disk failure on %s, disabling device.\n" 1641 - "raid5: Operation continuing on %d devices.\n", 1642 - bdevname(rdev->bdev,b), conf->raid_disks - mddev->degraded); 1643 } 1644 } 1645 ··· 1719 pd_idx = data_disks; 1720 break; 1721 default: 1722 - printk(KERN_ERR "raid5: unsupported algorithm %d\n", 1723 - algorithm); 1724 BUG(); 1725 } 1726 break; ··· 1835 qd_idx = raid_disks - 1; 1836 break; 1837 1838 - 1839 default: 1840 - printk(KERN_CRIT "raid6: unsupported algorithm %d\n", 1841 - algorithm); 1842 BUG(); 1843 } 1844 break; ··· 1898 case ALGORITHM_PARITY_N: 1899 break; 1900 default: 1901 - printk(KERN_ERR "raid5: unsupported algorithm %d\n", 1902 - algorithm); 1903 BUG(); 1904 } 1905 break; ··· 1956 i -= 1; 1957 break; 1958 default: 1959 - printk(KERN_CRIT "raid6: unsupported algorithm %d\n", 1960 - algorithm); 1961 BUG(); 1962 } 1963 break; ··· 1968 previous, &dummy1, &sh2); 1969 if (check != sh->sector || dummy1 != dd_idx || sh2.pd_idx != sh->pd_idx 1970 || sh2.qd_idx != sh->qd_idx) { 1971 - printk(KERN_ERR "compute_blocknr: map not correct\n"); 1972 return 0; 1973 } 1974 return r_sector; ··· 3706 3707 bio_put(bi); 3708 3709 - mddev = raid_bi->bi_bdev->bd_disk->queue->queuedata; 3710 - conf = mddev->private; 3711 rdev = (void*)raid_bi->bi_next; 3712 raid_bi->bi_next = NULL; 3713 3714 rdev_dec_pending(rdev, conf->mddev); 3715 ··· 3746 } 3747 3748 3749 - static int chunk_aligned_read(struct request_queue *q, struct bio * raid_bio) 3750 { 3751 - mddev_t *mddev = q->queuedata; 3752 raid5_conf_t *conf = mddev->private; 3753 int dd_idx; 3754 struct bio* align_bi; ··· 3862 return sh; 3863 } 3864 3865 - static int make_request(struct request_queue *q, struct bio * bi) 3866 { 3867 - mddev_t *mddev = q->queuedata; 3868 raid5_conf_t *conf = mddev->private; 3869 int dd_idx; 3870 sector_t new_sector; 3871 sector_t logical_sector, last_sector; 3872 struct stripe_head *sh; 3873 const int rw = bio_data_dir(bi); 3874 - int cpu, remaining; 3875 3876 if (unlikely(bio_rw_flagged(bi, BIO_RW_BARRIER))) { 3877 /* Drain all pending writes. We only really need ··· 3885 3886 md_write_start(mddev, bi); 3887 3888 - cpu = part_stat_lock(); 3889 - part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); 3890 - part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], 3891 - bio_sectors(bi)); 3892 - part_stat_unlock(); 3893 - 3894 if (rw == READ && 3895 mddev->reshape_position == MaxSector && 3896 - chunk_aligned_read(q,bi)) 3897 return 0; 3898 3899 logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1); ··· 3935 new_sector = raid5_compute_sector(conf, logical_sector, 3936 previous, 3937 &dd_idx, NULL); 3938 - pr_debug("raid5: make_request, sector %llu logical %llu\n", 3939 (unsigned long long)new_sector, 3940 (unsigned long long)logical_sector); 3941 ··· 4043 * As the reads complete, handle_stripe will copy the data 4044 * into the destination stripe and release that stripe. 4045 */ 4046 - raid5_conf_t *conf = (raid5_conf_t *) mddev->private; 4047 struct stripe_head *sh; 4048 sector_t first_sector, last_sector; 4049 int raid_disks = conf->previous_raid_disks; ··· 4252 /* FIXME go_faster isn't used */ 4253 static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster) 4254 { 4255 - raid5_conf_t *conf = (raid5_conf_t *) mddev->private; 4256 struct stripe_head *sh; 4257 sector_t max_sector = mddev->dev_sectors; 4258 int sync_blocks; ··· 4714 if (mddev->new_level != 5 4715 && mddev->new_level != 4 4716 && mddev->new_level != 6) { 4717 - printk(KERN_ERR "raid5: %s: raid level not set to 4/5/6 (%d)\n", 4718 mdname(mddev), mddev->new_level); 4719 return ERR_PTR(-EIO); 4720 } ··· 4722 && !algorithm_valid_raid5(mddev->new_layout)) || 4723 (mddev->new_level == 6 4724 && !algorithm_valid_raid6(mddev->new_layout))) { 4725 - printk(KERN_ERR "raid5: %s: layout %d not supported\n", 4726 mdname(mddev), mddev->new_layout); 4727 return ERR_PTR(-EIO); 4728 } 4729 if (mddev->new_level == 6 && mddev->raid_disks < 4) { 4730 - printk(KERN_ERR "raid6: not enough configured devices for %s (%d, minimum 4)\n", 4731 mdname(mddev), mddev->raid_disks); 4732 return ERR_PTR(-EINVAL); 4733 } ··· 4735 if (!mddev->new_chunk_sectors || 4736 (mddev->new_chunk_sectors << 9) % PAGE_SIZE || 4737 !is_power_of_2(mddev->new_chunk_sectors)) { 4738 - printk(KERN_ERR "raid5: invalid chunk size %d for %s\n", 4739 - mddev->new_chunk_sectors << 9, mdname(mddev)); 4740 return ERR_PTR(-EINVAL); 4741 } 4742 ··· 4778 if (raid5_alloc_percpu(conf) != 0) 4779 goto abort; 4780 4781 - pr_debug("raid5: run(%s) called.\n", mdname(mddev)); 4782 4783 list_for_each_entry(rdev, &mddev->disks, same_set) { 4784 raid_disk = rdev->raid_disk; ··· 4791 4792 if (test_bit(In_sync, &rdev->flags)) { 4793 char b[BDEVNAME_SIZE]; 4794 - printk(KERN_INFO "raid5: device %s operational as raid" 4795 - " disk %d\n", bdevname(rdev->bdev,b), 4796 - raid_disk); 4797 } else 4798 /* Cannot rely on bitmap to complete recovery */ 4799 conf->fullsync = 1; ··· 4817 max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; 4818 if (grow_stripes(conf, conf->max_nr_stripes)) { 4819 printk(KERN_ERR 4820 - "raid5: couldn't allocate %dkB for buffers\n", memory); 4821 goto abort; 4822 } else 4823 - printk(KERN_INFO "raid5: allocated %dkB for %s\n", 4824 - memory, mdname(mddev)); 4825 4826 conf->thread = md_register_thread(raid5d, mddev, NULL); 4827 if (!conf->thread) { 4828 printk(KERN_ERR 4829 - "raid5: couldn't allocate thread for %s\n", 4830 mdname(mddev)); 4831 goto abort; 4832 } ··· 4878 sector_t reshape_offset = 0; 4879 4880 if (mddev->recovery_cp != MaxSector) 4881 - printk(KERN_NOTICE "raid5: %s is not clean" 4882 " -- starting background reconstruction\n", 4883 mdname(mddev)); 4884 if (mddev->reshape_position != MaxSector) { ··· 4892 int max_degraded = (mddev->level == 6 ? 2 : 1); 4893 4894 if (mddev->new_level != mddev->level) { 4895 - printk(KERN_ERR "raid5: %s: unsupported reshape " 4896 "required - aborting.\n", 4897 mdname(mddev)); 4898 return -EINVAL; ··· 4905 here_new = mddev->reshape_position; 4906 if (sector_div(here_new, mddev->new_chunk_sectors * 4907 (mddev->raid_disks - max_degraded))) { 4908 - printk(KERN_ERR "raid5: reshape_position not " 4909 - "on a stripe boundary\n"); 4910 return -EINVAL; 4911 } 4912 reshape_offset = here_new * mddev->new_chunk_sectors; ··· 4927 if ((here_new * mddev->new_chunk_sectors != 4928 here_old * mddev->chunk_sectors) || 4929 mddev->ro == 0) { 4930 - printk(KERN_ERR "raid5: in-place reshape must be started" 4931 - " in read-only mode - aborting\n"); 4932 return -EINVAL; 4933 } 4934 } else if (mddev->delta_disks < 0 ··· 4938 : (here_new * mddev->new_chunk_sectors >= 4939 here_old * mddev->chunk_sectors)) { 4940 /* Reading from the same stripe as writing to - bad */ 4941 - printk(KERN_ERR "raid5: reshape_position too early for " 4942 - "auto-recovery - aborting.\n"); 4943 return -EINVAL; 4944 } 4945 - printk(KERN_INFO "raid5: reshape will continue\n"); 4946 /* OK, we should be able to continue; */ 4947 } else { 4948 BUG_ON(mddev->level != mddev->new_level); ··· 4986 mddev->minor_version > 90) 4987 rdev->recovery_offset = reshape_offset; 4988 4989 - printk("%d: w=%d pa=%d pr=%d m=%d a=%d r=%d op1=%d op2=%d\n", 4990 - rdev->raid_disk, working_disks, conf->prev_algo, 4991 - conf->previous_raid_disks, conf->max_degraded, 4992 - conf->algorithm, conf->raid_disks, 4993 - only_parity(rdev->raid_disk, 4994 - conf->prev_algo, 4995 - conf->previous_raid_disks, 4996 - conf->max_degraded), 4997 - only_parity(rdev->raid_disk, 4998 - conf->algorithm, 4999 - conf->raid_disks, 5000 - conf->max_degraded)); 5001 if (rdev->recovery_offset < reshape_offset) { 5002 /* We need to check old and new layout */ 5003 if (!only_parity(rdev->raid_disk, ··· 5006 - working_disks); 5007 5008 if (mddev->degraded > conf->max_degraded) { 5009 - printk(KERN_ERR "raid5: not enough operational devices for %s" 5010 " (%d/%d failed)\n", 5011 mdname(mddev), mddev->degraded, conf->raid_disks); 5012 goto abort; ··· 5020 mddev->recovery_cp != MaxSector) { 5021 if (mddev->ok_start_degraded) 5022 printk(KERN_WARNING 5023 - "raid5: starting dirty degraded array: %s" 5024 - "- data corruption possible.\n", 5025 mdname(mddev)); 5026 else { 5027 printk(KERN_ERR 5028 - "raid5: cannot start dirty degraded array for %s\n", 5029 mdname(mddev)); 5030 goto abort; 5031 } 5032 } 5033 5034 if (mddev->degraded == 0) 5035 - printk("raid5: raid level %d set %s active with %d out of %d" 5036 - " devices, algorithm %d\n", conf->level, mdname(mddev), 5037 mddev->raid_disks-mddev->degraded, mddev->raid_disks, 5038 mddev->new_layout); 5039 else 5040 - printk(KERN_ALERT "raid5: raid level %d set %s active with %d" 5041 - " out of %d devices, algorithm %d\n", conf->level, 5042 - mdname(mddev), mddev->raid_disks - mddev->degraded, 5043 - mddev->raid_disks, mddev->new_layout); 5044 5045 print_raid5_conf(conf); 5046 5047 if (conf->reshape_progress != MaxSector) { 5048 - printk("...ok start reshape thread\n"); 5049 conf->reshape_safe = conf->reshape_progress; 5050 atomic_set(&conf->reshape_stripes, 0); 5051 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); ··· 5068 } 5069 5070 /* Ok, everything is just fine now */ 5071 - if (sysfs_create_group(&mddev->kobj, &raid5_attrs_group)) 5072 printk(KERN_WARNING 5073 - "raid5: failed to create sysfs attributes for %s\n", 5074 mdname(mddev)); 5075 5076 mddev->queue->queue_lock = &conf->device_lock; ··· 5102 free_conf(conf); 5103 } 5104 mddev->private = NULL; 5105 - printk(KERN_ALERT "raid5: failed to run raid set %s\n", mdname(mddev)); 5106 return -EIO; 5107 } 5108 5109 - 5110 - 5111 static int stop(mddev_t *mddev) 5112 { 5113 - raid5_conf_t *conf = (raid5_conf_t *) mddev->private; 5114 5115 md_unregister_thread(mddev->thread); 5116 mddev->thread = NULL; 5117 mddev->queue->backing_dev_info.congested_fn = NULL; 5118 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ 5119 free_conf(conf); 5120 - mddev->private = &raid5_attrs_group; 5121 return 0; 5122 } 5123 ··· 5157 5158 static void status(struct seq_file *seq, mddev_t *mddev) 5159 { 5160 - raid5_conf_t *conf = (raid5_conf_t *) mddev->private; 5161 int i; 5162 5163 seq_printf(seq, " level %d, %dk chunk, algorithm %d", mddev->level, ··· 5179 int i; 5180 struct disk_info *tmp; 5181 5182 - printk("RAID5 conf printout:\n"); 5183 if (!conf) { 5184 printk("(conf==NULL)\n"); 5185 return; 5186 } 5187 - printk(" --- rd:%d wd:%d\n", conf->raid_disks, 5188 - conf->raid_disks - conf->mddev->degraded); 5189 5190 for (i = 0; i < conf->raid_disks; i++) { 5191 char b[BDEVNAME_SIZE]; 5192 tmp = conf->disks + i; 5193 if (tmp->rdev) 5194 - printk(" disk %d, o:%d, dev:%s\n", 5195 - i, !test_bit(Faulty, &tmp->rdev->flags), 5196 - bdevname(tmp->rdev->bdev,b)); 5197 } 5198 } 5199 ··· 5317 raid5_size(mddev, sectors, mddev->raid_disks)) 5318 return -EINVAL; 5319 set_capacity(mddev->gendisk, mddev->array_sectors); 5320 - mddev->changed = 1; 5321 revalidate_disk(mddev->gendisk); 5322 if (sectors > mddev->dev_sectors && mddev->recovery_cp == MaxSector) { 5323 mddev->recovery_cp = mddev->dev_sectors; ··· 5342 > conf->max_nr_stripes || 5343 ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4 5344 > conf->max_nr_stripes) { 5345 - printk(KERN_WARNING "raid5: reshape: not enough stripes. Needed %lu\n", 5346 ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9) 5347 / STRIPE_SIZE)*4); 5348 return 0; ··· 5414 */ 5415 if (raid5_size(mddev, 0, conf->raid_disks + mddev->delta_disks) 5416 < mddev->array_sectors) { 5417 - printk(KERN_ERR "md: %s: array size must be reduced " 5418 "before number of disks\n", mdname(mddev)); 5419 return -EINVAL; 5420 } ··· 5452 if (sysfs_create_link(&mddev->kobj, 5453 &rdev->kobj, nm)) 5454 printk(KERN_WARNING 5455 - "raid5: failed to create " 5456 - " link %s for %s\n", 5457 - nm, mdname(mddev)); 5458 } else 5459 break; 5460 } ··· 5531 if (mddev->delta_disks > 0) { 5532 md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); 5533 set_capacity(mddev->gendisk, mddev->array_sectors); 5534 - mddev->changed = 1; 5535 revalidate_disk(mddev->gendisk); 5536 } else { 5537 int d; ··· 5592 spin_unlock_irq(&conf->device_lock); 5593 break; 5594 } 5595 } 5596 5597 ··· 5742 static void *raid5_takeover(mddev_t *mddev) 5743 { 5744 /* raid5 can take over: 5745 - * raid0 - if all devices are the same - make it a raid4 layout 5746 * raid1 - if there are two drives. We need to know the chunk size 5747 * raid4 - trivial - just use a raid4 layout. 5748 * raid6 - Providing it is a *_6 layout 5749 */ 5750 - 5751 if (mddev->level == 1) 5752 return raid5_takeover_raid1(mddev); 5753 if (mddev->level == 4) { ··· 5762 return ERR_PTR(-EINVAL); 5763 } 5764 5765 5766 static struct mdk_personality raid5_personality; 5767 ··· 5893 .start_reshape = raid5_start_reshape, 5894 .finish_reshape = raid5_finish_reshape, 5895 .quiesce = raid5_quiesce, 5896 }; 5897 5898 static int __init raid5_init(void)
··· 53 #include <linux/slab.h> 54 #include "md.h" 55 #include "raid5.h" 56 + #include "raid0.h" 57 #include "bitmap.h" 58 59 /* ··· 1509 set_bit(R5_UPTODATE, &sh->dev[i].flags); 1510 if (test_bit(R5_ReadError, &sh->dev[i].flags)) { 1511 rdev = conf->disks[i].rdev; 1512 + printk_rl(KERN_INFO "md/raid:%s: read error corrected" 1513 " (%lu sectors at %llu on %s)\n", 1514 mdname(conf->mddev), STRIPE_SECTORS, 1515 (unsigned long long)(sh->sector ··· 1529 atomic_inc(&rdev->read_errors); 1530 if (conf->mddev->degraded >= conf->max_degraded) 1531 printk_rl(KERN_WARNING 1532 + "md/raid:%s: read error not correctable " 1533 "(sector %llu on %s).\n", 1534 mdname(conf->mddev), 1535 (unsigned long long)(sh->sector ··· 1538 else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) 1539 /* Oh, no!!! */ 1540 printk_rl(KERN_WARNING 1541 + "md/raid:%s: read error NOT corrected!! " 1542 "(sector %llu on %s).\n", 1543 mdname(conf->mddev), 1544 (unsigned long long)(sh->sector ··· 1547 else if (atomic_read(&rdev->read_errors) 1548 > conf->max_nr_stripes) 1549 printk(KERN_WARNING 1550 + "md/raid:%s: Too many read errors, failing device %s.\n", 1551 mdname(conf->mddev), bdn); 1552 else 1553 retry = 1; ··· 1619 static void error(mddev_t *mddev, mdk_rdev_t *rdev) 1620 { 1621 char b[BDEVNAME_SIZE]; 1622 + raid5_conf_t *conf = mddev->private; 1623 + pr_debug("raid456: error called\n"); 1624 1625 if (!test_bit(Faulty, &rdev->flags)) { 1626 set_bit(MD_CHANGE_DEVS, &mddev->flags); ··· 1636 } 1637 set_bit(Faulty, &rdev->flags); 1638 printk(KERN_ALERT 1639 + "md/raid:%s: Disk failure on %s, disabling device.\n" 1640 + KERN_ALERT 1641 + "md/raid:%s: Operation continuing on %d devices.\n", 1642 + mdname(mddev), 1643 + bdevname(rdev->bdev, b), 1644 + mdname(mddev), 1645 + conf->raid_disks - mddev->degraded); 1646 } 1647 } 1648 ··· 1714 pd_idx = data_disks; 1715 break; 1716 default: 1717 BUG(); 1718 } 1719 break; ··· 1832 qd_idx = raid_disks - 1; 1833 break; 1834 1835 default: 1836 BUG(); 1837 } 1838 break; ··· 1898 case ALGORITHM_PARITY_N: 1899 break; 1900 default: 1901 BUG(); 1902 } 1903 break; ··· 1958 i -= 1; 1959 break; 1960 default: 1961 BUG(); 1962 } 1963 break; ··· 1972 previous, &dummy1, &sh2); 1973 if (check != sh->sector || dummy1 != dd_idx || sh2.pd_idx != sh->pd_idx 1974 || sh2.qd_idx != sh->qd_idx) { 1975 + printk(KERN_ERR "md/raid:%s: compute_blocknr: map not correct\n", 1976 + mdname(conf->mddev)); 1977 return 0; 1978 } 1979 return r_sector; ··· 3709 3710 bio_put(bi); 3711 3712 rdev = (void*)raid_bi->bi_next; 3713 raid_bi->bi_next = NULL; 3714 + mddev = rdev->mddev; 3715 + conf = mddev->private; 3716 3717 rdev_dec_pending(rdev, conf->mddev); 3718 ··· 3749 } 3750 3751 3752 + static int chunk_aligned_read(mddev_t *mddev, struct bio * raid_bio) 3753 { 3754 raid5_conf_t *conf = mddev->private; 3755 int dd_idx; 3756 struct bio* align_bi; ··· 3866 return sh; 3867 } 3868 3869 + static int make_request(mddev_t *mddev, struct bio * bi) 3870 { 3871 raid5_conf_t *conf = mddev->private; 3872 int dd_idx; 3873 sector_t new_sector; 3874 sector_t logical_sector, last_sector; 3875 struct stripe_head *sh; 3876 const int rw = bio_data_dir(bi); 3877 + int remaining; 3878 3879 if (unlikely(bio_rw_flagged(bi, BIO_RW_BARRIER))) { 3880 /* Drain all pending writes. We only really need ··· 3890 3891 md_write_start(mddev, bi); 3892 3893 if (rw == READ && 3894 mddev->reshape_position == MaxSector && 3895 + chunk_aligned_read(mddev,bi)) 3896 return 0; 3897 3898 logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1); ··· 3946 new_sector = raid5_compute_sector(conf, logical_sector, 3947 previous, 3948 &dd_idx, NULL); 3949 + pr_debug("raid456: make_request, sector %llu logical %llu\n", 3950 (unsigned long long)new_sector, 3951 (unsigned long long)logical_sector); 3952 ··· 4054 * As the reads complete, handle_stripe will copy the data 4055 * into the destination stripe and release that stripe. 4056 */ 4057 + raid5_conf_t *conf = mddev->private; 4058 struct stripe_head *sh; 4059 sector_t first_sector, last_sector; 4060 int raid_disks = conf->previous_raid_disks; ··· 4263 /* FIXME go_faster isn't used */ 4264 static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster) 4265 { 4266 + raid5_conf_t *conf = mddev->private; 4267 struct stripe_head *sh; 4268 sector_t max_sector = mddev->dev_sectors; 4269 int sync_blocks; ··· 4725 if (mddev->new_level != 5 4726 && mddev->new_level != 4 4727 && mddev->new_level != 6) { 4728 + printk(KERN_ERR "md/raid:%s: raid level not set to 4/5/6 (%d)\n", 4729 mdname(mddev), mddev->new_level); 4730 return ERR_PTR(-EIO); 4731 } ··· 4733 && !algorithm_valid_raid5(mddev->new_layout)) || 4734 (mddev->new_level == 6 4735 && !algorithm_valid_raid6(mddev->new_layout))) { 4736 + printk(KERN_ERR "md/raid:%s: layout %d not supported\n", 4737 mdname(mddev), mddev->new_layout); 4738 return ERR_PTR(-EIO); 4739 } 4740 if (mddev->new_level == 6 && mddev->raid_disks < 4) { 4741 + printk(KERN_ERR "md/raid:%s: not enough configured devices (%d, minimum 4)\n", 4742 mdname(mddev), mddev->raid_disks); 4743 return ERR_PTR(-EINVAL); 4744 } ··· 4746 if (!mddev->new_chunk_sectors || 4747 (mddev->new_chunk_sectors << 9) % PAGE_SIZE || 4748 !is_power_of_2(mddev->new_chunk_sectors)) { 4749 + printk(KERN_ERR "md/raid:%s: invalid chunk size %d\n", 4750 + mdname(mddev), mddev->new_chunk_sectors << 9); 4751 return ERR_PTR(-EINVAL); 4752 } 4753 ··· 4789 if (raid5_alloc_percpu(conf) != 0) 4790 goto abort; 4791 4792 + pr_debug("raid456: run(%s) called.\n", mdname(mddev)); 4793 4794 list_for_each_entry(rdev, &mddev->disks, same_set) { 4795 raid_disk = rdev->raid_disk; ··· 4802 4803 if (test_bit(In_sync, &rdev->flags)) { 4804 char b[BDEVNAME_SIZE]; 4805 + printk(KERN_INFO "md/raid:%s: device %s operational as raid" 4806 + " disk %d\n", 4807 + mdname(mddev), bdevname(rdev->bdev, b), raid_disk); 4808 } else 4809 /* Cannot rely on bitmap to complete recovery */ 4810 conf->fullsync = 1; ··· 4828 max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; 4829 if (grow_stripes(conf, conf->max_nr_stripes)) { 4830 printk(KERN_ERR 4831 + "md/raid:%s: couldn't allocate %dkB for buffers\n", 4832 + mdname(mddev), memory); 4833 goto abort; 4834 } else 4835 + printk(KERN_INFO "md/raid:%s: allocated %dkB\n", 4836 + mdname(mddev), memory); 4837 4838 conf->thread = md_register_thread(raid5d, mddev, NULL); 4839 if (!conf->thread) { 4840 printk(KERN_ERR 4841 + "md/raid:%s: couldn't allocate thread.\n", 4842 mdname(mddev)); 4843 goto abort; 4844 } ··· 4888 sector_t reshape_offset = 0; 4889 4890 if (mddev->recovery_cp != MaxSector) 4891 + printk(KERN_NOTICE "md/raid:%s: not clean" 4892 " -- starting background reconstruction\n", 4893 mdname(mddev)); 4894 if (mddev->reshape_position != MaxSector) { ··· 4902 int max_degraded = (mddev->level == 6 ? 2 : 1); 4903 4904 if (mddev->new_level != mddev->level) { 4905 + printk(KERN_ERR "md/raid:%s: unsupported reshape " 4906 "required - aborting.\n", 4907 mdname(mddev)); 4908 return -EINVAL; ··· 4915 here_new = mddev->reshape_position; 4916 if (sector_div(here_new, mddev->new_chunk_sectors * 4917 (mddev->raid_disks - max_degraded))) { 4918 + printk(KERN_ERR "md/raid:%s: reshape_position not " 4919 + "on a stripe boundary\n", mdname(mddev)); 4920 return -EINVAL; 4921 } 4922 reshape_offset = here_new * mddev->new_chunk_sectors; ··· 4937 if ((here_new * mddev->new_chunk_sectors != 4938 here_old * mddev->chunk_sectors) || 4939 mddev->ro == 0) { 4940 + printk(KERN_ERR "md/raid:%s: in-place reshape must be started" 4941 + " in read-only mode - aborting\n", 4942 + mdname(mddev)); 4943 return -EINVAL; 4944 } 4945 } else if (mddev->delta_disks < 0 ··· 4947 : (here_new * mddev->new_chunk_sectors >= 4948 here_old * mddev->chunk_sectors)) { 4949 /* Reading from the same stripe as writing to - bad */ 4950 + printk(KERN_ERR "md/raid:%s: reshape_position too early for " 4951 + "auto-recovery - aborting.\n", 4952 + mdname(mddev)); 4953 return -EINVAL; 4954 } 4955 + printk(KERN_INFO "md/raid:%s: reshape will continue\n", 4956 + mdname(mddev)); 4957 /* OK, we should be able to continue; */ 4958 } else { 4959 BUG_ON(mddev->level != mddev->new_level); ··· 4993 mddev->minor_version > 90) 4994 rdev->recovery_offset = reshape_offset; 4995 4996 if (rdev->recovery_offset < reshape_offset) { 4997 /* We need to check old and new layout */ 4998 if (!only_parity(rdev->raid_disk, ··· 5025 - working_disks); 5026 5027 if (mddev->degraded > conf->max_degraded) { 5028 + printk(KERN_ERR "md/raid:%s: not enough operational devices" 5029 " (%d/%d failed)\n", 5030 mdname(mddev), mddev->degraded, conf->raid_disks); 5031 goto abort; ··· 5039 mddev->recovery_cp != MaxSector) { 5040 if (mddev->ok_start_degraded) 5041 printk(KERN_WARNING 5042 + "md/raid:%s: starting dirty degraded array" 5043 + " - data corruption possible.\n", 5044 mdname(mddev)); 5045 else { 5046 printk(KERN_ERR 5047 + "md/raid:%s: cannot start dirty degraded array.\n", 5048 mdname(mddev)); 5049 goto abort; 5050 } 5051 } 5052 5053 if (mddev->degraded == 0) 5054 + printk(KERN_INFO "md/raid:%s: raid level %d active with %d out of %d" 5055 + " devices, algorithm %d\n", mdname(mddev), conf->level, 5056 mddev->raid_disks-mddev->degraded, mddev->raid_disks, 5057 mddev->new_layout); 5058 else 5059 + printk(KERN_ALERT "md/raid:%s: raid level %d active with %d" 5060 + " out of %d devices, algorithm %d\n", 5061 + mdname(mddev), conf->level, 5062 + mddev->raid_disks - mddev->degraded, 5063 + mddev->raid_disks, mddev->new_layout); 5064 5065 print_raid5_conf(conf); 5066 5067 if (conf->reshape_progress != MaxSector) { 5068 conf->reshape_safe = conf->reshape_progress; 5069 atomic_set(&conf->reshape_stripes, 0); 5070 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); ··· 5087 } 5088 5089 /* Ok, everything is just fine now */ 5090 + if (mddev->to_remove == &raid5_attrs_group) 5091 + mddev->to_remove = NULL; 5092 + else if (sysfs_create_group(&mddev->kobj, &raid5_attrs_group)) 5093 printk(KERN_WARNING 5094 + "md/raid:%s: failed to create sysfs attributes.\n", 5095 mdname(mddev)); 5096 5097 mddev->queue->queue_lock = &conf->device_lock; ··· 5119 free_conf(conf); 5120 } 5121 mddev->private = NULL; 5122 + printk(KERN_ALERT "md/raid:%s: failed to run raid set.\n", mdname(mddev)); 5123 return -EIO; 5124 } 5125 5126 static int stop(mddev_t *mddev) 5127 { 5128 + raid5_conf_t *conf = mddev->private; 5129 5130 md_unregister_thread(mddev->thread); 5131 mddev->thread = NULL; 5132 mddev->queue->backing_dev_info.congested_fn = NULL; 5133 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ 5134 free_conf(conf); 5135 + mddev->private = NULL; 5136 + mddev->to_remove = &raid5_attrs_group; 5137 return 0; 5138 } 5139 ··· 5175 5176 static void status(struct seq_file *seq, mddev_t *mddev) 5177 { 5178 + raid5_conf_t *conf = mddev->private; 5179 int i; 5180 5181 seq_printf(seq, " level %d, %dk chunk, algorithm %d", mddev->level, ··· 5197 int i; 5198 struct disk_info *tmp; 5199 5200 + printk(KERN_DEBUG "RAID conf printout:\n"); 5201 if (!conf) { 5202 printk("(conf==NULL)\n"); 5203 return; 5204 } 5205 + printk(KERN_DEBUG " --- level:%d rd:%d wd:%d\n", conf->level, 5206 + conf->raid_disks, 5207 + conf->raid_disks - conf->mddev->degraded); 5208 5209 for (i = 0; i < conf->raid_disks; i++) { 5210 char b[BDEVNAME_SIZE]; 5211 tmp = conf->disks + i; 5212 if (tmp->rdev) 5213 + printk(KERN_DEBUG " disk %d, o:%d, dev:%s\n", 5214 + i, !test_bit(Faulty, &tmp->rdev->flags), 5215 + bdevname(tmp->rdev->bdev, b)); 5216 } 5217 } 5218 ··· 5334 raid5_size(mddev, sectors, mddev->raid_disks)) 5335 return -EINVAL; 5336 set_capacity(mddev->gendisk, mddev->array_sectors); 5337 revalidate_disk(mddev->gendisk); 5338 if (sectors > mddev->dev_sectors && mddev->recovery_cp == MaxSector) { 5339 mddev->recovery_cp = mddev->dev_sectors; ··· 5360 > conf->max_nr_stripes || 5361 ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4 5362 > conf->max_nr_stripes) { 5363 + printk(KERN_WARNING "md/raid:%s: reshape: not enough stripes. Needed %lu\n", 5364 + mdname(mddev), 5365 ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9) 5366 / STRIPE_SIZE)*4); 5367 return 0; ··· 5431 */ 5432 if (raid5_size(mddev, 0, conf->raid_disks + mddev->delta_disks) 5433 < mddev->array_sectors) { 5434 + printk(KERN_ERR "md/raid:%s: array size must be reduced " 5435 "before number of disks\n", mdname(mddev)); 5436 return -EINVAL; 5437 } ··· 5469 if (sysfs_create_link(&mddev->kobj, 5470 &rdev->kobj, nm)) 5471 printk(KERN_WARNING 5472 + "md/raid:%s: failed to create " 5473 + " link %s\n", 5474 + mdname(mddev), nm); 5475 } else 5476 break; 5477 } ··· 5548 if (mddev->delta_disks > 0) { 5549 md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); 5550 set_capacity(mddev->gendisk, mddev->array_sectors); 5551 revalidate_disk(mddev->gendisk); 5552 } else { 5553 int d; ··· 5610 spin_unlock_irq(&conf->device_lock); 5611 break; 5612 } 5613 + } 5614 + 5615 + 5616 + static void *raid45_takeover_raid0(mddev_t *mddev, int level) 5617 + { 5618 + struct raid0_private_data *raid0_priv = mddev->private; 5619 + 5620 + /* for raid0 takeover only one zone is supported */ 5621 + if (raid0_priv->nr_strip_zones > 1) { 5622 + printk(KERN_ERR "md/raid:%s: cannot takeover raid0 with more than one zone.\n", 5623 + mdname(mddev)); 5624 + return ERR_PTR(-EINVAL); 5625 + } 5626 + 5627 + mddev->new_level = level; 5628 + mddev->new_layout = ALGORITHM_PARITY_N; 5629 + mddev->new_chunk_sectors = mddev->chunk_sectors; 5630 + mddev->raid_disks += 1; 5631 + mddev->delta_disks = 1; 5632 + /* make sure it will be not marked as dirty */ 5633 + mddev->recovery_cp = MaxSector; 5634 + 5635 + return setup_conf(mddev); 5636 } 5637 5638 ··· 5737 static void *raid5_takeover(mddev_t *mddev) 5738 { 5739 /* raid5 can take over: 5740 + * raid0 - if there is only one strip zone - make it a raid4 layout 5741 * raid1 - if there are two drives. We need to know the chunk size 5742 * raid4 - trivial - just use a raid4 layout. 5743 * raid6 - Providing it is a *_6 layout 5744 */ 5745 + if (mddev->level == 0) 5746 + return raid45_takeover_raid0(mddev, 5); 5747 if (mddev->level == 1) 5748 return raid5_takeover_raid1(mddev); 5749 if (mddev->level == 4) { ··· 5756 return ERR_PTR(-EINVAL); 5757 } 5758 5759 + static void *raid4_takeover(mddev_t *mddev) 5760 + { 5761 + /* raid4 can take over: 5762 + * raid0 - if there is only one strip zone 5763 + * raid5 - if layout is right 5764 + */ 5765 + if (mddev->level == 0) 5766 + return raid45_takeover_raid0(mddev, 4); 5767 + if (mddev->level == 5 && 5768 + mddev->layout == ALGORITHM_PARITY_N) { 5769 + mddev->new_layout = 0; 5770 + mddev->new_level = 4; 5771 + return setup_conf(mddev); 5772 + } 5773 + return ERR_PTR(-EINVAL); 5774 + } 5775 5776 static struct mdk_personality raid5_personality; 5777 ··· 5871 .start_reshape = raid5_start_reshape, 5872 .finish_reshape = raid5_finish_reshape, 5873 .quiesce = raid5_quiesce, 5874 + .takeover = raid4_takeover, 5875 }; 5876 5877 static int __init raid5_init(void)