Merge branch 'for-linus' of git://neil.brown.name/md

* 'for-linus' of git://neil.brown.name/md:
md: allow upper limit for resync/reshape to be set when array is read-only
md/raid5: Properly remove excess drives after shrinking a raid5/6
md/raid5: make sure a reshape restarts at the correct address.
md/raid5: allow new reshape modes to be restarted in the middle.
md: never advance 'events' counter by more than 1.
Remove deadlock potential in md_open

+56 -20
+16 -16
drivers/md/md.c
··· 359 359 else 360 360 new->md_minor = MINOR(unit) >> MdpMinorShift; 361 361 362 + mutex_init(&new->open_mutex); 362 363 mutex_init(&new->reconfig_mutex); 363 364 INIT_LIST_HEAD(&new->disks); 364 365 INIT_LIST_HEAD(&new->all_mddevs); ··· 1975 1974 /* otherwise we have to go forward and ... */ 1976 1975 mddev->events ++; 1977 1976 if (!mddev->in_sync || mddev->recovery_cp != MaxSector) { /* not clean */ 1978 - /* .. if the array isn't clean, insist on an odd 'events' */ 1979 - if ((mddev->events&1)==0) { 1980 - mddev->events++; 1977 + /* .. if the array isn't clean, an 'even' event must also go 1978 + * to spares. */ 1979 + if ((mddev->events&1)==0) 1981 1980 nospares = 0; 1982 - } 1983 1981 } else { 1984 - /* otherwise insist on an even 'events' (for clean states) */ 1985 - if ((mddev->events&1)) { 1986 - mddev->events++; 1982 + /* otherwise an 'odd' event must go to spares */ 1983 + if ((mddev->events&1)) 1987 1984 nospares = 0; 1988 - } 1989 1985 } 1990 1986 } 1991 1987 ··· 3599 3601 if (max < mddev->resync_min) 3600 3602 return -EINVAL; 3601 3603 if (max < mddev->resync_max && 3604 + mddev->ro == 0 && 3602 3605 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 3603 3606 return -EBUSY; 3604 3607 ··· 4303 4304 struct gendisk *disk = mddev->gendisk; 4304 4305 mdk_rdev_t *rdev; 4305 4306 4307 + mutex_lock(&mddev->open_mutex); 4306 4308 if (atomic_read(&mddev->openers) > is_open) { 4307 4309 printk("md: %s still in use.\n",mdname(mddev)); 4308 - return -EBUSY; 4309 - } 4310 - 4311 - if (mddev->pers) { 4310 + err = -EBUSY; 4311 + } else if (mddev->pers) { 4312 4312 4313 4313 if (mddev->sync_thread) { 4314 4314 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); ··· 4365 4367 set_disk_ro(disk, 1); 4366 4368 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4367 4369 } 4368 - 4370 + out: 4371 + mutex_unlock(&mddev->open_mutex); 4372 + if (err) 4373 + return err; 4369 4374 /* 4370 4375 * Free resources if final stop 4371 4376 */ ··· 4434 4433 blk_integrity_unregister(disk); 4435 4434 md_new_event(mddev); 4436 4435 sysfs_notify_dirent(mddev->sysfs_state); 4437 - out: 4438 4436 return err; 4439 4437 } 4440 4438 ··· 5518 5518 } 5519 5519 BUG_ON(mddev != bdev->bd_disk->private_data); 5520 5520 5521 - if ((err = mutex_lock_interruptible_nested(&mddev->reconfig_mutex, 1))) 5521 + if ((err = mutex_lock_interruptible(&mddev->open_mutex))) 5522 5522 goto out; 5523 5523 5524 5524 err = 0; 5525 5525 atomic_inc(&mddev->openers); 5526 - mddev_unlock(mddev); 5526 + mutex_unlock(&mddev->open_mutex); 5527 5527 5528 5528 check_disk_change(bdev); 5529 5529 out:
+10
drivers/md/md.h
··· 223 223 * so we don't loop trying */ 224 224 225 225 int in_sync; /* know to not need resync */ 226 + /* 'open_mutex' avoids races between 'md_open' and 'do_md_stop', so 227 + * that we are never stopping an array while it is open. 228 + * 'reconfig_mutex' protects all other reconfiguration. 229 + * These locks are separate due to conflicting interactions 230 + * with bdev->bd_mutex. 231 + * Lock ordering is: 232 + * reconfig_mutex -> bd_mutex : e.g. do_md_run -> revalidate_disk 233 + * bd_mutex -> open_mutex: e.g. __blkdev_get -> md_open 234 + */ 235 + struct mutex open_mutex; 226 236 struct mutex reconfig_mutex; 227 237 atomic_t active; /* general refcount */ 228 238 atomic_t openers; /* number of active opens */
+30 -4
drivers/md/raid5.c
··· 3785 3785 conf->reshape_progress < raid5_size(mddev, 0, 0)) { 3786 3786 sector_nr = raid5_size(mddev, 0, 0) 3787 3787 - conf->reshape_progress; 3788 - } else if (mddev->delta_disks > 0 && 3788 + } else if (mddev->delta_disks >= 0 && 3789 3789 conf->reshape_progress > 0) 3790 3790 sector_nr = conf->reshape_progress; 3791 3791 sector_div(sector_nr, new_data_disks); ··· 4509 4509 (old_disks-max_degraded)); 4510 4510 /* here_old is the first stripe that we might need to read 4511 4511 * from */ 4512 - if (here_new >= here_old) { 4512 + if (mddev->delta_disks == 0) { 4513 + /* We cannot be sure it is safe to start an in-place 4514 + * reshape. It is only safe if user-space if monitoring 4515 + * and taking constant backups. 4516 + * mdadm always starts a situation like this in 4517 + * readonly mode so it can take control before 4518 + * allowing any writes. So just check for that. 4519 + */ 4520 + if ((here_new * mddev->new_chunk_sectors != 4521 + here_old * mddev->chunk_sectors) || 4522 + mddev->ro == 0) { 4523 + printk(KERN_ERR "raid5: in-place reshape must be started" 4524 + " in read-only mode - aborting\n"); 4525 + return -EINVAL; 4526 + } 4527 + } else if (mddev->delta_disks < 0 4528 + ? (here_new * mddev->new_chunk_sectors <= 4529 + here_old * mddev->chunk_sectors) 4530 + : (here_new * mddev->new_chunk_sectors >= 4531 + here_old * mddev->chunk_sectors)) { 4513 4532 /* Reading from the same stripe as writing to - bad */ 4514 4533 printk(KERN_ERR "raid5: reshape_position too early for " 4515 4534 "auto-recovery - aborting.\n"); ··· 5097 5078 mddev->degraded--; 5098 5079 for (d = conf->raid_disks ; 5099 5080 d < conf->raid_disks - mddev->delta_disks; 5100 - d++) 5101 - raid5_remove_disk(mddev, d); 5081 + d++) { 5082 + mdk_rdev_t *rdev = conf->disks[d].rdev; 5083 + if (rdev && raid5_remove_disk(mddev, d) == 0) { 5084 + char nm[20]; 5085 + sprintf(nm, "rd%d", rdev->raid_disk); 5086 + sysfs_remove_link(&mddev->kobj, nm); 5087 + rdev->raid_disk = -1; 5088 + } 5089 + } 5102 5090 } 5103 5091 mddev->layout = conf->algorithm; 5104 5092 mddev->chunk_sectors = conf->chunk_sectors;