at v2.6.13-rc2 584 lines 15 kB view raw
1/* 2 * multipath.c : Multiple Devices driver for Linux 3 * 4 * Copyright (C) 1999, 2000, 2001 Ingo Molnar, Red Hat 5 * 6 * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman 7 * 8 * MULTIPATH management functions. 9 * 10 * derived from raid1.c. 11 * 12 * This program is free software; you can redistribute it and/or modify 13 * it under the terms of the GNU General Public License as published by 14 * the Free Software Foundation; either version 2, or (at your option) 15 * any later version. 16 * 17 * You should have received a copy of the GNU General Public License 18 * (for example /usr/src/linux/COPYING); if not, write to the Free 19 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 20 */ 21 22#include <linux/module.h> 23#include <linux/slab.h> 24#include <linux/spinlock.h> 25#include <linux/raid/multipath.h> 26#include <linux/buffer_head.h> 27#include <asm/atomic.h> 28 29#define MAJOR_NR MD_MAJOR 30#define MD_DRIVER 31#define MD_PERSONALITY 32 33#define MAX_WORK_PER_DISK 128 34 35#define NR_RESERVED_BUFS 32 36 37 38static mdk_personality_t multipath_personality; 39 40 41static void *mp_pool_alloc(unsigned int __nocast gfp_flags, void *data) 42{ 43 struct multipath_bh *mpb; 44 mpb = kmalloc(sizeof(*mpb), gfp_flags); 45 if (mpb) 46 memset(mpb, 0, sizeof(*mpb)); 47 return mpb; 48} 49 50static void mp_pool_free(void *mpb, void *data) 51{ 52 kfree(mpb); 53} 54 55static int multipath_map (multipath_conf_t *conf) 56{ 57 int i, disks = conf->raid_disks; 58 59 /* 60 * Later we do read balancing on the read side 61 * now we use the first available disk. 62 */ 63 64 rcu_read_lock(); 65 for (i = 0; i < disks; i++) { 66 mdk_rdev_t *rdev = conf->multipaths[i].rdev; 67 if (rdev && rdev->in_sync) { 68 atomic_inc(&rdev->nr_pending); 69 rcu_read_unlock(); 70 return i; 71 } 72 } 73 rcu_read_unlock(); 74 75 printk(KERN_ERR "multipath_map(): no more operational IO paths?\n"); 76 return (-1); 77} 78 79static void multipath_reschedule_retry (struct multipath_bh *mp_bh) 80{ 81 unsigned long flags; 82 mddev_t *mddev = mp_bh->mddev; 83 multipath_conf_t *conf = mddev_to_conf(mddev); 84 85 spin_lock_irqsave(&conf->device_lock, flags); 86 list_add(&mp_bh->retry_list, &conf->retry_list); 87 spin_unlock_irqrestore(&conf->device_lock, flags); 88 md_wakeup_thread(mddev->thread); 89} 90 91 92/* 93 * multipath_end_bh_io() is called when we have finished servicing a multipathed 94 * operation and are ready to return a success/failure code to the buffer 95 * cache layer. 96 */ 97static void multipath_end_bh_io (struct multipath_bh *mp_bh, int err) 98{ 99 struct bio *bio = mp_bh->master_bio; 100 multipath_conf_t *conf = mddev_to_conf(mp_bh->mddev); 101 102 bio_endio(bio, bio->bi_size, err); 103 mempool_free(mp_bh, conf->pool); 104} 105 106static int multipath_end_request(struct bio *bio, unsigned int bytes_done, 107 int error) 108{ 109 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 110 struct multipath_bh * mp_bh = (struct multipath_bh *)(bio->bi_private); 111 multipath_conf_t *conf = mddev_to_conf(mp_bh->mddev); 112 mdk_rdev_t *rdev = conf->multipaths[mp_bh->path].rdev; 113 114 if (bio->bi_size) 115 return 1; 116 117 if (uptodate) 118 multipath_end_bh_io(mp_bh, 0); 119 else if (!bio_rw_ahead(bio)) { 120 /* 121 * oops, IO error: 122 */ 123 char b[BDEVNAME_SIZE]; 124 md_error (mp_bh->mddev, rdev); 125 printk(KERN_ERR "multipath: %s: rescheduling sector %llu\n", 126 bdevname(rdev->bdev,b), 127 (unsigned long long)bio->bi_sector); 128 multipath_reschedule_retry(mp_bh); 129 } else 130 multipath_end_bh_io(mp_bh, error); 131 rdev_dec_pending(rdev, conf->mddev); 132 return 0; 133} 134 135static void unplug_slaves(mddev_t *mddev) 136{ 137 multipath_conf_t *conf = mddev_to_conf(mddev); 138 int i; 139 140 rcu_read_lock(); 141 for (i=0; i<mddev->raid_disks; i++) { 142 mdk_rdev_t *rdev = conf->multipaths[i].rdev; 143 if (rdev && !rdev->faulty && atomic_read(&rdev->nr_pending)) { 144 request_queue_t *r_queue = bdev_get_queue(rdev->bdev); 145 146 atomic_inc(&rdev->nr_pending); 147 rcu_read_unlock(); 148 149 if (r_queue->unplug_fn) 150 r_queue->unplug_fn(r_queue); 151 152 rdev_dec_pending(rdev, mddev); 153 rcu_read_lock(); 154 } 155 } 156 rcu_read_unlock(); 157} 158 159static void multipath_unplug(request_queue_t *q) 160{ 161 unplug_slaves(q->queuedata); 162} 163 164 165static int multipath_make_request (request_queue_t *q, struct bio * bio) 166{ 167 mddev_t *mddev = q->queuedata; 168 multipath_conf_t *conf = mddev_to_conf(mddev); 169 struct multipath_bh * mp_bh; 170 struct multipath_info *multipath; 171 172 mp_bh = mempool_alloc(conf->pool, GFP_NOIO); 173 174 mp_bh->master_bio = bio; 175 mp_bh->mddev = mddev; 176 177 if (bio_data_dir(bio)==WRITE) { 178 disk_stat_inc(mddev->gendisk, writes); 179 disk_stat_add(mddev->gendisk, write_sectors, bio_sectors(bio)); 180 } else { 181 disk_stat_inc(mddev->gendisk, reads); 182 disk_stat_add(mddev->gendisk, read_sectors, bio_sectors(bio)); 183 } 184 185 mp_bh->path = multipath_map(conf); 186 if (mp_bh->path < 0) { 187 bio_endio(bio, bio->bi_size, -EIO); 188 mempool_free(mp_bh, conf->pool); 189 return 0; 190 } 191 multipath = conf->multipaths + mp_bh->path; 192 193 mp_bh->bio = *bio; 194 mp_bh->bio.bi_sector += multipath->rdev->data_offset; 195 mp_bh->bio.bi_bdev = multipath->rdev->bdev; 196 mp_bh->bio.bi_rw |= (1 << BIO_RW_FAILFAST); 197 mp_bh->bio.bi_end_io = multipath_end_request; 198 mp_bh->bio.bi_private = mp_bh; 199 generic_make_request(&mp_bh->bio); 200 return 0; 201} 202 203static void multipath_status (struct seq_file *seq, mddev_t *mddev) 204{ 205 multipath_conf_t *conf = mddev_to_conf(mddev); 206 int i; 207 208 seq_printf (seq, " [%d/%d] [", conf->raid_disks, 209 conf->working_disks); 210 for (i = 0; i < conf->raid_disks; i++) 211 seq_printf (seq, "%s", 212 conf->multipaths[i].rdev && 213 conf->multipaths[i].rdev->in_sync ? "U" : "_"); 214 seq_printf (seq, "]"); 215} 216 217static int multipath_issue_flush(request_queue_t *q, struct gendisk *disk, 218 sector_t *error_sector) 219{ 220 mddev_t *mddev = q->queuedata; 221 multipath_conf_t *conf = mddev_to_conf(mddev); 222 int i, ret = 0; 223 224 rcu_read_lock(); 225 for (i=0; i<mddev->raid_disks && ret == 0; i++) { 226 mdk_rdev_t *rdev = conf->multipaths[i].rdev; 227 if (rdev && !rdev->faulty) { 228 struct block_device *bdev = rdev->bdev; 229 request_queue_t *r_queue = bdev_get_queue(bdev); 230 231 if (!r_queue->issue_flush_fn) 232 ret = -EOPNOTSUPP; 233 else { 234 atomic_inc(&rdev->nr_pending); 235 rcu_read_unlock(); 236 ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk, 237 error_sector); 238 rdev_dec_pending(rdev, mddev); 239 rcu_read_lock(); 240 } 241 } 242 } 243 rcu_read_unlock(); 244 return ret; 245} 246 247/* 248 * Careful, this can execute in IRQ contexts as well! 249 */ 250static void multipath_error (mddev_t *mddev, mdk_rdev_t *rdev) 251{ 252 multipath_conf_t *conf = mddev_to_conf(mddev); 253 254 if (conf->working_disks <= 1) { 255 /* 256 * Uh oh, we can do nothing if this is our last path, but 257 * first check if this is a queued request for a device 258 * which has just failed. 259 */ 260 printk(KERN_ALERT 261 "multipath: only one IO path left and IO error.\n"); 262 /* leave it active... it's all we have */ 263 } else { 264 /* 265 * Mark disk as unusable 266 */ 267 if (!rdev->faulty) { 268 char b[BDEVNAME_SIZE]; 269 rdev->in_sync = 0; 270 rdev->faulty = 1; 271 mddev->sb_dirty = 1; 272 conf->working_disks--; 273 printk(KERN_ALERT "multipath: IO failure on %s," 274 " disabling IO path. \n Operation continuing" 275 " on %d IO paths.\n", 276 bdevname (rdev->bdev,b), 277 conf->working_disks); 278 } 279 } 280} 281 282static void print_multipath_conf (multipath_conf_t *conf) 283{ 284 int i; 285 struct multipath_info *tmp; 286 287 printk("MULTIPATH conf printout:\n"); 288 if (!conf) { 289 printk("(conf==NULL)\n"); 290 return; 291 } 292 printk(" --- wd:%d rd:%d\n", conf->working_disks, 293 conf->raid_disks); 294 295 for (i = 0; i < conf->raid_disks; i++) { 296 char b[BDEVNAME_SIZE]; 297 tmp = conf->multipaths + i; 298 if (tmp->rdev) 299 printk(" disk%d, o:%d, dev:%s\n", 300 i,!tmp->rdev->faulty, 301 bdevname(tmp->rdev->bdev,b)); 302 } 303} 304 305 306static int multipath_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) 307{ 308 multipath_conf_t *conf = mddev->private; 309 int found = 0; 310 int path; 311 struct multipath_info *p; 312 313 print_multipath_conf(conf); 314 315 for (path=0; path<mddev->raid_disks; path++) 316 if ((p=conf->multipaths+path)->rdev == NULL) { 317 blk_queue_stack_limits(mddev->queue, 318 rdev->bdev->bd_disk->queue); 319 320 /* as we don't honour merge_bvec_fn, we must never risk 321 * violating it, so limit ->max_sector to one PAGE, as 322 * a one page request is never in violation. 323 * (Note: it is very unlikely that a device with 324 * merge_bvec_fn will be involved in multipath.) 325 */ 326 if (rdev->bdev->bd_disk->queue->merge_bvec_fn && 327 mddev->queue->max_sectors > (PAGE_SIZE>>9)) 328 blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); 329 330 conf->working_disks++; 331 rdev->raid_disk = path; 332 rdev->in_sync = 1; 333 p->rdev = rdev; 334 found = 1; 335 } 336 337 print_multipath_conf(conf); 338 return found; 339} 340 341static int multipath_remove_disk(mddev_t *mddev, int number) 342{ 343 multipath_conf_t *conf = mddev->private; 344 int err = 0; 345 mdk_rdev_t *rdev; 346 struct multipath_info *p = conf->multipaths + number; 347 348 print_multipath_conf(conf); 349 350 rdev = p->rdev; 351 if (rdev) { 352 if (rdev->in_sync || 353 atomic_read(&rdev->nr_pending)) { 354 printk(KERN_ERR "hot-remove-disk, slot %d is identified" " but is still operational!\n", number); 355 err = -EBUSY; 356 goto abort; 357 } 358 p->rdev = NULL; 359 synchronize_rcu(); 360 if (atomic_read(&rdev->nr_pending)) { 361 /* lost the race, try later */ 362 err = -EBUSY; 363 p->rdev = rdev; 364 } 365 } 366abort: 367 368 print_multipath_conf(conf); 369 return err; 370} 371 372 373 374/* 375 * This is a kernel thread which: 376 * 377 * 1. Retries failed read operations on working multipaths. 378 * 2. Updates the raid superblock when problems encounter. 379 * 3. Performs writes following reads for array syncronising. 380 */ 381 382static void multipathd (mddev_t *mddev) 383{ 384 struct multipath_bh *mp_bh; 385 struct bio *bio; 386 unsigned long flags; 387 multipath_conf_t *conf = mddev_to_conf(mddev); 388 struct list_head *head = &conf->retry_list; 389 390 md_check_recovery(mddev); 391 for (;;) { 392 char b[BDEVNAME_SIZE]; 393 spin_lock_irqsave(&conf->device_lock, flags); 394 if (list_empty(head)) 395 break; 396 mp_bh = list_entry(head->prev, struct multipath_bh, retry_list); 397 list_del(head->prev); 398 spin_unlock_irqrestore(&conf->device_lock, flags); 399 400 bio = &mp_bh->bio; 401 bio->bi_sector = mp_bh->master_bio->bi_sector; 402 403 if ((mp_bh->path = multipath_map (conf))<0) { 404 printk(KERN_ALERT "multipath: %s: unrecoverable IO read" 405 " error for block %llu\n", 406 bdevname(bio->bi_bdev,b), 407 (unsigned long long)bio->bi_sector); 408 multipath_end_bh_io(mp_bh, -EIO); 409 } else { 410 printk(KERN_ERR "multipath: %s: redirecting sector %llu" 411 " to another IO path\n", 412 bdevname(bio->bi_bdev,b), 413 (unsigned long long)bio->bi_sector); 414 *bio = *(mp_bh->master_bio); 415 bio->bi_sector += conf->multipaths[mp_bh->path].rdev->data_offset; 416 bio->bi_bdev = conf->multipaths[mp_bh->path].rdev->bdev; 417 bio->bi_rw |= (1 << BIO_RW_FAILFAST); 418 bio->bi_end_io = multipath_end_request; 419 bio->bi_private = mp_bh; 420 generic_make_request(bio); 421 } 422 } 423 spin_unlock_irqrestore(&conf->device_lock, flags); 424} 425 426static int multipath_run (mddev_t *mddev) 427{ 428 multipath_conf_t *conf; 429 int disk_idx; 430 struct multipath_info *disk; 431 mdk_rdev_t *rdev; 432 struct list_head *tmp; 433 434 if (mddev->level != LEVEL_MULTIPATH) { 435 printk("multipath: %s: raid level not set to multipath IO (%d)\n", 436 mdname(mddev), mddev->level); 437 goto out; 438 } 439 /* 440 * copy the already verified devices into our private MULTIPATH 441 * bookkeeping area. [whatever we allocate in multipath_run(), 442 * should be freed in multipath_stop()] 443 */ 444 445 conf = kmalloc(sizeof(multipath_conf_t), GFP_KERNEL); 446 mddev->private = conf; 447 if (!conf) { 448 printk(KERN_ERR 449 "multipath: couldn't allocate memory for %s\n", 450 mdname(mddev)); 451 goto out; 452 } 453 memset(conf, 0, sizeof(*conf)); 454 455 conf->multipaths = kmalloc(sizeof(struct multipath_info)*mddev->raid_disks, 456 GFP_KERNEL); 457 if (!conf->multipaths) { 458 printk(KERN_ERR 459 "multipath: couldn't allocate memory for %s\n", 460 mdname(mddev)); 461 goto out_free_conf; 462 } 463 memset(conf->multipaths, 0, sizeof(struct multipath_info)*mddev->raid_disks); 464 465 conf->working_disks = 0; 466 ITERATE_RDEV(mddev,rdev,tmp) { 467 disk_idx = rdev->raid_disk; 468 if (disk_idx < 0 || 469 disk_idx >= mddev->raid_disks) 470 continue; 471 472 disk = conf->multipaths + disk_idx; 473 disk->rdev = rdev; 474 475 blk_queue_stack_limits(mddev->queue, 476 rdev->bdev->bd_disk->queue); 477 /* as we don't honour merge_bvec_fn, we must never risk 478 * violating it, not that we ever expect a device with 479 * a merge_bvec_fn to be involved in multipath */ 480 if (rdev->bdev->bd_disk->queue->merge_bvec_fn && 481 mddev->queue->max_sectors > (PAGE_SIZE>>9)) 482 blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); 483 484 if (!rdev->faulty) 485 conf->working_disks++; 486 } 487 488 conf->raid_disks = mddev->raid_disks; 489 mddev->sb_dirty = 1; 490 conf->mddev = mddev; 491 spin_lock_init(&conf->device_lock); 492 INIT_LIST_HEAD(&conf->retry_list); 493 494 if (!conf->working_disks) { 495 printk(KERN_ERR "multipath: no operational IO paths for %s\n", 496 mdname(mddev)); 497 goto out_free_conf; 498 } 499 mddev->degraded = conf->raid_disks = conf->working_disks; 500 501 conf->pool = mempool_create(NR_RESERVED_BUFS, 502 mp_pool_alloc, mp_pool_free, 503 NULL); 504 if (conf->pool == NULL) { 505 printk(KERN_ERR 506 "multipath: couldn't allocate memory for %s\n", 507 mdname(mddev)); 508 goto out_free_conf; 509 } 510 511 { 512 mddev->thread = md_register_thread(multipathd, mddev, "%s_multipath"); 513 if (!mddev->thread) { 514 printk(KERN_ERR "multipath: couldn't allocate thread" 515 " for %s\n", mdname(mddev)); 516 goto out_free_conf; 517 } 518 } 519 520 printk(KERN_INFO 521 "multipath: array %s active with %d out of %d IO paths\n", 522 mdname(mddev), conf->working_disks, mddev->raid_disks); 523 /* 524 * Ok, everything is just fine now 525 */ 526 mddev->array_size = mddev->size; 527 528 mddev->queue->unplug_fn = multipath_unplug; 529 mddev->queue->issue_flush_fn = multipath_issue_flush; 530 531 return 0; 532 533out_free_conf: 534 if (conf->pool) 535 mempool_destroy(conf->pool); 536 kfree(conf->multipaths); 537 kfree(conf); 538 mddev->private = NULL; 539out: 540 return -EIO; 541} 542 543 544static int multipath_stop (mddev_t *mddev) 545{ 546 multipath_conf_t *conf = mddev_to_conf(mddev); 547 548 md_unregister_thread(mddev->thread); 549 mddev->thread = NULL; 550 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ 551 mempool_destroy(conf->pool); 552 kfree(conf->multipaths); 553 kfree(conf); 554 mddev->private = NULL; 555 return 0; 556} 557 558static mdk_personality_t multipath_personality= 559{ 560 .name = "multipath", 561 .owner = THIS_MODULE, 562 .make_request = multipath_make_request, 563 .run = multipath_run, 564 .stop = multipath_stop, 565 .status = multipath_status, 566 .error_handler = multipath_error, 567 .hot_add_disk = multipath_add_disk, 568 .hot_remove_disk= multipath_remove_disk, 569}; 570 571static int __init multipath_init (void) 572{ 573 return register_md_personality (MULTIPATH, &multipath_personality); 574} 575 576static void __exit multipath_exit (void) 577{ 578 unregister_md_personality (MULTIPATH); 579} 580 581module_init(multipath_init); 582module_exit(multipath_exit); 583MODULE_LICENSE("GPL"); 584MODULE_ALIAS("md-personality-7"); /* MULTIPATH */