Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
fork

Configure Feed

Select the types of activity you want to include in your feed.

at v2.6.14 589 lines 15 kB view raw
1/* 2 * multipath.c : Multiple Devices driver for Linux 3 * 4 * Copyright (C) 1999, 2000, 2001 Ingo Molnar, Red Hat 5 * 6 * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman 7 * 8 * MULTIPATH management functions. 9 * 10 * derived from raid1.c. 11 * 12 * This program is free software; you can redistribute it and/or modify 13 * it under the terms of the GNU General Public License as published by 14 * the Free Software Foundation; either version 2, or (at your option) 15 * any later version. 16 * 17 * You should have received a copy of the GNU General Public License 18 * (for example /usr/src/linux/COPYING); if not, write to the Free 19 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 20 */ 21 22#include <linux/module.h> 23#include <linux/slab.h> 24#include <linux/spinlock.h> 25#include <linux/raid/multipath.h> 26#include <linux/buffer_head.h> 27#include <asm/atomic.h> 28 29#define MAJOR_NR MD_MAJOR 30#define MD_DRIVER 31#define MD_PERSONALITY 32 33#define MAX_WORK_PER_DISK 128 34 35#define NR_RESERVED_BUFS 32 36 37 38static mdk_personality_t multipath_personality; 39 40 41static void *mp_pool_alloc(gfp_t gfp_flags, void *data) 42{ 43 struct multipath_bh *mpb; 44 mpb = kmalloc(sizeof(*mpb), gfp_flags); 45 if (mpb) 46 memset(mpb, 0, sizeof(*mpb)); 47 return mpb; 48} 49 50static void mp_pool_free(void *mpb, void *data) 51{ 52 kfree(mpb); 53} 54 55static int multipath_map (multipath_conf_t *conf) 56{ 57 int i, disks = conf->raid_disks; 58 59 /* 60 * Later we do read balancing on the read side 61 * now we use the first available disk. 62 */ 63 64 rcu_read_lock(); 65 for (i = 0; i < disks; i++) { 66 mdk_rdev_t *rdev = conf->multipaths[i].rdev; 67 if (rdev && rdev->in_sync) { 68 atomic_inc(&rdev->nr_pending); 69 rcu_read_unlock(); 70 return i; 71 } 72 } 73 rcu_read_unlock(); 74 75 printk(KERN_ERR "multipath_map(): no more operational IO paths?\n"); 76 return (-1); 77} 78 79static void multipath_reschedule_retry (struct multipath_bh *mp_bh) 80{ 81 unsigned long flags; 82 mddev_t *mddev = mp_bh->mddev; 83 multipath_conf_t *conf = mddev_to_conf(mddev); 84 85 spin_lock_irqsave(&conf->device_lock, flags); 86 list_add(&mp_bh->retry_list, &conf->retry_list); 87 spin_unlock_irqrestore(&conf->device_lock, flags); 88 md_wakeup_thread(mddev->thread); 89} 90 91 92/* 93 * multipath_end_bh_io() is called when we have finished servicing a multipathed 94 * operation and are ready to return a success/failure code to the buffer 95 * cache layer. 96 */ 97static void multipath_end_bh_io (struct multipath_bh *mp_bh, int err) 98{ 99 struct bio *bio = mp_bh->master_bio; 100 multipath_conf_t *conf = mddev_to_conf(mp_bh->mddev); 101 102 bio_endio(bio, bio->bi_size, err); 103 mempool_free(mp_bh, conf->pool); 104} 105 106static int multipath_end_request(struct bio *bio, unsigned int bytes_done, 107 int error) 108{ 109 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 110 struct multipath_bh * mp_bh = (struct multipath_bh *)(bio->bi_private); 111 multipath_conf_t *conf = mddev_to_conf(mp_bh->mddev); 112 mdk_rdev_t *rdev = conf->multipaths[mp_bh->path].rdev; 113 114 if (bio->bi_size) 115 return 1; 116 117 if (uptodate) 118 multipath_end_bh_io(mp_bh, 0); 119 else if (!bio_rw_ahead(bio)) { 120 /* 121 * oops, IO error: 122 */ 123 char b[BDEVNAME_SIZE]; 124 md_error (mp_bh->mddev, rdev); 125 printk(KERN_ERR "multipath: %s: rescheduling sector %llu\n", 126 bdevname(rdev->bdev,b), 127 (unsigned long long)bio->bi_sector); 128 multipath_reschedule_retry(mp_bh); 129 } else 130 multipath_end_bh_io(mp_bh, error); 131 rdev_dec_pending(rdev, conf->mddev); 132 return 0; 133} 134 135static void unplug_slaves(mddev_t *mddev) 136{ 137 multipath_conf_t *conf = mddev_to_conf(mddev); 138 int i; 139 140 rcu_read_lock(); 141 for (i=0; i<mddev->raid_disks; i++) { 142 mdk_rdev_t *rdev = conf->multipaths[i].rdev; 143 if (rdev && !rdev->faulty && atomic_read(&rdev->nr_pending)) { 144 request_queue_t *r_queue = bdev_get_queue(rdev->bdev); 145 146 atomic_inc(&rdev->nr_pending); 147 rcu_read_unlock(); 148 149 if (r_queue->unplug_fn) 150 r_queue->unplug_fn(r_queue); 151 152 rdev_dec_pending(rdev, mddev); 153 rcu_read_lock(); 154 } 155 } 156 rcu_read_unlock(); 157} 158 159static void multipath_unplug(request_queue_t *q) 160{ 161 unplug_slaves(q->queuedata); 162} 163 164 165static int multipath_make_request (request_queue_t *q, struct bio * bio) 166{ 167 mddev_t *mddev = q->queuedata; 168 multipath_conf_t *conf = mddev_to_conf(mddev); 169 struct multipath_bh * mp_bh; 170 struct multipath_info *multipath; 171 172 if (unlikely(bio_barrier(bio))) { 173 bio_endio(bio, bio->bi_size, -EOPNOTSUPP); 174 return 0; 175 } 176 177 mp_bh = mempool_alloc(conf->pool, GFP_NOIO); 178 179 mp_bh->master_bio = bio; 180 mp_bh->mddev = mddev; 181 182 if (bio_data_dir(bio)==WRITE) { 183 disk_stat_inc(mddev->gendisk, writes); 184 disk_stat_add(mddev->gendisk, write_sectors, bio_sectors(bio)); 185 } else { 186 disk_stat_inc(mddev->gendisk, reads); 187 disk_stat_add(mddev->gendisk, read_sectors, bio_sectors(bio)); 188 } 189 190 mp_bh->path = multipath_map(conf); 191 if (mp_bh->path < 0) { 192 bio_endio(bio, bio->bi_size, -EIO); 193 mempool_free(mp_bh, conf->pool); 194 return 0; 195 } 196 multipath = conf->multipaths + mp_bh->path; 197 198 mp_bh->bio = *bio; 199 mp_bh->bio.bi_sector += multipath->rdev->data_offset; 200 mp_bh->bio.bi_bdev = multipath->rdev->bdev; 201 mp_bh->bio.bi_rw |= (1 << BIO_RW_FAILFAST); 202 mp_bh->bio.bi_end_io = multipath_end_request; 203 mp_bh->bio.bi_private = mp_bh; 204 generic_make_request(&mp_bh->bio); 205 return 0; 206} 207 208static void multipath_status (struct seq_file *seq, mddev_t *mddev) 209{ 210 multipath_conf_t *conf = mddev_to_conf(mddev); 211 int i; 212 213 seq_printf (seq, " [%d/%d] [", conf->raid_disks, 214 conf->working_disks); 215 for (i = 0; i < conf->raid_disks; i++) 216 seq_printf (seq, "%s", 217 conf->multipaths[i].rdev && 218 conf->multipaths[i].rdev->in_sync ? "U" : "_"); 219 seq_printf (seq, "]"); 220} 221 222static int multipath_issue_flush(request_queue_t *q, struct gendisk *disk, 223 sector_t *error_sector) 224{ 225 mddev_t *mddev = q->queuedata; 226 multipath_conf_t *conf = mddev_to_conf(mddev); 227 int i, ret = 0; 228 229 rcu_read_lock(); 230 for (i=0; i<mddev->raid_disks && ret == 0; i++) { 231 mdk_rdev_t *rdev = conf->multipaths[i].rdev; 232 if (rdev && !rdev->faulty) { 233 struct block_device *bdev = rdev->bdev; 234 request_queue_t *r_queue = bdev_get_queue(bdev); 235 236 if (!r_queue->issue_flush_fn) 237 ret = -EOPNOTSUPP; 238 else { 239 atomic_inc(&rdev->nr_pending); 240 rcu_read_unlock(); 241 ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk, 242 error_sector); 243 rdev_dec_pending(rdev, mddev); 244 rcu_read_lock(); 245 } 246 } 247 } 248 rcu_read_unlock(); 249 return ret; 250} 251 252/* 253 * Careful, this can execute in IRQ contexts as well! 254 */ 255static void multipath_error (mddev_t *mddev, mdk_rdev_t *rdev) 256{ 257 multipath_conf_t *conf = mddev_to_conf(mddev); 258 259 if (conf->working_disks <= 1) { 260 /* 261 * Uh oh, we can do nothing if this is our last path, but 262 * first check if this is a queued request for a device 263 * which has just failed. 264 */ 265 printk(KERN_ALERT 266 "multipath: only one IO path left and IO error.\n"); 267 /* leave it active... it's all we have */ 268 } else { 269 /* 270 * Mark disk as unusable 271 */ 272 if (!rdev->faulty) { 273 char b[BDEVNAME_SIZE]; 274 rdev->in_sync = 0; 275 rdev->faulty = 1; 276 mddev->sb_dirty = 1; 277 conf->working_disks--; 278 printk(KERN_ALERT "multipath: IO failure on %s," 279 " disabling IO path. \n Operation continuing" 280 " on %d IO paths.\n", 281 bdevname (rdev->bdev,b), 282 conf->working_disks); 283 } 284 } 285} 286 287static void print_multipath_conf (multipath_conf_t *conf) 288{ 289 int i; 290 struct multipath_info *tmp; 291 292 printk("MULTIPATH conf printout:\n"); 293 if (!conf) { 294 printk("(conf==NULL)\n"); 295 return; 296 } 297 printk(" --- wd:%d rd:%d\n", conf->working_disks, 298 conf->raid_disks); 299 300 for (i = 0; i < conf->raid_disks; i++) { 301 char b[BDEVNAME_SIZE]; 302 tmp = conf->multipaths + i; 303 if (tmp->rdev) 304 printk(" disk%d, o:%d, dev:%s\n", 305 i,!tmp->rdev->faulty, 306 bdevname(tmp->rdev->bdev,b)); 307 } 308} 309 310 311static int multipath_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) 312{ 313 multipath_conf_t *conf = mddev->private; 314 int found = 0; 315 int path; 316 struct multipath_info *p; 317 318 print_multipath_conf(conf); 319 320 for (path=0; path<mddev->raid_disks; path++) 321 if ((p=conf->multipaths+path)->rdev == NULL) { 322 blk_queue_stack_limits(mddev->queue, 323 rdev->bdev->bd_disk->queue); 324 325 /* as we don't honour merge_bvec_fn, we must never risk 326 * violating it, so limit ->max_sector to one PAGE, as 327 * a one page request is never in violation. 328 * (Note: it is very unlikely that a device with 329 * merge_bvec_fn will be involved in multipath.) 330 */ 331 if (rdev->bdev->bd_disk->queue->merge_bvec_fn && 332 mddev->queue->max_sectors > (PAGE_SIZE>>9)) 333 blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); 334 335 conf->working_disks++; 336 rdev->raid_disk = path; 337 rdev->in_sync = 1; 338 p->rdev = rdev; 339 found = 1; 340 } 341 342 print_multipath_conf(conf); 343 return found; 344} 345 346static int multipath_remove_disk(mddev_t *mddev, int number) 347{ 348 multipath_conf_t *conf = mddev->private; 349 int err = 0; 350 mdk_rdev_t *rdev; 351 struct multipath_info *p = conf->multipaths + number; 352 353 print_multipath_conf(conf); 354 355 rdev = p->rdev; 356 if (rdev) { 357 if (rdev->in_sync || 358 atomic_read(&rdev->nr_pending)) { 359 printk(KERN_ERR "hot-remove-disk, slot %d is identified" " but is still operational!\n", number); 360 err = -EBUSY; 361 goto abort; 362 } 363 p->rdev = NULL; 364 synchronize_rcu(); 365 if (atomic_read(&rdev->nr_pending)) { 366 /* lost the race, try later */ 367 err = -EBUSY; 368 p->rdev = rdev; 369 } 370 } 371abort: 372 373 print_multipath_conf(conf); 374 return err; 375} 376 377 378 379/* 380 * This is a kernel thread which: 381 * 382 * 1. Retries failed read operations on working multipaths. 383 * 2. Updates the raid superblock when problems encounter. 384 * 3. Performs writes following reads for array syncronising. 385 */ 386 387static void multipathd (mddev_t *mddev) 388{ 389 struct multipath_bh *mp_bh; 390 struct bio *bio; 391 unsigned long flags; 392 multipath_conf_t *conf = mddev_to_conf(mddev); 393 struct list_head *head = &conf->retry_list; 394 395 md_check_recovery(mddev); 396 for (;;) { 397 char b[BDEVNAME_SIZE]; 398 spin_lock_irqsave(&conf->device_lock, flags); 399 if (list_empty(head)) 400 break; 401 mp_bh = list_entry(head->prev, struct multipath_bh, retry_list); 402 list_del(head->prev); 403 spin_unlock_irqrestore(&conf->device_lock, flags); 404 405 bio = &mp_bh->bio; 406 bio->bi_sector = mp_bh->master_bio->bi_sector; 407 408 if ((mp_bh->path = multipath_map (conf))<0) { 409 printk(KERN_ALERT "multipath: %s: unrecoverable IO read" 410 " error for block %llu\n", 411 bdevname(bio->bi_bdev,b), 412 (unsigned long long)bio->bi_sector); 413 multipath_end_bh_io(mp_bh, -EIO); 414 } else { 415 printk(KERN_ERR "multipath: %s: redirecting sector %llu" 416 " to another IO path\n", 417 bdevname(bio->bi_bdev,b), 418 (unsigned long long)bio->bi_sector); 419 *bio = *(mp_bh->master_bio); 420 bio->bi_sector += conf->multipaths[mp_bh->path].rdev->data_offset; 421 bio->bi_bdev = conf->multipaths[mp_bh->path].rdev->bdev; 422 bio->bi_rw |= (1 << BIO_RW_FAILFAST); 423 bio->bi_end_io = multipath_end_request; 424 bio->bi_private = mp_bh; 425 generic_make_request(bio); 426 } 427 } 428 spin_unlock_irqrestore(&conf->device_lock, flags); 429} 430 431static int multipath_run (mddev_t *mddev) 432{ 433 multipath_conf_t *conf; 434 int disk_idx; 435 struct multipath_info *disk; 436 mdk_rdev_t *rdev; 437 struct list_head *tmp; 438 439 if (mddev->level != LEVEL_MULTIPATH) { 440 printk("multipath: %s: raid level not set to multipath IO (%d)\n", 441 mdname(mddev), mddev->level); 442 goto out; 443 } 444 /* 445 * copy the already verified devices into our private MULTIPATH 446 * bookkeeping area. [whatever we allocate in multipath_run(), 447 * should be freed in multipath_stop()] 448 */ 449 450 conf = kmalloc(sizeof(multipath_conf_t), GFP_KERNEL); 451 mddev->private = conf; 452 if (!conf) { 453 printk(KERN_ERR 454 "multipath: couldn't allocate memory for %s\n", 455 mdname(mddev)); 456 goto out; 457 } 458 memset(conf, 0, sizeof(*conf)); 459 460 conf->multipaths = kmalloc(sizeof(struct multipath_info)*mddev->raid_disks, 461 GFP_KERNEL); 462 if (!conf->multipaths) { 463 printk(KERN_ERR 464 "multipath: couldn't allocate memory for %s\n", 465 mdname(mddev)); 466 goto out_free_conf; 467 } 468 memset(conf->multipaths, 0, sizeof(struct multipath_info)*mddev->raid_disks); 469 470 conf->working_disks = 0; 471 ITERATE_RDEV(mddev,rdev,tmp) { 472 disk_idx = rdev->raid_disk; 473 if (disk_idx < 0 || 474 disk_idx >= mddev->raid_disks) 475 continue; 476 477 disk = conf->multipaths + disk_idx; 478 disk->rdev = rdev; 479 480 blk_queue_stack_limits(mddev->queue, 481 rdev->bdev->bd_disk->queue); 482 /* as we don't honour merge_bvec_fn, we must never risk 483 * violating it, not that we ever expect a device with 484 * a merge_bvec_fn to be involved in multipath */ 485 if (rdev->bdev->bd_disk->queue->merge_bvec_fn && 486 mddev->queue->max_sectors > (PAGE_SIZE>>9)) 487 blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); 488 489 if (!rdev->faulty) 490 conf->working_disks++; 491 } 492 493 conf->raid_disks = mddev->raid_disks; 494 mddev->sb_dirty = 1; 495 conf->mddev = mddev; 496 spin_lock_init(&conf->device_lock); 497 INIT_LIST_HEAD(&conf->retry_list); 498 499 if (!conf->working_disks) { 500 printk(KERN_ERR "multipath: no operational IO paths for %s\n", 501 mdname(mddev)); 502 goto out_free_conf; 503 } 504 mddev->degraded = conf->raid_disks = conf->working_disks; 505 506 conf->pool = mempool_create(NR_RESERVED_BUFS, 507 mp_pool_alloc, mp_pool_free, 508 NULL); 509 if (conf->pool == NULL) { 510 printk(KERN_ERR 511 "multipath: couldn't allocate memory for %s\n", 512 mdname(mddev)); 513 goto out_free_conf; 514 } 515 516 { 517 mddev->thread = md_register_thread(multipathd, mddev, "%s_multipath"); 518 if (!mddev->thread) { 519 printk(KERN_ERR "multipath: couldn't allocate thread" 520 " for %s\n", mdname(mddev)); 521 goto out_free_conf; 522 } 523 } 524 525 printk(KERN_INFO 526 "multipath: array %s active with %d out of %d IO paths\n", 527 mdname(mddev), conf->working_disks, mddev->raid_disks); 528 /* 529 * Ok, everything is just fine now 530 */ 531 mddev->array_size = mddev->size; 532 533 mddev->queue->unplug_fn = multipath_unplug; 534 mddev->queue->issue_flush_fn = multipath_issue_flush; 535 536 return 0; 537 538out_free_conf: 539 if (conf->pool) 540 mempool_destroy(conf->pool); 541 kfree(conf->multipaths); 542 kfree(conf); 543 mddev->private = NULL; 544out: 545 return -EIO; 546} 547 548 549static int multipath_stop (mddev_t *mddev) 550{ 551 multipath_conf_t *conf = mddev_to_conf(mddev); 552 553 md_unregister_thread(mddev->thread); 554 mddev->thread = NULL; 555 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ 556 mempool_destroy(conf->pool); 557 kfree(conf->multipaths); 558 kfree(conf); 559 mddev->private = NULL; 560 return 0; 561} 562 563static mdk_personality_t multipath_personality= 564{ 565 .name = "multipath", 566 .owner = THIS_MODULE, 567 .make_request = multipath_make_request, 568 .run = multipath_run, 569 .stop = multipath_stop, 570 .status = multipath_status, 571 .error_handler = multipath_error, 572 .hot_add_disk = multipath_add_disk, 573 .hot_remove_disk= multipath_remove_disk, 574}; 575 576static int __init multipath_init (void) 577{ 578 return register_md_personality (MULTIPATH, &multipath_personality); 579} 580 581static void __exit multipath_exit (void) 582{ 583 unregister_md_personality (MULTIPATH); 584} 585 586module_init(multipath_init); 587module_exit(multipath_exit); 588MODULE_LICENSE("GPL"); 589MODULE_ALIAS("md-personality-7"); /* MULTIPATH */