at v2.6.15-rc1 802 lines 18 kB view raw
1/* 2 * linux/drivers/block/elevator.c 3 * 4 * Block device elevator/IO-scheduler. 5 * 6 * Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE 7 * 8 * 30042000 Jens Axboe <axboe@suse.de> : 9 * 10 * Split the elevator a bit so that it is possible to choose a different 11 * one or even write a new "plug in". There are three pieces: 12 * - elevator_fn, inserts a new request in the queue list 13 * - elevator_merge_fn, decides whether a new buffer can be merged with 14 * an existing request 15 * - elevator_dequeue_fn, called when a request is taken off the active list 16 * 17 * 20082000 Dave Jones <davej@suse.de> : 18 * Removed tests for max-bomb-segments, which was breaking elvtune 19 * when run without -bN 20 * 21 * Jens: 22 * - Rework again to work with bio instead of buffer_heads 23 * - loose bi_dev comparisons, partition handling is right now 24 * - completely modularize elevator setup and teardown 25 * 26 */ 27#include <linux/kernel.h> 28#include <linux/fs.h> 29#include <linux/blkdev.h> 30#include <linux/elevator.h> 31#include <linux/bio.h> 32#include <linux/config.h> 33#include <linux/module.h> 34#include <linux/slab.h> 35#include <linux/init.h> 36#include <linux/compiler.h> 37#include <linux/delay.h> 38 39#include <asm/uaccess.h> 40 41static DEFINE_SPINLOCK(elv_list_lock); 42static LIST_HEAD(elv_list); 43 44/* 45 * can we safely merge with this request? 46 */ 47inline int elv_rq_merge_ok(struct request *rq, struct bio *bio) 48{ 49 if (!rq_mergeable(rq)) 50 return 0; 51 52 /* 53 * different data direction or already started, don't merge 54 */ 55 if (bio_data_dir(bio) != rq_data_dir(rq)) 56 return 0; 57 58 /* 59 * same device and no special stuff set, merge is ok 60 */ 61 if (rq->rq_disk == bio->bi_bdev->bd_disk && 62 !rq->waiting && !rq->special) 63 return 1; 64 65 return 0; 66} 67EXPORT_SYMBOL(elv_rq_merge_ok); 68 69inline int elv_try_merge(struct request *__rq, struct bio *bio) 70{ 71 int ret = ELEVATOR_NO_MERGE; 72 73 /* 74 * we can merge and sequence is ok, check if it's possible 75 */ 76 if (elv_rq_merge_ok(__rq, bio)) { 77 if (__rq->sector + __rq->nr_sectors == bio->bi_sector) 78 ret = ELEVATOR_BACK_MERGE; 79 else if (__rq->sector - bio_sectors(bio) == bio->bi_sector) 80 ret = ELEVATOR_FRONT_MERGE; 81 } 82 83 return ret; 84} 85EXPORT_SYMBOL(elv_try_merge); 86 87static struct elevator_type *elevator_find(const char *name) 88{ 89 struct elevator_type *e = NULL; 90 struct list_head *entry; 91 92 list_for_each(entry, &elv_list) { 93 struct elevator_type *__e; 94 95 __e = list_entry(entry, struct elevator_type, list); 96 97 if (!strcmp(__e->elevator_name, name)) { 98 e = __e; 99 break; 100 } 101 } 102 103 return e; 104} 105 106static void elevator_put(struct elevator_type *e) 107{ 108 module_put(e->elevator_owner); 109} 110 111static struct elevator_type *elevator_get(const char *name) 112{ 113 struct elevator_type *e; 114 115 spin_lock_irq(&elv_list_lock); 116 117 e = elevator_find(name); 118 if (e && !try_module_get(e->elevator_owner)) 119 e = NULL; 120 121 spin_unlock_irq(&elv_list_lock); 122 123 return e; 124} 125 126static int elevator_attach(request_queue_t *q, struct elevator_type *e, 127 struct elevator_queue *eq) 128{ 129 int ret = 0; 130 131 memset(eq, 0, sizeof(*eq)); 132 eq->ops = &e->ops; 133 eq->elevator_type = e; 134 135 q->elevator = eq; 136 137 if (eq->ops->elevator_init_fn) 138 ret = eq->ops->elevator_init_fn(q, eq); 139 140 return ret; 141} 142 143static char chosen_elevator[16]; 144 145static void elevator_setup_default(void) 146{ 147 struct elevator_type *e; 148 149 /* 150 * If default has not been set, use the compiled-in selection. 151 */ 152 if (!chosen_elevator[0]) 153 strcpy(chosen_elevator, CONFIG_DEFAULT_IOSCHED); 154 155 /* 156 * If the given scheduler is not available, fall back to no-op. 157 */ 158 if (!(e = elevator_find(chosen_elevator))) 159 strcpy(chosen_elevator, "noop"); 160 elevator_put(e); 161} 162 163static int __init elevator_setup(char *str) 164{ 165 strncpy(chosen_elevator, str, sizeof(chosen_elevator) - 1); 166 return 0; 167} 168 169__setup("elevator=", elevator_setup); 170 171int elevator_init(request_queue_t *q, char *name) 172{ 173 struct elevator_type *e = NULL; 174 struct elevator_queue *eq; 175 int ret = 0; 176 177 INIT_LIST_HEAD(&q->queue_head); 178 q->last_merge = NULL; 179 q->end_sector = 0; 180 q->boundary_rq = NULL; 181 182 elevator_setup_default(); 183 184 if (!name) 185 name = chosen_elevator; 186 187 e = elevator_get(name); 188 if (!e) 189 return -EINVAL; 190 191 eq = kmalloc(sizeof(struct elevator_queue), GFP_KERNEL); 192 if (!eq) { 193 elevator_put(e->elevator_type); 194 return -ENOMEM; 195 } 196 197 ret = elevator_attach(q, e, eq); 198 if (ret) { 199 kfree(eq); 200 elevator_put(e->elevator_type); 201 } 202 203 return ret; 204} 205 206void elevator_exit(elevator_t *e) 207{ 208 if (e->ops->elevator_exit_fn) 209 e->ops->elevator_exit_fn(e); 210 211 elevator_put(e->elevator_type); 212 e->elevator_type = NULL; 213 kfree(e); 214} 215 216/* 217 * Insert rq into dispatch queue of q. Queue lock must be held on 218 * entry. If sort != 0, rq is sort-inserted; otherwise, rq will be 219 * appended to the dispatch queue. To be used by specific elevators. 220 */ 221void elv_dispatch_sort(request_queue_t *q, struct request *rq) 222{ 223 sector_t boundary; 224 struct list_head *entry; 225 226 if (q->last_merge == rq) 227 q->last_merge = NULL; 228 229 boundary = q->end_sector; 230 231 list_for_each_prev(entry, &q->queue_head) { 232 struct request *pos = list_entry_rq(entry); 233 234 if (pos->flags & (REQ_SOFTBARRIER|REQ_HARDBARRIER|REQ_STARTED)) 235 break; 236 if (rq->sector >= boundary) { 237 if (pos->sector < boundary) 238 continue; 239 } else { 240 if (pos->sector >= boundary) 241 break; 242 } 243 if (rq->sector >= pos->sector) 244 break; 245 } 246 247 list_add(&rq->queuelist, entry); 248} 249 250int elv_merge(request_queue_t *q, struct request **req, struct bio *bio) 251{ 252 elevator_t *e = q->elevator; 253 int ret; 254 255 if (q->last_merge) { 256 ret = elv_try_merge(q->last_merge, bio); 257 if (ret != ELEVATOR_NO_MERGE) { 258 *req = q->last_merge; 259 return ret; 260 } 261 } 262 263 if (e->ops->elevator_merge_fn) 264 return e->ops->elevator_merge_fn(q, req, bio); 265 266 return ELEVATOR_NO_MERGE; 267} 268 269void elv_merged_request(request_queue_t *q, struct request *rq) 270{ 271 elevator_t *e = q->elevator; 272 273 if (e->ops->elevator_merged_fn) 274 e->ops->elevator_merged_fn(q, rq); 275 276 q->last_merge = rq; 277} 278 279void elv_merge_requests(request_queue_t *q, struct request *rq, 280 struct request *next) 281{ 282 elevator_t *e = q->elevator; 283 284 if (e->ops->elevator_merge_req_fn) 285 e->ops->elevator_merge_req_fn(q, rq, next); 286 287 q->last_merge = rq; 288} 289 290void elv_requeue_request(request_queue_t *q, struct request *rq) 291{ 292 elevator_t *e = q->elevator; 293 294 /* 295 * it already went through dequeue, we need to decrement the 296 * in_flight count again 297 */ 298 if (blk_account_rq(rq)) { 299 q->in_flight--; 300 if (blk_sorted_rq(rq) && e->ops->elevator_deactivate_req_fn) 301 e->ops->elevator_deactivate_req_fn(q, rq); 302 } 303 304 rq->flags &= ~REQ_STARTED; 305 306 /* 307 * if this is the flush, requeue the original instead and drop the flush 308 */ 309 if (rq->flags & REQ_BAR_FLUSH) { 310 clear_bit(QUEUE_FLAG_FLUSH, &q->queue_flags); 311 rq = rq->end_io_data; 312 } 313 314 __elv_add_request(q, rq, ELEVATOR_INSERT_FRONT, 0); 315} 316 317void __elv_add_request(request_queue_t *q, struct request *rq, int where, 318 int plug) 319{ 320 if (rq->flags & (REQ_SOFTBARRIER | REQ_HARDBARRIER)) { 321 /* 322 * barriers implicitly indicate back insertion 323 */ 324 if (where == ELEVATOR_INSERT_SORT) 325 where = ELEVATOR_INSERT_BACK; 326 327 /* 328 * this request is scheduling boundary, update end_sector 329 */ 330 if (blk_fs_request(rq)) { 331 q->end_sector = rq_end_sector(rq); 332 q->boundary_rq = rq; 333 } 334 } else if (!(rq->flags & REQ_ELVPRIV) && where == ELEVATOR_INSERT_SORT) 335 where = ELEVATOR_INSERT_BACK; 336 337 if (plug) 338 blk_plug_device(q); 339 340 rq->q = q; 341 342 switch (where) { 343 case ELEVATOR_INSERT_FRONT: 344 rq->flags |= REQ_SOFTBARRIER; 345 346 list_add(&rq->queuelist, &q->queue_head); 347 break; 348 349 case ELEVATOR_INSERT_BACK: 350 rq->flags |= REQ_SOFTBARRIER; 351 352 while (q->elevator->ops->elevator_dispatch_fn(q, 1)) 353 ; 354 list_add_tail(&rq->queuelist, &q->queue_head); 355 /* 356 * We kick the queue here for the following reasons. 357 * - The elevator might have returned NULL previously 358 * to delay requests and returned them now. As the 359 * queue wasn't empty before this request, ll_rw_blk 360 * won't run the queue on return, resulting in hang. 361 * - Usually, back inserted requests won't be merged 362 * with anything. There's no point in delaying queue 363 * processing. 364 */ 365 blk_remove_plug(q); 366 q->request_fn(q); 367 break; 368 369 case ELEVATOR_INSERT_SORT: 370 BUG_ON(!blk_fs_request(rq)); 371 rq->flags |= REQ_SORTED; 372 if (q->last_merge == NULL && rq_mergeable(rq)) 373 q->last_merge = rq; 374 /* 375 * Some ioscheds (cfq) run q->request_fn directly, so 376 * rq cannot be accessed after calling 377 * elevator_add_req_fn. 378 */ 379 q->elevator->ops->elevator_add_req_fn(q, rq); 380 break; 381 382 default: 383 printk(KERN_ERR "%s: bad insertion point %d\n", 384 __FUNCTION__, where); 385 BUG(); 386 } 387 388 if (blk_queue_plugged(q)) { 389 int nrq = q->rq.count[READ] + q->rq.count[WRITE] 390 - q->in_flight; 391 392 if (nrq >= q->unplug_thresh) 393 __generic_unplug_device(q); 394 } 395} 396 397void elv_add_request(request_queue_t *q, struct request *rq, int where, 398 int plug) 399{ 400 unsigned long flags; 401 402 spin_lock_irqsave(q->queue_lock, flags); 403 __elv_add_request(q, rq, where, plug); 404 spin_unlock_irqrestore(q->queue_lock, flags); 405} 406 407static inline struct request *__elv_next_request(request_queue_t *q) 408{ 409 struct request *rq; 410 411 if (unlikely(list_empty(&q->queue_head) && 412 !q->elevator->ops->elevator_dispatch_fn(q, 0))) 413 return NULL; 414 415 rq = list_entry_rq(q->queue_head.next); 416 417 /* 418 * if this is a barrier write and the device has to issue a 419 * flush sequence to support it, check how far we are 420 */ 421 if (blk_fs_request(rq) && blk_barrier_rq(rq)) { 422 BUG_ON(q->ordered == QUEUE_ORDERED_NONE); 423 424 if (q->ordered == QUEUE_ORDERED_FLUSH && 425 !blk_barrier_preflush(rq)) 426 rq = blk_start_pre_flush(q, rq); 427 } 428 429 return rq; 430} 431 432struct request *elv_next_request(request_queue_t *q) 433{ 434 struct request *rq; 435 int ret; 436 437 while ((rq = __elv_next_request(q)) != NULL) { 438 if (!(rq->flags & REQ_STARTED)) { 439 elevator_t *e = q->elevator; 440 441 /* 442 * This is the first time the device driver 443 * sees this request (possibly after 444 * requeueing). Notify IO scheduler. 445 */ 446 if (blk_sorted_rq(rq) && 447 e->ops->elevator_activate_req_fn) 448 e->ops->elevator_activate_req_fn(q, rq); 449 450 /* 451 * just mark as started even if we don't start 452 * it, a request that has been delayed should 453 * not be passed by new incoming requests 454 */ 455 rq->flags |= REQ_STARTED; 456 } 457 458 if (!q->boundary_rq || q->boundary_rq == rq) { 459 q->end_sector = rq_end_sector(rq); 460 q->boundary_rq = NULL; 461 } 462 463 if ((rq->flags & REQ_DONTPREP) || !q->prep_rq_fn) 464 break; 465 466 ret = q->prep_rq_fn(q, rq); 467 if (ret == BLKPREP_OK) { 468 break; 469 } else if (ret == BLKPREP_DEFER) { 470 /* 471 * the request may have been (partially) prepped. 472 * we need to keep this request in the front to 473 * avoid resource deadlock. REQ_STARTED will 474 * prevent other fs requests from passing this one. 475 */ 476 rq = NULL; 477 break; 478 } else if (ret == BLKPREP_KILL) { 479 int nr_bytes = rq->hard_nr_sectors << 9; 480 481 if (!nr_bytes) 482 nr_bytes = rq->data_len; 483 484 blkdev_dequeue_request(rq); 485 rq->flags |= REQ_QUIET; 486 end_that_request_chunk(rq, 0, nr_bytes); 487 end_that_request_last(rq); 488 } else { 489 printk(KERN_ERR "%s: bad return=%d\n", __FUNCTION__, 490 ret); 491 break; 492 } 493 } 494 495 return rq; 496} 497 498void elv_dequeue_request(request_queue_t *q, struct request *rq) 499{ 500 BUG_ON(list_empty(&rq->queuelist)); 501 502 list_del_init(&rq->queuelist); 503 504 /* 505 * the time frame between a request being removed from the lists 506 * and to it is freed is accounted as io that is in progress at 507 * the driver side. 508 */ 509 if (blk_account_rq(rq)) 510 q->in_flight++; 511} 512 513int elv_queue_empty(request_queue_t *q) 514{ 515 elevator_t *e = q->elevator; 516 517 if (!list_empty(&q->queue_head)) 518 return 0; 519 520 if (e->ops->elevator_queue_empty_fn) 521 return e->ops->elevator_queue_empty_fn(q); 522 523 return 1; 524} 525 526struct request *elv_latter_request(request_queue_t *q, struct request *rq) 527{ 528 struct list_head *next; 529 530 elevator_t *e = q->elevator; 531 532 if (e->ops->elevator_latter_req_fn) 533 return e->ops->elevator_latter_req_fn(q, rq); 534 535 next = rq->queuelist.next; 536 if (next != &q->queue_head && next != &rq->queuelist) 537 return list_entry_rq(next); 538 539 return NULL; 540} 541 542struct request *elv_former_request(request_queue_t *q, struct request *rq) 543{ 544 struct list_head *prev; 545 546 elevator_t *e = q->elevator; 547 548 if (e->ops->elevator_former_req_fn) 549 return e->ops->elevator_former_req_fn(q, rq); 550 551 prev = rq->queuelist.prev; 552 if (prev != &q->queue_head && prev != &rq->queuelist) 553 return list_entry_rq(prev); 554 555 return NULL; 556} 557 558int elv_set_request(request_queue_t *q, struct request *rq, struct bio *bio, 559 gfp_t gfp_mask) 560{ 561 elevator_t *e = q->elevator; 562 563 if (e->ops->elevator_set_req_fn) 564 return e->ops->elevator_set_req_fn(q, rq, bio, gfp_mask); 565 566 rq->elevator_private = NULL; 567 return 0; 568} 569 570void elv_put_request(request_queue_t *q, struct request *rq) 571{ 572 elevator_t *e = q->elevator; 573 574 if (e->ops->elevator_put_req_fn) 575 e->ops->elevator_put_req_fn(q, rq); 576} 577 578int elv_may_queue(request_queue_t *q, int rw, struct bio *bio) 579{ 580 elevator_t *e = q->elevator; 581 582 if (e->ops->elevator_may_queue_fn) 583 return e->ops->elevator_may_queue_fn(q, rw, bio); 584 585 return ELV_MQUEUE_MAY; 586} 587 588void elv_completed_request(request_queue_t *q, struct request *rq) 589{ 590 elevator_t *e = q->elevator; 591 592 /* 593 * request is released from the driver, io must be done 594 */ 595 if (blk_account_rq(rq)) { 596 q->in_flight--; 597 if (blk_sorted_rq(rq) && e->ops->elevator_completed_req_fn) 598 e->ops->elevator_completed_req_fn(q, rq); 599 } 600} 601 602int elv_register_queue(struct request_queue *q) 603{ 604 elevator_t *e = q->elevator; 605 606 e->kobj.parent = kobject_get(&q->kobj); 607 if (!e->kobj.parent) 608 return -EBUSY; 609 610 snprintf(e->kobj.name, KOBJ_NAME_LEN, "%s", "iosched"); 611 e->kobj.ktype = e->elevator_type->elevator_ktype; 612 613 return kobject_register(&e->kobj); 614} 615 616void elv_unregister_queue(struct request_queue *q) 617{ 618 if (q) { 619 elevator_t *e = q->elevator; 620 kobject_unregister(&e->kobj); 621 kobject_put(&q->kobj); 622 } 623} 624 625int elv_register(struct elevator_type *e) 626{ 627 spin_lock_irq(&elv_list_lock); 628 if (elevator_find(e->elevator_name)) 629 BUG(); 630 list_add_tail(&e->list, &elv_list); 631 spin_unlock_irq(&elv_list_lock); 632 633 printk(KERN_INFO "io scheduler %s registered", e->elevator_name); 634 if (!strcmp(e->elevator_name, chosen_elevator)) 635 printk(" (default)"); 636 printk("\n"); 637 return 0; 638} 639EXPORT_SYMBOL_GPL(elv_register); 640 641void elv_unregister(struct elevator_type *e) 642{ 643 struct task_struct *g, *p; 644 645 /* 646 * Iterate every thread in the process to remove the io contexts. 647 */ 648 read_lock(&tasklist_lock); 649 do_each_thread(g, p) { 650 struct io_context *ioc = p->io_context; 651 if (ioc && ioc->cic) { 652 ioc->cic->exit(ioc->cic); 653 ioc->cic->dtor(ioc->cic); 654 ioc->cic = NULL; 655 } 656 if (ioc && ioc->aic) { 657 ioc->aic->exit(ioc->aic); 658 ioc->aic->dtor(ioc->aic); 659 ioc->aic = NULL; 660 } 661 } while_each_thread(g, p); 662 read_unlock(&tasklist_lock); 663 664 spin_lock_irq(&elv_list_lock); 665 list_del_init(&e->list); 666 spin_unlock_irq(&elv_list_lock); 667} 668EXPORT_SYMBOL_GPL(elv_unregister); 669 670/* 671 * switch to new_e io scheduler. be careful not to introduce deadlocks - 672 * we don't free the old io scheduler, before we have allocated what we 673 * need for the new one. this way we have a chance of going back to the old 674 * one, if the new one fails init for some reason. 675 */ 676static void elevator_switch(request_queue_t *q, struct elevator_type *new_e) 677{ 678 elevator_t *old_elevator, *e; 679 680 /* 681 * Allocate new elevator 682 */ 683 e = kmalloc(sizeof(elevator_t), GFP_KERNEL); 684 if (!e) 685 goto error; 686 687 /* 688 * Turn on BYPASS and drain all requests w/ elevator private data 689 */ 690 spin_lock_irq(q->queue_lock); 691 692 set_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags); 693 694 while (q->elevator->ops->elevator_dispatch_fn(q, 1)) 695 ; 696 697 while (q->rq.elvpriv) { 698 spin_unlock_irq(q->queue_lock); 699 msleep(10); 700 spin_lock_irq(q->queue_lock); 701 } 702 703 spin_unlock_irq(q->queue_lock); 704 705 /* 706 * unregister old elevator data 707 */ 708 elv_unregister_queue(q); 709 old_elevator = q->elevator; 710 711 /* 712 * attach and start new elevator 713 */ 714 if (elevator_attach(q, new_e, e)) 715 goto fail; 716 717 if (elv_register_queue(q)) 718 goto fail_register; 719 720 /* 721 * finally exit old elevator and turn off BYPASS. 722 */ 723 elevator_exit(old_elevator); 724 clear_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags); 725 return; 726 727fail_register: 728 /* 729 * switch failed, exit the new io scheduler and reattach the old 730 * one again (along with re-adding the sysfs dir) 731 */ 732 elevator_exit(e); 733 e = NULL; 734fail: 735 q->elevator = old_elevator; 736 elv_register_queue(q); 737 clear_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags); 738 kfree(e); 739error: 740 elevator_put(new_e); 741 printk(KERN_ERR "elevator: switch to %s failed\n",new_e->elevator_name); 742} 743 744ssize_t elv_iosched_store(request_queue_t *q, const char *name, size_t count) 745{ 746 char elevator_name[ELV_NAME_MAX]; 747 struct elevator_type *e; 748 749 memset(elevator_name, 0, sizeof(elevator_name)); 750 strncpy(elevator_name, name, sizeof(elevator_name)); 751 752 if (elevator_name[strlen(elevator_name) - 1] == '\n') 753 elevator_name[strlen(elevator_name) - 1] = '\0'; 754 755 e = elevator_get(elevator_name); 756 if (!e) { 757 printk(KERN_ERR "elevator: type %s not found\n", elevator_name); 758 return -EINVAL; 759 } 760 761 if (!strcmp(elevator_name, q->elevator->elevator_type->elevator_name)) { 762 elevator_put(e); 763 return count; 764 } 765 766 elevator_switch(q, e); 767 return count; 768} 769 770ssize_t elv_iosched_show(request_queue_t *q, char *name) 771{ 772 elevator_t *e = q->elevator; 773 struct elevator_type *elv = e->elevator_type; 774 struct list_head *entry; 775 int len = 0; 776 777 spin_lock_irq(q->queue_lock); 778 list_for_each(entry, &elv_list) { 779 struct elevator_type *__e; 780 781 __e = list_entry(entry, struct elevator_type, list); 782 if (!strcmp(elv->elevator_name, __e->elevator_name)) 783 len += sprintf(name+len, "[%s] ", elv->elevator_name); 784 else 785 len += sprintf(name+len, "%s ", __e->elevator_name); 786 } 787 spin_unlock_irq(q->queue_lock); 788 789 len += sprintf(len+name, "\n"); 790 return len; 791} 792 793EXPORT_SYMBOL(elv_dispatch_sort); 794EXPORT_SYMBOL(elv_add_request); 795EXPORT_SYMBOL(__elv_add_request); 796EXPORT_SYMBOL(elv_requeue_request); 797EXPORT_SYMBOL(elv_next_request); 798EXPORT_SYMBOL(elv_dequeue_request); 799EXPORT_SYMBOL(elv_queue_empty); 800EXPORT_SYMBOL(elv_completed_request); 801EXPORT_SYMBOL(elevator_exit); 802EXPORT_SYMBOL(elevator_init);