Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at v2.6.15-rc3 1999 lines 50 kB view raw
1/* 2 * Anticipatory & deadline i/o scheduler. 3 * 4 * Copyright (C) 2002 Jens Axboe <axboe@suse.de> 5 * Nick Piggin <nickpiggin@yahoo.com.au> 6 * 7 */ 8#include <linux/kernel.h> 9#include <linux/fs.h> 10#include <linux/blkdev.h> 11#include <linux/elevator.h> 12#include <linux/bio.h> 13#include <linux/config.h> 14#include <linux/module.h> 15#include <linux/slab.h> 16#include <linux/init.h> 17#include <linux/compiler.h> 18#include <linux/hash.h> 19#include <linux/rbtree.h> 20#include <linux/interrupt.h> 21 22#define REQ_SYNC 1 23#define REQ_ASYNC 0 24 25/* 26 * See Documentation/block/as-iosched.txt 27 */ 28 29/* 30 * max time before a read is submitted. 31 */ 32#define default_read_expire (HZ / 8) 33 34/* 35 * ditto for writes, these limits are not hard, even 36 * if the disk is capable of satisfying them. 37 */ 38#define default_write_expire (HZ / 4) 39 40/* 41 * read_batch_expire describes how long we will allow a stream of reads to 42 * persist before looking to see whether it is time to switch over to writes. 43 */ 44#define default_read_batch_expire (HZ / 2) 45 46/* 47 * write_batch_expire describes how long we want a stream of writes to run for. 48 * This is not a hard limit, but a target we set for the auto-tuning thingy. 49 * See, the problem is: we can send a lot of writes to disk cache / TCQ in 50 * a short amount of time... 51 */ 52#define default_write_batch_expire (HZ / 8) 53 54/* 55 * max time we may wait to anticipate a read (default around 6ms) 56 */ 57#define default_antic_expire ((HZ / 150) ? HZ / 150 : 1) 58 59/* 60 * Keep track of up to 20ms thinktimes. We can go as big as we like here, 61 * however huge values tend to interfere and not decay fast enough. A program 62 * might be in a non-io phase of operation. Waiting on user input for example, 63 * or doing a lengthy computation. A small penalty can be justified there, and 64 * will still catch out those processes that constantly have large thinktimes. 65 */ 66#define MAX_THINKTIME (HZ/50UL) 67 68/* Bits in as_io_context.state */ 69enum as_io_states { 70 AS_TASK_RUNNING=0, /* Process has not exited */ 71 AS_TASK_IOSTARTED, /* Process has started some IO */ 72 AS_TASK_IORUNNING, /* Process has completed some IO */ 73}; 74 75enum anticipation_status { 76 ANTIC_OFF=0, /* Not anticipating (normal operation) */ 77 ANTIC_WAIT_REQ, /* The last read has not yet completed */ 78 ANTIC_WAIT_NEXT, /* Currently anticipating a request vs 79 last read (which has completed) */ 80 ANTIC_FINISHED, /* Anticipating but have found a candidate 81 * or timed out */ 82}; 83 84struct as_data { 85 /* 86 * run time data 87 */ 88 89 struct request_queue *q; /* the "owner" queue */ 90 91 /* 92 * requests (as_rq s) are present on both sort_list and fifo_list 93 */ 94 struct rb_root sort_list[2]; 95 struct list_head fifo_list[2]; 96 97 struct as_rq *next_arq[2]; /* next in sort order */ 98 sector_t last_sector[2]; /* last REQ_SYNC & REQ_ASYNC sectors */ 99 struct list_head *hash; /* request hash */ 100 101 unsigned long exit_prob; /* probability a task will exit while 102 being waited on */ 103 unsigned long exit_no_coop; /* probablility an exited task will 104 not be part of a later cooperating 105 request */ 106 unsigned long new_ttime_total; /* mean thinktime on new proc */ 107 unsigned long new_ttime_mean; 108 u64 new_seek_total; /* mean seek on new proc */ 109 sector_t new_seek_mean; 110 111 unsigned long current_batch_expires; 112 unsigned long last_check_fifo[2]; 113 int changed_batch; /* 1: waiting for old batch to end */ 114 int new_batch; /* 1: waiting on first read complete */ 115 int batch_data_dir; /* current batch REQ_SYNC / REQ_ASYNC */ 116 int write_batch_count; /* max # of reqs in a write batch */ 117 int current_write_count; /* how many requests left this batch */ 118 int write_batch_idled; /* has the write batch gone idle? */ 119 mempool_t *arq_pool; 120 121 enum anticipation_status antic_status; 122 unsigned long antic_start; /* jiffies: when it started */ 123 struct timer_list antic_timer; /* anticipatory scheduling timer */ 124 struct work_struct antic_work; /* Deferred unplugging */ 125 struct io_context *io_context; /* Identify the expected process */ 126 int ioc_finished; /* IO associated with io_context is finished */ 127 int nr_dispatched; 128 129 /* 130 * settings that change how the i/o scheduler behaves 131 */ 132 unsigned long fifo_expire[2]; 133 unsigned long batch_expire[2]; 134 unsigned long antic_expire; 135}; 136 137#define list_entry_fifo(ptr) list_entry((ptr), struct as_rq, fifo) 138 139/* 140 * per-request data. 141 */ 142enum arq_state { 143 AS_RQ_NEW=0, /* New - not referenced and not on any lists */ 144 AS_RQ_QUEUED, /* In the request queue. It belongs to the 145 scheduler */ 146 AS_RQ_DISPATCHED, /* On the dispatch list. It belongs to the 147 driver now */ 148 AS_RQ_PRESCHED, /* Debug poisoning for requests being used */ 149 AS_RQ_REMOVED, 150 AS_RQ_MERGED, 151 AS_RQ_POSTSCHED, /* when they shouldn't be */ 152}; 153 154struct as_rq { 155 /* 156 * rbtree index, key is the starting offset 157 */ 158 struct rb_node rb_node; 159 sector_t rb_key; 160 161 struct request *request; 162 163 struct io_context *io_context; /* The submitting task */ 164 165 /* 166 * request hash, key is the ending offset (for back merge lookup) 167 */ 168 struct list_head hash; 169 unsigned int on_hash; 170 171 /* 172 * expire fifo 173 */ 174 struct list_head fifo; 175 unsigned long expires; 176 177 unsigned int is_sync; 178 enum arq_state state; 179}; 180 181#define RQ_DATA(rq) ((struct as_rq *) (rq)->elevator_private) 182 183static kmem_cache_t *arq_pool; 184 185/* 186 * IO Context helper functions 187 */ 188 189/* Called to deallocate the as_io_context */ 190static void free_as_io_context(struct as_io_context *aic) 191{ 192 kfree(aic); 193} 194 195/* Called when the task exits */ 196static void exit_as_io_context(struct as_io_context *aic) 197{ 198 WARN_ON(!test_bit(AS_TASK_RUNNING, &aic->state)); 199 clear_bit(AS_TASK_RUNNING, &aic->state); 200} 201 202static struct as_io_context *alloc_as_io_context(void) 203{ 204 struct as_io_context *ret; 205 206 ret = kmalloc(sizeof(*ret), GFP_ATOMIC); 207 if (ret) { 208 ret->dtor = free_as_io_context; 209 ret->exit = exit_as_io_context; 210 ret->state = 1 << AS_TASK_RUNNING; 211 atomic_set(&ret->nr_queued, 0); 212 atomic_set(&ret->nr_dispatched, 0); 213 spin_lock_init(&ret->lock); 214 ret->ttime_total = 0; 215 ret->ttime_samples = 0; 216 ret->ttime_mean = 0; 217 ret->seek_total = 0; 218 ret->seek_samples = 0; 219 ret->seek_mean = 0; 220 } 221 222 return ret; 223} 224 225/* 226 * If the current task has no AS IO context then create one and initialise it. 227 * Then take a ref on the task's io context and return it. 228 */ 229static struct io_context *as_get_io_context(void) 230{ 231 struct io_context *ioc = get_io_context(GFP_ATOMIC); 232 if (ioc && !ioc->aic) { 233 ioc->aic = alloc_as_io_context(); 234 if (!ioc->aic) { 235 put_io_context(ioc); 236 ioc = NULL; 237 } 238 } 239 return ioc; 240} 241 242static void as_put_io_context(struct as_rq *arq) 243{ 244 struct as_io_context *aic; 245 246 if (unlikely(!arq->io_context)) 247 return; 248 249 aic = arq->io_context->aic; 250 251 if (arq->is_sync == REQ_SYNC && aic) { 252 spin_lock(&aic->lock); 253 set_bit(AS_TASK_IORUNNING, &aic->state); 254 aic->last_end_request = jiffies; 255 spin_unlock(&aic->lock); 256 } 257 258 put_io_context(arq->io_context); 259} 260 261/* 262 * the back merge hash support functions 263 */ 264static const int as_hash_shift = 6; 265#define AS_HASH_BLOCK(sec) ((sec) >> 3) 266#define AS_HASH_FN(sec) (hash_long(AS_HASH_BLOCK((sec)), as_hash_shift)) 267#define AS_HASH_ENTRIES (1 << as_hash_shift) 268#define rq_hash_key(rq) ((rq)->sector + (rq)->nr_sectors) 269#define list_entry_hash(ptr) list_entry((ptr), struct as_rq, hash) 270 271static inline void __as_del_arq_hash(struct as_rq *arq) 272{ 273 arq->on_hash = 0; 274 list_del_init(&arq->hash); 275} 276 277static inline void as_del_arq_hash(struct as_rq *arq) 278{ 279 if (arq->on_hash) 280 __as_del_arq_hash(arq); 281} 282 283static void as_add_arq_hash(struct as_data *ad, struct as_rq *arq) 284{ 285 struct request *rq = arq->request; 286 287 BUG_ON(arq->on_hash); 288 289 arq->on_hash = 1; 290 list_add(&arq->hash, &ad->hash[AS_HASH_FN(rq_hash_key(rq))]); 291} 292 293/* 294 * move hot entry to front of chain 295 */ 296static inline void as_hot_arq_hash(struct as_data *ad, struct as_rq *arq) 297{ 298 struct request *rq = arq->request; 299 struct list_head *head = &ad->hash[AS_HASH_FN(rq_hash_key(rq))]; 300 301 if (!arq->on_hash) { 302 WARN_ON(1); 303 return; 304 } 305 306 if (arq->hash.prev != head) { 307 list_del(&arq->hash); 308 list_add(&arq->hash, head); 309 } 310} 311 312static struct request *as_find_arq_hash(struct as_data *ad, sector_t offset) 313{ 314 struct list_head *hash_list = &ad->hash[AS_HASH_FN(offset)]; 315 struct list_head *entry, *next = hash_list->next; 316 317 while ((entry = next) != hash_list) { 318 struct as_rq *arq = list_entry_hash(entry); 319 struct request *__rq = arq->request; 320 321 next = entry->next; 322 323 BUG_ON(!arq->on_hash); 324 325 if (!rq_mergeable(__rq)) { 326 as_del_arq_hash(arq); 327 continue; 328 } 329 330 if (rq_hash_key(__rq) == offset) 331 return __rq; 332 } 333 334 return NULL; 335} 336 337/* 338 * rb tree support functions 339 */ 340#define RB_NONE (2) 341#define RB_EMPTY(root) ((root)->rb_node == NULL) 342#define ON_RB(node) ((node)->rb_color != RB_NONE) 343#define RB_CLEAR(node) ((node)->rb_color = RB_NONE) 344#define rb_entry_arq(node) rb_entry((node), struct as_rq, rb_node) 345#define ARQ_RB_ROOT(ad, arq) (&(ad)->sort_list[(arq)->is_sync]) 346#define rq_rb_key(rq) (rq)->sector 347 348/* 349 * as_find_first_arq finds the first (lowest sector numbered) request 350 * for the specified data_dir. Used to sweep back to the start of the disk 351 * (1-way elevator) after we process the last (highest sector) request. 352 */ 353static struct as_rq *as_find_first_arq(struct as_data *ad, int data_dir) 354{ 355 struct rb_node *n = ad->sort_list[data_dir].rb_node; 356 357 if (n == NULL) 358 return NULL; 359 360 for (;;) { 361 if (n->rb_left == NULL) 362 return rb_entry_arq(n); 363 364 n = n->rb_left; 365 } 366} 367 368/* 369 * Add the request to the rb tree if it is unique. If there is an alias (an 370 * existing request against the same sector), which can happen when using 371 * direct IO, then return the alias. 372 */ 373static struct as_rq *as_add_arq_rb(struct as_data *ad, struct as_rq *arq) 374{ 375 struct rb_node **p = &ARQ_RB_ROOT(ad, arq)->rb_node; 376 struct rb_node *parent = NULL; 377 struct as_rq *__arq; 378 struct request *rq = arq->request; 379 380 arq->rb_key = rq_rb_key(rq); 381 382 while (*p) { 383 parent = *p; 384 __arq = rb_entry_arq(parent); 385 386 if (arq->rb_key < __arq->rb_key) 387 p = &(*p)->rb_left; 388 else if (arq->rb_key > __arq->rb_key) 389 p = &(*p)->rb_right; 390 else 391 return __arq; 392 } 393 394 rb_link_node(&arq->rb_node, parent, p); 395 rb_insert_color(&arq->rb_node, ARQ_RB_ROOT(ad, arq)); 396 397 return NULL; 398} 399 400static inline void as_del_arq_rb(struct as_data *ad, struct as_rq *arq) 401{ 402 if (!ON_RB(&arq->rb_node)) { 403 WARN_ON(1); 404 return; 405 } 406 407 rb_erase(&arq->rb_node, ARQ_RB_ROOT(ad, arq)); 408 RB_CLEAR(&arq->rb_node); 409} 410 411static struct request * 412as_find_arq_rb(struct as_data *ad, sector_t sector, int data_dir) 413{ 414 struct rb_node *n = ad->sort_list[data_dir].rb_node; 415 struct as_rq *arq; 416 417 while (n) { 418 arq = rb_entry_arq(n); 419 420 if (sector < arq->rb_key) 421 n = n->rb_left; 422 else if (sector > arq->rb_key) 423 n = n->rb_right; 424 else 425 return arq->request; 426 } 427 428 return NULL; 429} 430 431/* 432 * IO Scheduler proper 433 */ 434 435#define MAXBACK (1024 * 1024) /* 436 * Maximum distance the disk will go backward 437 * for a request. 438 */ 439 440#define BACK_PENALTY 2 441 442/* 443 * as_choose_req selects the preferred one of two requests of the same data_dir 444 * ignoring time - eg. timeouts, which is the job of as_dispatch_request 445 */ 446static struct as_rq * 447as_choose_req(struct as_data *ad, struct as_rq *arq1, struct as_rq *arq2) 448{ 449 int data_dir; 450 sector_t last, s1, s2, d1, d2; 451 int r1_wrap=0, r2_wrap=0; /* requests are behind the disk head */ 452 const sector_t maxback = MAXBACK; 453 454 if (arq1 == NULL || arq1 == arq2) 455 return arq2; 456 if (arq2 == NULL) 457 return arq1; 458 459 data_dir = arq1->is_sync; 460 461 last = ad->last_sector[data_dir]; 462 s1 = arq1->request->sector; 463 s2 = arq2->request->sector; 464 465 BUG_ON(data_dir != arq2->is_sync); 466 467 /* 468 * Strict one way elevator _except_ in the case where we allow 469 * short backward seeks which are biased as twice the cost of a 470 * similar forward seek. 471 */ 472 if (s1 >= last) 473 d1 = s1 - last; 474 else if (s1+maxback >= last) 475 d1 = (last - s1)*BACK_PENALTY; 476 else { 477 r1_wrap = 1; 478 d1 = 0; /* shut up, gcc */ 479 } 480 481 if (s2 >= last) 482 d2 = s2 - last; 483 else if (s2+maxback >= last) 484 d2 = (last - s2)*BACK_PENALTY; 485 else { 486 r2_wrap = 1; 487 d2 = 0; 488 } 489 490 /* Found required data */ 491 if (!r1_wrap && r2_wrap) 492 return arq1; 493 else if (!r2_wrap && r1_wrap) 494 return arq2; 495 else if (r1_wrap && r2_wrap) { 496 /* both behind the head */ 497 if (s1 <= s2) 498 return arq1; 499 else 500 return arq2; 501 } 502 503 /* Both requests in front of the head */ 504 if (d1 < d2) 505 return arq1; 506 else if (d2 < d1) 507 return arq2; 508 else { 509 if (s1 >= s2) 510 return arq1; 511 else 512 return arq2; 513 } 514} 515 516/* 517 * as_find_next_arq finds the next request after @prev in elevator order. 518 * this with as_choose_req form the basis for how the scheduler chooses 519 * what request to process next. Anticipation works on top of this. 520 */ 521static struct as_rq *as_find_next_arq(struct as_data *ad, struct as_rq *last) 522{ 523 const int data_dir = last->is_sync; 524 struct as_rq *ret; 525 struct rb_node *rbnext = rb_next(&last->rb_node); 526 struct rb_node *rbprev = rb_prev(&last->rb_node); 527 struct as_rq *arq_next, *arq_prev; 528 529 BUG_ON(!ON_RB(&last->rb_node)); 530 531 if (rbprev) 532 arq_prev = rb_entry_arq(rbprev); 533 else 534 arq_prev = NULL; 535 536 if (rbnext) 537 arq_next = rb_entry_arq(rbnext); 538 else { 539 arq_next = as_find_first_arq(ad, data_dir); 540 if (arq_next == last) 541 arq_next = NULL; 542 } 543 544 ret = as_choose_req(ad, arq_next, arq_prev); 545 546 return ret; 547} 548 549/* 550 * anticipatory scheduling functions follow 551 */ 552 553/* 554 * as_antic_expired tells us when we have anticipated too long. 555 * The funny "absolute difference" math on the elapsed time is to handle 556 * jiffy wraps, and disks which have been idle for 0x80000000 jiffies. 557 */ 558static int as_antic_expired(struct as_data *ad) 559{ 560 long delta_jif; 561 562 delta_jif = jiffies - ad->antic_start; 563 if (unlikely(delta_jif < 0)) 564 delta_jif = -delta_jif; 565 if (delta_jif < ad->antic_expire) 566 return 0; 567 568 return 1; 569} 570 571/* 572 * as_antic_waitnext starts anticipating that a nice request will soon be 573 * submitted. See also as_antic_waitreq 574 */ 575static void as_antic_waitnext(struct as_data *ad) 576{ 577 unsigned long timeout; 578 579 BUG_ON(ad->antic_status != ANTIC_OFF 580 && ad->antic_status != ANTIC_WAIT_REQ); 581 582 timeout = ad->antic_start + ad->antic_expire; 583 584 mod_timer(&ad->antic_timer, timeout); 585 586 ad->antic_status = ANTIC_WAIT_NEXT; 587} 588 589/* 590 * as_antic_waitreq starts anticipating. We don't start timing the anticipation 591 * until the request that we're anticipating on has finished. This means we 592 * are timing from when the candidate process wakes up hopefully. 593 */ 594static void as_antic_waitreq(struct as_data *ad) 595{ 596 BUG_ON(ad->antic_status == ANTIC_FINISHED); 597 if (ad->antic_status == ANTIC_OFF) { 598 if (!ad->io_context || ad->ioc_finished) 599 as_antic_waitnext(ad); 600 else 601 ad->antic_status = ANTIC_WAIT_REQ; 602 } 603} 604 605/* 606 * This is called directly by the functions in this file to stop anticipation. 607 * We kill the timer and schedule a call to the request_fn asap. 608 */ 609static void as_antic_stop(struct as_data *ad) 610{ 611 int status = ad->antic_status; 612 613 if (status == ANTIC_WAIT_REQ || status == ANTIC_WAIT_NEXT) { 614 if (status == ANTIC_WAIT_NEXT) 615 del_timer(&ad->antic_timer); 616 ad->antic_status = ANTIC_FINISHED; 617 /* see as_work_handler */ 618 kblockd_schedule_work(&ad->antic_work); 619 } 620} 621 622/* 623 * as_antic_timeout is the timer function set by as_antic_waitnext. 624 */ 625static void as_antic_timeout(unsigned long data) 626{ 627 struct request_queue *q = (struct request_queue *)data; 628 struct as_data *ad = q->elevator->elevator_data; 629 unsigned long flags; 630 631 spin_lock_irqsave(q->queue_lock, flags); 632 if (ad->antic_status == ANTIC_WAIT_REQ 633 || ad->antic_status == ANTIC_WAIT_NEXT) { 634 struct as_io_context *aic = ad->io_context->aic; 635 636 ad->antic_status = ANTIC_FINISHED; 637 kblockd_schedule_work(&ad->antic_work); 638 639 if (aic->ttime_samples == 0) { 640 /* process anticipated on has exited or timed out*/ 641 ad->exit_prob = (7*ad->exit_prob + 256)/8; 642 } 643 if (!test_bit(AS_TASK_RUNNING, &aic->state)) { 644 /* process not "saved" by a cooperating request */ 645 ad->exit_no_coop = (7*ad->exit_no_coop + 256)/8; 646 } 647 } 648 spin_unlock_irqrestore(q->queue_lock, flags); 649} 650 651static void as_update_thinktime(struct as_data *ad, struct as_io_context *aic, 652 unsigned long ttime) 653{ 654 /* fixed point: 1.0 == 1<<8 */ 655 if (aic->ttime_samples == 0) { 656 ad->new_ttime_total = (7*ad->new_ttime_total + 256*ttime) / 8; 657 ad->new_ttime_mean = ad->new_ttime_total / 256; 658 659 ad->exit_prob = (7*ad->exit_prob)/8; 660 } 661 aic->ttime_samples = (7*aic->ttime_samples + 256) / 8; 662 aic->ttime_total = (7*aic->ttime_total + 256*ttime) / 8; 663 aic->ttime_mean = (aic->ttime_total + 128) / aic->ttime_samples; 664} 665 666static void as_update_seekdist(struct as_data *ad, struct as_io_context *aic, 667 sector_t sdist) 668{ 669 u64 total; 670 671 if (aic->seek_samples == 0) { 672 ad->new_seek_total = (7*ad->new_seek_total + 256*(u64)sdist)/8; 673 ad->new_seek_mean = ad->new_seek_total / 256; 674 } 675 676 /* 677 * Don't allow the seek distance to get too large from the 678 * odd fragment, pagein, etc 679 */ 680 if (aic->seek_samples <= 60) /* second&third seek */ 681 sdist = min(sdist, (aic->seek_mean * 4) + 2*1024*1024); 682 else 683 sdist = min(sdist, (aic->seek_mean * 4) + 2*1024*64); 684 685 aic->seek_samples = (7*aic->seek_samples + 256) / 8; 686 aic->seek_total = (7*aic->seek_total + (u64)256*sdist) / 8; 687 total = aic->seek_total + (aic->seek_samples/2); 688 do_div(total, aic->seek_samples); 689 aic->seek_mean = (sector_t)total; 690} 691 692/* 693 * as_update_iohist keeps a decaying histogram of IO thinktimes, and 694 * updates @aic->ttime_mean based on that. It is called when a new 695 * request is queued. 696 */ 697static void as_update_iohist(struct as_data *ad, struct as_io_context *aic, 698 struct request *rq) 699{ 700 struct as_rq *arq = RQ_DATA(rq); 701 int data_dir = arq->is_sync; 702 unsigned long thinktime = 0; 703 sector_t seek_dist; 704 705 if (aic == NULL) 706 return; 707 708 if (data_dir == REQ_SYNC) { 709 unsigned long in_flight = atomic_read(&aic->nr_queued) 710 + atomic_read(&aic->nr_dispatched); 711 spin_lock(&aic->lock); 712 if (test_bit(AS_TASK_IORUNNING, &aic->state) || 713 test_bit(AS_TASK_IOSTARTED, &aic->state)) { 714 /* Calculate read -> read thinktime */ 715 if (test_bit(AS_TASK_IORUNNING, &aic->state) 716 && in_flight == 0) { 717 thinktime = jiffies - aic->last_end_request; 718 thinktime = min(thinktime, MAX_THINKTIME-1); 719 } 720 as_update_thinktime(ad, aic, thinktime); 721 722 /* Calculate read -> read seek distance */ 723 if (aic->last_request_pos < rq->sector) 724 seek_dist = rq->sector - aic->last_request_pos; 725 else 726 seek_dist = aic->last_request_pos - rq->sector; 727 as_update_seekdist(ad, aic, seek_dist); 728 } 729 aic->last_request_pos = rq->sector + rq->nr_sectors; 730 set_bit(AS_TASK_IOSTARTED, &aic->state); 731 spin_unlock(&aic->lock); 732 } 733} 734 735/* 736 * as_close_req decides if one request is considered "close" to the 737 * previous one issued. 738 */ 739static int as_close_req(struct as_data *ad, struct as_io_context *aic, 740 struct as_rq *arq) 741{ 742 unsigned long delay; /* milliseconds */ 743 sector_t last = ad->last_sector[ad->batch_data_dir]; 744 sector_t next = arq->request->sector; 745 sector_t delta; /* acceptable close offset (in sectors) */ 746 sector_t s; 747 748 if (ad->antic_status == ANTIC_OFF || !ad->ioc_finished) 749 delay = 0; 750 else 751 delay = ((jiffies - ad->antic_start) * 1000) / HZ; 752 753 if (delay == 0) 754 delta = 8192; 755 else if (delay <= 20 && delay <= ad->antic_expire) 756 delta = 8192 << delay; 757 else 758 return 1; 759 760 if ((last <= next + (delta>>1)) && (next <= last + delta)) 761 return 1; 762 763 if (last < next) 764 s = next - last; 765 else 766 s = last - next; 767 768 if (aic->seek_samples == 0) { 769 /* 770 * Process has just started IO. Use past statistics to 771 * gauge success possibility 772 */ 773 if (ad->new_seek_mean > s) { 774 /* this request is better than what we're expecting */ 775 return 1; 776 } 777 778 } else { 779 if (aic->seek_mean > s) { 780 /* this request is better than what we're expecting */ 781 return 1; 782 } 783 } 784 785 return 0; 786} 787 788/* 789 * as_can_break_anticipation returns true if we have been anticipating this 790 * request. 791 * 792 * It also returns true if the process against which we are anticipating 793 * submits a write - that's presumably an fsync, O_SYNC write, etc. We want to 794 * dispatch it ASAP, because we know that application will not be submitting 795 * any new reads. 796 * 797 * If the task which has submitted the request has exited, break anticipation. 798 * 799 * If this task has queued some other IO, do not enter enticipation. 800 */ 801static int as_can_break_anticipation(struct as_data *ad, struct as_rq *arq) 802{ 803 struct io_context *ioc; 804 struct as_io_context *aic; 805 806 ioc = ad->io_context; 807 BUG_ON(!ioc); 808 809 if (arq && ioc == arq->io_context) { 810 /* request from same process */ 811 return 1; 812 } 813 814 if (ad->ioc_finished && as_antic_expired(ad)) { 815 /* 816 * In this situation status should really be FINISHED, 817 * however the timer hasn't had the chance to run yet. 818 */ 819 return 1; 820 } 821 822 aic = ioc->aic; 823 if (!aic) 824 return 0; 825 826 if (atomic_read(&aic->nr_queued) > 0) { 827 /* process has more requests queued */ 828 return 1; 829 } 830 831 if (atomic_read(&aic->nr_dispatched) > 0) { 832 /* process has more requests dispatched */ 833 return 1; 834 } 835 836 if (arq && arq->is_sync == REQ_SYNC && as_close_req(ad, aic, arq)) { 837 /* 838 * Found a close request that is not one of ours. 839 * 840 * This makes close requests from another process update 841 * our IO history. Is generally useful when there are 842 * two or more cooperating processes working in the same 843 * area. 844 */ 845 if (!test_bit(AS_TASK_RUNNING, &aic->state)) { 846 if (aic->ttime_samples == 0) 847 ad->exit_prob = (7*ad->exit_prob + 256)/8; 848 849 ad->exit_no_coop = (7*ad->exit_no_coop)/8; 850 } 851 852 as_update_iohist(ad, aic, arq->request); 853 return 1; 854 } 855 856 if (!test_bit(AS_TASK_RUNNING, &aic->state)) { 857 /* process anticipated on has exited */ 858 if (aic->ttime_samples == 0) 859 ad->exit_prob = (7*ad->exit_prob + 256)/8; 860 861 if (ad->exit_no_coop > 128) 862 return 1; 863 } 864 865 if (aic->ttime_samples == 0) { 866 if (ad->new_ttime_mean > ad->antic_expire) 867 return 1; 868 if (ad->exit_prob * ad->exit_no_coop > 128*256) 869 return 1; 870 } else if (aic->ttime_mean > ad->antic_expire) { 871 /* the process thinks too much between requests */ 872 return 1; 873 } 874 875 return 0; 876} 877 878/* 879 * as_can_anticipate indicates weather we should either run arq 880 * or keep anticipating a better request. 881 */ 882static int as_can_anticipate(struct as_data *ad, struct as_rq *arq) 883{ 884 if (!ad->io_context) 885 /* 886 * Last request submitted was a write 887 */ 888 return 0; 889 890 if (ad->antic_status == ANTIC_FINISHED) 891 /* 892 * Don't restart if we have just finished. Run the next request 893 */ 894 return 0; 895 896 if (as_can_break_anticipation(ad, arq)) 897 /* 898 * This request is a good candidate. Don't keep anticipating, 899 * run it. 900 */ 901 return 0; 902 903 /* 904 * OK from here, we haven't finished, and don't have a decent request! 905 * Status is either ANTIC_OFF so start waiting, 906 * ANTIC_WAIT_REQ so continue waiting for request to finish 907 * or ANTIC_WAIT_NEXT so continue waiting for an acceptable request. 908 */ 909 910 return 1; 911} 912 913/* 914 * as_update_arq must be called whenever a request (arq) is added to 915 * the sort_list. This function keeps caches up to date, and checks if the 916 * request might be one we are "anticipating" 917 */ 918static void as_update_arq(struct as_data *ad, struct as_rq *arq) 919{ 920 const int data_dir = arq->is_sync; 921 922 /* keep the next_arq cache up to date */ 923 ad->next_arq[data_dir] = as_choose_req(ad, arq, ad->next_arq[data_dir]); 924 925 /* 926 * have we been anticipating this request? 927 * or does it come from the same process as the one we are anticipating 928 * for? 929 */ 930 if (ad->antic_status == ANTIC_WAIT_REQ 931 || ad->antic_status == ANTIC_WAIT_NEXT) { 932 if (as_can_break_anticipation(ad, arq)) 933 as_antic_stop(ad); 934 } 935} 936 937/* 938 * Gathers timings and resizes the write batch automatically 939 */ 940static void update_write_batch(struct as_data *ad) 941{ 942 unsigned long batch = ad->batch_expire[REQ_ASYNC]; 943 long write_time; 944 945 write_time = (jiffies - ad->current_batch_expires) + batch; 946 if (write_time < 0) 947 write_time = 0; 948 949 if (write_time > batch && !ad->write_batch_idled) { 950 if (write_time > batch * 3) 951 ad->write_batch_count /= 2; 952 else 953 ad->write_batch_count--; 954 } else if (write_time < batch && ad->current_write_count == 0) { 955 if (batch > write_time * 3) 956 ad->write_batch_count *= 2; 957 else 958 ad->write_batch_count++; 959 } 960 961 if (ad->write_batch_count < 1) 962 ad->write_batch_count = 1; 963} 964 965/* 966 * as_completed_request is to be called when a request has completed and 967 * returned something to the requesting process, be it an error or data. 968 */ 969static void as_completed_request(request_queue_t *q, struct request *rq) 970{ 971 struct as_data *ad = q->elevator->elevator_data; 972 struct as_rq *arq = RQ_DATA(rq); 973 974 WARN_ON(!list_empty(&rq->queuelist)); 975 976 if (arq->state != AS_RQ_REMOVED) { 977 printk("arq->state %d\n", arq->state); 978 WARN_ON(1); 979 goto out; 980 } 981 982 if (ad->changed_batch && ad->nr_dispatched == 1) { 983 kblockd_schedule_work(&ad->antic_work); 984 ad->changed_batch = 0; 985 986 if (ad->batch_data_dir == REQ_SYNC) 987 ad->new_batch = 1; 988 } 989 WARN_ON(ad->nr_dispatched == 0); 990 ad->nr_dispatched--; 991 992 /* 993 * Start counting the batch from when a request of that direction is 994 * actually serviced. This should help devices with big TCQ windows 995 * and writeback caches 996 */ 997 if (ad->new_batch && ad->batch_data_dir == arq->is_sync) { 998 update_write_batch(ad); 999 ad->current_batch_expires = jiffies + 1000 ad->batch_expire[REQ_SYNC]; 1001 ad->new_batch = 0; 1002 } 1003 1004 if (ad->io_context == arq->io_context && ad->io_context) { 1005 ad->antic_start = jiffies; 1006 ad->ioc_finished = 1; 1007 if (ad->antic_status == ANTIC_WAIT_REQ) { 1008 /* 1009 * We were waiting on this request, now anticipate 1010 * the next one 1011 */ 1012 as_antic_waitnext(ad); 1013 } 1014 } 1015 1016 as_put_io_context(arq); 1017out: 1018 arq->state = AS_RQ_POSTSCHED; 1019} 1020 1021/* 1022 * as_remove_queued_request removes a request from the pre dispatch queue 1023 * without updating refcounts. It is expected the caller will drop the 1024 * reference unless it replaces the request at somepart of the elevator 1025 * (ie. the dispatch queue) 1026 */ 1027static void as_remove_queued_request(request_queue_t *q, struct request *rq) 1028{ 1029 struct as_rq *arq = RQ_DATA(rq); 1030 const int data_dir = arq->is_sync; 1031 struct as_data *ad = q->elevator->elevator_data; 1032 1033 WARN_ON(arq->state != AS_RQ_QUEUED); 1034 1035 if (arq->io_context && arq->io_context->aic) { 1036 BUG_ON(!atomic_read(&arq->io_context->aic->nr_queued)); 1037 atomic_dec(&arq->io_context->aic->nr_queued); 1038 } 1039 1040 /* 1041 * Update the "next_arq" cache if we are about to remove its 1042 * entry 1043 */ 1044 if (ad->next_arq[data_dir] == arq) 1045 ad->next_arq[data_dir] = as_find_next_arq(ad, arq); 1046 1047 list_del_init(&arq->fifo); 1048 as_del_arq_hash(arq); 1049 as_del_arq_rb(ad, arq); 1050} 1051 1052/* 1053 * as_fifo_expired returns 0 if there are no expired reads on the fifo, 1054 * 1 otherwise. It is ratelimited so that we only perform the check once per 1055 * `fifo_expire' interval. Otherwise a large number of expired requests 1056 * would create a hopeless seekstorm. 1057 * 1058 * See as_antic_expired comment. 1059 */ 1060static int as_fifo_expired(struct as_data *ad, int adir) 1061{ 1062 struct as_rq *arq; 1063 long delta_jif; 1064 1065 delta_jif = jiffies - ad->last_check_fifo[adir]; 1066 if (unlikely(delta_jif < 0)) 1067 delta_jif = -delta_jif; 1068 if (delta_jif < ad->fifo_expire[adir]) 1069 return 0; 1070 1071 ad->last_check_fifo[adir] = jiffies; 1072 1073 if (list_empty(&ad->fifo_list[adir])) 1074 return 0; 1075 1076 arq = list_entry_fifo(ad->fifo_list[adir].next); 1077 1078 return time_after(jiffies, arq->expires); 1079} 1080 1081/* 1082 * as_batch_expired returns true if the current batch has expired. A batch 1083 * is a set of reads or a set of writes. 1084 */ 1085static inline int as_batch_expired(struct as_data *ad) 1086{ 1087 if (ad->changed_batch || ad->new_batch) 1088 return 0; 1089 1090 if (ad->batch_data_dir == REQ_SYNC) 1091 /* TODO! add a check so a complete fifo gets written? */ 1092 return time_after(jiffies, ad->current_batch_expires); 1093 1094 return time_after(jiffies, ad->current_batch_expires) 1095 || ad->current_write_count == 0; 1096} 1097 1098/* 1099 * move an entry to dispatch queue 1100 */ 1101static void as_move_to_dispatch(struct as_data *ad, struct as_rq *arq) 1102{ 1103 struct request *rq = arq->request; 1104 const int data_dir = arq->is_sync; 1105 1106 BUG_ON(!ON_RB(&arq->rb_node)); 1107 1108 as_antic_stop(ad); 1109 ad->antic_status = ANTIC_OFF; 1110 1111 /* 1112 * This has to be set in order to be correctly updated by 1113 * as_find_next_arq 1114 */ 1115 ad->last_sector[data_dir] = rq->sector + rq->nr_sectors; 1116 1117 if (data_dir == REQ_SYNC) { 1118 /* In case we have to anticipate after this */ 1119 copy_io_context(&ad->io_context, &arq->io_context); 1120 } else { 1121 if (ad->io_context) { 1122 put_io_context(ad->io_context); 1123 ad->io_context = NULL; 1124 } 1125 1126 if (ad->current_write_count != 0) 1127 ad->current_write_count--; 1128 } 1129 ad->ioc_finished = 0; 1130 1131 ad->next_arq[data_dir] = as_find_next_arq(ad, arq); 1132 1133 /* 1134 * take it off the sort and fifo list, add to dispatch queue 1135 */ 1136 while (!list_empty(&rq->queuelist)) { 1137 struct request *__rq = list_entry_rq(rq->queuelist.next); 1138 struct as_rq *__arq = RQ_DATA(__rq); 1139 1140 list_del(&__rq->queuelist); 1141 1142 elv_dispatch_add_tail(ad->q, __rq); 1143 1144 if (__arq->io_context && __arq->io_context->aic) 1145 atomic_inc(&__arq->io_context->aic->nr_dispatched); 1146 1147 WARN_ON(__arq->state != AS_RQ_QUEUED); 1148 __arq->state = AS_RQ_DISPATCHED; 1149 1150 ad->nr_dispatched++; 1151 } 1152 1153 as_remove_queued_request(ad->q, rq); 1154 WARN_ON(arq->state != AS_RQ_QUEUED); 1155 1156 elv_dispatch_sort(ad->q, rq); 1157 1158 arq->state = AS_RQ_DISPATCHED; 1159 if (arq->io_context && arq->io_context->aic) 1160 atomic_inc(&arq->io_context->aic->nr_dispatched); 1161 ad->nr_dispatched++; 1162} 1163 1164/* 1165 * as_dispatch_request selects the best request according to 1166 * read/write expire, batch expire, etc, and moves it to the dispatch 1167 * queue. Returns 1 if a request was found, 0 otherwise. 1168 */ 1169static int as_dispatch_request(request_queue_t *q, int force) 1170{ 1171 struct as_data *ad = q->elevator->elevator_data; 1172 struct as_rq *arq; 1173 const int reads = !list_empty(&ad->fifo_list[REQ_SYNC]); 1174 const int writes = !list_empty(&ad->fifo_list[REQ_ASYNC]); 1175 1176 if (unlikely(force)) { 1177 /* 1178 * Forced dispatch, accounting is useless. Reset 1179 * accounting states and dump fifo_lists. Note that 1180 * batch_data_dir is reset to REQ_SYNC to avoid 1181 * screwing write batch accounting as write batch 1182 * accounting occurs on W->R transition. 1183 */ 1184 int dispatched = 0; 1185 1186 ad->batch_data_dir = REQ_SYNC; 1187 ad->changed_batch = 0; 1188 ad->new_batch = 0; 1189 1190 while (ad->next_arq[REQ_SYNC]) { 1191 as_move_to_dispatch(ad, ad->next_arq[REQ_SYNC]); 1192 dispatched++; 1193 } 1194 ad->last_check_fifo[REQ_SYNC] = jiffies; 1195 1196 while (ad->next_arq[REQ_ASYNC]) { 1197 as_move_to_dispatch(ad, ad->next_arq[REQ_ASYNC]); 1198 dispatched++; 1199 } 1200 ad->last_check_fifo[REQ_ASYNC] = jiffies; 1201 1202 return dispatched; 1203 } 1204 1205 /* Signal that the write batch was uncontended, so we can't time it */ 1206 if (ad->batch_data_dir == REQ_ASYNC && !reads) { 1207 if (ad->current_write_count == 0 || !writes) 1208 ad->write_batch_idled = 1; 1209 } 1210 1211 if (!(reads || writes) 1212 || ad->antic_status == ANTIC_WAIT_REQ 1213 || ad->antic_status == ANTIC_WAIT_NEXT 1214 || ad->changed_batch) 1215 return 0; 1216 1217 if (!(reads && writes && as_batch_expired(ad))) { 1218 /* 1219 * batch is still running or no reads or no writes 1220 */ 1221 arq = ad->next_arq[ad->batch_data_dir]; 1222 1223 if (ad->batch_data_dir == REQ_SYNC && ad->antic_expire) { 1224 if (as_fifo_expired(ad, REQ_SYNC)) 1225 goto fifo_expired; 1226 1227 if (as_can_anticipate(ad, arq)) { 1228 as_antic_waitreq(ad); 1229 return 0; 1230 } 1231 } 1232 1233 if (arq) { 1234 /* we have a "next request" */ 1235 if (reads && !writes) 1236 ad->current_batch_expires = 1237 jiffies + ad->batch_expire[REQ_SYNC]; 1238 goto dispatch_request; 1239 } 1240 } 1241 1242 /* 1243 * at this point we are not running a batch. select the appropriate 1244 * data direction (read / write) 1245 */ 1246 1247 if (reads) { 1248 BUG_ON(RB_EMPTY(&ad->sort_list[REQ_SYNC])); 1249 1250 if (writes && ad->batch_data_dir == REQ_SYNC) 1251 /* 1252 * Last batch was a read, switch to writes 1253 */ 1254 goto dispatch_writes; 1255 1256 if (ad->batch_data_dir == REQ_ASYNC) { 1257 WARN_ON(ad->new_batch); 1258 ad->changed_batch = 1; 1259 } 1260 ad->batch_data_dir = REQ_SYNC; 1261 arq = list_entry_fifo(ad->fifo_list[ad->batch_data_dir].next); 1262 ad->last_check_fifo[ad->batch_data_dir] = jiffies; 1263 goto dispatch_request; 1264 } 1265 1266 /* 1267 * the last batch was a read 1268 */ 1269 1270 if (writes) { 1271dispatch_writes: 1272 BUG_ON(RB_EMPTY(&ad->sort_list[REQ_ASYNC])); 1273 1274 if (ad->batch_data_dir == REQ_SYNC) { 1275 ad->changed_batch = 1; 1276 1277 /* 1278 * new_batch might be 1 when the queue runs out of 1279 * reads. A subsequent submission of a write might 1280 * cause a change of batch before the read is finished. 1281 */ 1282 ad->new_batch = 0; 1283 } 1284 ad->batch_data_dir = REQ_ASYNC; 1285 ad->current_write_count = ad->write_batch_count; 1286 ad->write_batch_idled = 0; 1287 arq = ad->next_arq[ad->batch_data_dir]; 1288 goto dispatch_request; 1289 } 1290 1291 BUG(); 1292 return 0; 1293 1294dispatch_request: 1295 /* 1296 * If a request has expired, service it. 1297 */ 1298 1299 if (as_fifo_expired(ad, ad->batch_data_dir)) { 1300fifo_expired: 1301 arq = list_entry_fifo(ad->fifo_list[ad->batch_data_dir].next); 1302 BUG_ON(arq == NULL); 1303 } 1304 1305 if (ad->changed_batch) { 1306 WARN_ON(ad->new_batch); 1307 1308 if (ad->nr_dispatched) 1309 return 0; 1310 1311 if (ad->batch_data_dir == REQ_ASYNC) 1312 ad->current_batch_expires = jiffies + 1313 ad->batch_expire[REQ_ASYNC]; 1314 else 1315 ad->new_batch = 1; 1316 1317 ad->changed_batch = 0; 1318 } 1319 1320 /* 1321 * arq is the selected appropriate request. 1322 */ 1323 as_move_to_dispatch(ad, arq); 1324 1325 return 1; 1326} 1327 1328/* 1329 * Add arq to a list behind alias 1330 */ 1331static inline void 1332as_add_aliased_request(struct as_data *ad, struct as_rq *arq, 1333 struct as_rq *alias) 1334{ 1335 struct request *req = arq->request; 1336 struct list_head *insert = alias->request->queuelist.prev; 1337 1338 /* 1339 * Transfer list of aliases 1340 */ 1341 while (!list_empty(&req->queuelist)) { 1342 struct request *__rq = list_entry_rq(req->queuelist.next); 1343 struct as_rq *__arq = RQ_DATA(__rq); 1344 1345 list_move_tail(&__rq->queuelist, &alias->request->queuelist); 1346 1347 WARN_ON(__arq->state != AS_RQ_QUEUED); 1348 } 1349 1350 /* 1351 * Another request with the same start sector on the rbtree. 1352 * Link this request to that sector. They are untangled in 1353 * as_move_to_dispatch 1354 */ 1355 list_add(&arq->request->queuelist, insert); 1356 1357 /* 1358 * Don't want to have to handle merges. 1359 */ 1360 as_del_arq_hash(arq); 1361 arq->request->flags |= REQ_NOMERGE; 1362} 1363 1364/* 1365 * add arq to rbtree and fifo 1366 */ 1367static void as_add_request(request_queue_t *q, struct request *rq) 1368{ 1369 struct as_data *ad = q->elevator->elevator_data; 1370 struct as_rq *arq = RQ_DATA(rq); 1371 struct as_rq *alias; 1372 int data_dir; 1373 1374 arq->state = AS_RQ_NEW; 1375 1376 if (rq_data_dir(arq->request) == READ 1377 || current->flags&PF_SYNCWRITE) 1378 arq->is_sync = 1; 1379 else 1380 arq->is_sync = 0; 1381 data_dir = arq->is_sync; 1382 1383 arq->io_context = as_get_io_context(); 1384 1385 if (arq->io_context) { 1386 as_update_iohist(ad, arq->io_context->aic, arq->request); 1387 atomic_inc(&arq->io_context->aic->nr_queued); 1388 } 1389 1390 alias = as_add_arq_rb(ad, arq); 1391 if (!alias) { 1392 /* 1393 * set expire time (only used for reads) and add to fifo list 1394 */ 1395 arq->expires = jiffies + ad->fifo_expire[data_dir]; 1396 list_add_tail(&arq->fifo, &ad->fifo_list[data_dir]); 1397 1398 if (rq_mergeable(arq->request)) 1399 as_add_arq_hash(ad, arq); 1400 as_update_arq(ad, arq); /* keep state machine up to date */ 1401 1402 } else { 1403 as_add_aliased_request(ad, arq, alias); 1404 1405 /* 1406 * have we been anticipating this request? 1407 * or does it come from the same process as the one we are 1408 * anticipating for? 1409 */ 1410 if (ad->antic_status == ANTIC_WAIT_REQ 1411 || ad->antic_status == ANTIC_WAIT_NEXT) { 1412 if (as_can_break_anticipation(ad, arq)) 1413 as_antic_stop(ad); 1414 } 1415 } 1416 1417 arq->state = AS_RQ_QUEUED; 1418} 1419 1420static void as_activate_request(request_queue_t *q, struct request *rq) 1421{ 1422 struct as_rq *arq = RQ_DATA(rq); 1423 1424 WARN_ON(arq->state != AS_RQ_DISPATCHED); 1425 arq->state = AS_RQ_REMOVED; 1426 if (arq->io_context && arq->io_context->aic) 1427 atomic_dec(&arq->io_context->aic->nr_dispatched); 1428} 1429 1430static void as_deactivate_request(request_queue_t *q, struct request *rq) 1431{ 1432 struct as_rq *arq = RQ_DATA(rq); 1433 1434 WARN_ON(arq->state != AS_RQ_REMOVED); 1435 arq->state = AS_RQ_DISPATCHED; 1436 if (arq->io_context && arq->io_context->aic) 1437 atomic_inc(&arq->io_context->aic->nr_dispatched); 1438} 1439 1440/* 1441 * as_queue_empty tells us if there are requests left in the device. It may 1442 * not be the case that a driver can get the next request even if the queue 1443 * is not empty - it is used in the block layer to check for plugging and 1444 * merging opportunities 1445 */ 1446static int as_queue_empty(request_queue_t *q) 1447{ 1448 struct as_data *ad = q->elevator->elevator_data; 1449 1450 return list_empty(&ad->fifo_list[REQ_ASYNC]) 1451 && list_empty(&ad->fifo_list[REQ_SYNC]); 1452} 1453 1454static struct request *as_former_request(request_queue_t *q, 1455 struct request *rq) 1456{ 1457 struct as_rq *arq = RQ_DATA(rq); 1458 struct rb_node *rbprev = rb_prev(&arq->rb_node); 1459 struct request *ret = NULL; 1460 1461 if (rbprev) 1462 ret = rb_entry_arq(rbprev)->request; 1463 1464 return ret; 1465} 1466 1467static struct request *as_latter_request(request_queue_t *q, 1468 struct request *rq) 1469{ 1470 struct as_rq *arq = RQ_DATA(rq); 1471 struct rb_node *rbnext = rb_next(&arq->rb_node); 1472 struct request *ret = NULL; 1473 1474 if (rbnext) 1475 ret = rb_entry_arq(rbnext)->request; 1476 1477 return ret; 1478} 1479 1480static int 1481as_merge(request_queue_t *q, struct request **req, struct bio *bio) 1482{ 1483 struct as_data *ad = q->elevator->elevator_data; 1484 sector_t rb_key = bio->bi_sector + bio_sectors(bio); 1485 struct request *__rq; 1486 int ret; 1487 1488 /* 1489 * see if the merge hash can satisfy a back merge 1490 */ 1491 __rq = as_find_arq_hash(ad, bio->bi_sector); 1492 if (__rq) { 1493 BUG_ON(__rq->sector + __rq->nr_sectors != bio->bi_sector); 1494 1495 if (elv_rq_merge_ok(__rq, bio)) { 1496 ret = ELEVATOR_BACK_MERGE; 1497 goto out; 1498 } 1499 } 1500 1501 /* 1502 * check for front merge 1503 */ 1504 __rq = as_find_arq_rb(ad, rb_key, bio_data_dir(bio)); 1505 if (__rq) { 1506 BUG_ON(rb_key != rq_rb_key(__rq)); 1507 1508 if (elv_rq_merge_ok(__rq, bio)) { 1509 ret = ELEVATOR_FRONT_MERGE; 1510 goto out; 1511 } 1512 } 1513 1514 return ELEVATOR_NO_MERGE; 1515out: 1516 if (ret) { 1517 if (rq_mergeable(__rq)) 1518 as_hot_arq_hash(ad, RQ_DATA(__rq)); 1519 } 1520 *req = __rq; 1521 return ret; 1522} 1523 1524static void as_merged_request(request_queue_t *q, struct request *req) 1525{ 1526 struct as_data *ad = q->elevator->elevator_data; 1527 struct as_rq *arq = RQ_DATA(req); 1528 1529 /* 1530 * hash always needs to be repositioned, key is end sector 1531 */ 1532 as_del_arq_hash(arq); 1533 as_add_arq_hash(ad, arq); 1534 1535 /* 1536 * if the merge was a front merge, we need to reposition request 1537 */ 1538 if (rq_rb_key(req) != arq->rb_key) { 1539 struct as_rq *alias, *next_arq = NULL; 1540 1541 if (ad->next_arq[arq->is_sync] == arq) 1542 next_arq = as_find_next_arq(ad, arq); 1543 1544 /* 1545 * Note! We should really be moving any old aliased requests 1546 * off this request and try to insert them into the rbtree. We 1547 * currently don't bother. Ditto the next function. 1548 */ 1549 as_del_arq_rb(ad, arq); 1550 if ((alias = as_add_arq_rb(ad, arq))) { 1551 list_del_init(&arq->fifo); 1552 as_add_aliased_request(ad, arq, alias); 1553 if (next_arq) 1554 ad->next_arq[arq->is_sync] = next_arq; 1555 } 1556 /* 1557 * Note! At this stage of this and the next function, our next 1558 * request may not be optimal - eg the request may have "grown" 1559 * behind the disk head. We currently don't bother adjusting. 1560 */ 1561 } 1562} 1563 1564static void as_merged_requests(request_queue_t *q, struct request *req, 1565 struct request *next) 1566{ 1567 struct as_data *ad = q->elevator->elevator_data; 1568 struct as_rq *arq = RQ_DATA(req); 1569 struct as_rq *anext = RQ_DATA(next); 1570 1571 BUG_ON(!arq); 1572 BUG_ON(!anext); 1573 1574 /* 1575 * reposition arq (this is the merged request) in hash, and in rbtree 1576 * in case of a front merge 1577 */ 1578 as_del_arq_hash(arq); 1579 as_add_arq_hash(ad, arq); 1580 1581 if (rq_rb_key(req) != arq->rb_key) { 1582 struct as_rq *alias, *next_arq = NULL; 1583 1584 if (ad->next_arq[arq->is_sync] == arq) 1585 next_arq = as_find_next_arq(ad, arq); 1586 1587 as_del_arq_rb(ad, arq); 1588 if ((alias = as_add_arq_rb(ad, arq))) { 1589 list_del_init(&arq->fifo); 1590 as_add_aliased_request(ad, arq, alias); 1591 if (next_arq) 1592 ad->next_arq[arq->is_sync] = next_arq; 1593 } 1594 } 1595 1596 /* 1597 * if anext expires before arq, assign its expire time to arq 1598 * and move into anext position (anext will be deleted) in fifo 1599 */ 1600 if (!list_empty(&arq->fifo) && !list_empty(&anext->fifo)) { 1601 if (time_before(anext->expires, arq->expires)) { 1602 list_move(&arq->fifo, &anext->fifo); 1603 arq->expires = anext->expires; 1604 /* 1605 * Don't copy here but swap, because when anext is 1606 * removed below, it must contain the unused context 1607 */ 1608 swap_io_context(&arq->io_context, &anext->io_context); 1609 } 1610 } 1611 1612 /* 1613 * Transfer list of aliases 1614 */ 1615 while (!list_empty(&next->queuelist)) { 1616 struct request *__rq = list_entry_rq(next->queuelist.next); 1617 struct as_rq *__arq = RQ_DATA(__rq); 1618 1619 list_move_tail(&__rq->queuelist, &req->queuelist); 1620 1621 WARN_ON(__arq->state != AS_RQ_QUEUED); 1622 } 1623 1624 /* 1625 * kill knowledge of next, this one is a goner 1626 */ 1627 as_remove_queued_request(q, next); 1628 as_put_io_context(anext); 1629 1630 anext->state = AS_RQ_MERGED; 1631} 1632 1633/* 1634 * This is executed in a "deferred" process context, by kblockd. It calls the 1635 * driver's request_fn so the driver can submit that request. 1636 * 1637 * IMPORTANT! This guy will reenter the elevator, so set up all queue global 1638 * state before calling, and don't rely on any state over calls. 1639 * 1640 * FIXME! dispatch queue is not a queue at all! 1641 */ 1642static void as_work_handler(void *data) 1643{ 1644 struct request_queue *q = data; 1645 unsigned long flags; 1646 1647 spin_lock_irqsave(q->queue_lock, flags); 1648 if (!as_queue_empty(q)) 1649 q->request_fn(q); 1650 spin_unlock_irqrestore(q->queue_lock, flags); 1651} 1652 1653static void as_put_request(request_queue_t *q, struct request *rq) 1654{ 1655 struct as_data *ad = q->elevator->elevator_data; 1656 struct as_rq *arq = RQ_DATA(rq); 1657 1658 if (!arq) { 1659 WARN_ON(1); 1660 return; 1661 } 1662 1663 if (unlikely(arq->state != AS_RQ_POSTSCHED && 1664 arq->state != AS_RQ_PRESCHED && 1665 arq->state != AS_RQ_MERGED)) { 1666 printk("arq->state %d\n", arq->state); 1667 WARN_ON(1); 1668 } 1669 1670 mempool_free(arq, ad->arq_pool); 1671 rq->elevator_private = NULL; 1672} 1673 1674static int as_set_request(request_queue_t *q, struct request *rq, 1675 struct bio *bio, gfp_t gfp_mask) 1676{ 1677 struct as_data *ad = q->elevator->elevator_data; 1678 struct as_rq *arq = mempool_alloc(ad->arq_pool, gfp_mask); 1679 1680 if (arq) { 1681 memset(arq, 0, sizeof(*arq)); 1682 RB_CLEAR(&arq->rb_node); 1683 arq->request = rq; 1684 arq->state = AS_RQ_PRESCHED; 1685 arq->io_context = NULL; 1686 INIT_LIST_HEAD(&arq->hash); 1687 arq->on_hash = 0; 1688 INIT_LIST_HEAD(&arq->fifo); 1689 rq->elevator_private = arq; 1690 return 0; 1691 } 1692 1693 return 1; 1694} 1695 1696static int as_may_queue(request_queue_t *q, int rw, struct bio *bio) 1697{ 1698 int ret = ELV_MQUEUE_MAY; 1699 struct as_data *ad = q->elevator->elevator_data; 1700 struct io_context *ioc; 1701 if (ad->antic_status == ANTIC_WAIT_REQ || 1702 ad->antic_status == ANTIC_WAIT_NEXT) { 1703 ioc = as_get_io_context(); 1704 if (ad->io_context == ioc) 1705 ret = ELV_MQUEUE_MUST; 1706 put_io_context(ioc); 1707 } 1708 1709 return ret; 1710} 1711 1712static void as_exit_queue(elevator_t *e) 1713{ 1714 struct as_data *ad = e->elevator_data; 1715 1716 del_timer_sync(&ad->antic_timer); 1717 kblockd_flush(); 1718 1719 BUG_ON(!list_empty(&ad->fifo_list[REQ_SYNC])); 1720 BUG_ON(!list_empty(&ad->fifo_list[REQ_ASYNC])); 1721 1722 mempool_destroy(ad->arq_pool); 1723 put_io_context(ad->io_context); 1724 kfree(ad->hash); 1725 kfree(ad); 1726} 1727 1728/* 1729 * initialize elevator private data (as_data), and alloc a arq for 1730 * each request on the free lists 1731 */ 1732static int as_init_queue(request_queue_t *q, elevator_t *e) 1733{ 1734 struct as_data *ad; 1735 int i; 1736 1737 if (!arq_pool) 1738 return -ENOMEM; 1739 1740 ad = kmalloc_node(sizeof(*ad), GFP_KERNEL, q->node); 1741 if (!ad) 1742 return -ENOMEM; 1743 memset(ad, 0, sizeof(*ad)); 1744 1745 ad->q = q; /* Identify what queue the data belongs to */ 1746 1747 ad->hash = kmalloc_node(sizeof(struct list_head)*AS_HASH_ENTRIES, 1748 GFP_KERNEL, q->node); 1749 if (!ad->hash) { 1750 kfree(ad); 1751 return -ENOMEM; 1752 } 1753 1754 ad->arq_pool = mempool_create_node(BLKDEV_MIN_RQ, mempool_alloc_slab, 1755 mempool_free_slab, arq_pool, q->node); 1756 if (!ad->arq_pool) { 1757 kfree(ad->hash); 1758 kfree(ad); 1759 return -ENOMEM; 1760 } 1761 1762 /* anticipatory scheduling helpers */ 1763 ad->antic_timer.function = as_antic_timeout; 1764 ad->antic_timer.data = (unsigned long)q; 1765 init_timer(&ad->antic_timer); 1766 INIT_WORK(&ad->antic_work, as_work_handler, q); 1767 1768 for (i = 0; i < AS_HASH_ENTRIES; i++) 1769 INIT_LIST_HEAD(&ad->hash[i]); 1770 1771 INIT_LIST_HEAD(&ad->fifo_list[REQ_SYNC]); 1772 INIT_LIST_HEAD(&ad->fifo_list[REQ_ASYNC]); 1773 ad->sort_list[REQ_SYNC] = RB_ROOT; 1774 ad->sort_list[REQ_ASYNC] = RB_ROOT; 1775 ad->fifo_expire[REQ_SYNC] = default_read_expire; 1776 ad->fifo_expire[REQ_ASYNC] = default_write_expire; 1777 ad->antic_expire = default_antic_expire; 1778 ad->batch_expire[REQ_SYNC] = default_read_batch_expire; 1779 ad->batch_expire[REQ_ASYNC] = default_write_batch_expire; 1780 e->elevator_data = ad; 1781 1782 ad->current_batch_expires = jiffies + ad->batch_expire[REQ_SYNC]; 1783 ad->write_batch_count = ad->batch_expire[REQ_ASYNC] / 10; 1784 if (ad->write_batch_count < 2) 1785 ad->write_batch_count = 2; 1786 1787 return 0; 1788} 1789 1790/* 1791 * sysfs parts below 1792 */ 1793struct as_fs_entry { 1794 struct attribute attr; 1795 ssize_t (*show)(struct as_data *, char *); 1796 ssize_t (*store)(struct as_data *, const char *, size_t); 1797}; 1798 1799static ssize_t 1800as_var_show(unsigned int var, char *page) 1801{ 1802 return sprintf(page, "%d\n", var); 1803} 1804 1805static ssize_t 1806as_var_store(unsigned long *var, const char *page, size_t count) 1807{ 1808 char *p = (char *) page; 1809 1810 *var = simple_strtoul(p, &p, 10); 1811 return count; 1812} 1813 1814static ssize_t as_est_show(struct as_data *ad, char *page) 1815{ 1816 int pos = 0; 1817 1818 pos += sprintf(page+pos, "%lu %% exit probability\n", 1819 100*ad->exit_prob/256); 1820 pos += sprintf(page+pos, "%lu %% probability of exiting without a " 1821 "cooperating process submitting IO\n", 1822 100*ad->exit_no_coop/256); 1823 pos += sprintf(page+pos, "%lu ms new thinktime\n", ad->new_ttime_mean); 1824 pos += sprintf(page+pos, "%llu sectors new seek distance\n", 1825 (unsigned long long)ad->new_seek_mean); 1826 1827 return pos; 1828} 1829 1830#define SHOW_FUNCTION(__FUNC, __VAR) \ 1831static ssize_t __FUNC(struct as_data *ad, char *page) \ 1832{ \ 1833 return as_var_show(jiffies_to_msecs((__VAR)), (page)); \ 1834} 1835SHOW_FUNCTION(as_readexpire_show, ad->fifo_expire[REQ_SYNC]); 1836SHOW_FUNCTION(as_writeexpire_show, ad->fifo_expire[REQ_ASYNC]); 1837SHOW_FUNCTION(as_anticexpire_show, ad->antic_expire); 1838SHOW_FUNCTION(as_read_batchexpire_show, ad->batch_expire[REQ_SYNC]); 1839SHOW_FUNCTION(as_write_batchexpire_show, ad->batch_expire[REQ_ASYNC]); 1840#undef SHOW_FUNCTION 1841 1842#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX) \ 1843static ssize_t __FUNC(struct as_data *ad, const char *page, size_t count) \ 1844{ \ 1845 int ret = as_var_store(__PTR, (page), count); \ 1846 if (*(__PTR) < (MIN)) \ 1847 *(__PTR) = (MIN); \ 1848 else if (*(__PTR) > (MAX)) \ 1849 *(__PTR) = (MAX); \ 1850 *(__PTR) = msecs_to_jiffies(*(__PTR)); \ 1851 return ret; \ 1852} 1853STORE_FUNCTION(as_readexpire_store, &ad->fifo_expire[REQ_SYNC], 0, INT_MAX); 1854STORE_FUNCTION(as_writeexpire_store, &ad->fifo_expire[REQ_ASYNC], 0, INT_MAX); 1855STORE_FUNCTION(as_anticexpire_store, &ad->antic_expire, 0, INT_MAX); 1856STORE_FUNCTION(as_read_batchexpire_store, 1857 &ad->batch_expire[REQ_SYNC], 0, INT_MAX); 1858STORE_FUNCTION(as_write_batchexpire_store, 1859 &ad->batch_expire[REQ_ASYNC], 0, INT_MAX); 1860#undef STORE_FUNCTION 1861 1862static struct as_fs_entry as_est_entry = { 1863 .attr = {.name = "est_time", .mode = S_IRUGO }, 1864 .show = as_est_show, 1865}; 1866static struct as_fs_entry as_readexpire_entry = { 1867 .attr = {.name = "read_expire", .mode = S_IRUGO | S_IWUSR }, 1868 .show = as_readexpire_show, 1869 .store = as_readexpire_store, 1870}; 1871static struct as_fs_entry as_writeexpire_entry = { 1872 .attr = {.name = "write_expire", .mode = S_IRUGO | S_IWUSR }, 1873 .show = as_writeexpire_show, 1874 .store = as_writeexpire_store, 1875}; 1876static struct as_fs_entry as_anticexpire_entry = { 1877 .attr = {.name = "antic_expire", .mode = S_IRUGO | S_IWUSR }, 1878 .show = as_anticexpire_show, 1879 .store = as_anticexpire_store, 1880}; 1881static struct as_fs_entry as_read_batchexpire_entry = { 1882 .attr = {.name = "read_batch_expire", .mode = S_IRUGO | S_IWUSR }, 1883 .show = as_read_batchexpire_show, 1884 .store = as_read_batchexpire_store, 1885}; 1886static struct as_fs_entry as_write_batchexpire_entry = { 1887 .attr = {.name = "write_batch_expire", .mode = S_IRUGO | S_IWUSR }, 1888 .show = as_write_batchexpire_show, 1889 .store = as_write_batchexpire_store, 1890}; 1891 1892static struct attribute *default_attrs[] = { 1893 &as_est_entry.attr, 1894 &as_readexpire_entry.attr, 1895 &as_writeexpire_entry.attr, 1896 &as_anticexpire_entry.attr, 1897 &as_read_batchexpire_entry.attr, 1898 &as_write_batchexpire_entry.attr, 1899 NULL, 1900}; 1901 1902#define to_as(atr) container_of((atr), struct as_fs_entry, attr) 1903 1904static ssize_t 1905as_attr_show(struct kobject *kobj, struct attribute *attr, char *page) 1906{ 1907 elevator_t *e = container_of(kobj, elevator_t, kobj); 1908 struct as_fs_entry *entry = to_as(attr); 1909 1910 if (!entry->show) 1911 return -EIO; 1912 1913 return entry->show(e->elevator_data, page); 1914} 1915 1916static ssize_t 1917as_attr_store(struct kobject *kobj, struct attribute *attr, 1918 const char *page, size_t length) 1919{ 1920 elevator_t *e = container_of(kobj, elevator_t, kobj); 1921 struct as_fs_entry *entry = to_as(attr); 1922 1923 if (!entry->store) 1924 return -EIO; 1925 1926 return entry->store(e->elevator_data, page, length); 1927} 1928 1929static struct sysfs_ops as_sysfs_ops = { 1930 .show = as_attr_show, 1931 .store = as_attr_store, 1932}; 1933 1934static struct kobj_type as_ktype = { 1935 .sysfs_ops = &as_sysfs_ops, 1936 .default_attrs = default_attrs, 1937}; 1938 1939static struct elevator_type iosched_as = { 1940 .ops = { 1941 .elevator_merge_fn = as_merge, 1942 .elevator_merged_fn = as_merged_request, 1943 .elevator_merge_req_fn = as_merged_requests, 1944 .elevator_dispatch_fn = as_dispatch_request, 1945 .elevator_add_req_fn = as_add_request, 1946 .elevator_activate_req_fn = as_activate_request, 1947 .elevator_deactivate_req_fn = as_deactivate_request, 1948 .elevator_queue_empty_fn = as_queue_empty, 1949 .elevator_completed_req_fn = as_completed_request, 1950 .elevator_former_req_fn = as_former_request, 1951 .elevator_latter_req_fn = as_latter_request, 1952 .elevator_set_req_fn = as_set_request, 1953 .elevator_put_req_fn = as_put_request, 1954 .elevator_may_queue_fn = as_may_queue, 1955 .elevator_init_fn = as_init_queue, 1956 .elevator_exit_fn = as_exit_queue, 1957 }, 1958 1959 .elevator_ktype = &as_ktype, 1960 .elevator_name = "anticipatory", 1961 .elevator_owner = THIS_MODULE, 1962}; 1963 1964static int __init as_init(void) 1965{ 1966 int ret; 1967 1968 arq_pool = kmem_cache_create("as_arq", sizeof(struct as_rq), 1969 0, 0, NULL, NULL); 1970 if (!arq_pool) 1971 return -ENOMEM; 1972 1973 ret = elv_register(&iosched_as); 1974 if (!ret) { 1975 /* 1976 * don't allow AS to get unregistered, since we would have 1977 * to browse all tasks in the system and release their 1978 * as_io_context first 1979 */ 1980 __module_get(THIS_MODULE); 1981 return 0; 1982 } 1983 1984 kmem_cache_destroy(arq_pool); 1985 return ret; 1986} 1987 1988static void __exit as_exit(void) 1989{ 1990 elv_unregister(&iosched_as); 1991 kmem_cache_destroy(arq_pool); 1992} 1993 1994module_init(as_init); 1995module_exit(as_exit); 1996 1997MODULE_AUTHOR("Nick Piggin"); 1998MODULE_LICENSE("GPL"); 1999MODULE_DESCRIPTION("anticipatory IO scheduler");