Merge branch 'for-3.3/core' of git://git.kernel.dk/linux-block

+6 -5

block/blk-cgroup.c

··· 1655 1655 struct io_context *ioc; 1656 1656 1657 1657 cgroup_taskset_for_each(task, cgrp, tset) { 1658 - task_lock(task); 1659 - ioc = task->io_context; 1660 - if (ioc) 1661 - ioc->cgroup_changed = 1; 1662 - task_unlock(task); 1658 + /* we don't lose anything even if ioc allocation fails */ 1659 + ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE); 1660 + if (ioc) { 1661 + ioc_cgroup_changed(ioc); 1662 + put_io_context(ioc, NULL); 1663 + } 1663 1664 } 1664 1665 } 1665 1666

+122 -81

block/blk-core.c

··· 39 39 EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap); 40 40 EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete); 41 41 42 + DEFINE_IDA(blk_queue_ida); 43 + 42 44 /* 43 45 * For the allocated request tables 44 46 */ ··· 360 358 void blk_drain_queue(struct request_queue *q, bool drain_all) 361 359 { 362 360 while (true) { 363 - int nr_rqs; 361 + bool drain = false; 362 + int i; 364 363 365 364 spin_lock_irq(q->queue_lock); 366 365 ··· 378 375 if (!list_empty(&q->queue_head)) 379 376 __blk_run_queue(q); 380 377 381 - if (drain_all) 382 - nr_rqs = q->rq.count[0] + q->rq.count[1]; 383 - else 384 - nr_rqs = q->rq.elvpriv; 378 + drain |= q->rq.elvpriv; 379 + 380 + /* 381 + * Unfortunately, requests are queued at and tracked from 382 + * multiple places and there's no single counter which can 383 + * be drained. Check all the queues and counters. 384 + */ 385 + if (drain_all) { 386 + drain |= !list_empty(&q->queue_head); 387 + for (i = 0; i < 2; i++) { 388 + drain |= q->rq.count[i]; 389 + drain |= q->in_flight[i]; 390 + drain |= !list_empty(&q->flush_queue[i]); 391 + } 392 + } 385 393 386 394 spin_unlock_irq(q->queue_lock); 387 395 388 - if (!nr_rqs) 396 + if (!drain) 389 397 break; 390 398 msleep(10); 391 399 } ··· 483 469 if (!q) 484 470 return NULL; 485 471 472 + q->id = ida_simple_get(&blk_queue_ida, 0, 0, GFP_KERNEL); 473 + if (q->id < 0) 474 + goto fail_q; 475 + 486 476 q->backing_dev_info.ra_pages = 487 477 (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; 488 478 q->backing_dev_info.state = 0; ··· 495 477 q->node = node_id; 496 478 497 479 err = bdi_init(&q->backing_dev_info); 498 - if (err) { 499 - kmem_cache_free(blk_requestq_cachep, q); 500 - return NULL; 501 - } 480 + if (err) 481 + goto fail_id; 502 482 503 - if (blk_throtl_init(q)) { 504 - kmem_cache_free(blk_requestq_cachep, q); 505 - return NULL; 506 - } 483 + if (blk_throtl_init(q)) 484 + goto fail_id; 507 485 508 486 setup_timer(&q->backing_dev_info.laptop_mode_wb_timer, 509 487 laptop_mode_timer_fn, (unsigned long) q); 510 488 setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q); 511 489 INIT_LIST_HEAD(&q->timeout_list); 490 + INIT_LIST_HEAD(&q->icq_list); 512 491 INIT_LIST_HEAD(&q->flush_queue[0]); 513 492 INIT_LIST_HEAD(&q->flush_queue[1]); 514 493 INIT_LIST_HEAD(&q->flush_data_in_flight); ··· 523 508 q->queue_lock = &q->__queue_lock; 524 509 525 510 return q; 511 + 512 + fail_id: 513 + ida_simple_remove(&blk_queue_ida, q->id); 514 + fail_q: 515 + kmem_cache_free(blk_requestq_cachep, q); 516 + return NULL; 526 517 } 527 518 EXPORT_SYMBOL(blk_alloc_queue_node); 528 519 ··· 626 605 } 627 606 EXPORT_SYMBOL(blk_init_allocated_queue); 628 607 629 - int blk_get_queue(struct request_queue *q) 608 + bool blk_get_queue(struct request_queue *q) 630 609 { 631 - if (likely(!test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) { 632 - kobject_get(&q->kobj); 633 - return 0; 610 + if (likely(!blk_queue_dead(q))) { 611 + __blk_get_queue(q); 612 + return true; 634 613 } 635 614 636 - return 1; 615 + return false; 637 616 } 638 617 EXPORT_SYMBOL(blk_get_queue); 639 618 640 619 static inline void blk_free_request(struct request_queue *q, struct request *rq) 641 620 { 642 - if (rq->cmd_flags & REQ_ELVPRIV) 621 + if (rq->cmd_flags & REQ_ELVPRIV) { 643 622 elv_put_request(q, rq); 623 + if (rq->elv.icq) 624 + put_io_context(rq->elv.icq->ioc, q); 625 + } 626 + 644 627 mempool_free(rq, q->rq.rq_pool); 645 628 } 646 629 647 630 static struct request * 648 - blk_alloc_request(struct request_queue *q, unsigned int flags, gfp_t gfp_mask) 631 + blk_alloc_request(struct request_queue *q, struct io_cq *icq, 632 + unsigned int flags, gfp_t gfp_mask) 649 633 { 650 634 struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask); 651 635 ··· 661 635 662 636 rq->cmd_flags = flags | REQ_ALLOCED; 663 637 664 - if ((flags & REQ_ELVPRIV) && 665 - unlikely(elv_set_request(q, rq, gfp_mask))) { 666 - mempool_free(rq, q->rq.rq_pool); 667 - return NULL; 638 + if (flags & REQ_ELVPRIV) { 639 + rq->elv.icq = icq; 640 + if (unlikely(elv_set_request(q, rq, gfp_mask))) { 641 + mempool_free(rq, q->rq.rq_pool); 642 + return NULL; 643 + } 644 + /* @rq->elv.icq holds on to io_context until @rq is freed */ 645 + if (icq) 646 + get_io_context(icq->ioc); 668 647 } 669 648 670 649 return rq; ··· 781 750 { 782 751 struct request *rq = NULL; 783 752 struct request_list *rl = &q->rq; 784 - struct io_context *ioc = NULL; 753 + struct elevator_type *et; 754 + struct io_context *ioc; 755 + struct io_cq *icq = NULL; 785 756 const bool is_sync = rw_is_sync(rw_flags) != 0; 757 + bool retried = false; 786 758 int may_queue; 759 + retry: 760 + et = q->elevator->type; 761 + ioc = current->io_context; 787 762 788 - if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) 763 + if (unlikely(blk_queue_dead(q))) 789 764 return NULL; 790 765 791 766 may_queue = elv_may_queue(q, rw_flags); ··· 800 763 801 764 if (rl->count[is_sync]+1 >= queue_congestion_on_threshold(q)) { 802 765 if (rl->count[is_sync]+1 >= q->nr_requests) { 803 - ioc = current_io_context(GFP_ATOMIC, q->node); 766 + /* 767 + * We want ioc to record batching state. If it's 768 + * not already there, creating a new one requires 769 + * dropping queue_lock, which in turn requires 770 + * retesting conditions to avoid queue hang. 771 + */ 772 + if (!ioc && !retried) { 773 + spin_unlock_irq(q->queue_lock); 774 + create_io_context(current, gfp_mask, q->node); 775 + spin_lock_irq(q->queue_lock); 776 + retried = true; 777 + goto retry; 778 + } 779 + 804 780 /* 805 781 * The queue will fill after this allocation, so set 806 782 * it as full, and mark this process as "batching". ··· 849 799 rl->count[is_sync]++; 850 800 rl->starved[is_sync] = 0; 851 801 802 + /* 803 + * Decide whether the new request will be managed by elevator. If 804 + * so, mark @rw_flags and increment elvpriv. Non-zero elvpriv will 805 + * prevent the current elevator from being destroyed until the new 806 + * request is freed. This guarantees icq's won't be destroyed and 807 + * makes creating new ones safe. 808 + * 809 + * Also, lookup icq while holding queue_lock. If it doesn't exist, 810 + * it will be created after releasing queue_lock. 811 + */ 852 812 if (blk_rq_should_init_elevator(bio) && 853 813 !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags)) { 854 814 rw_flags |= REQ_ELVPRIV; 855 815 rl->elvpriv++; 816 + if (et->icq_cache && ioc) 817 + icq = ioc_lookup_icq(ioc, q); 856 818 } 857 819 858 820 if (blk_queue_io_stat(q)) 859 821 rw_flags |= REQ_IO_STAT; 860 822 spin_unlock_irq(q->queue_lock); 861 823 862 - rq = blk_alloc_request(q, rw_flags, gfp_mask); 824 + /* create icq if missing */ 825 + if (unlikely(et->icq_cache && !icq)) 826 + icq = ioc_create_icq(q, gfp_mask); 827 + 828 + /* rqs are guaranteed to have icq on elv_set_request() if requested */ 829 + if (likely(!et->icq_cache || icq)) 830 + rq = blk_alloc_request(q, icq, rw_flags, gfp_mask); 831 + 863 832 if (unlikely(!rq)) { 864 833 /* 865 834 * Allocation failed presumably due to memory. Undo anything ··· 940 871 rq = get_request(q, rw_flags, bio, GFP_NOIO); 941 872 while (!rq) { 942 873 DEFINE_WAIT(wait); 943 - struct io_context *ioc; 944 874 struct request_list *rl = &q->rq; 945 875 946 - if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) 876 + if (unlikely(blk_queue_dead(q))) 947 877 return NULL; 948 878 949 879 prepare_to_wait_exclusive(&rl->wait[is_sync], &wait, ··· 959 891 * up to a big batch of them for a small period time. 960 892 * See ioc_batching, ioc_set_batching 961 893 */ 962 - ioc = current_io_context(GFP_NOIO, q->node); 963 - ioc_set_batching(q, ioc); 894 + create_io_context(current, GFP_NOIO, q->node); 895 + ioc_set_batching(q, current->io_context); 964 896 965 897 spin_lock_irq(q->queue_lock); 966 898 finish_wait(&rl->wait[is_sync], &wait); ··· 1076 1008 drive_stat_acct(rq, 1); 1077 1009 __elv_add_request(q, rq, where); 1078 1010 } 1079 - 1080 - /** 1081 - * blk_insert_request - insert a special request into a request queue 1082 - * @q: request queue where request should be inserted 1083 - * @rq: request to be inserted 1084 - * @at_head: insert request at head or tail of queue 1085 - * @data: private data 1086 - * 1087 - * Description: 1088 - * Many block devices need to execute commands asynchronously, so they don't 1089 - * block the whole kernel from preemption during request execution. This is 1090 - * accomplished normally by inserting aritficial requests tagged as 1091 - * REQ_TYPE_SPECIAL in to the corresponding request queue, and letting them 1092 - * be scheduled for actual execution by the request queue. 1093 - * 1094 - * We have the option of inserting the head or the tail of the queue. 1095 - * Typically we use the tail for new ioctls and so forth. We use the head 1096 - * of the queue for things like a QUEUE_FULL message from a device, or a 1097 - * host that is unable to accept a particular command. 1098 - */ 1099 - void blk_insert_request(struct request_queue *q, struct request *rq, 1100 - int at_head, void *data) 1101 - { 1102 - int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK; 1103 - unsigned long flags; 1104 - 1105 - /* 1106 - * tell I/O scheduler that this isn't a regular read/write (ie it 1107 - * must not attempt merges on this) and that it acts as a soft 1108 - * barrier 1109 - */ 1110 - rq->cmd_type = REQ_TYPE_SPECIAL; 1111 - 1112 - rq->special = data; 1113 - 1114 - spin_lock_irqsave(q->queue_lock, flags); 1115 - 1116 - /* 1117 - * If command is tagged, release the tag 1118 - */ 1119 - if (blk_rq_tagged(rq)) 1120 - blk_queue_end_tag(q, rq); 1121 - 1122 - add_acct_request(q, rq, where); 1123 - __blk_run_queue(q); 1124 - spin_unlock_irqrestore(q->queue_lock, flags); 1125 - } 1126 - EXPORT_SYMBOL(blk_insert_request); 1127 1011 1128 1012 static void part_round_stats_single(int cpu, struct hd_struct *part, 1129 1013 unsigned long now) ··· 1786 1766 return -EIO; 1787 1767 1788 1768 spin_lock_irqsave(q->queue_lock, flags); 1769 + if (unlikely(blk_queue_dead(q))) { 1770 + spin_unlock_irqrestore(q->queue_lock, flags); 1771 + return -ENODEV; 1772 + } 1789 1773 1790 1774 /* 1791 1775 * Submitting request must be dequeued before calling this function ··· 2764 2740 trace_block_unplug(q, depth, !from_schedule); 2765 2741 2766 2742 /* 2743 + * Don't mess with dead queue. 2744 + */ 2745 + if (unlikely(blk_queue_dead(q))) { 2746 + spin_unlock(q->queue_lock); 2747 + return; 2748 + } 2749 + 2750 + /* 2767 2751 * If we are punting this to kblockd, then we can safely drop 2768 2752 * the queue_lock before waking kblockd (which needs to take 2769 2753 * this lock). ··· 2847 2815 depth = 0; 2848 2816 spin_lock(q->queue_lock); 2849 2817 } 2818 + 2819 + /* 2820 + * Short-circuit if @q is dead 2821 + */ 2822 + if (unlikely(blk_queue_dead(q))) { 2823 + __blk_end_request_all(rq, -ENODEV); 2824 + continue; 2825 + } 2826 + 2850 2827 /* 2851 2828 * rq is already accounted, so use raw insert 2852 2829 */

+5 -3

block/blk-exec.c

··· 50 50 { 51 51 int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK; 52 52 53 - if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) { 53 + WARN_ON(irqs_disabled()); 54 + spin_lock_irq(q->queue_lock); 55 + 56 + if (unlikely(blk_queue_dead(q))) { 57 + spin_unlock_irq(q->queue_lock); 54 58 rq->errors = -ENXIO; 55 59 if (rq->end_io) 56 60 rq->end_io(rq, rq->errors); ··· 63 59 64 60 rq->rq_disk = bd_disk; 65 61 rq->end_io = done; 66 - WARN_ON(irqs_disabled()); 67 - spin_lock_irq(q->queue_lock); 68 62 __elv_add_request(q, rq, where); 69 63 __blk_run_queue(q); 70 64 /* the queue is stopped so it won't be run */

+419 -104

block/blk-ioc.c

··· 16 16 */ 17 17 static struct kmem_cache *iocontext_cachep; 18 18 19 - static void cfq_dtor(struct io_context *ioc) 19 + /** 20 + * get_io_context - increment reference count to io_context 21 + * @ioc: io_context to get 22 + * 23 + * Increment reference count to @ioc. 24 + */ 25 + void get_io_context(struct io_context *ioc) 20 26 { 21 - if (!hlist_empty(&ioc->cic_list)) { 22 - struct cfq_io_context *cic; 27 + BUG_ON(atomic_long_read(&ioc->refcount) <= 0); 28 + atomic_long_inc(&ioc->refcount); 29 + } 30 + EXPORT_SYMBOL(get_io_context); 23 31 24 - cic = hlist_entry(ioc->cic_list.first, struct cfq_io_context, 25 - cic_list); 26 - cic->dtor(ioc); 27 - } 32 + /* 33 + * Releasing ioc may nest into another put_io_context() leading to nested 34 + * fast path release. As the ioc's can't be the same, this is okay but 35 + * makes lockdep whine. Keep track of nesting and use it as subclass. 36 + */ 37 + #ifdef CONFIG_LOCKDEP 38 + #define ioc_release_depth(q) ((q) ? (q)->ioc_release_depth : 0) 39 + #define ioc_release_depth_inc(q) (q)->ioc_release_depth++ 40 + #define ioc_release_depth_dec(q) (q)->ioc_release_depth-- 41 + #else 42 + #define ioc_release_depth(q) 0 43 + #define ioc_release_depth_inc(q) do { } while (0) 44 + #define ioc_release_depth_dec(q) do { } while (0) 45 + #endif 46 + 47 + static void icq_free_icq_rcu(struct rcu_head *head) 48 + { 49 + struct io_cq *icq = container_of(head, struct io_cq, __rcu_head); 50 + 51 + kmem_cache_free(icq->__rcu_icq_cache, icq); 28 52 } 29 53 30 54 /* 31 - * IO Context helper functions. put_io_context() returns 1 if there are no 32 - * more users of this io context, 0 otherwise. 55 + * Exit and free an icq. Called with both ioc and q locked. 33 56 */ 34 - int put_io_context(struct io_context *ioc) 57 + static void ioc_exit_icq(struct io_cq *icq) 35 58 { 36 - if (ioc == NULL) 37 - return 1; 59 + struct io_context *ioc = icq->ioc; 60 + struct request_queue *q = icq->q; 61 + struct elevator_type *et = q->elevator->type; 38 62 39 - BUG_ON(atomic_long_read(&ioc->refcount) == 0); 63 + lockdep_assert_held(&ioc->lock); 64 + lockdep_assert_held(q->queue_lock); 40 65 41 - if (atomic_long_dec_and_test(&ioc->refcount)) { 42 - rcu_read_lock(); 43 - cfq_dtor(ioc); 44 - rcu_read_unlock(); 66 + radix_tree_delete(&ioc->icq_tree, icq->q->id); 67 + hlist_del_init(&icq->ioc_node); 68 + list_del_init(&icq->q_node); 45 69 46 - kmem_cache_free(iocontext_cachep, ioc); 47 - return 1; 70 + /* 71 + * Both setting lookup hint to and clearing it from @icq are done 72 + * under queue_lock. If it's not pointing to @icq now, it never 73 + * will. Hint assignment itself can race safely. 74 + */ 75 + if (rcu_dereference_raw(ioc->icq_hint) == icq) 76 + rcu_assign_pointer(ioc->icq_hint, NULL); 77 + 78 + if (et->ops.elevator_exit_icq_fn) { 79 + ioc_release_depth_inc(q); 80 + et->ops.elevator_exit_icq_fn(icq); 81 + ioc_release_depth_dec(q); 48 82 } 49 - return 0; 83 + 84 + /* 85 + * @icq->q might have gone away by the time RCU callback runs 86 + * making it impossible to determine icq_cache. Record it in @icq. 87 + */ 88 + icq->__rcu_icq_cache = et->icq_cache; 89 + call_rcu(&icq->__rcu_head, icq_free_icq_rcu); 90 + } 91 + 92 + /* 93 + * Slow path for ioc release in put_io_context(). Performs double-lock 94 + * dancing to unlink all icq's and then frees ioc. 95 + */ 96 + static void ioc_release_fn(struct work_struct *work) 97 + { 98 + struct io_context *ioc = container_of(work, struct io_context, 99 + release_work); 100 + struct request_queue *last_q = NULL; 101 + 102 + spin_lock_irq(&ioc->lock); 103 + 104 + while (!hlist_empty(&ioc->icq_list)) { 105 + struct io_cq *icq = hlist_entry(ioc->icq_list.first, 106 + struct io_cq, ioc_node); 107 + struct request_queue *this_q = icq->q; 108 + 109 + if (this_q != last_q) { 110 + /* 111 + * Need to switch to @this_q. Once we release 112 + * @ioc->lock, it can go away along with @cic. 113 + * Hold on to it. 114 + */ 115 + __blk_get_queue(this_q); 116 + 117 + /* 118 + * blk_put_queue() might sleep thanks to kobject 119 + * idiocy. Always release both locks, put and 120 + * restart. 121 + */ 122 + if (last_q) { 123 + spin_unlock(last_q->queue_lock); 124 + spin_unlock_irq(&ioc->lock); 125 + blk_put_queue(last_q); 126 + } else { 127 + spin_unlock_irq(&ioc->lock); 128 + } 129 + 130 + last_q = this_q; 131 + spin_lock_irq(this_q->queue_lock); 132 + spin_lock(&ioc->lock); 133 + continue; 134 + } 135 + ioc_exit_icq(icq); 136 + } 137 + 138 + if (last_q) { 139 + spin_unlock(last_q->queue_lock); 140 + spin_unlock_irq(&ioc->lock); 141 + blk_put_queue(last_q); 142 + } else { 143 + spin_unlock_irq(&ioc->lock); 144 + } 145 + 146 + kmem_cache_free(iocontext_cachep, ioc); 147 + } 148 + 149 + /** 150 + * put_io_context - put a reference of io_context 151 + * @ioc: io_context to put 152 + * @locked_q: request_queue the caller is holding queue_lock of (hint) 153 + * 154 + * Decrement reference count of @ioc and release it if the count reaches 155 + * zero. If the caller is holding queue_lock of a queue, it can indicate 156 + * that with @locked_q. This is an optimization hint and the caller is 157 + * allowed to pass in %NULL even when it's holding a queue_lock. 158 + */ 159 + void put_io_context(struct io_context *ioc, struct request_queue *locked_q) 160 + { 161 + struct request_queue *last_q = locked_q; 162 + unsigned long flags; 163 + 164 + if (ioc == NULL) 165 + return; 166 + 167 + BUG_ON(atomic_long_read(&ioc->refcount) <= 0); 168 + if (locked_q) 169 + lockdep_assert_held(locked_q->queue_lock); 170 + 171 + if (!atomic_long_dec_and_test(&ioc->refcount)) 172 + return; 173 + 174 + /* 175 + * Destroy @ioc. This is a bit messy because icq's are chained 176 + * from both ioc and queue, and ioc->lock nests inside queue_lock. 177 + * The inner ioc->lock should be held to walk our icq_list and then 178 + * for each icq the outer matching queue_lock should be grabbed. 179 + * ie. We need to do reverse-order double lock dancing. 180 + * 181 + * Another twist is that we are often called with one of the 182 + * matching queue_locks held as indicated by @locked_q, which 183 + * prevents performing double-lock dance for other queues. 184 + * 185 + * So, we do it in two stages. The fast path uses the queue_lock 186 + * the caller is holding and, if other queues need to be accessed, 187 + * uses trylock to avoid introducing locking dependency. This can 188 + * handle most cases, especially if @ioc was performing IO on only 189 + * single device. 190 + * 191 + * If trylock doesn't cut it, we defer to @ioc->release_work which 192 + * can do all the double-locking dancing. 193 + */ 194 + spin_lock_irqsave_nested(&ioc->lock, flags, 195 + ioc_release_depth(locked_q)); 196 + 197 + while (!hlist_empty(&ioc->icq_list)) { 198 + struct io_cq *icq = hlist_entry(ioc->icq_list.first, 199 + struct io_cq, ioc_node); 200 + struct request_queue *this_q = icq->q; 201 + 202 + if (this_q != last_q) { 203 + if (last_q && last_q != locked_q) 204 + spin_unlock(last_q->queue_lock); 205 + last_q = NULL; 206 + 207 + if (!spin_trylock(this_q->queue_lock)) 208 + break; 209 + last_q = this_q; 210 + continue; 211 + } 212 + ioc_exit_icq(icq); 213 + } 214 + 215 + if (last_q && last_q != locked_q) 216 + spin_unlock(last_q->queue_lock); 217 + 218 + spin_unlock_irqrestore(&ioc->lock, flags); 219 + 220 + /* if no icq is left, we're done; otherwise, kick release_work */ 221 + if (hlist_empty(&ioc->icq_list)) 222 + kmem_cache_free(iocontext_cachep, ioc); 223 + else 224 + schedule_work(&ioc->release_work); 50 225 } 51 226 EXPORT_SYMBOL(put_io_context); 52 - 53 - static void cfq_exit(struct io_context *ioc) 54 - { 55 - rcu_read_lock(); 56 - 57 - if (!hlist_empty(&ioc->cic_list)) { 58 - struct cfq_io_context *cic; 59 - 60 - cic = hlist_entry(ioc->cic_list.first, struct cfq_io_context, 61 - cic_list); 62 - cic->exit(ioc); 63 - } 64 - rcu_read_unlock(); 65 - } 66 227 67 228 /* Called by the exiting task */ 68 229 void exit_io_context(struct task_struct *task) ··· 235 74 task->io_context = NULL; 236 75 task_unlock(task); 237 76 238 - if (atomic_dec_and_test(&ioc->nr_tasks)) 239 - cfq_exit(ioc); 240 - 241 - put_io_context(ioc); 77 + atomic_dec(&ioc->nr_tasks); 78 + put_io_context(ioc, NULL); 242 79 } 243 80 244 - struct io_context *alloc_io_context(gfp_t gfp_flags, int node) 81 + /** 82 + * ioc_clear_queue - break any ioc association with the specified queue 83 + * @q: request_queue being cleared 84 + * 85 + * Walk @q->icq_list and exit all io_cq's. Must be called with @q locked. 86 + */ 87 + void ioc_clear_queue(struct request_queue *q) 88 + { 89 + lockdep_assert_held(q->queue_lock); 90 + 91 + while (!list_empty(&q->icq_list)) { 92 + struct io_cq *icq = list_entry(q->icq_list.next, 93 + struct io_cq, q_node); 94 + struct io_context *ioc = icq->ioc; 95 + 96 + spin_lock(&ioc->lock); 97 + ioc_exit_icq(icq); 98 + spin_unlock(&ioc->lock); 99 + } 100 + } 101 + 102 + void create_io_context_slowpath(struct task_struct *task, gfp_t gfp_flags, 103 + int node) 245 104 { 246 105 struct io_context *ioc; 247 106 248 - ioc = kmem_cache_alloc_node(iocontext_cachep, gfp_flags, node); 249 - if (ioc) { 250 - atomic_long_set(&ioc->refcount, 1); 251 - atomic_set(&ioc->nr_tasks, 1); 252 - spin_lock_init(&ioc->lock); 253 - ioc->ioprio_changed = 0; 254 - ioc->ioprio = 0; 255 - ioc->last_waited = 0; /* doesn't matter... */ 256 - ioc->nr_batch_requests = 0; /* because this is 0 */ 257 - INIT_RADIX_TREE(&ioc->radix_root, GFP_ATOMIC | __GFP_HIGH); 258 - INIT_HLIST_HEAD(&ioc->cic_list); 259 - ioc->ioc_data = NULL; 260 - #if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE) 261 - ioc->cgroup_changed = 0; 262 - #endif 263 - } 107 + ioc = kmem_cache_alloc_node(iocontext_cachep, gfp_flags | __GFP_ZERO, 108 + node); 109 + if (unlikely(!ioc)) 110 + return; 264 111 265 - return ioc; 266 - } 267 - 268 - /* 269 - * If the current task has no IO context then create one and initialise it. 270 - * Otherwise, return its existing IO context. 271 - * 272 - * This returned IO context doesn't have a specifically elevated refcount, 273 - * but since the current task itself holds a reference, the context can be 274 - * used in general code, so long as it stays within `current` context. 275 - */ 276 - struct io_context *current_io_context(gfp_t gfp_flags, int node) 277 - { 278 - struct task_struct *tsk = current; 279 - struct io_context *ret; 280 - 281 - ret = tsk->io_context; 282 - if (likely(ret)) 283 - return ret; 284 - 285 - ret = alloc_io_context(gfp_flags, node); 286 - if (ret) { 287 - /* make sure set_task_ioprio() sees the settings above */ 288 - smp_wmb(); 289 - tsk->io_context = ret; 290 - } 291 - 292 - return ret; 293 - } 294 - 295 - /* 296 - * If the current task has no IO context then create one and initialise it. 297 - * If it does have a context, take a ref on it. 298 - * 299 - * This is always called in the context of the task which submitted the I/O. 300 - */ 301 - struct io_context *get_io_context(gfp_t gfp_flags, int node) 302 - { 303 - struct io_context *ioc = NULL; 112 + /* initialize */ 113 + atomic_long_set(&ioc->refcount, 1); 114 + atomic_set(&ioc->nr_tasks, 1); 115 + spin_lock_init(&ioc->lock); 116 + INIT_RADIX_TREE(&ioc->icq_tree, GFP_ATOMIC | __GFP_HIGH); 117 + INIT_HLIST_HEAD(&ioc->icq_list); 118 + INIT_WORK(&ioc->release_work, ioc_release_fn); 304 119 305 120 /* 306 - * Check for unlikely race with exiting task. ioc ref count is 307 - * zero when ioc is being detached. 121 + * Try to install. ioc shouldn't be installed if someone else 122 + * already did or @task, which isn't %current, is exiting. Note 123 + * that we need to allow ioc creation on exiting %current as exit 124 + * path may issue IOs from e.g. exit_files(). The exit path is 125 + * responsible for not issuing IO after exit_io_context(). 308 126 */ 309 - do { 310 - ioc = current_io_context(gfp_flags, node); 311 - if (unlikely(!ioc)) 312 - break; 313 - } while (!atomic_long_inc_not_zero(&ioc->refcount)); 314 - 315 - return ioc; 127 + task_lock(task); 128 + if (!task->io_context && 129 + (task == current || !(task->flags & PF_EXITING))) 130 + task->io_context = ioc; 131 + else 132 + kmem_cache_free(iocontext_cachep, ioc); 133 + task_unlock(task); 316 134 } 317 - EXPORT_SYMBOL(get_io_context); 135 + 136 + /** 137 + * get_task_io_context - get io_context of a task 138 + * @task: task of interest 139 + * @gfp_flags: allocation flags, used if allocation is necessary 140 + * @node: allocation node, used if allocation is necessary 141 + * 142 + * Return io_context of @task. If it doesn't exist, it is created with 143 + * @gfp_flags and @node. The returned io_context has its reference count 144 + * incremented. 145 + * 146 + * This function always goes through task_lock() and it's better to use 147 + * %current->io_context + get_io_context() for %current. 148 + */ 149 + struct io_context *get_task_io_context(struct task_struct *task, 150 + gfp_t gfp_flags, int node) 151 + { 152 + struct io_context *ioc; 153 + 154 + might_sleep_if(gfp_flags & __GFP_WAIT); 155 + 156 + do { 157 + task_lock(task); 158 + ioc = task->io_context; 159 + if (likely(ioc)) { 160 + get_io_context(ioc); 161 + task_unlock(task); 162 + return ioc; 163 + } 164 + task_unlock(task); 165 + } while (create_io_context(task, gfp_flags, node)); 166 + 167 + return NULL; 168 + } 169 + EXPORT_SYMBOL(get_task_io_context); 170 + 171 + /** 172 + * ioc_lookup_icq - lookup io_cq from ioc 173 + * @ioc: the associated io_context 174 + * @q: the associated request_queue 175 + * 176 + * Look up io_cq associated with @ioc - @q pair from @ioc. Must be called 177 + * with @q->queue_lock held. 178 + */ 179 + struct io_cq *ioc_lookup_icq(struct io_context *ioc, struct request_queue *q) 180 + { 181 + struct io_cq *icq; 182 + 183 + lockdep_assert_held(q->queue_lock); 184 + 185 + /* 186 + * icq's are indexed from @ioc using radix tree and hint pointer, 187 + * both of which are protected with RCU. All removals are done 188 + * holding both q and ioc locks, and we're holding q lock - if we 189 + * find a icq which points to us, it's guaranteed to be valid. 190 + */ 191 + rcu_read_lock(); 192 + icq = rcu_dereference(ioc->icq_hint); 193 + if (icq && icq->q == q) 194 + goto out; 195 + 196 + icq = radix_tree_lookup(&ioc->icq_tree, q->id); 197 + if (icq && icq->q == q) 198 + rcu_assign_pointer(ioc->icq_hint, icq); /* allowed to race */ 199 + else 200 + icq = NULL; 201 + out: 202 + rcu_read_unlock(); 203 + return icq; 204 + } 205 + EXPORT_SYMBOL(ioc_lookup_icq); 206 + 207 + /** 208 + * ioc_create_icq - create and link io_cq 209 + * @q: request_queue of interest 210 + * @gfp_mask: allocation mask 211 + * 212 + * Make sure io_cq linking %current->io_context and @q exists. If either 213 + * io_context and/or icq don't exist, they will be created using @gfp_mask. 214 + * 215 + * The caller is responsible for ensuring @ioc won't go away and @q is 216 + * alive and will stay alive until this function returns. 217 + */ 218 + struct io_cq *ioc_create_icq(struct request_queue *q, gfp_t gfp_mask) 219 + { 220 + struct elevator_type *et = q->elevator->type; 221 + struct io_context *ioc; 222 + struct io_cq *icq; 223 + 224 + /* allocate stuff */ 225 + ioc = create_io_context(current, gfp_mask, q->node); 226 + if (!ioc) 227 + return NULL; 228 + 229 + icq = kmem_cache_alloc_node(et->icq_cache, gfp_mask | __GFP_ZERO, 230 + q->node); 231 + if (!icq) 232 + return NULL; 233 + 234 + if (radix_tree_preload(gfp_mask) < 0) { 235 + kmem_cache_free(et->icq_cache, icq); 236 + return NULL; 237 + } 238 + 239 + icq->ioc = ioc; 240 + icq->q = q; 241 + INIT_LIST_HEAD(&icq->q_node); 242 + INIT_HLIST_NODE(&icq->ioc_node); 243 + 244 + /* lock both q and ioc and try to link @icq */ 245 + spin_lock_irq(q->queue_lock); 246 + spin_lock(&ioc->lock); 247 + 248 + if (likely(!radix_tree_insert(&ioc->icq_tree, q->id, icq))) { 249 + hlist_add_head(&icq->ioc_node, &ioc->icq_list); 250 + list_add(&icq->q_node, &q->icq_list); 251 + if (et->ops.elevator_init_icq_fn) 252 + et->ops.elevator_init_icq_fn(icq); 253 + } else { 254 + kmem_cache_free(et->icq_cache, icq); 255 + icq = ioc_lookup_icq(ioc, q); 256 + if (!icq) 257 + printk(KERN_ERR "cfq: icq link failed!\n"); 258 + } 259 + 260 + spin_unlock(&ioc->lock); 261 + spin_unlock_irq(q->queue_lock); 262 + radix_tree_preload_end(); 263 + return icq; 264 + } 265 + 266 + void ioc_set_changed(struct io_context *ioc, int which) 267 + { 268 + struct io_cq *icq; 269 + struct hlist_node *n; 270 + 271 + hlist_for_each_entry(icq, n, &ioc->icq_list, ioc_node) 272 + set_bit(which, &icq->changed); 273 + } 274 + 275 + /** 276 + * ioc_ioprio_changed - notify ioprio change 277 + * @ioc: io_context of interest 278 + * @ioprio: new ioprio 279 + * 280 + * @ioc's ioprio has changed to @ioprio. Set %ICQ_IOPRIO_CHANGED for all 281 + * icq's. iosched is responsible for checking the bit and applying it on 282 + * request issue path. 283 + */ 284 + void ioc_ioprio_changed(struct io_context *ioc, int ioprio) 285 + { 286 + unsigned long flags; 287 + 288 + spin_lock_irqsave(&ioc->lock, flags); 289 + ioc->ioprio = ioprio; 290 + ioc_set_changed(ioc, ICQ_IOPRIO_CHANGED); 291 + spin_unlock_irqrestore(&ioc->lock, flags); 292 + } 293 + 294 + /** 295 + * ioc_cgroup_changed - notify cgroup change 296 + * @ioc: io_context of interest 297 + * 298 + * @ioc's cgroup has changed. Set %ICQ_CGROUP_CHANGED for all icq's. 299 + * iosched is responsible for checking the bit and applying it on request 300 + * issue path. 301 + */ 302 + void ioc_cgroup_changed(struct io_context *ioc) 303 + { 304 + unsigned long flags; 305 + 306 + spin_lock_irqsave(&ioc->lock, flags); 307 + ioc_set_changed(ioc, ICQ_CGROUP_CHANGED); 308 + spin_unlock_irqrestore(&ioc->lock, flags); 309 + } 310 + EXPORT_SYMBOL(ioc_cgroup_changed); 318 311 319 312 static int __init blk_ioc_init(void) 320 313 {

+24 -8

block/blk-settings.c

··· 104 104 * @lim: the queue_limits structure to reset 105 105 * 106 106 * Description: 107 - * Returns a queue_limit struct to its default state. Can be used by 108 - * stacking drivers like DM that stage table swaps and reuse an 109 - * existing device queue. 107 + * Returns a queue_limit struct to its default state. 110 108 */ 111 109 void blk_set_default_limits(struct queue_limits *lim) 112 110 { ··· 112 114 lim->max_integrity_segments = 0; 113 115 lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK; 114 116 lim->max_segment_size = BLK_MAX_SEGMENT_SIZE; 115 - lim->max_sectors = BLK_DEF_MAX_SECTORS; 116 - lim->max_hw_sectors = INT_MAX; 117 + lim->max_sectors = lim->max_hw_sectors = BLK_SAFE_MAX_SECTORS; 117 118 lim->max_discard_sectors = 0; 118 119 lim->discard_granularity = 0; 119 120 lim->discard_alignment = 0; 120 121 lim->discard_misaligned = 0; 121 - lim->discard_zeroes_data = 1; 122 + lim->discard_zeroes_data = 0; 122 123 lim->logical_block_size = lim->physical_block_size = lim->io_min = 512; 123 124 lim->bounce_pfn = (unsigned long)(BLK_BOUNCE_ANY >> PAGE_SHIFT); 124 125 lim->alignment_offset = 0; ··· 126 129 lim->cluster = 1; 127 130 } 128 131 EXPORT_SYMBOL(blk_set_default_limits); 132 + 133 + /** 134 + * blk_set_stacking_limits - set default limits for stacking devices 135 + * @lim: the queue_limits structure to reset 136 + * 137 + * Description: 138 + * Returns a queue_limit struct to its default state. Should be used 139 + * by stacking drivers like DM that have no internal limits. 140 + */ 141 + void blk_set_stacking_limits(struct queue_limits *lim) 142 + { 143 + blk_set_default_limits(lim); 144 + 145 + /* Inherit limits from component devices */ 146 + lim->discard_zeroes_data = 1; 147 + lim->max_segments = USHRT_MAX; 148 + lim->max_hw_sectors = UINT_MAX; 149 + 150 + lim->max_sectors = BLK_DEF_MAX_SECTORS; 151 + } 152 + EXPORT_SYMBOL(blk_set_stacking_limits); 129 153 130 154 /** 131 155 * blk_queue_make_request - define an alternate make_request function for a device ··· 183 165 q->nr_batching = BLK_BATCH_REQ; 184 166 185 167 blk_set_default_limits(&q->limits); 186 - blk_queue_max_hw_sectors(q, BLK_SAFE_MAX_SECTORS); 187 - q->limits.discard_zeroes_data = 0; 188 168 189 169 /* 190 170 * by default assume old behaviour and bounce for any highmem page

+9 -3

block/blk-sysfs.c

··· 425 425 if (!entry->show) 426 426 return -EIO; 427 427 mutex_lock(&q->sysfs_lock); 428 - if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)) { 428 + if (blk_queue_dead(q)) { 429 429 mutex_unlock(&q->sysfs_lock); 430 430 return -ENOENT; 431 431 } ··· 447 447 448 448 q = container_of(kobj, struct request_queue, kobj); 449 449 mutex_lock(&q->sysfs_lock); 450 - if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)) { 450 + if (blk_queue_dead(q)) { 451 451 mutex_unlock(&q->sysfs_lock); 452 452 return -ENOENT; 453 453 } ··· 479 479 480 480 blk_sync_queue(q); 481 481 482 - if (q->elevator) 482 + if (q->elevator) { 483 + spin_lock_irq(q->queue_lock); 484 + ioc_clear_queue(q); 485 + spin_unlock_irq(q->queue_lock); 483 486 elevator_exit(q->elevator); 487 + } 484 488 485 489 blk_throtl_exit(q); 486 490 ··· 498 494 blk_trace_shutdown(q); 499 495 500 496 bdi_destroy(&q->backing_dev_info); 497 + 498 + ida_simple_remove(&blk_queue_ida, q->id); 501 499 kmem_cache_free(blk_requestq_cachep, q); 502 500 } 503 501

+2 -2

block/blk-throttle.c

··· 310 310 struct request_queue *q = td->queue; 311 311 312 312 /* no throttling for dead queue */ 313 - if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) 313 + if (unlikely(blk_queue_dead(q))) 314 314 return NULL; 315 315 316 316 rcu_read_lock(); ··· 335 335 spin_lock_irq(q->queue_lock); 336 336 337 337 /* Make sure @q is still alive */ 338 - if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) { 338 + if (unlikely(blk_queue_dead(q))) { 339 339 kfree(tg); 340 340 return NULL; 341 341 }

+50 -8

block/blk.h

··· 1 1 #ifndef BLK_INTERNAL_H 2 2 #define BLK_INTERNAL_H 3 3 4 + #include <linux/idr.h> 5 + 4 6 /* Amount of time in which a process may batch requests */ 5 7 #define BLK_BATCH_TIME (HZ/50UL) 6 8 ··· 11 9 12 10 extern struct kmem_cache *blk_requestq_cachep; 13 11 extern struct kobj_type blk_queue_ktype; 12 + extern struct ida blk_queue_ida; 13 + 14 + static inline void __blk_get_queue(struct request_queue *q) 15 + { 16 + kobject_get(&q->kobj); 17 + } 14 18 15 19 void init_request_from_bio(struct request *req, struct bio *bio); 16 20 void blk_rq_bio_prep(struct request_queue *q, struct request *rq, ··· 93 85 q->flush_queue_delayed = 1; 94 86 return NULL; 95 87 } 96 - if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags) || 97 - !q->elevator->ops->elevator_dispatch_fn(q, 0)) 88 + if (unlikely(blk_queue_dead(q)) || 89 + !q->elevator->type->ops.elevator_dispatch_fn(q, 0)) 98 90 return NULL; 99 91 } 100 92 } ··· 103 95 { 104 96 struct elevator_queue *e = q->elevator; 105 97 106 - if (e->ops->elevator_activate_req_fn) 107 - e->ops->elevator_activate_req_fn(q, rq); 98 + if (e->type->ops.elevator_activate_req_fn) 99 + e->type->ops.elevator_activate_req_fn(q, rq); 108 100 } 109 101 110 102 static inline void elv_deactivate_rq(struct request_queue *q, struct request *rq) 111 103 { 112 104 struct elevator_queue *e = q->elevator; 113 105 114 - if (e->ops->elevator_deactivate_req_fn) 115 - e->ops->elevator_deactivate_req_fn(q, rq); 106 + if (e->type->ops.elevator_deactivate_req_fn) 107 + e->type->ops.elevator_deactivate_req_fn(q, rq); 116 108 } 117 109 118 110 #ifdef CONFIG_FAIL_IO_TIMEOUT ··· 126 118 return 0; 127 119 } 128 120 #endif 129 - 130 - struct io_context *current_io_context(gfp_t gfp_flags, int node); 131 121 132 122 int ll_back_merge_fn(struct request_queue *q, struct request *req, 133 123 struct bio *bio); ··· 195 189 (rq->cmd_flags & REQ_DISCARD)); 196 190 } 197 191 192 + /* 193 + * Internal io_context interface 194 + */ 195 + void get_io_context(struct io_context *ioc); 196 + struct io_cq *ioc_lookup_icq(struct io_context *ioc, struct request_queue *q); 197 + struct io_cq *ioc_create_icq(struct request_queue *q, gfp_t gfp_mask); 198 + void ioc_clear_queue(struct request_queue *q); 199 + 200 + void create_io_context_slowpath(struct task_struct *task, gfp_t gfp_mask, 201 + int node); 202 + 203 + /** 204 + * create_io_context - try to create task->io_context 205 + * @task: target task 206 + * @gfp_mask: allocation mask 207 + * @node: allocation node 208 + * 209 + * If @task->io_context is %NULL, allocate a new io_context and install it. 210 + * Returns the current @task->io_context which may be %NULL if allocation 211 + * failed. 212 + * 213 + * Note that this function can't be called with IRQ disabled because 214 + * task_lock which protects @task->io_context is IRQ-unsafe. 215 + */ 216 + static inline struct io_context *create_io_context(struct task_struct *task, 217 + gfp_t gfp_mask, int node) 218 + { 219 + WARN_ON_ONCE(irqs_disabled()); 220 + if (unlikely(!task->io_context)) 221 + create_io_context_slowpath(task, gfp_mask, node); 222 + return task->io_context; 223 + } 224 + 225 + /* 226 + * Internal throttling interface 227 + */ 198 228 #ifdef CONFIG_BLK_DEV_THROTTLING 199 229 extern bool blk_throtl_bio(struct request_queue *q, struct bio *bio); 200 230 extern void blk_throtl_drain(struct request_queue *q);

+1 -3

block/bsg.c

··· 769 769 struct file *file) 770 770 { 771 771 struct bsg_device *bd; 772 - int ret; 773 772 #ifdef BSG_DEBUG 774 773 unsigned char buf[32]; 775 774 #endif 776 - ret = blk_get_queue(rq); 777 - if (ret) 775 + if (!blk_get_queue(rq)) 778 776 return ERR_PTR(-ENXIO); 779 777 780 778 bd = bsg_alloc_device();

+115 -508

block/cfq-iosched.c

··· 14 14 #include <linux/rbtree.h> 15 15 #include <linux/ioprio.h> 16 16 #include <linux/blktrace_api.h> 17 + #include "blk.h" 17 18 #include "cfq.h" 18 19 19 20 /* ··· 54 53 #define CFQQ_SECT_THR_NONROT (sector_t)(2 * 32) 55 54 #define CFQQ_SEEKY(cfqq) (hweight32(cfqq->seek_history) > 32/8) 56 55 57 - #define RQ_CIC(rq) \ 58 - ((struct cfq_io_context *) (rq)->elevator_private[0]) 59 - #define RQ_CFQQ(rq) (struct cfq_queue *) ((rq)->elevator_private[1]) 60 - #define RQ_CFQG(rq) (struct cfq_group *) ((rq)->elevator_private[2]) 56 + #define RQ_CIC(rq) icq_to_cic((rq)->elv.icq) 57 + #define RQ_CFQQ(rq) (struct cfq_queue *) ((rq)->elv.priv[0]) 58 + #define RQ_CFQG(rq) (struct cfq_group *) ((rq)->elv.priv[1]) 61 59 62 60 static struct kmem_cache *cfq_pool; 63 - static struct kmem_cache *cfq_ioc_pool; 64 - 65 - static DEFINE_PER_CPU(unsigned long, cfq_ioc_count); 66 - static struct completion *ioc_gone; 67 - static DEFINE_SPINLOCK(ioc_gone_lock); 68 - 69 - static DEFINE_SPINLOCK(cic_index_lock); 70 - static DEFINE_IDA(cic_index_ida); 71 61 72 62 #define CFQ_PRIO_LISTS IOPRIO_BE_NR 73 63 #define cfq_class_idle(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE) ··· 66 74 67 75 #define sample_valid(samples) ((samples) > 80) 68 76 #define rb_entry_cfqg(node) rb_entry((node), struct cfq_group, rb_node) 77 + 78 + struct cfq_ttime { 79 + unsigned long last_end_request; 80 + 81 + unsigned long ttime_total; 82 + unsigned long ttime_samples; 83 + unsigned long ttime_mean; 84 + }; 69 85 70 86 /* 71 87 * Most of our rbtree usage is for sorting with min extraction, so ··· 216 216 struct cfq_ttime ttime; 217 217 }; 218 218 219 + struct cfq_io_cq { 220 + struct io_cq icq; /* must be the first member */ 221 + struct cfq_queue *cfqq[2]; 222 + struct cfq_ttime ttime; 223 + }; 224 + 219 225 /* 220 226 * Per block device queue structure 221 227 */ ··· 273 267 struct work_struct unplug_work; 274 268 275 269 struct cfq_queue *active_queue; 276 - struct cfq_io_context *active_cic; 270 + struct cfq_io_cq *active_cic; 277 271 278 272 /* 279 273 * async queue for each priority case ··· 295 289 unsigned int cfq_slice_idle; 296 290 unsigned int cfq_group_idle; 297 291 unsigned int cfq_latency; 298 - 299 - unsigned int cic_index; 300 - struct list_head cic_list; 301 292 302 293 /* 303 294 * Fallback dummy cfqq for extreme OOM conditions ··· 467 464 static void cfq_dispatch_insert(struct request_queue *, struct request *); 468 465 static struct cfq_queue *cfq_get_queue(struct cfq_data *, bool, 469 466 struct io_context *, gfp_t); 470 - static struct cfq_io_context *cfq_cic_lookup(struct cfq_data *, 471 - struct io_context *); 472 467 473 - static inline struct cfq_queue *cic_to_cfqq(struct cfq_io_context *cic, 474 - bool is_sync) 468 + static inline struct cfq_io_cq *icq_to_cic(struct io_cq *icq) 469 + { 470 + /* cic->icq is the first member, %NULL will convert to %NULL */ 471 + return container_of(icq, struct cfq_io_cq, icq); 472 + } 473 + 474 + static inline struct cfq_io_cq *cfq_cic_lookup(struct cfq_data *cfqd, 475 + struct io_context *ioc) 476 + { 477 + if (ioc) 478 + return icq_to_cic(ioc_lookup_icq(ioc, cfqd->queue)); 479 + return NULL; 480 + } 481 + 482 + static inline struct cfq_queue *cic_to_cfqq(struct cfq_io_cq *cic, bool is_sync) 475 483 { 476 484 return cic->cfqq[is_sync]; 477 485 } 478 486 479 - static inline void cic_set_cfqq(struct cfq_io_context *cic, 480 - struct cfq_queue *cfqq, bool is_sync) 487 + static inline void cic_set_cfqq(struct cfq_io_cq *cic, struct cfq_queue *cfqq, 488 + bool is_sync) 481 489 { 482 490 cic->cfqq[is_sync] = cfqq; 483 491 } 484 492 485 - #define CIC_DEAD_KEY 1ul 486 - #define CIC_DEAD_INDEX_SHIFT 1 487 - 488 - static inline void *cfqd_dead_key(struct cfq_data *cfqd) 493 + static inline struct cfq_data *cic_to_cfqd(struct cfq_io_cq *cic) 489 494 { 490 - return (void *)(cfqd->cic_index << CIC_DEAD_INDEX_SHIFT | CIC_DEAD_KEY); 491 - } 492 - 493 - static inline struct cfq_data *cic_to_cfqd(struct cfq_io_context *cic) 494 - { 495 - struct cfq_data *cfqd = cic->key; 496 - 497 - if (unlikely((unsigned long) cfqd & CIC_DEAD_KEY)) 498 - return NULL; 499 - 500 - return cfqd; 495 + return cic->icq.q->elevator->elevator_data; 501 496 } 502 497 503 498 /* ··· 1562 1561 cfq_find_rq_fmerge(struct cfq_data *cfqd, struct bio *bio) 1563 1562 { 1564 1563 struct task_struct *tsk = current; 1565 - struct cfq_io_context *cic; 1564 + struct cfq_io_cq *cic; 1566 1565 struct cfq_queue *cfqq; 1567 1566 1568 1567 cic = cfq_cic_lookup(cfqd, tsk->io_context); ··· 1688 1687 struct bio *bio) 1689 1688 { 1690 1689 struct cfq_data *cfqd = q->elevator->elevator_data; 1691 - struct cfq_io_context *cic; 1690 + struct cfq_io_cq *cic; 1692 1691 struct cfq_queue *cfqq; 1693 1692 1694 1693 /* ··· 1698 1697 return false; 1699 1698 1700 1699 /* 1701 - * Lookup the cfqq that this bio will be queued with. Allow 1702 - * merge only if rq is queued there. 1700 + * Lookup the cfqq that this bio will be queued with and allow 1701 + * merge only if rq is queued there. This function can be called 1702 + * from plug merge without queue_lock. In such cases, ioc of @rq 1703 + * and %current are guaranteed to be equal. Avoid lookup which 1704 + * requires queue_lock by using @rq's cic. 1703 1705 */ 1704 - cic = cfq_cic_lookup(cfqd, current->io_context); 1705 - if (!cic) 1706 - return false; 1706 + if (current->io_context == RQ_CIC(rq)->icq.ioc) { 1707 + cic = RQ_CIC(rq); 1708 + } else { 1709 + cic = cfq_cic_lookup(cfqd, current->io_context); 1710 + if (!cic) 1711 + return false; 1712 + } 1707 1713 1708 1714 cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio)); 1709 1715 return cfqq == RQ_CFQQ(rq); ··· 1794 1786 cfqd->active_queue = NULL; 1795 1787 1796 1788 if (cfqd->active_cic) { 1797 - put_io_context(cfqd->active_cic->ioc); 1789 + put_io_context(cfqd->active_cic->icq.ioc, cfqd->queue); 1798 1790 cfqd->active_cic = NULL; 1799 1791 } 1800 1792 } ··· 2014 2006 static void cfq_arm_slice_timer(struct cfq_data *cfqd) 2015 2007 { 2016 2008 struct cfq_queue *cfqq = cfqd->active_queue; 2017 - struct cfq_io_context *cic; 2009 + struct cfq_io_cq *cic; 2018 2010 unsigned long sl, group_idle = 0; 2019 2011 2020 2012 /* ··· 2049 2041 * task has exited, don't wait 2050 2042 */ 2051 2043 cic = cfqd->active_cic; 2052 - if (!cic || !atomic_read(&cic->ioc->nr_tasks)) 2044 + if (!cic || !atomic_read(&cic->icq.ioc->nr_tasks)) 2053 2045 return; 2054 2046 2055 2047 /* ··· 2600 2592 cfq_dispatch_insert(cfqd->queue, rq); 2601 2593 2602 2594 if (!cfqd->active_cic) { 2603 - struct cfq_io_context *cic = RQ_CIC(rq); 2595 + struct cfq_io_cq *cic = RQ_CIC(rq); 2604 2596 2605 - atomic_long_inc(&cic->ioc->refcount); 2597 + atomic_long_inc(&cic->icq.ioc->refcount); 2606 2598 cfqd->active_cic = cic; 2607 2599 } 2608 2600 ··· 2685 2677 cfq_put_cfqg(cfqg); 2686 2678 } 2687 2679 2688 - /* 2689 - * Call func for each cic attached to this ioc. 2690 - */ 2691 - static void 2692 - call_for_each_cic(struct io_context *ioc, 2693 - void (*func)(struct io_context *, struct cfq_io_context *)) 2694 - { 2695 - struct cfq_io_context *cic; 2696 - struct hlist_node *n; 2697 - 2698 - rcu_read_lock(); 2699 - 2700 - hlist_for_each_entry_rcu(cic, n, &ioc->cic_list, cic_list) 2701 - func(ioc, cic); 2702 - 2703 - rcu_read_unlock(); 2704 - } 2705 - 2706 - static void cfq_cic_free_rcu(struct rcu_head *head) 2707 - { 2708 - struct cfq_io_context *cic; 2709 - 2710 - cic = container_of(head, struct cfq_io_context, rcu_head); 2711 - 2712 - kmem_cache_free(cfq_ioc_pool, cic); 2713 - elv_ioc_count_dec(cfq_ioc_count); 2714 - 2715 - if (ioc_gone) { 2716 - /* 2717 - * CFQ scheduler is exiting, grab exit lock and check 2718 - * the pending io context count. If it hits zero, 2719 - * complete ioc_gone and set it back to NULL 2720 - */ 2721 - spin_lock(&ioc_gone_lock); 2722 - if (ioc_gone && !elv_ioc_count_read(cfq_ioc_count)) { 2723 - complete(ioc_gone); 2724 - ioc_gone = NULL; 2725 - } 2726 - spin_unlock(&ioc_gone_lock); 2727 - } 2728 - } 2729 - 2730 - static void cfq_cic_free(struct cfq_io_context *cic) 2731 - { 2732 - call_rcu(&cic->rcu_head, cfq_cic_free_rcu); 2733 - } 2734 - 2735 - static void cic_free_func(struct io_context *ioc, struct cfq_io_context *cic) 2736 - { 2737 - unsigned long flags; 2738 - unsigned long dead_key = (unsigned long) cic->key; 2739 - 2740 - BUG_ON(!(dead_key & CIC_DEAD_KEY)); 2741 - 2742 - spin_lock_irqsave(&ioc->lock, flags); 2743 - radix_tree_delete(&ioc->radix_root, dead_key >> CIC_DEAD_INDEX_SHIFT); 2744 - hlist_del_rcu(&cic->cic_list); 2745 - spin_unlock_irqrestore(&ioc->lock, flags); 2746 - 2747 - cfq_cic_free(cic); 2748 - } 2749 - 2750 - /* 2751 - * Must be called with rcu_read_lock() held or preemption otherwise disabled. 2752 - * Only two callers of this - ->dtor() which is called with the rcu_read_lock(), 2753 - * and ->trim() which is called with the task lock held 2754 - */ 2755 - static void cfq_free_io_context(struct io_context *ioc) 2756 - { 2757 - /* 2758 - * ioc->refcount is zero here, or we are called from elv_unregister(), 2759 - * so no more cic's are allowed to be linked into this ioc. So it 2760 - * should be ok to iterate over the known list, we will see all cic's 2761 - * since no new ones are added. 2762 - */ 2763 - call_for_each_cic(ioc, cic_free_func); 2764 - } 2765 - 2766 2680 static void cfq_put_cooperator(struct cfq_queue *cfqq) 2767 2681 { 2768 2682 struct cfq_queue *__cfqq, *next; ··· 2718 2788 cfq_put_queue(cfqq); 2719 2789 } 2720 2790 2721 - static void __cfq_exit_single_io_context(struct cfq_data *cfqd, 2722 - struct cfq_io_context *cic) 2791 + static void cfq_init_icq(struct io_cq *icq) 2723 2792 { 2724 - struct io_context *ioc = cic->ioc; 2793 + struct cfq_io_cq *cic = icq_to_cic(icq); 2725 2794 2726 - list_del_init(&cic->queue_list); 2795 + cic->ttime.last_end_request = jiffies; 2796 + } 2727 2797 2728 - /* 2729 - * Make sure dead mark is seen for dead queues 2730 - */ 2731 - smp_wmb(); 2732 - cic->key = cfqd_dead_key(cfqd); 2733 - 2734 - rcu_read_lock(); 2735 - if (rcu_dereference(ioc->ioc_data) == cic) { 2736 - rcu_read_unlock(); 2737 - spin_lock(&ioc->lock); 2738 - rcu_assign_pointer(ioc->ioc_data, NULL); 2739 - spin_unlock(&ioc->lock); 2740 - } else 2741 - rcu_read_unlock(); 2798 + static void cfq_exit_icq(struct io_cq *icq) 2799 + { 2800 + struct cfq_io_cq *cic = icq_to_cic(icq); 2801 + struct cfq_data *cfqd = cic_to_cfqd(cic); 2742 2802 2743 2803 if (cic->cfqq[BLK_RW_ASYNC]) { 2744 2804 cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_ASYNC]); ··· 2739 2819 cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_SYNC]); 2740 2820 cic->cfqq[BLK_RW_SYNC] = NULL; 2741 2821 } 2742 - } 2743 - 2744 - static void cfq_exit_single_io_context(struct io_context *ioc, 2745 - struct cfq_io_context *cic) 2746 - { 2747 - struct cfq_data *cfqd = cic_to_cfqd(cic); 2748 - 2749 - if (cfqd) { 2750 - struct request_queue *q = cfqd->queue; 2751 - unsigned long flags; 2752 - 2753 - spin_lock_irqsave(q->queue_lock, flags); 2754 - 2755 - /* 2756 - * Ensure we get a fresh copy of the ->key to prevent 2757 - * race between exiting task and queue 2758 - */ 2759 - smp_read_barrier_depends(); 2760 - if (cic->key == cfqd) 2761 - __cfq_exit_single_io_context(cfqd, cic); 2762 - 2763 - spin_unlock_irqrestore(q->queue_lock, flags); 2764 - } 2765 - } 2766 - 2767 - /* 2768 - * The process that ioc belongs to has exited, we need to clean up 2769 - * and put the internal structures we have that belongs to that process. 2770 - */ 2771 - static void cfq_exit_io_context(struct io_context *ioc) 2772 - { 2773 - call_for_each_cic(ioc, cfq_exit_single_io_context); 2774 - } 2775 - 2776 - static struct cfq_io_context * 2777 - cfq_alloc_io_context(struct cfq_data *cfqd, gfp_t gfp_mask) 2778 - { 2779 - struct cfq_io_context *cic; 2780 - 2781 - cic = kmem_cache_alloc_node(cfq_ioc_pool, gfp_mask | __GFP_ZERO, 2782 - cfqd->queue->node); 2783 - if (cic) { 2784 - cic->ttime.last_end_request = jiffies; 2785 - INIT_LIST_HEAD(&cic->queue_list); 2786 - INIT_HLIST_NODE(&cic->cic_list); 2787 - cic->dtor = cfq_free_io_context; 2788 - cic->exit = cfq_exit_io_context; 2789 - elv_ioc_count_inc(cfq_ioc_count); 2790 - } 2791 - 2792 - return cic; 2793 2822 } 2794 2823 2795 2824 static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc) ··· 2783 2914 cfq_clear_cfqq_prio_changed(cfqq); 2784 2915 } 2785 2916 2786 - static void changed_ioprio(struct io_context *ioc, struct cfq_io_context *cic) 2917 + static void changed_ioprio(struct cfq_io_cq *cic) 2787 2918 { 2788 2919 struct cfq_data *cfqd = cic_to_cfqd(cic); 2789 2920 struct cfq_queue *cfqq; 2790 - unsigned long flags; 2791 2921 2792 2922 if (unlikely(!cfqd)) 2793 2923 return; 2794 2924 2795 - spin_lock_irqsave(cfqd->queue->queue_lock, flags); 2796 - 2797 2925 cfqq = cic->cfqq[BLK_RW_ASYNC]; 2798 2926 if (cfqq) { 2799 2927 struct cfq_queue *new_cfqq; 2800 - new_cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic->ioc, 2928 + new_cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic->icq.ioc, 2801 2929 GFP_ATOMIC); 2802 2930 if (new_cfqq) { 2803 2931 cic->cfqq[BLK_RW_ASYNC] = new_cfqq; ··· 2805 2939 cfqq = cic->cfqq[BLK_RW_SYNC]; 2806 2940 if (cfqq) 2807 2941 cfq_mark_cfqq_prio_changed(cfqq); 2808 - 2809 - spin_unlock_irqrestore(cfqd->queue->queue_lock, flags); 2810 - } 2811 - 2812 - static void cfq_ioc_set_ioprio(struct io_context *ioc) 2813 - { 2814 - call_for_each_cic(ioc, changed_ioprio); 2815 - ioc->ioprio_changed = 0; 2816 2942 } 2817 2943 2818 2944 static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq, ··· 2828 2970 } 2829 2971 2830 2972 #ifdef CONFIG_CFQ_GROUP_IOSCHED 2831 - static void changed_cgroup(struct io_context *ioc, struct cfq_io_context *cic) 2973 + static void changed_cgroup(struct cfq_io_cq *cic) 2832 2974 { 2833 2975 struct cfq_queue *sync_cfqq = cic_to_cfqq(cic, 1); 2834 2976 struct cfq_data *cfqd = cic_to_cfqd(cic); 2835 - unsigned long flags; 2836 2977 struct request_queue *q; 2837 2978 2838 2979 if (unlikely(!cfqd)) 2839 2980 return; 2840 2981 2841 2982 q = cfqd->queue; 2842 - 2843 - spin_lock_irqsave(q->queue_lock, flags); 2844 2983 2845 2984 if (sync_cfqq) { 2846 2985 /* ··· 2848 2993 cic_set_cfqq(cic, NULL, 1); 2849 2994 cfq_put_queue(sync_cfqq); 2850 2995 } 2851 - 2852 - spin_unlock_irqrestore(q->queue_lock, flags); 2853 - } 2854 - 2855 - static void cfq_ioc_set_cgroup(struct io_context *ioc) 2856 - { 2857 - call_for_each_cic(ioc, changed_cgroup); 2858 - ioc->cgroup_changed = 0; 2859 2996 } 2860 2997 #endif /* CONFIG_CFQ_GROUP_IOSCHED */ 2861 2998 ··· 2856 3009 struct io_context *ioc, gfp_t gfp_mask) 2857 3010 { 2858 3011 struct cfq_queue *cfqq, *new_cfqq = NULL; 2859 - struct cfq_io_context *cic; 3012 + struct cfq_io_cq *cic; 2860 3013 struct cfq_group *cfqg; 2861 3014 2862 3015 retry: ··· 2947 3100 return cfqq; 2948 3101 } 2949 3102 2950 - /* 2951 - * We drop cfq io contexts lazily, so we may find a dead one. 2952 - */ 2953 - static void 2954 - cfq_drop_dead_cic(struct cfq_data *cfqd, struct io_context *ioc, 2955 - struct cfq_io_context *cic) 2956 - { 2957 - unsigned long flags; 2958 - 2959 - WARN_ON(!list_empty(&cic->queue_list)); 2960 - BUG_ON(cic->key != cfqd_dead_key(cfqd)); 2961 - 2962 - spin_lock_irqsave(&ioc->lock, flags); 2963 - 2964 - BUG_ON(rcu_dereference_check(ioc->ioc_data, 2965 - lockdep_is_held(&ioc->lock)) == cic); 2966 - 2967 - radix_tree_delete(&ioc->radix_root, cfqd->cic_index); 2968 - hlist_del_rcu(&cic->cic_list); 2969 - spin_unlock_irqrestore(&ioc->lock, flags); 2970 - 2971 - cfq_cic_free(cic); 2972 - } 2973 - 2974 - static struct cfq_io_context * 2975 - cfq_cic_lookup(struct cfq_data *cfqd, struct io_context *ioc) 2976 - { 2977 - struct cfq_io_context *cic; 2978 - unsigned long flags; 2979 - 2980 - if (unlikely(!ioc)) 2981 - return NULL; 2982 - 2983 - rcu_read_lock(); 2984 - 2985 - /* 2986 - * we maintain a last-hit cache, to avoid browsing over the tree 2987 - */ 2988 - cic = rcu_dereference(ioc->ioc_data); 2989 - if (cic && cic->key == cfqd) { 2990 - rcu_read_unlock(); 2991 - return cic; 2992 - } 2993 - 2994 - do { 2995 - cic = radix_tree_lookup(&ioc->radix_root, cfqd->cic_index); 2996 - rcu_read_unlock(); 2997 - if (!cic) 2998 - break; 2999 - if (unlikely(cic->key != cfqd)) { 3000 - cfq_drop_dead_cic(cfqd, ioc, cic); 3001 - rcu_read_lock(); 3002 - continue; 3003 - } 3004 - 3005 - spin_lock_irqsave(&ioc->lock, flags); 3006 - rcu_assign_pointer(ioc->ioc_data, cic); 3007 - spin_unlock_irqrestore(&ioc->lock, flags); 3008 - break; 3009 - } while (1); 3010 - 3011 - return cic; 3012 - } 3013 - 3014 - /* 3015 - * Add cic into ioc, using cfqd as the search key. This enables us to lookup 3016 - * the process specific cfq io context when entered from the block layer. 3017 - * Also adds the cic to a per-cfqd list, used when this queue is removed. 3018 - */ 3019 - static int cfq_cic_link(struct cfq_data *cfqd, struct io_context *ioc, 3020 - struct cfq_io_context *cic, gfp_t gfp_mask) 3021 - { 3022 - unsigned long flags; 3023 - int ret; 3024 - 3025 - ret = radix_tree_preload(gfp_mask); 3026 - if (!ret) { 3027 - cic->ioc = ioc; 3028 - cic->key = cfqd; 3029 - 3030 - spin_lock_irqsave(&ioc->lock, flags); 3031 - ret = radix_tree_insert(&ioc->radix_root, 3032 - cfqd->cic_index, cic); 3033 - if (!ret) 3034 - hlist_add_head_rcu(&cic->cic_list, &ioc->cic_list); 3035 - spin_unlock_irqrestore(&ioc->lock, flags); 3036 - 3037 - radix_tree_preload_end(); 3038 - 3039 - if (!ret) { 3040 - spin_lock_irqsave(cfqd->queue->queue_lock, flags); 3041 - list_add(&cic->queue_list, &cfqd->cic_list); 3042 - spin_unlock_irqrestore(cfqd->queue->queue_lock, flags); 3043 - } 3044 - } 3045 - 3046 - if (ret && ret != -EEXIST) 3047 - printk(KERN_ERR "cfq: cic link failed!\n"); 3048 - 3049 - return ret; 3050 - } 3051 - 3052 - /* 3053 - * Setup general io context and cfq io context. There can be several cfq 3054 - * io contexts per general io context, if this process is doing io to more 3055 - * than one device managed by cfq. 3056 - */ 3057 - static struct cfq_io_context * 3058 - cfq_get_io_context(struct cfq_data *cfqd, gfp_t gfp_mask) 3059 - { 3060 - struct io_context *ioc = NULL; 3061 - struct cfq_io_context *cic; 3062 - int ret; 3063 - 3064 - might_sleep_if(gfp_mask & __GFP_WAIT); 3065 - 3066 - ioc = get_io_context(gfp_mask, cfqd->queue->node); 3067 - if (!ioc) 3068 - return NULL; 3069 - 3070 - retry: 3071 - cic = cfq_cic_lookup(cfqd, ioc); 3072 - if (cic) 3073 - goto out; 3074 - 3075 - cic = cfq_alloc_io_context(cfqd, gfp_mask); 3076 - if (cic == NULL) 3077 - goto err; 3078 - 3079 - ret = cfq_cic_link(cfqd, ioc, cic, gfp_mask); 3080 - if (ret == -EEXIST) { 3081 - /* someone has linked cic to ioc already */ 3082 - cfq_cic_free(cic); 3083 - goto retry; 3084 - } else if (ret) 3085 - goto err_free; 3086 - 3087 - out: 3088 - smp_read_barrier_depends(); 3089 - if (unlikely(ioc->ioprio_changed)) 3090 - cfq_ioc_set_ioprio(ioc); 3091 - 3092 - #ifdef CONFIG_CFQ_GROUP_IOSCHED 3093 - if (unlikely(ioc->cgroup_changed)) 3094 - cfq_ioc_set_cgroup(ioc); 3095 - #endif 3096 - return cic; 3097 - err_free: 3098 - cfq_cic_free(cic); 3099 - err: 3100 - put_io_context(ioc); 3101 - return NULL; 3102 - } 3103 - 3104 3103 static void 3105 3104 __cfq_update_io_thinktime(struct cfq_ttime *ttime, unsigned long slice_idle) 3106 3105 { ··· 2960 3267 2961 3268 static void 2962 3269 cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_queue *cfqq, 2963 - struct cfq_io_context *cic) 3270 + struct cfq_io_cq *cic) 2964 3271 { 2965 3272 if (cfq_cfqq_sync(cfqq)) { 2966 3273 __cfq_update_io_thinktime(&cic->ttime, cfqd->cfq_slice_idle); ··· 2998 3305 */ 2999 3306 static void 3000 3307 cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq, 3001 - struct cfq_io_context *cic) 3308 + struct cfq_io_cq *cic) 3002 3309 { 3003 3310 int old_idle, enable_idle; 3004 3311 ··· 3015 3322 3016 3323 if (cfqq->next_rq && (cfqq->next_rq->cmd_flags & REQ_NOIDLE)) 3017 3324 enable_idle = 0; 3018 - else if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle || 3019 - (!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq))) 3325 + else if (!atomic_read(&cic->icq.ioc->nr_tasks) || 3326 + !cfqd->cfq_slice_idle || 3327 + (!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq))) 3020 3328 enable_idle = 0; 3021 3329 else if (sample_valid(cic->ttime.ttime_samples)) { 3022 3330 if (cic->ttime.ttime_mean > cfqd->cfq_slice_idle) ··· 3149 3455 cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq, 3150 3456 struct request *rq) 3151 3457 { 3152 - struct cfq_io_context *cic = RQ_CIC(rq); 3458 + struct cfq_io_cq *cic = RQ_CIC(rq); 3153 3459 3154 3460 cfqd->rq_queued++; 3155 3461 if (rq->cmd_flags & REQ_PRIO) ··· 3202 3508 struct cfq_queue *cfqq = RQ_CFQQ(rq); 3203 3509 3204 3510 cfq_log_cfqq(cfqd, cfqq, "insert_request"); 3205 - cfq_init_prio_data(cfqq, RQ_CIC(rq)->ioc); 3511 + cfq_init_prio_data(cfqq, RQ_CIC(rq)->icq.ioc); 3206 3512 3207 3513 rq_set_fifo_time(rq, jiffies + cfqd->cfq_fifo_expire[rq_is_sync(rq)]); 3208 3514 list_add_tail(&rq->queuelist, &cfqq->fifo); ··· 3252 3558 3253 3559 static bool cfq_should_wait_busy(struct cfq_data *cfqd, struct cfq_queue *cfqq) 3254 3560 { 3255 - struct cfq_io_context *cic = cfqd->active_cic; 3561 + struct cfq_io_cq *cic = cfqd->active_cic; 3256 3562 3257 3563 /* If the queue already has requests, don't wait */ 3258 3564 if (!RB_EMPTY_ROOT(&cfqq->sort_list)) ··· 3389 3695 { 3390 3696 struct cfq_data *cfqd = q->elevator->elevator_data; 3391 3697 struct task_struct *tsk = current; 3392 - struct cfq_io_context *cic; 3698 + struct cfq_io_cq *cic; 3393 3699 struct cfq_queue *cfqq; 3394 3700 3395 3701 /* ··· 3404 3710 3405 3711 cfqq = cic_to_cfqq(cic, rw_is_sync(rw)); 3406 3712 if (cfqq) { 3407 - cfq_init_prio_data(cfqq, cic->ioc); 3713 + cfq_init_prio_data(cfqq, cic->icq.ioc); 3408 3714 3409 3715 return __cfq_may_queue(cfqq); 3410 3716 } ··· 3425 3731 BUG_ON(!cfqq->allocated[rw]); 3426 3732 cfqq->allocated[rw]--; 3427 3733 3428 - put_io_context(RQ_CIC(rq)->ioc); 3429 - 3430 - rq->elevator_private[0] = NULL; 3431 - rq->elevator_private[1] = NULL; 3432 - 3433 3734 /* Put down rq reference on cfqg */ 3434 3735 cfq_put_cfqg(RQ_CFQG(rq)); 3435 - rq->elevator_private[2] = NULL; 3736 + rq->elv.priv[0] = NULL; 3737 + rq->elv.priv[1] = NULL; 3436 3738 3437 3739 cfq_put_queue(cfqq); 3438 3740 } 3439 3741 } 3440 3742 3441 3743 static struct cfq_queue * 3442 - cfq_merge_cfqqs(struct cfq_data *cfqd, struct cfq_io_context *cic, 3744 + cfq_merge_cfqqs(struct cfq_data *cfqd, struct cfq_io_cq *cic, 3443 3745 struct cfq_queue *cfqq) 3444 3746 { 3445 3747 cfq_log_cfqq(cfqd, cfqq, "merging with queue %p", cfqq->new_cfqq); ··· 3450 3760 * was the last process referring to said cfqq. 3451 3761 */ 3452 3762 static struct cfq_queue * 3453 - split_cfqq(struct cfq_io_context *cic, struct cfq_queue *cfqq) 3763 + split_cfqq(struct cfq_io_cq *cic, struct cfq_queue *cfqq) 3454 3764 { 3455 3765 if (cfqq_process_refs(cfqq) == 1) { 3456 3766 cfqq->pid = current->pid; ··· 3473 3783 cfq_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask) 3474 3784 { 3475 3785 struct cfq_data *cfqd = q->elevator->elevator_data; 3476 - struct cfq_io_context *cic; 3786 + struct cfq_io_cq *cic = icq_to_cic(rq->elv.icq); 3477 3787 const int rw = rq_data_dir(rq); 3478 3788 const bool is_sync = rq_is_sync(rq); 3479 3789 struct cfq_queue *cfqq; 3480 - unsigned long flags; 3481 3790 3482 3791 might_sleep_if(gfp_mask & __GFP_WAIT); 3483 3792 3484 - cic = cfq_get_io_context(cfqd, gfp_mask); 3793 + spin_lock_irq(q->queue_lock); 3485 3794 3486 - spin_lock_irqsave(q->queue_lock, flags); 3487 - 3488 - if (!cic) 3489 - goto queue_fail; 3795 + /* handle changed notifications */ 3796 + if (unlikely(cic->icq.changed)) { 3797 + if (test_and_clear_bit(ICQ_IOPRIO_CHANGED, &cic->icq.changed)) 3798 + changed_ioprio(cic); 3799 + #ifdef CONFIG_CFQ_GROUP_IOSCHED 3800 + if (test_and_clear_bit(ICQ_CGROUP_CHANGED, &cic->icq.changed)) 3801 + changed_cgroup(cic); 3802 + #endif 3803 + } 3490 3804 3491 3805 new_queue: 3492 3806 cfqq = cic_to_cfqq(cic, is_sync); 3493 3807 if (!cfqq || cfqq == &cfqd->oom_cfqq) { 3494 - cfqq = cfq_get_queue(cfqd, is_sync, cic->ioc, gfp_mask); 3808 + cfqq = cfq_get_queue(cfqd, is_sync, cic->icq.ioc, gfp_mask); 3495 3809 cic_set_cfqq(cic, cfqq, is_sync); 3496 3810 } else { 3497 3811 /* ··· 3521 3827 cfqq->allocated[rw]++; 3522 3828 3523 3829 cfqq->ref++; 3524 - rq->elevator_private[0] = cic; 3525 - rq->elevator_private[1] = cfqq; 3526 - rq->elevator_private[2] = cfq_ref_get_cfqg(cfqq->cfqg); 3527 - spin_unlock_irqrestore(q->queue_lock, flags); 3830 + rq->elv.priv[0] = cfqq; 3831 + rq->elv.priv[1] = cfq_ref_get_cfqg(cfqq->cfqg); 3832 + spin_unlock_irq(q->queue_lock); 3528 3833 return 0; 3529 - 3530 - queue_fail: 3531 - cfq_schedule_dispatch(cfqd); 3532 - spin_unlock_irqrestore(q->queue_lock, flags); 3533 - cfq_log(cfqd, "set_request fail"); 3534 - return 1; 3535 3834 } 3536 3835 3537 3836 static void cfq_kick_queue(struct work_struct *work) ··· 3628 3941 if (cfqd->active_queue) 3629 3942 __cfq_slice_expired(cfqd, cfqd->active_queue, 0); 3630 3943 3631 - while (!list_empty(&cfqd->cic_list)) { 3632 - struct cfq_io_context *cic = list_entry(cfqd->cic_list.next, 3633 - struct cfq_io_context, 3634 - queue_list); 3635 - 3636 - __cfq_exit_single_io_context(cfqd, cic); 3637 - } 3638 - 3639 3944 cfq_put_async_queues(cfqd); 3640 3945 cfq_release_cfq_groups(cfqd); 3641 3946 ··· 3641 3962 spin_unlock_irq(q->queue_lock); 3642 3963 3643 3964 cfq_shutdown_timer_wq(cfqd); 3644 - 3645 - spin_lock(&cic_index_lock); 3646 - ida_remove(&cic_index_ida, cfqd->cic_index); 3647 - spin_unlock(&cic_index_lock); 3648 3965 3649 3966 /* 3650 3967 * Wait for cfqg->blkg->key accessors to exit their grace periods. ··· 3663 3988 kfree(cfqd); 3664 3989 } 3665 3990 3666 - static int cfq_alloc_cic_index(void) 3667 - { 3668 - int index, error; 3669 - 3670 - do { 3671 - if (!ida_pre_get(&cic_index_ida, GFP_KERNEL)) 3672 - return -ENOMEM; 3673 - 3674 - spin_lock(&cic_index_lock); 3675 - error = ida_get_new(&cic_index_ida, &index); 3676 - spin_unlock(&cic_index_lock); 3677 - if (error && error != -EAGAIN) 3678 - return error; 3679 - } while (error); 3680 - 3681 - return index; 3682 - } 3683 - 3684 3991 static void *cfq_init_queue(struct request_queue *q) 3685 3992 { 3686 3993 struct cfq_data *cfqd; ··· 3670 4013 struct cfq_group *cfqg; 3671 4014 struct cfq_rb_root *st; 3672 4015 3673 - i = cfq_alloc_cic_index(); 3674 - if (i < 0) 3675 - return NULL; 3676 - 3677 4016 cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node); 3678 - if (!cfqd) { 3679 - spin_lock(&cic_index_lock); 3680 - ida_remove(&cic_index_ida, i); 3681 - spin_unlock(&cic_index_lock); 4017 + if (!cfqd) 3682 4018 return NULL; 3683 - } 3684 - 3685 - /* 3686 - * Don't need take queue_lock in the routine, since we are 3687 - * initializing the ioscheduler, and nobody is using cfqd 3688 - */ 3689 - cfqd->cic_index = i; 3690 4019 3691 4020 /* Init root service tree */ 3692 4021 cfqd->grp_service_tree = CFQ_RB_ROOT; ··· 3698 4055 3699 4056 if (blkio_alloc_blkg_stats(&cfqg->blkg)) { 3700 4057 kfree(cfqg); 3701 - 3702 - spin_lock(&cic_index_lock); 3703 - ida_remove(&cic_index_ida, cfqd->cic_index); 3704 - spin_unlock(&cic_index_lock); 3705 - 3706 4058 kfree(cfqd); 3707 4059 return NULL; 3708 4060 } ··· 3729 4091 cfqd->oom_cfqq.ref++; 3730 4092 cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, &cfqd->root_group); 3731 4093 3732 - INIT_LIST_HEAD(&cfqd->cic_list); 3733 - 3734 4094 cfqd->queue = q; 3735 4095 3736 4096 init_timer(&cfqd->idle_slice_timer); ··· 3755 4119 */ 3756 4120 cfqd->last_delayed_sync = jiffies - HZ; 3757 4121 return cfqd; 3758 - } 3759 - 3760 - static void cfq_slab_kill(void) 3761 - { 3762 - /* 3763 - * Caller already ensured that pending RCU callbacks are completed, 3764 - * so we should have no busy allocations at this point. 3765 - */ 3766 - if (cfq_pool) 3767 - kmem_cache_destroy(cfq_pool); 3768 - if (cfq_ioc_pool) 3769 - kmem_cache_destroy(cfq_ioc_pool); 3770 - } 3771 - 3772 - static int __init cfq_slab_setup(void) 3773 - { 3774 - cfq_pool = KMEM_CACHE(cfq_queue, 0); 3775 - if (!cfq_pool) 3776 - goto fail; 3777 - 3778 - cfq_ioc_pool = KMEM_CACHE(cfq_io_context, 0); 3779 - if (!cfq_ioc_pool) 3780 - goto fail; 3781 - 3782 - return 0; 3783 - fail: 3784 - cfq_slab_kill(); 3785 - return -ENOMEM; 3786 4122 } 3787 4123 3788 4124 /* ··· 3862 4254 .elevator_completed_req_fn = cfq_completed_request, 3863 4255 .elevator_former_req_fn = elv_rb_former_request, 3864 4256 .elevator_latter_req_fn = elv_rb_latter_request, 4257 + .elevator_init_icq_fn = cfq_init_icq, 4258 + .elevator_exit_icq_fn = cfq_exit_icq, 3865 4259 .elevator_set_req_fn = cfq_set_request, 3866 4260 .elevator_put_req_fn = cfq_put_request, 3867 4261 .elevator_may_queue_fn = cfq_may_queue, 3868 4262 .elevator_init_fn = cfq_init_queue, 3869 4263 .elevator_exit_fn = cfq_exit_queue, 3870 - .trim = cfq_free_io_context, 3871 4264 }, 4265 + .icq_size = sizeof(struct cfq_io_cq), 4266 + .icq_align = __alignof__(struct cfq_io_cq), 3872 4267 .elevator_attrs = cfq_attrs, 3873 - .elevator_name = "cfq", 4268 + .elevator_name = "cfq", 3874 4269 .elevator_owner = THIS_MODULE, 3875 4270 }; 3876 4271 ··· 3891 4280 3892 4281 static int __init cfq_init(void) 3893 4282 { 4283 + int ret; 4284 + 3894 4285 /* 3895 4286 * could be 0 on HZ < 1000 setups 3896 4287 */ ··· 3907 4294 #else 3908 4295 cfq_group_idle = 0; 3909 4296 #endif 3910 - if (cfq_slab_setup()) 4297 + cfq_pool = KMEM_CACHE(cfq_queue, 0); 4298 + if (!cfq_pool) 3911 4299 return -ENOMEM; 3912 4300 3913 - elv_register(&iosched_cfq); 4301 + ret = elv_register(&iosched_cfq); 4302 + if (ret) { 4303 + kmem_cache_destroy(cfq_pool); 4304 + return ret; 4305 + } 4306 + 3914 4307 blkio_policy_register(&blkio_policy_cfq); 3915 4308 3916 4309 return 0; ··· 3924 4305 3925 4306 static void __exit cfq_exit(void) 3926 4307 { 3927 - DECLARE_COMPLETION_ONSTACK(all_gone); 3928 4308 blkio_policy_unregister(&blkio_policy_cfq); 3929 4309 elv_unregister(&iosched_cfq); 3930 - ioc_gone = &all_gone; 3931 - /* ioc_gone's update must be visible before reading ioc_count */ 3932 - smp_wmb(); 3933 - 3934 - /* 3935 - * this also protects us from entering cfq_slab_kill() with 3936 - * pending RCU callbacks 3937 - */ 3938 - if (elv_ioc_count_read(cfq_ioc_count)) 3939 - wait_for_completion(&all_gone); 3940 - ida_destroy(&cic_index_ida); 3941 - cfq_slab_kill(); 4310 + kmem_cache_destroy(cfq_pool); 3942 4311 } 3943 4312 3944 4313 module_init(cfq_init);

+3

block/compat_ioctl.c

··· 719 719 case BLKSECTGET: 720 720 return compat_put_ushort(arg, 721 721 queue_max_sectors(bdev_get_queue(bdev))); 722 + case BLKROTATIONAL: 723 + return compat_put_ushort(arg, 724 + !blk_queue_nonrot(bdev_get_queue(bdev))); 722 725 case BLKRASET: /* compatible, but no compat_ptr (!) */ 723 726 case BLKFRASET: 724 727 if (!capable(CAP_SYS_ADMIN))

+1 -3

block/deadline-iosched.c

··· 448 448 449 449 static int __init deadline_init(void) 450 450 { 451 - elv_register(&iosched_deadline); 452 - 453 - return 0; 451 + return elv_register(&iosched_deadline); 454 452 } 455 453 456 454 static void __exit deadline_exit(void)

+108 -111

block/elevator.c

··· 61 61 struct request_queue *q = rq->q; 62 62 struct elevator_queue *e = q->elevator; 63 63 64 - if (e->ops->elevator_allow_merge_fn) 65 - return e->ops->elevator_allow_merge_fn(q, rq, bio); 64 + if (e->type->ops.elevator_allow_merge_fn) 65 + return e->type->ops.elevator_allow_merge_fn(q, rq, bio); 66 66 67 67 return 1; 68 68 } ··· 168 168 return e; 169 169 } 170 170 171 - static void *elevator_init_queue(struct request_queue *q, 172 - struct elevator_queue *eq) 171 + static int elevator_init_queue(struct request_queue *q, 172 + struct elevator_queue *eq) 173 173 { 174 - return eq->ops->elevator_init_fn(q); 175 - } 176 - 177 - static void elevator_attach(struct request_queue *q, struct elevator_queue *eq, 178 - void *data) 179 - { 180 - q->elevator = eq; 181 - eq->elevator_data = data; 174 + eq->elevator_data = eq->type->ops.elevator_init_fn(q); 175 + if (eq->elevator_data) 176 + return 0; 177 + return -ENOMEM; 182 178 } 183 179 184 180 static char chosen_elevator[ELV_NAME_MAX]; ··· 203 207 if (unlikely(!eq)) 204 208 goto err; 205 209 206 - eq->ops = &e->ops; 207 - eq->elevator_type = e; 210 + eq->type = e; 208 211 kobject_init(&eq->kobj, &elv_ktype); 209 212 mutex_init(&eq->sysfs_lock); 210 213 ··· 227 232 struct elevator_queue *e; 228 233 229 234 e = container_of(kobj, struct elevator_queue, kobj); 230 - elevator_put(e->elevator_type); 235 + elevator_put(e->type); 231 236 kfree(e->hash); 232 237 kfree(e); 233 238 } ··· 236 241 { 237 242 struct elevator_type *e = NULL; 238 243 struct elevator_queue *eq; 239 - void *data; 244 + int err; 240 245 241 246 if (unlikely(q->elevator)) 242 247 return 0; ··· 273 278 if (!eq) 274 279 return -ENOMEM; 275 280 276 - data = elevator_init_queue(q, eq); 277 - if (!data) { 281 + err = elevator_init_queue(q, eq); 282 + if (err) { 278 283 kobject_put(&eq->kobj); 279 - return -ENOMEM; 284 + return err; 280 285 } 281 286 282 - elevator_attach(q, eq, data); 287 + q->elevator = eq; 283 288 return 0; 284 289 } 285 290 EXPORT_SYMBOL(elevator_init); ··· 287 292 void elevator_exit(struct elevator_queue *e) 288 293 { 289 294 mutex_lock(&e->sysfs_lock); 290 - if (e->ops->elevator_exit_fn) 291 - e->ops->elevator_exit_fn(e); 292 - e->ops = NULL; 295 + if (e->type->ops.elevator_exit_fn) 296 + e->type->ops.elevator_exit_fn(e); 293 297 mutex_unlock(&e->sysfs_lock); 294 298 295 299 kobject_put(&e->kobj); ··· 498 504 return ELEVATOR_BACK_MERGE; 499 505 } 500 506 501 - if (e->ops->elevator_merge_fn) 502 - return e->ops->elevator_merge_fn(q, req, bio); 507 + if (e->type->ops.elevator_merge_fn) 508 + return e->type->ops.elevator_merge_fn(q, req, bio); 503 509 504 510 return ELEVATOR_NO_MERGE; 505 511 } ··· 542 548 { 543 549 struct elevator_queue *e = q->elevator; 544 550 545 - if (e->ops->elevator_merged_fn) 546 - e->ops->elevator_merged_fn(q, rq, type); 551 + if (e->type->ops.elevator_merged_fn) 552 + e->type->ops.elevator_merged_fn(q, rq, type); 547 553 548 554 if (type == ELEVATOR_BACK_MERGE) 549 555 elv_rqhash_reposition(q, rq); ··· 557 563 struct elevator_queue *e = q->elevator; 558 564 const int next_sorted = next->cmd_flags & REQ_SORTED; 559 565 560 - if (next_sorted && e->ops->elevator_merge_req_fn) 561 - e->ops->elevator_merge_req_fn(q, rq, next); 566 + if (next_sorted && e->type->ops.elevator_merge_req_fn) 567 + e->type->ops.elevator_merge_req_fn(q, rq, next); 562 568 563 569 elv_rqhash_reposition(q, rq); 564 570 ··· 575 581 { 576 582 struct elevator_queue *e = q->elevator; 577 583 578 - if (e->ops->elevator_bio_merged_fn) 579 - e->ops->elevator_bio_merged_fn(q, rq, bio); 584 + if (e->type->ops.elevator_bio_merged_fn) 585 + e->type->ops.elevator_bio_merged_fn(q, rq, bio); 580 586 } 581 587 582 588 void elv_requeue_request(struct request_queue *q, struct request *rq) ··· 602 608 603 609 lockdep_assert_held(q->queue_lock); 604 610 605 - while (q->elevator->ops->elevator_dispatch_fn(q, 1)) 611 + while (q->elevator->type->ops.elevator_dispatch_fn(q, 1)) 606 612 ; 607 613 if (q->nr_sorted && printed++ < 10) { 608 614 printk(KERN_ERR "%s: forced dispatching is broken " 609 615 "(nr_sorted=%u), please report this\n", 610 - q->elevator->elevator_type->elevator_name, q->nr_sorted); 616 + q->elevator->type->elevator_name, q->nr_sorted); 611 617 } 612 618 } 613 619 ··· 696 702 * rq cannot be accessed after calling 697 703 * elevator_add_req_fn. 698 704 */ 699 - q->elevator->ops->elevator_add_req_fn(q, rq); 705 + q->elevator->type->ops.elevator_add_req_fn(q, rq); 700 706 break; 701 707 702 708 case ELEVATOR_INSERT_FLUSH: ··· 725 731 { 726 732 struct elevator_queue *e = q->elevator; 727 733 728 - if (e->ops->elevator_latter_req_fn) 729 - return e->ops->elevator_latter_req_fn(q, rq); 734 + if (e->type->ops.elevator_latter_req_fn) 735 + return e->type->ops.elevator_latter_req_fn(q, rq); 730 736 return NULL; 731 737 } 732 738 ··· 734 740 { 735 741 struct elevator_queue *e = q->elevator; 736 742 737 - if (e->ops->elevator_former_req_fn) 738 - return e->ops->elevator_former_req_fn(q, rq); 743 + if (e->type->ops.elevator_former_req_fn) 744 + return e->type->ops.elevator_former_req_fn(q, rq); 739 745 return NULL; 740 746 } 741 747 ··· 743 749 { 744 750 struct elevator_queue *e = q->elevator; 745 751 746 - if (e->ops->elevator_set_req_fn) 747 - return e->ops->elevator_set_req_fn(q, rq, gfp_mask); 748 - 749 - rq->elevator_private[0] = NULL; 752 + if (e->type->ops.elevator_set_req_fn) 753 + return e->type->ops.elevator_set_req_fn(q, rq, gfp_mask); 750 754 return 0; 751 755 } 752 756 ··· 752 760 { 753 761 struct elevator_queue *e = q->elevator; 754 762 755 - if (e->ops->elevator_put_req_fn) 756 - e->ops->elevator_put_req_fn(rq); 763 + if (e->type->ops.elevator_put_req_fn) 764 + e->type->ops.elevator_put_req_fn(rq); 757 765 } 758 766 759 767 int elv_may_queue(struct request_queue *q, int rw) 760 768 { 761 769 struct elevator_queue *e = q->elevator; 762 770 763 - if (e->ops->elevator_may_queue_fn) 764 - return e->ops->elevator_may_queue_fn(q, rw); 771 + if (e->type->ops.elevator_may_queue_fn) 772 + return e->type->ops.elevator_may_queue_fn(q, rw); 765 773 766 774 return ELV_MQUEUE_MAY; 767 775 } ··· 796 804 if (blk_account_rq(rq)) { 797 805 q->in_flight[rq_is_sync(rq)]--; 798 806 if ((rq->cmd_flags & REQ_SORTED) && 799 - e->ops->elevator_completed_req_fn) 800 - e->ops->elevator_completed_req_fn(q, rq); 807 + e->type->ops.elevator_completed_req_fn) 808 + e->type->ops.elevator_completed_req_fn(q, rq); 801 809 } 802 810 } 803 811 ··· 815 823 816 824 e = container_of(kobj, struct elevator_queue, kobj); 817 825 mutex_lock(&e->sysfs_lock); 818 - error = e->ops ? entry->show(e, page) : -ENOENT; 826 + error = e->type ? entry->show(e, page) : -ENOENT; 819 827 mutex_unlock(&e->sysfs_lock); 820 828 return error; 821 829 } ··· 833 841 834 842 e = container_of(kobj, struct elevator_queue, kobj); 835 843 mutex_lock(&e->sysfs_lock); 836 - error = e->ops ? entry->store(e, page, length) : -ENOENT; 844 + error = e->type ? entry->store(e, page, length) : -ENOENT; 837 845 mutex_unlock(&e->sysfs_lock); 838 846 return error; 839 847 } ··· 848 856 .release = elevator_release, 849 857 }; 850 858 851 - int elv_register_queue(struct request_queue *q) 859 + int __elv_register_queue(struct request_queue *q, struct elevator_queue *e) 852 860 { 853 - struct elevator_queue *e = q->elevator; 854 861 int error; 855 862 856 863 error = kobject_add(&e->kobj, &q->kobj, "%s", "iosched"); 857 864 if (!error) { 858 - struct elv_fs_entry *attr = e->elevator_type->elevator_attrs; 865 + struct elv_fs_entry *attr = e->type->elevator_attrs; 859 866 if (attr) { 860 867 while (attr->attr.name) { 861 868 if (sysfs_create_file(&e->kobj, &attr->attr)) ··· 867 876 } 868 877 return error; 869 878 } 870 - EXPORT_SYMBOL(elv_register_queue); 871 879 872 - static void __elv_unregister_queue(struct elevator_queue *e) 880 + int elv_register_queue(struct request_queue *q) 873 881 { 874 - kobject_uevent(&e->kobj, KOBJ_REMOVE); 875 - kobject_del(&e->kobj); 876 - e->registered = 0; 882 + return __elv_register_queue(q, q->elevator); 877 883 } 884 + EXPORT_SYMBOL(elv_register_queue); 878 885 879 886 void elv_unregister_queue(struct request_queue *q) 880 887 { 881 - if (q) 882 - __elv_unregister_queue(q->elevator); 888 + if (q) { 889 + struct elevator_queue *e = q->elevator; 890 + 891 + kobject_uevent(&e->kobj, KOBJ_REMOVE); 892 + kobject_del(&e->kobj); 893 + e->registered = 0; 894 + } 883 895 } 884 896 EXPORT_SYMBOL(elv_unregister_queue); 885 897 886 - void elv_register(struct elevator_type *e) 898 + int elv_register(struct elevator_type *e) 887 899 { 888 900 char *def = ""; 889 901 902 + /* create icq_cache if requested */ 903 + if (e->icq_size) { 904 + if (WARN_ON(e->icq_size < sizeof(struct io_cq)) || 905 + WARN_ON(e->icq_align < __alignof__(struct io_cq))) 906 + return -EINVAL; 907 + 908 + snprintf(e->icq_cache_name, sizeof(e->icq_cache_name), 909 + "%s_io_cq", e->elevator_name); 910 + e->icq_cache = kmem_cache_create(e->icq_cache_name, e->icq_size, 911 + e->icq_align, 0, NULL); 912 + if (!e->icq_cache) 913 + return -ENOMEM; 914 + } 915 + 916 + /* register, don't allow duplicate names */ 890 917 spin_lock(&elv_list_lock); 891 - BUG_ON(elevator_find(e->elevator_name)); 918 + if (elevator_find(e->elevator_name)) { 919 + spin_unlock(&elv_list_lock); 920 + if (e->icq_cache) 921 + kmem_cache_destroy(e->icq_cache); 922 + return -EBUSY; 923 + } 892 924 list_add_tail(&e->list, &elv_list); 893 925 spin_unlock(&elv_list_lock); 894 926 927 + /* print pretty message */ 895 928 if (!strcmp(e->elevator_name, chosen_elevator) || 896 929 (!*chosen_elevator && 897 930 !strcmp(e->elevator_name, CONFIG_DEFAULT_IOSCHED))) ··· 923 908 924 909 printk(KERN_INFO "io scheduler %s registered%s\n", e->elevator_name, 925 910 def); 911 + return 0; 926 912 } 927 913 EXPORT_SYMBOL_GPL(elv_register); 928 914 929 915 void elv_unregister(struct elevator_type *e) 930 916 { 931 - struct task_struct *g, *p; 932 - 933 - /* 934 - * Iterate every thread in the process to remove the io contexts. 935 - */ 936 - if (e->ops.trim) { 937 - read_lock(&tasklist_lock); 938 - do_each_thread(g, p) { 939 - task_lock(p); 940 - if (p->io_context) 941 - e->ops.trim(p->io_context); 942 - task_unlock(p); 943 - } while_each_thread(g, p); 944 - read_unlock(&tasklist_lock); 945 - } 946 - 917 + /* unregister */ 947 918 spin_lock(&elv_list_lock); 948 919 list_del_init(&e->list); 949 920 spin_unlock(&elv_list_lock); 921 + 922 + /* 923 + * Destroy icq_cache if it exists. icq's are RCU managed. Make 924 + * sure all RCU operations are complete before proceeding. 925 + */ 926 + if (e->icq_cache) { 927 + rcu_barrier(); 928 + kmem_cache_destroy(e->icq_cache); 929 + e->icq_cache = NULL; 930 + } 950 931 } 951 932 EXPORT_SYMBOL_GPL(elv_unregister); 952 933 ··· 955 944 static int elevator_switch(struct request_queue *q, struct elevator_type *new_e) 956 945 { 957 946 struct elevator_queue *old_elevator, *e; 958 - void *data; 959 947 int err; 960 948 961 - /* 962 - * Allocate new elevator 963 - */ 949 + /* allocate new elevator */ 964 950 e = elevator_alloc(q, new_e); 965 951 if (!e) 966 952 return -ENOMEM; 967 953 968 - data = elevator_init_queue(q, e); 969 - if (!data) { 954 + err = elevator_init_queue(q, e); 955 + if (err) { 970 956 kobject_put(&e->kobj); 971 - return -ENOMEM; 957 + return err; 972 958 } 973 959 974 - /* 975 - * Turn on BYPASS and drain all requests w/ elevator private data 976 - */ 960 + /* turn on BYPASS and drain all requests w/ elevator private data */ 977 961 elv_quiesce_start(q); 978 962 979 - /* 980 - * Remember old elevator. 981 - */ 982 - old_elevator = q->elevator; 983 - 984 - /* 985 - * attach and start new elevator 986 - */ 987 - spin_lock_irq(q->queue_lock); 988 - elevator_attach(q, e, data); 989 - spin_unlock_irq(q->queue_lock); 990 - 991 - if (old_elevator->registered) { 992 - __elv_unregister_queue(old_elevator); 993 - 994 - err = elv_register_queue(q); 963 + /* unregister old queue, register new one and kill old elevator */ 964 + if (q->elevator->registered) { 965 + elv_unregister_queue(q); 966 + err = __elv_register_queue(q, e); 995 967 if (err) 996 968 goto fail_register; 997 969 } 998 970 999 - /* 1000 - * finally exit old elevator and turn off BYPASS. 1001 - */ 971 + /* done, clear io_cq's, switch elevators and turn off BYPASS */ 972 + spin_lock_irq(q->queue_lock); 973 + ioc_clear_queue(q); 974 + old_elevator = q->elevator; 975 + q->elevator = e; 976 + spin_unlock_irq(q->queue_lock); 977 + 1002 978 elevator_exit(old_elevator); 1003 979 elv_quiesce_end(q); 1004 980 1005 - blk_add_trace_msg(q, "elv switch: %s", e->elevator_type->elevator_name); 981 + blk_add_trace_msg(q, "elv switch: %s", e->type->elevator_name); 1006 982 1007 983 return 0; 1008 984 ··· 999 1001 * one again (along with re-adding the sysfs dir) 1000 1002 */ 1001 1003 elevator_exit(e); 1002 - q->elevator = old_elevator; 1003 1004 elv_register_queue(q); 1004 1005 elv_quiesce_end(q); 1005 1006 ··· 1023 1026 return -EINVAL; 1024 1027 } 1025 1028 1026 - if (!strcmp(elevator_name, q->elevator->elevator_type->elevator_name)) { 1029 + if (!strcmp(elevator_name, q->elevator->type->elevator_name)) { 1027 1030 elevator_put(e); 1028 1031 return 0; 1029 1032 } ··· 1058 1061 if (!q->elevator || !blk_queue_stackable(q)) 1059 1062 return sprintf(name, "none\n"); 1060 1063 1061 - elv = e->elevator_type; 1064 + elv = e->type; 1062 1065 1063 1066 spin_lock(&elv_list_lock); 1064 1067 list_for_each_entry(__e, &elv_list, list) {

+1 -1

block/genhd.c

··· 614 614 * Take an extra ref on queue which will be put on disk_release() 615 615 * so that it sticks around as long as @disk is there. 616 616 */ 617 - WARN_ON_ONCE(blk_get_queue(disk->queue)); 617 + WARN_ON_ONCE(!blk_get_queue(disk->queue)); 618 618 619 619 retval = sysfs_create_link(&disk_to_dev(disk)->kobj, &bdi->dev->kobj, 620 620 "bdi");

+2

block/ioctl.c

··· 296 296 return put_uint(arg, bdev_discard_zeroes_data(bdev)); 297 297 case BLKSECTGET: 298 298 return put_ushort(arg, queue_max_sectors(bdev_get_queue(bdev))); 299 + case BLKROTATIONAL: 300 + return put_ushort(arg, !blk_queue_nonrot(bdev_get_queue(bdev))); 299 301 case BLKRASET: 300 302 case BLKFRASET: 301 303 if(!capable(CAP_SYS_ADMIN))

+1 -3

block/noop-iosched.c

··· 94 94 95 95 static int __init noop_init(void) 96 96 { 97 - elv_register(&elevator_noop); 98 - 99 - return 0; 97 + return elv_register(&elevator_noop); 100 98 } 101 99 102 100 static void __exit noop_exit(void)

+8 -4

drivers/block/sx8.c

··· 619 619 host->state == HST_DEV_SCAN); 620 620 spin_unlock_irq(&host->lock); 621 621 622 - DPRINTK("blk_insert_request, tag == %u\n", idx); 623 - blk_insert_request(host->oob_q, crq->rq, 1, crq); 622 + DPRINTK("blk_execute_rq_nowait, tag == %u\n", idx); 623 + crq->rq->cmd_type = REQ_TYPE_SPECIAL; 624 + crq->rq->special = crq; 625 + blk_execute_rq_nowait(host->oob_q, NULL, crq->rq, true, NULL); 624 626 625 627 return 0; 626 628 ··· 660 658 BUG_ON(rc < 0); 661 659 crq->msg_bucket = (u32) rc; 662 660 663 - DPRINTK("blk_insert_request, tag == %u\n", idx); 664 - blk_insert_request(host->oob_q, crq->rq, 1, crq); 661 + DPRINTK("blk_execute_rq_nowait, tag == %u\n", idx); 662 + crq->rq->cmd_type = REQ_TYPE_SPECIAL; 663 + crq->rq->special = crq; 664 + blk_execute_rq_nowait(host->oob_q, NULL, crq->rq, true, NULL); 665 665 666 666 return 0; 667 667 }

+3 -3

drivers/md/dm-table.c

··· 699 699 while (i < dm_table_get_num_targets(table)) { 700 700 ti = dm_table_get_target(table, i++); 701 701 702 - blk_set_default_limits(&ti_limits); 702 + blk_set_stacking_limits(&ti_limits); 703 703 704 704 /* combine all target devices' limits */ 705 705 if (ti->type->iterate_devices) ··· 1221 1221 struct queue_limits ti_limits; 1222 1222 unsigned i = 0; 1223 1223 1224 - blk_set_default_limits(limits); 1224 + blk_set_stacking_limits(limits); 1225 1225 1226 1226 while (i < dm_table_get_num_targets(table)) { 1227 - blk_set_default_limits(&ti_limits); 1227 + blk_set_stacking_limits(&ti_limits); 1228 1228 1229 1229 ti = dm_table_get_target(table, i++); 1230 1230

+1

drivers/md/md.c

··· 4666 4666 mddev->queue->queuedata = mddev; 4667 4667 4668 4668 blk_queue_make_request(mddev->queue, md_make_request); 4669 + blk_set_stacking_limits(&mddev->queue->limits); 4669 4670 4670 4671 disk = alloc_disk(1 << shift); 4671 4672 if (!disk) {

+1 -1

drivers/scsi/scsi_scan.c

··· 297 297 kfree(sdev); 298 298 goto out; 299 299 } 300 - blk_get_queue(sdev->request_queue); 300 + WARN_ON_ONCE(!blk_get_queue(sdev->request_queue)); 301 301 sdev->request_queue->queuedata = sdev; 302 302 scsi_adjust_queue_depth(sdev, 0, sdev->host->cmd_per_lun); 303 303

+4 -20

fs/ioprio.c

··· 48 48 if (err) 49 49 return err; 50 50 51 - task_lock(task); 52 - do { 53 - ioc = task->io_context; 54 - /* see wmb() in current_io_context() */ 55 - smp_read_barrier_depends(); 56 - if (ioc) 57 - break; 58 - 59 - ioc = alloc_io_context(GFP_ATOMIC, -1); 60 - if (!ioc) { 61 - err = -ENOMEM; 62 - break; 63 - } 64 - task->io_context = ioc; 65 - } while (1); 66 - 67 - if (!err) { 68 - ioc->ioprio = ioprio; 69 - ioc->ioprio_changed = 1; 51 + ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE); 52 + if (ioc) { 53 + ioc_ioprio_changed(ioc, ioprio); 54 + put_io_context(ioc, NULL); 70 55 } 71 56 72 - task_unlock(task); 73 57 return err; 74 58 } 75 59 EXPORT_SYMBOL_GPL(set_task_ioprio);

-4

fs/mpage.c

··· 371 371 sector_t last_block_in_bio = 0; 372 372 struct buffer_head map_bh; 373 373 unsigned long first_logical_block = 0; 374 - struct blk_plug plug; 375 - 376 - blk_start_plug(&plug); 377 374 378 375 map_bh.b_state = 0; 379 376 map_bh.b_size = 0; ··· 392 395 BUG_ON(!list_empty(pages)); 393 396 if (bio) 394 397 mpage_bio_submit(READ, bio); 395 - blk_finish_plug(&plug); 396 398 return 0; 397 399 } 398 400 EXPORT_SYMBOL(mpage_readpages);

+53 -13

include/linux/bio.h

··· 515 515 516 516 #else /* CONFIG_BLK_DEV_INTEGRITY */ 517 517 518 - #define bio_integrity(a) (0) 519 - #define bioset_integrity_create(a, b) (0) 520 - #define bio_integrity_prep(a) (0) 521 - #define bio_integrity_enabled(a) (0) 518 + static inline int bio_integrity(struct bio *bio) 519 + { 520 + return 0; 521 + } 522 + 523 + static inline int bio_integrity_enabled(struct bio *bio) 524 + { 525 + return 0; 526 + } 527 + 528 + static inline int bioset_integrity_create(struct bio_set *bs, int pool_size) 529 + { 530 + return 0; 531 + } 532 + 533 + static inline void bioset_integrity_free (struct bio_set *bs) 534 + { 535 + return; 536 + } 537 + 538 + static inline int bio_integrity_prep(struct bio *bio) 539 + { 540 + return 0; 541 + } 542 + 543 + static inline void bio_integrity_free(struct bio *bio, struct bio_set *bs) 544 + { 545 + return; 546 + } 547 + 522 548 static inline int bio_integrity_clone(struct bio *bio, struct bio *bio_src, 523 549 gfp_t gfp_mask, struct bio_set *bs) 524 550 { 525 551 return 0; 526 552 } 527 - #define bioset_integrity_free(a) do { } while (0) 528 - #define bio_integrity_free(a, b) do { } while (0) 529 - #define bio_integrity_endio(a, b) do { } while (0) 530 - #define bio_integrity_advance(a, b) do { } while (0) 531 - #define bio_integrity_trim(a, b, c) do { } while (0) 532 - #define bio_integrity_split(a, b, c) do { } while (0) 533 - #define bio_integrity_set_tag(a, b, c) do { } while (0) 534 - #define bio_integrity_get_tag(a, b, c) do { } while (0) 535 - #define bio_integrity_init(a) do { } while (0) 553 + 554 + static inline void bio_integrity_split(struct bio *bio, struct bio_pair *bp, 555 + int sectors) 556 + { 557 + return; 558 + } 559 + 560 + static inline void bio_integrity_advance(struct bio *bio, 561 + unsigned int bytes_done) 562 + { 563 + return; 564 + } 565 + 566 + static inline void bio_integrity_trim(struct bio *bio, unsigned int offset, 567 + unsigned int sectors) 568 + { 569 + return; 570 + } 571 + 572 + static inline void bio_integrity_init(void) 573 + { 574 + return; 575 + } 536 576 537 577 #endif /* CONFIG_BLK_DEV_INTEGRITY */ 538 578

+84 -17

include/linux/blkdev.h

··· 111 111 * Three pointers are available for the IO schedulers, if they need 112 112 * more they have to dynamically allocate it. Flush requests are 113 113 * never put on the IO scheduler. So let the flush fields share 114 - * space with the three elevator_private pointers. 114 + * space with the elevator data. 115 115 */ 116 116 union { 117 - void *elevator_private[3]; 117 + struct { 118 + struct io_cq *icq; 119 + void *priv[2]; 120 + } elv; 121 + 118 122 struct { 119 123 unsigned int seq; 120 124 struct list_head list; ··· 315 311 unsigned long queue_flags; 316 312 317 313 /* 314 + * ida allocated id for this queue. Used to index queues from 315 + * ioctx. 316 + */ 317 + int id; 318 + 319 + /* 318 320 * queue needs bounce pages for pages above this limit 319 321 */ 320 322 gfp_t bounce_gfp; ··· 361 351 struct timer_list timeout; 362 352 struct list_head timeout_list; 363 353 354 + struct list_head icq_list; 355 + 364 356 struct queue_limits limits; 365 357 366 358 /* ··· 398 386 #ifdef CONFIG_BLK_DEV_THROTTLING 399 387 /* Throttle data */ 400 388 struct throtl_data *td; 389 + #endif 390 + #ifdef CONFIG_LOCKDEP 391 + int ioc_release_depth; 401 392 #endif 402 393 }; 403 394 ··· 496 481 497 482 #define blk_queue_tagged(q) test_bit(QUEUE_FLAG_QUEUED, &(q)->queue_flags) 498 483 #define blk_queue_stopped(q) test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags) 484 + #define blk_queue_dead(q) test_bit(QUEUE_FLAG_DEAD, &(q)->queue_flags) 499 485 #define blk_queue_nomerges(q) test_bit(QUEUE_FLAG_NOMERGES, &(q)->queue_flags) 500 486 #define blk_queue_noxmerges(q) \ 501 487 test_bit(QUEUE_FLAG_NOXMERGES, &(q)->queue_flags) ··· 676 660 extern struct request *blk_get_request(struct request_queue *, int, gfp_t); 677 661 extern struct request *blk_make_request(struct request_queue *, struct bio *, 678 662 gfp_t); 679 - extern void blk_insert_request(struct request_queue *, struct request *, int, void *); 680 663 extern void blk_requeue_request(struct request_queue *, struct request *); 681 664 extern void blk_add_request_payload(struct request *rq, struct page *page, 682 665 unsigned int len); ··· 844 829 extern void blk_limits_io_opt(struct queue_limits *limits, unsigned int opt); 845 830 extern void blk_queue_io_opt(struct request_queue *q, unsigned int opt); 846 831 extern void blk_set_default_limits(struct queue_limits *lim); 832 + extern void blk_set_stacking_limits(struct queue_limits *lim); 847 833 extern int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, 848 834 sector_t offset); 849 835 extern int bdev_stack_limits(struct queue_limits *t, struct block_device *bdev, ··· 875 859 extern void blk_dump_rq_flags(struct request *, char *); 876 860 extern long nr_blockdev_pages(void); 877 861 878 - int blk_get_queue(struct request_queue *); 862 + bool __must_check blk_get_queue(struct request_queue *); 879 863 struct request_queue *blk_alloc_queue(gfp_t); 880 864 struct request_queue *blk_alloc_queue_node(gfp_t, int); 881 865 extern void blk_put_queue(struct request_queue *); ··· 1298 1282 1299 1283 #else /* CONFIG_BLK_DEV_INTEGRITY */ 1300 1284 1301 - #define blk_integrity_rq(rq) (0) 1302 - #define blk_rq_count_integrity_sg(a, b) (0) 1303 - #define blk_rq_map_integrity_sg(a, b, c) (0) 1304 - #define bdev_get_integrity(a) (0) 1305 - #define blk_get_integrity(a) (0) 1306 - #define blk_integrity_compare(a, b) (0) 1307 - #define blk_integrity_register(a, b) (0) 1308 - #define blk_integrity_unregister(a) do { } while (0) 1309 - #define blk_queue_max_integrity_segments(a, b) do { } while (0) 1310 - #define queue_max_integrity_segments(a) (0) 1311 - #define blk_integrity_merge_rq(a, b, c) (0) 1312 - #define blk_integrity_merge_bio(a, b, c) (0) 1313 - #define blk_integrity_is_initialized(a) (0) 1285 + struct bio; 1286 + struct block_device; 1287 + struct gendisk; 1288 + struct blk_integrity; 1289 + 1290 + static inline int blk_integrity_rq(struct request *rq) 1291 + { 1292 + return 0; 1293 + } 1294 + static inline int blk_rq_count_integrity_sg(struct request_queue *q, 1295 + struct bio *b) 1296 + { 1297 + return 0; 1298 + } 1299 + static inline int blk_rq_map_integrity_sg(struct request_queue *q, 1300 + struct bio *b, 1301 + struct scatterlist *s) 1302 + { 1303 + return 0; 1304 + } 1305 + static inline struct blk_integrity *bdev_get_integrity(struct block_device *b) 1306 + { 1307 + return 0; 1308 + } 1309 + static inline struct blk_integrity *blk_get_integrity(struct gendisk *disk) 1310 + { 1311 + return NULL; 1312 + } 1313 + static inline int blk_integrity_compare(struct gendisk *a, struct gendisk *b) 1314 + { 1315 + return 0; 1316 + } 1317 + static inline int blk_integrity_register(struct gendisk *d, 1318 + struct blk_integrity *b) 1319 + { 1320 + return 0; 1321 + } 1322 + static inline void blk_integrity_unregister(struct gendisk *d) 1323 + { 1324 + } 1325 + static inline void blk_queue_max_integrity_segments(struct request_queue *q, 1326 + unsigned int segs) 1327 + { 1328 + } 1329 + static inline unsigned short queue_max_integrity_segments(struct request_queue *q) 1330 + { 1331 + return 0; 1332 + } 1333 + static inline int blk_integrity_merge_rq(struct request_queue *rq, 1334 + struct request *r1, 1335 + struct request *r2) 1336 + { 1337 + return 0; 1338 + } 1339 + static inline int blk_integrity_merge_bio(struct request_queue *rq, 1340 + struct request *r, 1341 + struct bio *b) 1342 + { 1343 + return 0; 1344 + } 1345 + static inline bool blk_integrity_is_initialized(struct gendisk *g) 1346 + { 1347 + return 0; 1348 + } 1314 1349 1315 1350 #endif /* CONFIG_BLK_DEV_INTEGRITY */ 1316 1351

+19 -22

include/linux/elevator.h

··· 5 5 6 6 #ifdef CONFIG_BLOCK 7 7 8 + struct io_cq; 9 + 8 10 typedef int (elevator_merge_fn) (struct request_queue *, struct request **, 9 11 struct bio *); 10 12 ··· 26 24 typedef void (elevator_completed_req_fn) (struct request_queue *, struct request *); 27 25 typedef int (elevator_may_queue_fn) (struct request_queue *, int); 28 26 27 + typedef void (elevator_init_icq_fn) (struct io_cq *); 28 + typedef void (elevator_exit_icq_fn) (struct io_cq *); 29 29 typedef int (elevator_set_req_fn) (struct request_queue *, struct request *, gfp_t); 30 30 typedef void (elevator_put_req_fn) (struct request *); 31 31 typedef void (elevator_activate_req_fn) (struct request_queue *, struct request *); ··· 60 56 elevator_request_list_fn *elevator_former_req_fn; 61 57 elevator_request_list_fn *elevator_latter_req_fn; 62 58 59 + elevator_init_icq_fn *elevator_init_icq_fn; /* see iocontext.h */ 60 + elevator_exit_icq_fn *elevator_exit_icq_fn; /* ditto */ 61 + 63 62 elevator_set_req_fn *elevator_set_req_fn; 64 63 elevator_put_req_fn *elevator_put_req_fn; 65 64 ··· 70 63 71 64 elevator_init_fn *elevator_init_fn; 72 65 elevator_exit_fn *elevator_exit_fn; 73 - void (*trim)(struct io_context *); 74 66 }; 75 67 76 68 #define ELV_NAME_MAX (16) ··· 85 79 */ 86 80 struct elevator_type 87 81 { 88 - struct list_head list; 82 + /* managed by elevator core */ 83 + struct kmem_cache *icq_cache; 84 + 85 + /* fields provided by elevator implementation */ 89 86 struct elevator_ops ops; 87 + size_t icq_size; /* see iocontext.h */ 88 + size_t icq_align; /* ditto */ 90 89 struct elv_fs_entry *elevator_attrs; 91 90 char elevator_name[ELV_NAME_MAX]; 92 91 struct module *elevator_owner; 92 + 93 + /* managed by elevator core */ 94 + char icq_cache_name[ELV_NAME_MAX + 5]; /* elvname + "_io_cq" */ 95 + struct list_head list; 93 96 }; 94 97 95 98 /* ··· 106 91 */ 107 92 struct elevator_queue 108 93 { 109 - struct elevator_ops *ops; 94 + struct elevator_type *type; 110 95 void *elevator_data; 111 96 struct kobject kobj; 112 - struct elevator_type *elevator_type; 113 97 struct mutex sysfs_lock; 114 98 struct hlist_head *hash; 115 99 unsigned int registered:1; ··· 143 129 /* 144 130 * io scheduler registration 145 131 */ 146 - extern void elv_register(struct elevator_type *); 132 + extern int elv_register(struct elevator_type *); 147 133 extern void elv_unregister(struct elevator_type *); 148 134 149 135 /* ··· 210 196 list_del_init(&(rq)->queuelist); \ 211 197 INIT_LIST_HEAD(&(rq)->csd.list); \ 212 198 } while (0) 213 - 214 - /* 215 - * io context count accounting 216 - */ 217 - #define elv_ioc_count_mod(name, __val) this_cpu_add(name, __val) 218 - #define elv_ioc_count_inc(name) this_cpu_inc(name) 219 - #define elv_ioc_count_dec(name) this_cpu_dec(name) 220 - 221 - #define elv_ioc_count_read(name) \ 222 - ({ \ 223 - unsigned long __val = 0; \ 224 - int __cpu; \ 225 - smp_wmb(); \ 226 - for_each_possible_cpu(__cpu) \ 227 - __val += per_cpu(name, __cpu); \ 228 - __val; \ 229 - }) 230 199 231 200 #endif /* CONFIG_BLOCK */ 232 201 #endif

+1

include/linux/fs.h

··· 319 319 #define BLKPBSZGET _IO(0x12,123) 320 320 #define BLKDISCARDZEROES _IO(0x12,124) 321 321 #define BLKSECDISCARD _IO(0x12,125) 322 + #define BLKROTATIONAL _IO(0x12,126) 322 323 323 324 #define BMAP_IOCTL 1 /* obsolete - kept for compatibility */ 324 325 #define FIBMAP _IO(0x00,1) /* bmap access */

+94 -40

include/linux/iocontext.h

··· 3 3 4 4 #include <linux/radix-tree.h> 5 5 #include <linux/rcupdate.h> 6 + #include <linux/workqueue.h> 6 7 7 - struct cfq_queue; 8 - struct cfq_ttime { 9 - unsigned long last_end_request; 10 - 11 - unsigned long ttime_total; 12 - unsigned long ttime_samples; 13 - unsigned long ttime_mean; 8 + enum { 9 + ICQ_IOPRIO_CHANGED, 10 + ICQ_CGROUP_CHANGED, 14 11 }; 15 12 16 - struct cfq_io_context { 17 - void *key; 13 + /* 14 + * An io_cq (icq) is association between an io_context (ioc) and a 15 + * request_queue (q). This is used by elevators which need to track 16 + * information per ioc - q pair. 17 + * 18 + * Elevator can request use of icq by setting elevator_type->icq_size and 19 + * ->icq_align. Both size and align must be larger than that of struct 20 + * io_cq and elevator can use the tail area for private information. The 21 + * recommended way to do this is defining a struct which contains io_cq as 22 + * the first member followed by private members and using its size and 23 + * align. For example, 24 + * 25 + * struct snail_io_cq { 26 + * struct io_cq icq; 27 + * int poke_snail; 28 + * int feed_snail; 29 + * }; 30 + * 31 + * struct elevator_type snail_elv_type { 32 + * .ops = { ... }, 33 + * .icq_size = sizeof(struct snail_io_cq), 34 + * .icq_align = __alignof__(struct snail_io_cq), 35 + * ... 36 + * }; 37 + * 38 + * If icq_size is set, block core will manage icq's. All requests will 39 + * have its ->elv.icq field set before elevator_ops->elevator_set_req_fn() 40 + * is called and be holding a reference to the associated io_context. 41 + * 42 + * Whenever a new icq is created, elevator_ops->elevator_init_icq_fn() is 43 + * called and, on destruction, ->elevator_exit_icq_fn(). Both functions 44 + * are called with both the associated io_context and queue locks held. 45 + * 46 + * Elevator is allowed to lookup icq using ioc_lookup_icq() while holding 47 + * queue lock but the returned icq is valid only until the queue lock is 48 + * released. Elevators can not and should not try to create or destroy 49 + * icq's. 50 + * 51 + * As icq's are linked from both ioc and q, the locking rules are a bit 52 + * complex. 53 + * 54 + * - ioc lock nests inside q lock. 55 + * 56 + * - ioc->icq_list and icq->ioc_node are protected by ioc lock. 57 + * q->icq_list and icq->q_node by q lock. 58 + * 59 + * - ioc->icq_tree and ioc->icq_hint are protected by ioc lock, while icq 60 + * itself is protected by q lock. However, both the indexes and icq 61 + * itself are also RCU managed and lookup can be performed holding only 62 + * the q lock. 63 + * 64 + * - icq's are not reference counted. They are destroyed when either the 65 + * ioc or q goes away. Each request with icq set holds an extra 66 + * reference to ioc to ensure it stays until the request is completed. 67 + * 68 + * - Linking and unlinking icq's are performed while holding both ioc and q 69 + * locks. Due to the lock ordering, q exit is simple but ioc exit 70 + * requires reverse-order double lock dance. 71 + */ 72 + struct io_cq { 73 + struct request_queue *q; 74 + struct io_context *ioc; 18 75 19 - struct cfq_queue *cfqq[2]; 76 + /* 77 + * q_node and ioc_node link io_cq through icq_list of q and ioc 78 + * respectively. Both fields are unused once ioc_exit_icq() is 79 + * called and shared with __rcu_icq_cache and __rcu_head which are 80 + * used for RCU free of io_cq. 81 + */ 82 + union { 83 + struct list_head q_node; 84 + struct kmem_cache *__rcu_icq_cache; 85 + }; 86 + union { 87 + struct hlist_node ioc_node; 88 + struct rcu_head __rcu_head; 89 + }; 20 90 21 - struct io_context *ioc; 22 - 23 - struct cfq_ttime ttime; 24 - 25 - struct list_head queue_list; 26 - struct hlist_node cic_list; 27 - 28 - void (*dtor)(struct io_context *); /* destructor */ 29 - void (*exit)(struct io_context *); /* called on task exit */ 30 - 31 - struct rcu_head rcu_head; 91 + unsigned long changed; 32 92 }; 33 93 34 94 /* ··· 103 43 spinlock_t lock; 104 44 105 45 unsigned short ioprio; 106 - unsigned short ioprio_changed; 107 - 108 - #if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE) 109 - unsigned short cgroup_changed; 110 - #endif 111 46 112 47 /* 113 48 * For request batching ··· 110 55 int nr_batch_requests; /* Number of requests left in the batch */ 111 56 unsigned long last_waited; /* Time last woken after wait for request */ 112 57 113 - struct radix_tree_root radix_root; 114 - struct hlist_head cic_list; 115 - void __rcu *ioc_data; 58 + struct radix_tree_root icq_tree; 59 + struct io_cq __rcu *icq_hint; 60 + struct hlist_head icq_list; 61 + 62 + struct work_struct release_work; 116 63 }; 117 64 118 65 static inline struct io_context *ioc_task_link(struct io_context *ioc) ··· 133 76 134 77 struct task_struct; 135 78 #ifdef CONFIG_BLOCK 136 - int put_io_context(struct io_context *ioc); 79 + void put_io_context(struct io_context *ioc, struct request_queue *locked_q); 137 80 void exit_io_context(struct task_struct *task); 138 - struct io_context *get_io_context(gfp_t gfp_flags, int node); 139 - struct io_context *alloc_io_context(gfp_t gfp_flags, int node); 81 + struct io_context *get_task_io_context(struct task_struct *task, 82 + gfp_t gfp_flags, int node); 83 + void ioc_ioprio_changed(struct io_context *ioc, int ioprio); 84 + void ioc_cgroup_changed(struct io_context *ioc); 140 85 #else 141 - static inline void exit_io_context(struct task_struct *task) 142 - { 143 - } 144 - 145 86 struct io_context; 146 - static inline int put_io_context(struct io_context *ioc) 147 - { 148 - return 1; 149 - } 87 + static inline void put_io_context(struct io_context *ioc, 88 + struct request_queue *locked_q) { } 89 + static inline void exit_io_context(struct task_struct *task) { } 150 90 #endif 151 91 152 92 #endif

+5 -3

kernel/fork.c

··· 873 873 { 874 874 #ifdef CONFIG_BLOCK 875 875 struct io_context *ioc = current->io_context; 876 + struct io_context *new_ioc; 876 877 877 878 if (!ioc) 878 879 return 0; ··· 885 884 if (unlikely(!tsk->io_context)) 886 885 return -ENOMEM; 887 886 } else if (ioprio_valid(ioc->ioprio)) { 888 - tsk->io_context = alloc_io_context(GFP_KERNEL, -1); 889 - if (unlikely(!tsk->io_context)) 887 + new_ioc = get_task_io_context(tsk, GFP_KERNEL, NUMA_NO_NODE); 888 + if (unlikely(!new_ioc)) 890 889 return -ENOMEM; 891 890 892 - tsk->io_context->ioprio = ioc->ioprio; 891 + new_ioc->ioprio = ioc->ioprio; 892 + put_io_context(new_ioc, NULL); 893 893 } 894 894 #endif 895 895 return 0;