Merge branch 'for-3.6/core' of git://git.kernel.dk/linux-block

+7

Documentation/block/queue-sysfs.txt

··· 38 38 this amount, since it applies only to reads or writes (not the accumulated 39 39 sum). 40 40 41 + To avoid priority inversion through request starvation, a request 42 + queue maintains a separate request pool per each cgroup when 43 + CONFIG_BLK_CGROUP is enabled, and this parameter applies to each such 44 + per-block-cgroup request pool. IOW, if there are N block cgroups, 45 + each request queue may have upto N request pools, each independently 46 + regulated by nr_requests. 47 + 41 48 read_ahead_kb (RW) 42 49 ------------------ 43 50 Maximum number of kilobytes to read-ahead for filesystems on this block

+90 -49

block/blk-cgroup.c

··· 31 31 32 32 static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS]; 33 33 34 - struct blkcg *cgroup_to_blkcg(struct cgroup *cgroup) 35 - { 36 - return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id), 37 - struct blkcg, css); 38 - } 39 - EXPORT_SYMBOL_GPL(cgroup_to_blkcg); 40 - 41 - static struct blkcg *task_blkcg(struct task_struct *tsk) 42 - { 43 - return container_of(task_subsys_state(tsk, blkio_subsys_id), 44 - struct blkcg, css); 45 - } 46 - 47 - struct blkcg *bio_blkcg(struct bio *bio) 48 - { 49 - if (bio && bio->bi_css) 50 - return container_of(bio->bi_css, struct blkcg, css); 51 - return task_blkcg(current); 52 - } 53 - EXPORT_SYMBOL_GPL(bio_blkcg); 54 - 55 34 static bool blkcg_policy_enabled(struct request_queue *q, 56 35 const struct blkcg_policy *pol) 57 36 { ··· 63 84 kfree(pd); 64 85 } 65 86 87 + blk_exit_rl(&blkg->rl); 66 88 kfree(blkg); 67 89 } 68 90 ··· 71 91 * blkg_alloc - allocate a blkg 72 92 * @blkcg: block cgroup the new blkg is associated with 73 93 * @q: request_queue the new blkg is associated with 94 + * @gfp_mask: allocation mask to use 74 95 * 75 96 * Allocate a new blkg assocating @blkcg and @q. 76 97 */ 77 - static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q) 98 + static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q, 99 + gfp_t gfp_mask) 78 100 { 79 101 struct blkcg_gq *blkg; 80 102 int i; 81 103 82 104 /* alloc and init base part */ 83 - blkg = kzalloc_node(sizeof(*blkg), GFP_ATOMIC, q->node); 105 + blkg = kzalloc_node(sizeof(*blkg), gfp_mask, q->node); 84 106 if (!blkg) 85 107 return NULL; 86 108 ··· 90 108 INIT_LIST_HEAD(&blkg->q_node); 91 109 blkg->blkcg = blkcg; 92 110 blkg->refcnt = 1; 111 + 112 + /* root blkg uses @q->root_rl, init rl only for !root blkgs */ 113 + if (blkcg != &blkcg_root) { 114 + if (blk_init_rl(&blkg->rl, q, gfp_mask)) 115 + goto err_free; 116 + blkg->rl.blkg = blkg; 117 + } 93 118 94 119 for (i = 0; i < BLKCG_MAX_POLS; i++) { 95 120 struct blkcg_policy *pol = blkcg_policy[i]; ··· 106 117 continue; 107 118 108 119 /* alloc per-policy data and attach it to blkg */ 109 - pd = kzalloc_node(pol->pd_size, GFP_ATOMIC, q->node); 110 - if (!pd) { 111 - blkg_free(blkg); 112 - return NULL; 113 - } 120 + pd = kzalloc_node(pol->pd_size, gfp_mask, q->node); 121 + if (!pd) 122 + goto err_free; 114 123 115 124 blkg->pd[i] = pd; 116 125 pd->blkg = blkg; ··· 119 132 } 120 133 121 134 return blkg; 135 + 136 + err_free: 137 + blkg_free(blkg); 138 + return NULL; 122 139 } 123 140 124 141 static struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, ··· 166 175 } 167 176 EXPORT_SYMBOL_GPL(blkg_lookup); 168 177 178 + /* 179 + * If @new_blkg is %NULL, this function tries to allocate a new one as 180 + * necessary using %GFP_ATOMIC. @new_blkg is always consumed on return. 181 + */ 169 182 static struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg, 170 - struct request_queue *q) 171 - __releases(q->queue_lock) __acquires(q->queue_lock) 183 + struct request_queue *q, 184 + struct blkcg_gq *new_blkg) 172 185 { 173 186 struct blkcg_gq *blkg; 174 187 int ret; ··· 184 189 blkg = __blkg_lookup(blkcg, q); 185 190 if (blkg) { 186 191 rcu_assign_pointer(blkcg->blkg_hint, blkg); 187 - return blkg; 192 + goto out_free; 188 193 } 189 194 190 195 /* blkg holds a reference to blkcg */ 191 - if (!css_tryget(&blkcg->css)) 192 - return ERR_PTR(-EINVAL); 196 + if (!css_tryget(&blkcg->css)) { 197 + blkg = ERR_PTR(-EINVAL); 198 + goto out_free; 199 + } 193 200 194 201 /* allocate */ 195 - ret = -ENOMEM; 196 - blkg = blkg_alloc(blkcg, q); 197 - if (unlikely(!blkg)) 198 - goto err_put; 202 + if (!new_blkg) { 203 + new_blkg = blkg_alloc(blkcg, q, GFP_ATOMIC); 204 + if (unlikely(!new_blkg)) { 205 + blkg = ERR_PTR(-ENOMEM); 206 + goto out_put; 207 + } 208 + } 209 + blkg = new_blkg; 199 210 200 211 /* insert */ 201 - ret = radix_tree_preload(GFP_ATOMIC); 202 - if (ret) 203 - goto err_free; 204 - 205 212 spin_lock(&blkcg->lock); 206 213 ret = radix_tree_insert(&blkcg->blkg_tree, q->id, blkg); 207 214 if (likely(!ret)) { ··· 212 215 } 213 216 spin_unlock(&blkcg->lock); 214 217 215 - radix_tree_preload_end(); 216 - 217 218 if (!ret) 218 219 return blkg; 219 - err_free: 220 - blkg_free(blkg); 221 - err_put: 220 + 221 + blkg = ERR_PTR(ret); 222 + out_put: 222 223 css_put(&blkcg->css); 223 - return ERR_PTR(ret); 224 + out_free: 225 + blkg_free(new_blkg); 226 + return blkg; 224 227 } 225 228 226 229 struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, ··· 232 235 */ 233 236 if (unlikely(blk_queue_bypass(q))) 234 237 return ERR_PTR(blk_queue_dead(q) ? -EINVAL : -EBUSY); 235 - return __blkg_lookup_create(blkcg, q); 238 + return __blkg_lookup_create(blkcg, q, NULL); 236 239 } 237 240 EXPORT_SYMBOL_GPL(blkg_lookup_create); 238 241 ··· 309 312 call_rcu(&blkg->rcu_head, blkg_rcu_free); 310 313 } 311 314 EXPORT_SYMBOL_GPL(__blkg_release); 315 + 316 + /* 317 + * The next function used by blk_queue_for_each_rl(). It's a bit tricky 318 + * because the root blkg uses @q->root_rl instead of its own rl. 319 + */ 320 + struct request_list *__blk_queue_next_rl(struct request_list *rl, 321 + struct request_queue *q) 322 + { 323 + struct list_head *ent; 324 + struct blkcg_gq *blkg; 325 + 326 + /* 327 + * Determine the current blkg list_head. The first entry is 328 + * root_rl which is off @q->blkg_list and mapped to the head. 329 + */ 330 + if (rl == &q->root_rl) { 331 + ent = &q->blkg_list; 332 + } else { 333 + blkg = container_of(rl, struct blkcg_gq, rl); 334 + ent = &blkg->q_node; 335 + } 336 + 337 + /* walk to the next list_head, skip root blkcg */ 338 + ent = ent->next; 339 + if (ent == &q->root_blkg->q_node) 340 + ent = ent->next; 341 + if (ent == &q->blkg_list) 342 + return NULL; 343 + 344 + blkg = container_of(ent, struct blkcg_gq, q_node); 345 + return &blkg->rl; 346 + } 312 347 313 348 static int blkcg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, 314 349 u64 val) ··· 763 734 struct blkcg_gq *blkg; 764 735 struct blkg_policy_data *pd, *n; 765 736 int cnt = 0, ret; 737 + bool preloaded; 766 738 767 739 if (blkcg_policy_enabled(q, pol)) 768 740 return 0; 741 + 742 + /* preallocations for root blkg */ 743 + blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL); 744 + if (!blkg) 745 + return -ENOMEM; 746 + 747 + preloaded = !radix_tree_preload(GFP_KERNEL); 769 748 770 749 blk_queue_bypass_start(q); 771 750 ··· 781 744 spin_lock_irq(q->queue_lock); 782 745 783 746 rcu_read_lock(); 784 - blkg = __blkg_lookup_create(&blkcg_root, q); 747 + blkg = __blkg_lookup_create(&blkcg_root, q, blkg); 785 748 rcu_read_unlock(); 749 + 750 + if (preloaded) 751 + radix_tree_preload_end(); 786 752 787 753 if (IS_ERR(blkg)) { 788 754 ret = PTR_ERR(blkg); 789 755 goto out_unlock; 790 756 } 791 757 q->root_blkg = blkg; 758 + q->root_rl.blkg = blkg; 792 759 793 760 list_for_each_entry(blkg, &q->blkg_list, q_node) 794 761 cnt++;

+124 -4

block/blk-cgroup.h

··· 17 17 #include <linux/u64_stats_sync.h> 18 18 #include <linux/seq_file.h> 19 19 #include <linux/radix-tree.h> 20 + #include <linux/blkdev.h> 20 21 21 22 /* Max limits for throttle policy */ 22 23 #define THROTL_IOPS_MAX UINT_MAX ··· 94 93 struct list_head q_node; 95 94 struct hlist_node blkcg_node; 96 95 struct blkcg *blkcg; 96 + /* request allocation list for this blkcg-q pair */ 97 + struct request_list rl; 97 98 /* reference count */ 98 99 int refcnt; 99 100 ··· 123 120 124 121 extern struct blkcg blkcg_root; 125 122 126 - struct blkcg *cgroup_to_blkcg(struct cgroup *cgroup); 127 - struct blkcg *bio_blkcg(struct bio *bio); 128 123 struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q); 129 124 struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, 130 125 struct request_queue *q); ··· 160 159 const char *input, struct blkg_conf_ctx *ctx); 161 160 void blkg_conf_finish(struct blkg_conf_ctx *ctx); 162 161 162 + 163 + static inline struct blkcg *cgroup_to_blkcg(struct cgroup *cgroup) 164 + { 165 + return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id), 166 + struct blkcg, css); 167 + } 168 + 169 + static inline struct blkcg *task_blkcg(struct task_struct *tsk) 170 + { 171 + return container_of(task_subsys_state(tsk, blkio_subsys_id), 172 + struct blkcg, css); 173 + } 174 + 175 + static inline struct blkcg *bio_blkcg(struct bio *bio) 176 + { 177 + if (bio && bio->bi_css) 178 + return container_of(bio->bi_css, struct blkcg, css); 179 + return task_blkcg(current); 180 + } 163 181 164 182 /** 165 183 * blkg_to_pdata - get policy private data ··· 252 232 if (!--blkg->refcnt) 253 233 __blkg_release(blkg); 254 234 } 235 + 236 + /** 237 + * blk_get_rl - get request_list to use 238 + * @q: request_queue of interest 239 + * @bio: bio which will be attached to the allocated request (may be %NULL) 240 + * 241 + * The caller wants to allocate a request from @q to use for @bio. Find 242 + * the request_list to use and obtain a reference on it. Should be called 243 + * under queue_lock. This function is guaranteed to return non-%NULL 244 + * request_list. 245 + */ 246 + static inline struct request_list *blk_get_rl(struct request_queue *q, 247 + struct bio *bio) 248 + { 249 + struct blkcg *blkcg; 250 + struct blkcg_gq *blkg; 251 + 252 + rcu_read_lock(); 253 + 254 + blkcg = bio_blkcg(bio); 255 + 256 + /* bypass blkg lookup and use @q->root_rl directly for root */ 257 + if (blkcg == &blkcg_root) 258 + goto root_rl; 259 + 260 + /* 261 + * Try to use blkg->rl. blkg lookup may fail under memory pressure 262 + * or if either the blkcg or queue is going away. Fall back to 263 + * root_rl in such cases. 264 + */ 265 + blkg = blkg_lookup_create(blkcg, q); 266 + if (unlikely(IS_ERR(blkg))) 267 + goto root_rl; 268 + 269 + blkg_get(blkg); 270 + rcu_read_unlock(); 271 + return &blkg->rl; 272 + root_rl: 273 + rcu_read_unlock(); 274 + return &q->root_rl; 275 + } 276 + 277 + /** 278 + * blk_put_rl - put request_list 279 + * @rl: request_list to put 280 + * 281 + * Put the reference acquired by blk_get_rl(). Should be called under 282 + * queue_lock. 283 + */ 284 + static inline void blk_put_rl(struct request_list *rl) 285 + { 286 + /* root_rl may not have blkg set */ 287 + if (rl->blkg && rl->blkg->blkcg != &blkcg_root) 288 + blkg_put(rl->blkg); 289 + } 290 + 291 + /** 292 + * blk_rq_set_rl - associate a request with a request_list 293 + * @rq: request of interest 294 + * @rl: target request_list 295 + * 296 + * Associate @rq with @rl so that accounting and freeing can know the 297 + * request_list @rq came from. 298 + */ 299 + static inline void blk_rq_set_rl(struct request *rq, struct request_list *rl) 300 + { 301 + rq->rl = rl; 302 + } 303 + 304 + /** 305 + * blk_rq_rl - return the request_list a request came from 306 + * @rq: request of interest 307 + * 308 + * Return the request_list @rq is allocated from. 309 + */ 310 + static inline struct request_list *blk_rq_rl(struct request *rq) 311 + { 312 + return rq->rl; 313 + } 314 + 315 + struct request_list *__blk_queue_next_rl(struct request_list *rl, 316 + struct request_queue *q); 317 + /** 318 + * blk_queue_for_each_rl - iterate through all request_lists of a request_queue 319 + * 320 + * Should be used under queue_lock. 321 + */ 322 + #define blk_queue_for_each_rl(rl, q) \ 323 + for ((rl) = &(q)->root_rl; (rl); (rl) = __blk_queue_next_rl((rl), (q))) 255 324 256 325 /** 257 326 * blkg_stat_add - add a value to a blkg_stat ··· 460 351 #else /* CONFIG_BLK_CGROUP */ 461 352 462 353 struct cgroup; 354 + struct blkcg; 463 355 464 356 struct blkg_policy_data { 465 357 }; ··· 471 361 struct blkcg_policy { 472 362 }; 473 363 474 - static inline struct blkcg *cgroup_to_blkcg(struct cgroup *cgroup) { return NULL; } 475 - static inline struct blkcg *bio_blkcg(struct bio *bio) { return NULL; } 476 364 static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; } 477 365 static inline int blkcg_init_queue(struct request_queue *q) { return 0; } 478 366 static inline void blkcg_drain_queue(struct request_queue *q) { } ··· 482 374 static inline void blkcg_deactivate_policy(struct request_queue *q, 483 375 const struct blkcg_policy *pol) { } 484 376 377 + static inline struct blkcg *cgroup_to_blkcg(struct cgroup *cgroup) { return NULL; } 378 + static inline struct blkcg *bio_blkcg(struct bio *bio) { return NULL; } 379 + 485 380 static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg, 486 381 struct blkcg_policy *pol) { return NULL; } 487 382 static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) { return NULL; } 488 383 static inline char *blkg_path(struct blkcg_gq *blkg) { return NULL; } 489 384 static inline void blkg_get(struct blkcg_gq *blkg) { } 490 385 static inline void blkg_put(struct blkcg_gq *blkg) { } 386 + 387 + static inline struct request_list *blk_get_rl(struct request_queue *q, 388 + struct bio *bio) { return &q->root_rl; } 389 + static inline void blk_put_rl(struct request_list *rl) { } 390 + static inline void blk_rq_set_rl(struct request *rq, struct request_list *rl) { } 391 + static inline struct request_list *blk_rq_rl(struct request *rq) { return &rq->q->root_rl; } 392 + 393 + #define blk_queue_for_each_rl(rl, q) \ 394 + for ((rl) = &(q)->root_rl; (rl); (rl) = NULL) 491 395 492 396 #endif /* CONFIG_BLK_CGROUP */ 493 397 #endif /* _BLK_CGROUP_H */

+111 -98

block/blk-core.c

··· 387 387 if (!list_empty(&q->queue_head) && q->request_fn) 388 388 __blk_run_queue(q); 389 389 390 - drain |= q->rq.elvpriv; 390 + drain |= q->nr_rqs_elvpriv; 391 391 392 392 /* 393 393 * Unfortunately, requests are queued at and tracked from ··· 397 397 if (drain_all) { 398 398 drain |= !list_empty(&q->queue_head); 399 399 for (i = 0; i < 2; i++) { 400 - drain |= q->rq.count[i]; 400 + drain |= q->nr_rqs[i]; 401 401 drain |= q->in_flight[i]; 402 402 drain |= !list_empty(&q->flush_queue[i]); 403 403 } ··· 416 416 * left with hung waiters. We need to wake up those waiters. 417 417 */ 418 418 if (q->request_fn) { 419 + struct request_list *rl; 420 + 419 421 spin_lock_irq(q->queue_lock); 420 - for (i = 0; i < ARRAY_SIZE(q->rq.wait); i++) 421 - wake_up_all(&q->rq.wait[i]); 422 + 423 + blk_queue_for_each_rl(rl, q) 424 + for (i = 0; i < ARRAY_SIZE(rl->wait); i++) 425 + wake_up_all(&rl->wait[i]); 426 + 422 427 spin_unlock_irq(q->queue_lock); 423 428 } 424 429 } ··· 522 517 } 523 518 EXPORT_SYMBOL(blk_cleanup_queue); 524 519 525 - static int blk_init_free_list(struct request_queue *q) 520 + int blk_init_rl(struct request_list *rl, struct request_queue *q, 521 + gfp_t gfp_mask) 526 522 { 527 - struct request_list *rl = &q->rq; 528 - 529 523 if (unlikely(rl->rq_pool)) 530 524 return 0; 531 525 526 + rl->q = q; 532 527 rl->count[BLK_RW_SYNC] = rl->count[BLK_RW_ASYNC] = 0; 533 528 rl->starved[BLK_RW_SYNC] = rl->starved[BLK_RW_ASYNC] = 0; 534 - rl->elvpriv = 0; 535 529 init_waitqueue_head(&rl->wait[BLK_RW_SYNC]); 536 530 init_waitqueue_head(&rl->wait[BLK_RW_ASYNC]); 537 531 538 532 rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ, mempool_alloc_slab, 539 - mempool_free_slab, request_cachep, q->node); 540 - 533 + mempool_free_slab, request_cachep, 534 + gfp_mask, q->node); 541 535 if (!rl->rq_pool) 542 536 return -ENOMEM; 543 537 544 538 return 0; 539 + } 540 + 541 + void blk_exit_rl(struct request_list *rl) 542 + { 543 + if (rl->rq_pool) 544 + mempool_destroy(rl->rq_pool); 545 545 } 546 546 547 547 struct request_queue *blk_alloc_queue(gfp_t gfp_mask) ··· 690 680 if (!q) 691 681 return NULL; 692 682 693 - if (blk_init_free_list(q)) 683 + if (blk_init_rl(&q->root_rl, q, GFP_KERNEL)) 694 684 return NULL; 695 685 696 686 q->request_fn = rfn; ··· 732 722 } 733 723 EXPORT_SYMBOL(blk_get_queue); 734 724 735 - static inline void blk_free_request(struct request_queue *q, struct request *rq) 725 + static inline void blk_free_request(struct request_list *rl, struct request *rq) 736 726 { 737 727 if (rq->cmd_flags & REQ_ELVPRIV) { 738 - elv_put_request(q, rq); 728 + elv_put_request(rl->q, rq); 739 729 if (rq->elv.icq) 740 730 put_io_context(rq->elv.icq->ioc); 741 731 } 742 732 743 - mempool_free(rq, q->rq.rq_pool); 733 + mempool_free(rq, rl->rq_pool); 744 734 } 745 735 746 736 /* ··· 777 767 ioc->last_waited = jiffies; 778 768 } 779 769 780 - static void __freed_request(struct request_queue *q, int sync) 770 + static void __freed_request(struct request_list *rl, int sync) 781 771 { 782 - struct request_list *rl = &q->rq; 772 + struct request_queue *q = rl->q; 783 773 784 - if (rl->count[sync] < queue_congestion_off_threshold(q)) 774 + /* 775 + * bdi isn't aware of blkcg yet. As all async IOs end up root 776 + * blkcg anyway, just use root blkcg state. 777 + */ 778 + if (rl == &q->root_rl && 779 + rl->count[sync] < queue_congestion_off_threshold(q)) 785 780 blk_clear_queue_congested(q, sync); 786 781 787 782 if (rl->count[sync] + 1 <= q->nr_requests) { 788 783 if (waitqueue_active(&rl->wait[sync])) 789 784 wake_up(&rl->wait[sync]); 790 785 791 - blk_clear_queue_full(q, sync); 786 + blk_clear_rl_full(rl, sync); 792 787 } 793 788 } 794 789 ··· 801 786 * A request has just been released. Account for it, update the full and 802 787 * congestion status, wake up any waiters. Called under q->queue_lock. 803 788 */ 804 - static void freed_request(struct request_queue *q, unsigned int flags) 789 + static void freed_request(struct request_list *rl, unsigned int flags) 805 790 { 806 - struct request_list *rl = &q->rq; 791 + struct request_queue *q = rl->q; 807 792 int sync = rw_is_sync(flags); 808 793 794 + q->nr_rqs[sync]--; 809 795 rl->count[sync]--; 810 796 if (flags & REQ_ELVPRIV) 811 - rl->elvpriv--; 797 + q->nr_rqs_elvpriv--; 812 798 813 - __freed_request(q, sync); 799 + __freed_request(rl, sync); 814 800 815 801 if (unlikely(rl->starved[sync ^ 1])) 816 - __freed_request(q, sync ^ 1); 802 + __freed_request(rl, sync ^ 1); 817 803 } 818 804 819 805 /* ··· 853 837 } 854 838 855 839 /** 856 - * get_request - get a free request 857 - * @q: request_queue to allocate request from 840 + * __get_request - get a free request 841 + * @rl: request list to allocate from 858 842 * @rw_flags: RW and SYNC flags 859 843 * @bio: bio to allocate request for (can be %NULL) 860 844 * @gfp_mask: allocation mask ··· 866 850 * Returns %NULL on failure, with @q->queue_lock held. 867 851 * Returns !%NULL on success, with @q->queue_lock *not held*. 868 852 */ 869 - static struct request *get_request(struct request_queue *q, int rw_flags, 870 - struct bio *bio, gfp_t gfp_mask) 853 + static struct request *__get_request(struct request_list *rl, int rw_flags, 854 + struct bio *bio, gfp_t gfp_mask) 871 855 { 856 + struct request_queue *q = rl->q; 872 857 struct request *rq; 873 - struct request_list *rl = &q->rq; 874 - struct elevator_type *et; 875 - struct io_context *ioc; 858 + struct elevator_type *et = q->elevator->type; 859 + struct io_context *ioc = rq_ioc(bio); 876 860 struct io_cq *icq = NULL; 877 861 const bool is_sync = rw_is_sync(rw_flags) != 0; 878 - bool retried = false; 879 862 int may_queue; 880 - retry: 881 - et = q->elevator->type; 882 - ioc = rq_ioc(bio); 883 863 884 864 if (unlikely(blk_queue_dead(q))) 885 865 return NULL; ··· 887 875 if (rl->count[is_sync]+1 >= queue_congestion_on_threshold(q)) { 888 876 if (rl->count[is_sync]+1 >= q->nr_requests) { 889 877 /* 890 - * We want ioc to record batching state. If it's 891 - * not already there, creating a new one requires 892 - * dropping queue_lock, which in turn requires 893 - * retesting conditions to avoid queue hang. 894 - */ 895 - if (!ioc && !retried) { 896 - spin_unlock_irq(q->queue_lock); 897 - create_io_context(gfp_mask, q->node); 898 - spin_lock_irq(q->queue_lock); 899 - retried = true; 900 - goto retry; 901 - } 902 - 903 - /* 904 878 * The queue will fill after this allocation, so set 905 879 * it as full, and mark this process as "batching". 906 880 * This process will be allowed to complete a batch of 907 881 * requests, others will be blocked. 908 882 */ 909 - if (!blk_queue_full(q, is_sync)) { 883 + if (!blk_rl_full(rl, is_sync)) { 910 884 ioc_set_batching(q, ioc); 911 - blk_set_queue_full(q, is_sync); 885 + blk_set_rl_full(rl, is_sync); 912 886 } else { 913 887 if (may_queue != ELV_MQUEUE_MUST 914 888 && !ioc_batching(q, ioc)) { ··· 907 909 } 908 910 } 909 911 } 910 - blk_set_queue_congested(q, is_sync); 912 + /* 913 + * bdi isn't aware of blkcg yet. As all async IOs end up 914 + * root blkcg anyway, just use root blkcg state. 915 + */ 916 + if (rl == &q->root_rl) 917 + blk_set_queue_congested(q, is_sync); 911 918 } 912 919 913 920 /* ··· 923 920 if (rl->count[is_sync] >= (3 * q->nr_requests / 2)) 924 921 return NULL; 925 922 923 + q->nr_rqs[is_sync]++; 926 924 rl->count[is_sync]++; 927 925 rl->starved[is_sync] = 0; 928 926 ··· 939 935 */ 940 936 if (blk_rq_should_init_elevator(bio) && !blk_queue_bypass(q)) { 941 937 rw_flags |= REQ_ELVPRIV; 942 - rl->elvpriv++; 938 + q->nr_rqs_elvpriv++; 943 939 if (et->icq_cache && ioc) 944 940 icq = ioc_lookup_icq(ioc, q); 945 941 } ··· 949 945 spin_unlock_irq(q->queue_lock); 950 946 951 947 /* allocate and init request */ 952 - rq = mempool_alloc(q->rq.rq_pool, gfp_mask); 948 + rq = mempool_alloc(rl->rq_pool, gfp_mask); 953 949 if (!rq) 954 950 goto fail_alloc; 955 951 956 952 blk_rq_init(q, rq); 953 + blk_rq_set_rl(rq, rl); 957 954 rq->cmd_flags = rw_flags | REQ_ALLOCED; 958 955 959 956 /* init elvpriv */ 960 957 if (rw_flags & REQ_ELVPRIV) { 961 958 if (unlikely(et->icq_cache && !icq)) { 962 - create_io_context(gfp_mask, q->node); 963 - ioc = rq_ioc(bio); 964 - if (!ioc) 965 - goto fail_elvpriv; 966 - 967 - icq = ioc_create_icq(ioc, q, gfp_mask); 959 + if (ioc) 960 + icq = ioc_create_icq(ioc, q, gfp_mask); 968 961 if (!icq) 969 962 goto fail_elvpriv; 970 963 } ··· 1001 1000 rq->elv.icq = NULL; 1002 1001 1003 1002 spin_lock_irq(q->queue_lock); 1004 - rl->elvpriv--; 1003 + q->nr_rqs_elvpriv--; 1005 1004 spin_unlock_irq(q->queue_lock); 1006 1005 goto out; 1007 1006 ··· 1014 1013 * queue, but this is pretty rare. 1015 1014 */ 1016 1015 spin_lock_irq(q->queue_lock); 1017 - freed_request(q, rw_flags); 1016 + freed_request(rl, rw_flags); 1018 1017 1019 1018 /* 1020 1019 * in the very unlikely event that allocation failed and no ··· 1030 1029 } 1031 1030 1032 1031 /** 1033 - * get_request_wait - get a free request with retry 1032 + * get_request - get a free request 1034 1033 * @q: request_queue to allocate request from 1035 1034 * @rw_flags: RW and SYNC flags 1036 1035 * @bio: bio to allocate request for (can be %NULL) 1036 + * @gfp_mask: allocation mask 1037 1037 * 1038 - * Get a free request from @q. This function keeps retrying under memory 1039 - * pressure and fails iff @q is dead. 1038 + * Get a free request from @q. If %__GFP_WAIT is set in @gfp_mask, this 1039 + * function keeps retrying under memory pressure and fails iff @q is dead. 1040 1040 * 1041 1041 * Must be callled with @q->queue_lock held and, 1042 1042 * Returns %NULL on failure, with @q->queue_lock held. 1043 1043 * Returns !%NULL on success, with @q->queue_lock *not held*. 1044 1044 */ 1045 - static struct request *get_request_wait(struct request_queue *q, int rw_flags, 1046 - struct bio *bio) 1045 + static struct request *get_request(struct request_queue *q, int rw_flags, 1046 + struct bio *bio, gfp_t gfp_mask) 1047 1047 { 1048 1048 const bool is_sync = rw_is_sync(rw_flags) != 0; 1049 + DEFINE_WAIT(wait); 1050 + struct request_list *rl; 1049 1051 struct request *rq; 1050 1052 1051 - rq = get_request(q, rw_flags, bio, GFP_NOIO); 1052 - while (!rq) { 1053 - DEFINE_WAIT(wait); 1054 - struct request_list *rl = &q->rq; 1053 + rl = blk_get_rl(q, bio); /* transferred to @rq on success */ 1054 + retry: 1055 + rq = __get_request(rl, rw_flags, bio, gfp_mask); 1056 + if (rq) 1057 + return rq; 1055 1058 1056 - if (unlikely(blk_queue_dead(q))) 1057 - return NULL; 1059 + if (!(gfp_mask & __GFP_WAIT) || unlikely(blk_queue_dead(q))) { 1060 + blk_put_rl(rl); 1061 + return NULL; 1062 + } 1058 1063 1059 - prepare_to_wait_exclusive(&rl->wait[is_sync], &wait, 1060 - TASK_UNINTERRUPTIBLE); 1064 + /* wait on @rl and retry */ 1065 + prepare_to_wait_exclusive(&rl->wait[is_sync], &wait, 1066 + TASK_UNINTERRUPTIBLE); 1061 1067 1062 - trace_block_sleeprq(q, bio, rw_flags & 1); 1068 + trace_block_sleeprq(q, bio, rw_flags & 1); 1063 1069 1064 - spin_unlock_irq(q->queue_lock); 1065 - io_schedule(); 1070 + spin_unlock_irq(q->queue_lock); 1071 + io_schedule(); 1066 1072 1067 - /* 1068 - * After sleeping, we become a "batching" process and 1069 - * will be able to allocate at least one request, and 1070 - * up to a big batch of them for a small period time. 1071 - * See ioc_batching, ioc_set_batching 1072 - */ 1073 - create_io_context(GFP_NOIO, q->node); 1074 - ioc_set_batching(q, current->io_context); 1073 + /* 1074 + * After sleeping, we become a "batching" process and will be able 1075 + * to allocate at least one request, and up to a big batch of them 1076 + * for a small period time. See ioc_batching, ioc_set_batching 1077 + */ 1078 + ioc_set_batching(q, current->io_context); 1075 1079 1076 - spin_lock_irq(q->queue_lock); 1077 - finish_wait(&rl->wait[is_sync], &wait); 1080 + spin_lock_irq(q->queue_lock); 1081 + finish_wait(&rl->wait[is_sync], &wait); 1078 1082 1079 - rq = get_request(q, rw_flags, bio, GFP_NOIO); 1080 - }; 1081 - 1082 - return rq; 1083 + goto retry; 1083 1084 } 1084 1085 1085 1086 struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask) ··· 1090 1087 1091 1088 BUG_ON(rw != READ && rw != WRITE); 1092 1089 1090 + /* create ioc upfront */ 1091 + create_io_context(gfp_mask, q->node); 1092 + 1093 1093 spin_lock_irq(q->queue_lock); 1094 - if (gfp_mask & __GFP_WAIT) 1095 - rq = get_request_wait(q, rw, NULL); 1096 - else 1097 - rq = get_request(q, rw, NULL, gfp_mask); 1094 + rq = get_request(q, rw, NULL, gfp_mask); 1098 1095 if (!rq) 1099 1096 spin_unlock_irq(q->queue_lock); 1100 1097 /* q->queue_lock is unlocked at this point */ ··· 1251 1248 */ 1252 1249 if (req->cmd_flags & REQ_ALLOCED) { 1253 1250 unsigned int flags = req->cmd_flags; 1251 + struct request_list *rl = blk_rq_rl(req); 1254 1252 1255 1253 BUG_ON(!list_empty(&req->queuelist)); 1256 1254 BUG_ON(!hlist_unhashed(&req->hash)); 1257 1255 1258 - blk_free_request(q, req); 1259 - freed_request(q, flags); 1256 + blk_free_request(rl, req); 1257 + freed_request(rl, flags); 1258 + blk_put_rl(rl); 1260 1259 } 1261 1260 } 1262 1261 EXPORT_SYMBOL_GPL(__blk_put_request); ··· 1486 1481 * Grab a free request. This is might sleep but can not fail. 1487 1482 * Returns with the queue unlocked. 1488 1483 */ 1489 - req = get_request_wait(q, rw_flags, bio); 1484 + req = get_request(q, rw_flags, bio, GFP_NOIO); 1490 1485 if (unlikely(!req)) { 1491 1486 bio_endio(bio, -ENODEV); /* @q is dead */ 1492 1487 goto out_unlock; ··· 1706 1701 err = -EOPNOTSUPP; 1707 1702 goto end_io; 1708 1703 } 1704 + 1705 + /* 1706 + * Various block parts want %current->io_context and lazy ioc 1707 + * allocation ends up trading a lot of pain for a small amount of 1708 + * memory. Just allocate it upfront. This may fail and block 1709 + * layer knows how to live with it. 1710 + */ 1711 + create_io_context(GFP_ATOMIC, q->node); 1709 1712 1710 1713 if (blk_throtl_bio(q, bio)) 1711 1714 return false; /* throttled, will be resubmitted later */

+1

block/blk-ioc.c

··· 244 244 245 245 /* initialize */ 246 246 atomic_long_set(&ioc->refcount, 1); 247 + atomic_set(&ioc->nr_tasks, 1); 247 248 atomic_set(&ioc->active_ref, 1); 248 249 spin_lock_init(&ioc->lock); 249 250 INIT_RADIX_TREE(&ioc->icq_tree, GFP_ATOMIC | __GFP_HIGH);

+1 -2

block/blk-settings.c

··· 143 143 lim->discard_zeroes_data = 1; 144 144 lim->max_segments = USHRT_MAX; 145 145 lim->max_hw_sectors = UINT_MAX; 146 - 147 - lim->max_sectors = BLK_DEF_MAX_SECTORS; 146 + lim->max_sectors = UINT_MAX; 148 147 } 149 148 EXPORT_SYMBOL(blk_set_stacking_limits); 150 149

+19 -15

block/blk-sysfs.c

··· 40 40 static ssize_t 41 41 queue_requests_store(struct request_queue *q, const char *page, size_t count) 42 42 { 43 - struct request_list *rl = &q->rq; 43 + struct request_list *rl; 44 44 unsigned long nr; 45 45 int ret; 46 46 ··· 55 55 q->nr_requests = nr; 56 56 blk_queue_congestion_threshold(q); 57 57 58 + /* congestion isn't cgroup aware and follows root blkcg for now */ 59 + rl = &q->root_rl; 60 + 58 61 if (rl->count[BLK_RW_SYNC] >= queue_congestion_on_threshold(q)) 59 62 blk_set_queue_congested(q, BLK_RW_SYNC); 60 63 else if (rl->count[BLK_RW_SYNC] < queue_congestion_off_threshold(q)) ··· 68 65 else if (rl->count[BLK_RW_ASYNC] < queue_congestion_off_threshold(q)) 69 66 blk_clear_queue_congested(q, BLK_RW_ASYNC); 70 67 71 - if (rl->count[BLK_RW_SYNC] >= q->nr_requests) { 72 - blk_set_queue_full(q, BLK_RW_SYNC); 73 - } else { 74 - blk_clear_queue_full(q, BLK_RW_SYNC); 75 - wake_up(&rl->wait[BLK_RW_SYNC]); 68 + blk_queue_for_each_rl(rl, q) { 69 + if (rl->count[BLK_RW_SYNC] >= q->nr_requests) { 70 + blk_set_rl_full(rl, BLK_RW_SYNC); 71 + } else { 72 + blk_clear_rl_full(rl, BLK_RW_SYNC); 73 + wake_up(&rl->wait[BLK_RW_SYNC]); 74 + } 75 + 76 + if (rl->count[BLK_RW_ASYNC] >= q->nr_requests) { 77 + blk_set_rl_full(rl, BLK_RW_ASYNC); 78 + } else { 79 + blk_clear_rl_full(rl, BLK_RW_ASYNC); 80 + wake_up(&rl->wait[BLK_RW_ASYNC]); 81 + } 76 82 } 77 83 78 - if (rl->count[BLK_RW_ASYNC] >= q->nr_requests) { 79 - blk_set_queue_full(q, BLK_RW_ASYNC); 80 - } else { 81 - blk_clear_queue_full(q, BLK_RW_ASYNC); 82 - wake_up(&rl->wait[BLK_RW_ASYNC]); 83 - } 84 84 spin_unlock_irq(q->queue_lock); 85 85 return ret; 86 86 } ··· 482 476 { 483 477 struct request_queue *q = 484 478 container_of(kobj, struct request_queue, kobj); 485 - struct request_list *rl = &q->rq; 486 479 487 480 blk_sync_queue(q); 488 481 ··· 494 489 elevator_exit(q->elevator); 495 490 } 496 491 497 - if (rl->rq_pool) 498 - mempool_destroy(rl->rq_pool); 492 + blk_exit_rl(&q->root_rl); 499 493 500 494 if (q->queue_tags) 501 495 __blk_queue_free_tags(q);

-3

block/blk-throttle.c

··· 1123 1123 goto out; 1124 1124 } 1125 1125 1126 - /* bio_associate_current() needs ioc, try creating */ 1127 - create_io_context(GFP_ATOMIC, q->node); 1128 - 1129 1126 /* 1130 1127 * A throtl_grp pointer retrieved under rcu can be used to access 1131 1128 * basic fields like stats and io rates. If a group has no rules,

+3 -1

block/blk.h

··· 18 18 kobject_get(&q->kobj); 19 19 } 20 20 21 + int blk_init_rl(struct request_list *rl, struct request_queue *q, 22 + gfp_t gfp_mask); 23 + void blk_exit_rl(struct request_list *rl); 21 24 void init_request_from_bio(struct request *req, struct bio *bio); 22 25 void blk_rq_bio_prep(struct request_queue *q, struct request *rq, 23 26 struct bio *bio); ··· 36 33 void blk_rq_timed_out_timer(unsigned long data); 37 34 void blk_delete_timer(struct request *); 38 35 void blk_add_timer(struct request *); 39 - void __generic_unplug_device(struct request_queue *); 40 36 41 37 /* 42 38 * Internal atomic flags for request handling

-53

block/bsg-lib.c

··· 243 243 return 0; 244 244 } 245 245 EXPORT_SYMBOL_GPL(bsg_setup_queue); 246 - 247 - /** 248 - * bsg_remove_queue - Deletes the bsg dev from the q 249 - * @q: the request_queue that is to be torn down. 250 - * 251 - * Notes: 252 - * Before unregistering the queue empty any requests that are blocked 253 - */ 254 - void bsg_remove_queue(struct request_queue *q) 255 - { 256 - struct request *req; /* block request */ 257 - int counts; /* totals for request_list count and starved */ 258 - 259 - if (!q) 260 - return; 261 - 262 - /* Stop taking in new requests */ 263 - spin_lock_irq(q->queue_lock); 264 - blk_stop_queue(q); 265 - 266 - /* drain all requests in the queue */ 267 - while (1) { 268 - /* need the lock to fetch a request 269 - * this may fetch the same reqeust as the previous pass 270 - */ 271 - req = blk_fetch_request(q); 272 - /* save requests in use and starved */ 273 - counts = q->rq.count[0] + q->rq.count[1] + 274 - q->rq.starved[0] + q->rq.starved[1]; 275 - spin_unlock_irq(q->queue_lock); 276 - /* any requests still outstanding? */ 277 - if (counts == 0) 278 - break; 279 - 280 - /* This may be the same req as the previous iteration, 281 - * always send the blk_end_request_all after a prefetch. 282 - * It is not okay to not end the request because the 283 - * prefetch started the request. 284 - */ 285 - if (req) { 286 - /* return -ENXIO to indicate that this queue is 287 - * going away 288 - */ 289 - req->errors = -ENXIO; 290 - blk_end_request_all(req, -ENXIO); 291 - } 292 - 293 - msleep(200); /* allow bsg to possibly finish */ 294 - spin_lock_irq(q->queue_lock); 295 - } 296 - bsg_unregister_queue(q); 297 - } 298 - EXPORT_SYMBOL_GPL(bsg_remove_queue);

+15 -5

block/genhd.c

··· 154 154 part = rcu_dereference(ptbl->part[piter->idx]); 155 155 if (!part) 156 156 continue; 157 - if (!part->nr_sects && 157 + if (!part_nr_sects_read(part) && 158 158 !(piter->flags & DISK_PITER_INCL_EMPTY) && 159 159 !(piter->flags & DISK_PITER_INCL_EMPTY_PART0 && 160 160 piter->idx == 0)) ··· 191 191 static inline int sector_in_part(struct hd_struct *part, sector_t sector) 192 192 { 193 193 return part->start_sect <= sector && 194 - sector < part->start_sect + part->nr_sects; 194 + sector < part->start_sect + part_nr_sects_read(part); 195 195 } 196 196 197 197 /** ··· 769 769 770 770 printk("%s%s %10llu %s %s", is_part0 ? "" : " ", 771 771 bdevt_str(part_devt(part), devt_buf), 772 - (unsigned long long)part->nr_sects >> 1, 773 - disk_name(disk, part->partno, name_buf), 772 + (unsigned long long)part_nr_sects_read(part) >> 1 773 + , disk_name(disk, part->partno, name_buf), 774 774 uuid_buf); 775 775 if (is_part0) { 776 776 if (disk->driverfs_dev != NULL && ··· 862 862 while ((part = disk_part_iter_next(&piter))) 863 863 seq_printf(seqf, "%4d %7d %10llu %s\n", 864 864 MAJOR(part_devt(part)), MINOR(part_devt(part)), 865 - (unsigned long long)part->nr_sects >> 1, 865 + (unsigned long long)part_nr_sects_read(part) >> 1, 866 866 disk_name(sgp, part->partno, buf)); 867 867 disk_part_iter_exit(&piter); 868 868 ··· 1268 1268 } 1269 1269 disk->part_tbl->part[0] = &disk->part0; 1270 1270 1271 + /* 1272 + * set_capacity() and get_capacity() currently don't use 1273 + * seqcounter to read/update the part0->nr_sects. Still init 1274 + * the counter as we can read the sectors in IO submission 1275 + * patch using seqence counters. 1276 + * 1277 + * TODO: Ideally set_capacity() and get_capacity() should be 1278 + * converted to make use of bd_mutex and sequence counters. 1279 + */ 1280 + seqcount_init(&disk->part0.nr_sects_seq); 1271 1281 hd_ref_init(&disk->part0); 1272 1282 1273 1283 disk->minors = minors;

+56 -3

block/ioctl.c

··· 13 13 { 14 14 struct block_device *bdevp; 15 15 struct gendisk *disk; 16 - struct hd_struct *part; 16 + struct hd_struct *part, *lpart; 17 17 struct blkpg_ioctl_arg a; 18 18 struct blkpg_partition p; 19 19 struct disk_part_iter piter; ··· 36 36 case BLKPG_ADD_PARTITION: 37 37 start = p.start >> 9; 38 38 length = p.length >> 9; 39 - /* check for fit in a hd_struct */ 40 - if (sizeof(sector_t) == sizeof(long) && 39 + /* check for fit in a hd_struct */ 40 + if (sizeof(sector_t) == sizeof(long) && 41 41 sizeof(long long) > sizeof(long)) { 42 42 long pstart = start, plength = length; 43 43 if (pstart != start || plength != length ··· 91 91 mutex_unlock(&bdevp->bd_mutex); 92 92 bdput(bdevp); 93 93 94 + return 0; 95 + case BLKPG_RESIZE_PARTITION: 96 + start = p.start >> 9; 97 + /* new length of partition in bytes */ 98 + length = p.length >> 9; 99 + /* check for fit in a hd_struct */ 100 + if (sizeof(sector_t) == sizeof(long) && 101 + sizeof(long long) > sizeof(long)) { 102 + long pstart = start, plength = length; 103 + if (pstart != start || plength != length 104 + || pstart < 0 || plength < 0) 105 + return -EINVAL; 106 + } 107 + part = disk_get_part(disk, partno); 108 + if (!part) 109 + return -ENXIO; 110 + bdevp = bdget(part_devt(part)); 111 + if (!bdevp) { 112 + disk_put_part(part); 113 + return -ENOMEM; 114 + } 115 + mutex_lock(&bdevp->bd_mutex); 116 + mutex_lock_nested(&bdev->bd_mutex, 1); 117 + if (start != part->start_sect) { 118 + mutex_unlock(&bdevp->bd_mutex); 119 + mutex_unlock(&bdev->bd_mutex); 120 + bdput(bdevp); 121 + disk_put_part(part); 122 + return -EINVAL; 123 + } 124 + /* overlap? */ 125 + disk_part_iter_init(&piter, disk, 126 + DISK_PITER_INCL_EMPTY); 127 + while ((lpart = disk_part_iter_next(&piter))) { 128 + if (lpart->partno != partno && 129 + !(start + length <= lpart->start_sect || 130 + start >= lpart->start_sect + lpart->nr_sects) 131 + ) { 132 + disk_part_iter_exit(&piter); 133 + mutex_unlock(&bdevp->bd_mutex); 134 + mutex_unlock(&bdev->bd_mutex); 135 + bdput(bdevp); 136 + disk_put_part(part); 137 + return -EBUSY; 138 + } 139 + } 140 + disk_part_iter_exit(&piter); 141 + part_nr_sects_write(part, (sector_t)length); 142 + i_size_write(bdevp->bd_inode, p.length); 143 + mutex_unlock(&bdevp->bd_mutex); 144 + mutex_unlock(&bdev->bd_mutex); 145 + bdput(bdevp); 146 + disk_put_part(part); 94 147 return 0; 95 148 default: 96 149 return -EINVAL;

+3 -1

block/partition-generic.c

··· 84 84 struct device_attribute *attr, char *buf) 85 85 { 86 86 struct hd_struct *p = dev_to_part(dev); 87 - return sprintf(buf, "%llu\n",(unsigned long long)p->nr_sects); 87 + return sprintf(buf, "%llu\n",(unsigned long long)part_nr_sects_read(p)); 88 88 } 89 89 90 90 static ssize_t part_ro_show(struct device *dev, ··· 294 294 err = -ENOMEM; 295 295 goto out_free; 296 296 } 297 + 298 + seqcount_init(&p->nr_sects_seq); 297 299 pdev = part_to_dev(p); 298 300 299 301 p->start_sect = start;

-38

drivers/scsi/scsi_transport_fc.c

··· 4146 4146 static void 4147 4147 fc_bsg_remove(struct request_queue *q) 4148 4148 { 4149 - struct request *req; /* block request */ 4150 - int counts; /* totals for request_list count and starved */ 4151 - 4152 4149 if (q) { 4153 - /* Stop taking in new requests */ 4154 - spin_lock_irq(q->queue_lock); 4155 - blk_stop_queue(q); 4156 - 4157 - /* drain all requests in the queue */ 4158 - while (1) { 4159 - /* need the lock to fetch a request 4160 - * this may fetch the same reqeust as the previous pass 4161 - */ 4162 - req = blk_fetch_request(q); 4163 - /* save requests in use and starved */ 4164 - counts = q->rq.count[0] + q->rq.count[1] + 4165 - q->rq.starved[0] + q->rq.starved[1]; 4166 - spin_unlock_irq(q->queue_lock); 4167 - /* any requests still outstanding? */ 4168 - if (counts == 0) 4169 - break; 4170 - 4171 - /* This may be the same req as the previous iteration, 4172 - * always send the blk_end_request_all after a prefetch. 4173 - * It is not okay to not end the request because the 4174 - * prefetch started the request. 4175 - */ 4176 - if (req) { 4177 - /* return -ENXIO to indicate that this queue is 4178 - * going away 4179 - */ 4180 - req->errors = -ENXIO; 4181 - blk_end_request_all(req, -ENXIO); 4182 - } 4183 - 4184 - msleep(200); /* allow bsg to possibly finish */ 4185 - spin_lock_irq(q->queue_lock); 4186 - } 4187 - 4188 4150 bsg_unregister_queue(q); 4189 4151 blk_cleanup_queue(q); 4190 4152 }

+1 -1

drivers/scsi/scsi_transport_iscsi.c

··· 575 575 struct iscsi_cls_host *ihost = shost->shost_data; 576 576 577 577 if (ihost->bsg_q) { 578 - bsg_remove_queue(ihost->bsg_q); 578 + bsg_unregister_queue(ihost->bsg_q); 579 579 blk_cleanup_queue(ihost->bsg_q); 580 580 } 581 581 return 0;

+32 -21

include/linux/blkdev.h

··· 46 46 struct request; 47 47 typedef void (rq_end_io_fn)(struct request *, int); 48 48 49 + #define BLK_RL_SYNCFULL (1U << 0) 50 + #define BLK_RL_ASYNCFULL (1U << 1) 51 + 49 52 struct request_list { 53 + struct request_queue *q; /* the queue this rl belongs to */ 54 + #ifdef CONFIG_BLK_CGROUP 55 + struct blkcg_gq *blkg; /* blkg this request pool belongs to */ 56 + #endif 50 57 /* 51 58 * count[], starved[], and wait[] are indexed by 52 59 * BLK_RW_SYNC/BLK_RW_ASYNC 53 60 */ 54 - int count[2]; 55 - int starved[2]; 56 - int elvpriv; 57 - mempool_t *rq_pool; 58 - wait_queue_head_t wait[2]; 61 + int count[2]; 62 + int starved[2]; 63 + mempool_t *rq_pool; 64 + wait_queue_head_t wait[2]; 65 + unsigned int flags; 59 66 }; 60 67 61 68 /* ··· 145 138 struct hd_struct *part; 146 139 unsigned long start_time; 147 140 #ifdef CONFIG_BLK_CGROUP 141 + struct request_list *rl; /* rl this rq is alloced from */ 148 142 unsigned long long start_time_ns; 149 143 unsigned long long io_start_time_ns; /* when passed to hardware */ 150 144 #endif ··· 290 282 struct list_head queue_head; 291 283 struct request *last_merge; 292 284 struct elevator_queue *elevator; 285 + int nr_rqs[2]; /* # allocated [a]sync rqs */ 286 + int nr_rqs_elvpriv; /* # allocated rqs w/ elvpriv */ 293 287 294 288 /* 295 - * the queue request freelist, one for reads and one for writes 289 + * If blkcg is not used, @q->root_rl serves all requests. If blkcg 290 + * is used, root blkg allocates from @q->root_rl and all other 291 + * blkgs from their own blkg->rl. Which one to use should be 292 + * determined using bio_request_list(). 296 293 */ 297 - struct request_list rq; 294 + struct request_list root_rl; 298 295 299 296 request_fn_proc *request_fn; 300 297 make_request_fn *make_request_fn; ··· 574 561 return rw_is_sync(rq->cmd_flags); 575 562 } 576 563 577 - static inline int blk_queue_full(struct request_queue *q, int sync) 564 + static inline bool blk_rl_full(struct request_list *rl, bool sync) 578 565 { 579 - if (sync) 580 - return test_bit(QUEUE_FLAG_SYNCFULL, &q->queue_flags); 581 - return test_bit(QUEUE_FLAG_ASYNCFULL, &q->queue_flags); 566 + unsigned int flag = sync ? BLK_RL_SYNCFULL : BLK_RL_ASYNCFULL; 567 + 568 + return rl->flags & flag; 582 569 } 583 570 584 - static inline void blk_set_queue_full(struct request_queue *q, int sync) 571 + static inline void blk_set_rl_full(struct request_list *rl, bool sync) 585 572 { 586 - if (sync) 587 - queue_flag_set(QUEUE_FLAG_SYNCFULL, q); 588 - else 589 - queue_flag_set(QUEUE_FLAG_ASYNCFULL, q); 573 + unsigned int flag = sync ? BLK_RL_SYNCFULL : BLK_RL_ASYNCFULL; 574 + 575 + rl->flags |= flag; 590 576 } 591 577 592 - static inline void blk_clear_queue_full(struct request_queue *q, int sync) 578 + static inline void blk_clear_rl_full(struct request_list *rl, bool sync) 593 579 { 594 - if (sync) 595 - queue_flag_clear(QUEUE_FLAG_SYNCFULL, q); 596 - else 597 - queue_flag_clear(QUEUE_FLAG_ASYNCFULL, q); 580 + unsigned int flag = sync ? BLK_RL_SYNCFULL : BLK_RL_ASYNCFULL; 581 + 582 + rl->flags &= ~flag; 598 583 } 599 584 600 585

+1

include/linux/blkpg.h

··· 40 40 /* The subfunctions (for the op field) */ 41 41 #define BLKPG_ADD_PARTITION 1 42 42 #define BLKPG_DEL_PARTITION 2 43 + #define BLKPG_RESIZE_PARTITION 3 43 44 44 45 /* Sizes of name fields. Unused at present. */ 45 46 #define BLKPG_DEVNAMELTH 64

-1

include/linux/bsg-lib.h

··· 67 67 int bsg_setup_queue(struct device *dev, struct request_queue *q, char *name, 68 68 bsg_job_fn *job_fn, int dd_job_size); 69 69 void bsg_request_fn(struct request_queue *q); 70 - void bsg_remove_queue(struct request_queue *q); 71 70 void bsg_goose_queue(struct request_queue *q); 72 71 73 72 #endif

+57

include/linux/genhd.h

··· 97 97 98 98 struct hd_struct { 99 99 sector_t start_sect; 100 + /* 101 + * nr_sects is protected by sequence counter. One might extend a 102 + * partition while IO is happening to it and update of nr_sects 103 + * can be non-atomic on 32bit machines with 64bit sector_t. 104 + */ 100 105 sector_t nr_sects; 106 + seqcount_t nr_sects_seq; 101 107 sector_t alignment_offset; 102 108 unsigned int discard_alignment; 103 109 struct device __dev; ··· 651 645 { 652 646 if (atomic_dec_and_test(&part->ref)) 653 647 __delete_partition(part); 648 + } 649 + 650 + /* 651 + * Any access of part->nr_sects which is not protected by partition 652 + * bd_mutex or gendisk bdev bd_mutex, should be done using this 653 + * accessor function. 654 + * 655 + * Code written along the lines of i_size_read() and i_size_write(). 656 + * CONFIG_PREEMPT case optimizes the case of UP kernel with preemption 657 + * on. 658 + */ 659 + static inline sector_t part_nr_sects_read(struct hd_struct *part) 660 + { 661 + #if BITS_PER_LONG==32 && defined(CONFIG_LBDAF) && defined(CONFIG_SMP) 662 + sector_t nr_sects; 663 + unsigned seq; 664 + do { 665 + seq = read_seqcount_begin(&part->nr_sects_seq); 666 + nr_sects = part->nr_sects; 667 + } while (read_seqcount_retry(&part->nr_sects_seq, seq)); 668 + return nr_sects; 669 + #elif BITS_PER_LONG==32 && defined(CONFIG_LBDAF) && defined(CONFIG_PREEMPT) 670 + sector_t nr_sects; 671 + 672 + preempt_disable(); 673 + nr_sects = part->nr_sects; 674 + preempt_enable(); 675 + return nr_sects; 676 + #else 677 + return part->nr_sects; 678 + #endif 679 + } 680 + 681 + /* 682 + * Should be called with mutex lock held (typically bd_mutex) of partition 683 + * to provide mutual exlusion among writers otherwise seqcount might be 684 + * left in wrong state leaving the readers spinning infinitely. 685 + */ 686 + static inline void part_nr_sects_write(struct hd_struct *part, sector_t size) 687 + { 688 + #if BITS_PER_LONG==32 && defined(CONFIG_LBDAF) && defined(CONFIG_SMP) 689 + write_seqcount_begin(&part->nr_sects_seq); 690 + part->nr_sects = size; 691 + write_seqcount_end(&part->nr_sects_seq); 692 + #elif BITS_PER_LONG==32 && defined(CONFIG_LBDAF) && defined(CONFIG_PREEMPT) 693 + preempt_disable(); 694 + part->nr_sects = size; 695 + preempt_enable(); 696 + #else 697 + part->nr_sects = size; 698 + #endif 654 699 } 655 700 656 701 #else /* CONFIG_BLOCK */

+2 -1

include/linux/mempool.h

··· 26 26 extern mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn, 27 27 mempool_free_t *free_fn, void *pool_data); 28 28 extern mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn, 29 - mempool_free_t *free_fn, void *pool_data, int nid); 29 + mempool_free_t *free_fn, void *pool_data, 30 + gfp_t gfp_mask, int nid); 30 31 31 32 extern int mempool_resize(mempool_t *pool, int new_min_nr, gfp_t gfp_mask); 32 33 extern void mempool_destroy(mempool_t *pool);

+7 -5

mm/mempool.c

··· 63 63 mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn, 64 64 mempool_free_t *free_fn, void *pool_data) 65 65 { 66 - return mempool_create_node(min_nr,alloc_fn,free_fn, pool_data,-1); 66 + return mempool_create_node(min_nr,alloc_fn,free_fn, pool_data, 67 + GFP_KERNEL, NUMA_NO_NODE); 67 68 } 68 69 EXPORT_SYMBOL(mempool_create); 69 70 70 71 mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn, 71 - mempool_free_t *free_fn, void *pool_data, int node_id) 72 + mempool_free_t *free_fn, void *pool_data, 73 + gfp_t gfp_mask, int node_id) 72 74 { 73 75 mempool_t *pool; 74 - pool = kmalloc_node(sizeof(*pool), GFP_KERNEL | __GFP_ZERO, node_id); 76 + pool = kmalloc_node(sizeof(*pool), gfp_mask | __GFP_ZERO, node_id); 75 77 if (!pool) 76 78 return NULL; 77 79 pool->elements = kmalloc_node(min_nr * sizeof(void *), 78 - GFP_KERNEL, node_id); 80 + gfp_mask, node_id); 79 81 if (!pool->elements) { 80 82 kfree(pool); 81 83 return NULL; ··· 95 93 while (pool->curr_nr < pool->min_nr) { 96 94 void *element; 97 95 98 - element = pool->alloc(GFP_KERNEL, pool->pool_data); 96 + element = pool->alloc(gfp_mask, pool->pool_data); 99 97 if (unlikely(!element)) { 100 98 mempool_destroy(pool); 101 99 return NULL;