Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

blk-mq: fix elevator depth_updated method

Current depth_updated has some problems:

1) depth_updated() will be called for each hctx, while all elevators
will update async_depth for the disk level, this is not related to hctx;
2) In blk_mq_update_nr_requests(), if previous hctx update succeed and
this hctx update failed, q->nr_requests will not be updated, while
async_depth is already updated with new nr_reqeuests in previous
depth_updated();
3) All elevators are using q->nr_requests to calculate async_depth now,
however, q->nr_requests is still the old value when depth_updated() is
called from blk_mq_update_nr_requests();

Those problems are first from error path, then mq-deadline, and recently
for bfq and kyber, fix those problems by:

- pass in request_queue instead of hctx;
- move depth_updated() after q->nr_requests is updated in
blk_mq_update_nr_requests();
- add depth_updated() call inside init_sched() method to initialize
async_depth;
- remove init_hctx() method for mq-deadline and bfq that is useless now;

Fixes: 77f1e0a52d26 ("bfq: update internal depth state when queue depth changes")
Fixes: 39823b47bbd4 ("block/mq-deadline: Fix the tag reservation code")
Fixes: 42e6c6ce03fd ("lib/sbitmap: convert shallow_depth from one word to the whole sbitmap")
Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Li Nan <linan122@huawei.com>
Reviewed-by: Nilay Shroff <nilay@linux.ibm.com>
Link: https://lore.kernel.org/r/20250821060612.1729939-2-yukuai1@huaweicloud.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>

authored by

Yu Kuai and committed by
Jens Axboe
7d337eef 225dc96f

+42 -53
+5 -17
block/bfq-iosched.c
··· 7109 7109 * See the comments on bfq_limit_depth for the purpose of 7110 7110 * the depths set in the function. Return minimum shallow depth we'll use. 7111 7111 */ 7112 - static void bfq_update_depths(struct bfq_data *bfqd, struct sbitmap_queue *bt) 7112 + static void bfq_depth_updated(struct request_queue *q) 7113 7113 { 7114 - unsigned int nr_requests = bfqd->queue->nr_requests; 7114 + struct bfq_data *bfqd = q->elevator->elevator_data; 7115 + unsigned int nr_requests = q->nr_requests; 7115 7116 7116 7117 /* 7117 7118 * In-word depths if no bfq_queue is being weight-raised: ··· 7144 7143 bfqd->async_depths[1][0] = max((nr_requests * 3) >> 4, 1U); 7145 7144 /* no more than ~37% of tags for sync writes (~20% extra tags) */ 7146 7145 bfqd->async_depths[1][1] = max((nr_requests * 6) >> 4, 1U); 7147 - } 7148 7146 7149 - static void bfq_depth_updated(struct blk_mq_hw_ctx *hctx) 7150 - { 7151 - struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; 7152 - struct blk_mq_tags *tags = hctx->sched_tags; 7153 - 7154 - bfq_update_depths(bfqd, &tags->bitmap_tags); 7155 - sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, 1); 7156 - } 7157 - 7158 - static int bfq_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int index) 7159 - { 7160 - bfq_depth_updated(hctx); 7161 - return 0; 7147 + blk_mq_set_min_shallow_depth(q, 1); 7162 7148 } 7163 7149 7164 7150 static void bfq_exit_queue(struct elevator_queue *e) ··· 7357 7369 goto out_free; 7358 7370 bfq_init_root_group(bfqd->root_group, bfqd); 7359 7371 bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group); 7372 + bfq_depth_updated(q); 7360 7373 7361 7374 /* We dispatch from request queue wide instead of hw queue */ 7362 7375 blk_queue_flag_set(QUEUE_FLAG_SQ_SCHED, q); ··· 7617 7628 .request_merged = bfq_request_merged, 7618 7629 .has_work = bfq_has_work, 7619 7630 .depth_updated = bfq_depth_updated, 7620 - .init_hctx = bfq_init_hctx, 7621 7631 .init_sched = bfq_init_queue, 7622 7632 .exit_sched = bfq_exit_queue, 7623 7633 },
+11
block/blk-mq-sched.h
··· 92 92 return test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); 93 93 } 94 94 95 + static inline void blk_mq_set_min_shallow_depth(struct request_queue *q, 96 + unsigned int depth) 97 + { 98 + struct blk_mq_hw_ctx *hctx; 99 + unsigned long i; 100 + 101 + queue_for_each_hw_ctx(q, hctx, i) 102 + sbitmap_queue_min_shallow_depth(&hctx->sched_tags->bitmap_tags, 103 + depth); 104 + } 105 + 95 106 #endif
+13 -12
block/blk-mq.c
··· 4951 4951 false); 4952 4952 } 4953 4953 if (ret) 4954 - break; 4955 - if (q->elevator && q->elevator->type->ops.depth_updated) 4956 - q->elevator->type->ops.depth_updated(hctx); 4957 - } 4958 - if (!ret) { 4959 - q->nr_requests = nr; 4960 - if (blk_mq_is_shared_tags(set->flags)) { 4961 - if (q->elevator) 4962 - blk_mq_tag_update_sched_shared_tags(q); 4963 - else 4964 - blk_mq_tag_resize_shared_tags(set, nr); 4965 - } 4954 + goto out; 4966 4955 } 4967 4956 4957 + q->nr_requests = nr; 4958 + if (q->elevator && q->elevator->type->ops.depth_updated) 4959 + q->elevator->type->ops.depth_updated(q); 4960 + 4961 + if (blk_mq_is_shared_tags(set->flags)) { 4962 + if (q->elevator) 4963 + blk_mq_tag_update_sched_shared_tags(q); 4964 + else 4965 + blk_mq_tag_resize_shared_tags(set, nr); 4966 + } 4967 + 4968 + out: 4968 4969 blk_mq_unquiesce_queue(q); 4969 4970 4970 4971 return ret;
+1 -1
block/elevator.h
··· 37 37 void (*exit_sched)(struct elevator_queue *); 38 38 int (*init_hctx)(struct blk_mq_hw_ctx *, unsigned int); 39 39 void (*exit_hctx)(struct blk_mq_hw_ctx *, unsigned int); 40 - void (*depth_updated)(struct blk_mq_hw_ctx *); 40 + void (*depth_updated)(struct request_queue *); 41 41 42 42 bool (*allow_merge)(struct request_queue *, struct request *, struct bio *); 43 43 bool (*bio_merge)(struct request_queue *, struct bio *, unsigned int);
+9 -10
block/kyber-iosched.c
··· 399 399 return ERR_PTR(ret); 400 400 } 401 401 402 + static void kyber_depth_updated(struct request_queue *q) 403 + { 404 + struct kyber_queue_data *kqd = q->elevator->elevator_data; 405 + 406 + kqd->async_depth = q->nr_requests * KYBER_ASYNC_PERCENT / 100U; 407 + blk_mq_set_min_shallow_depth(q, kqd->async_depth); 408 + } 409 + 402 410 static int kyber_init_sched(struct request_queue *q, struct elevator_queue *eq) 403 411 { 404 412 struct kyber_queue_data *kqd; ··· 421 413 422 414 eq->elevator_data = kqd; 423 415 q->elevator = eq; 416 + kyber_depth_updated(q); 424 417 425 418 return 0; 426 419 } ··· 447 438 spin_lock_init(&kcq->lock); 448 439 for (i = 0; i < KYBER_NUM_DOMAINS; i++) 449 440 INIT_LIST_HEAD(&kcq->rq_list[i]); 450 - } 451 - 452 - static void kyber_depth_updated(struct blk_mq_hw_ctx *hctx) 453 - { 454 - struct kyber_queue_data *kqd = hctx->queue->elevator->elevator_data; 455 - struct blk_mq_tags *tags = hctx->sched_tags; 456 - 457 - kqd->async_depth = hctx->queue->nr_requests * KYBER_ASYNC_PERCENT / 100U; 458 - sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, kqd->async_depth); 459 441 } 460 442 461 443 static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) ··· 493 493 khd->batching = 0; 494 494 495 495 hctx->sched_data = khd; 496 - kyber_depth_updated(hctx); 497 496 498 497 return 0; 499 498
+3 -13
block/mq-deadline.c
··· 507 507 } 508 508 509 509 /* Called by blk_mq_update_nr_requests(). */ 510 - static void dd_depth_updated(struct blk_mq_hw_ctx *hctx) 510 + static void dd_depth_updated(struct request_queue *q) 511 511 { 512 - struct request_queue *q = hctx->queue; 513 512 struct deadline_data *dd = q->elevator->elevator_data; 514 - struct blk_mq_tags *tags = hctx->sched_tags; 515 513 516 514 dd->async_depth = q->nr_requests; 517 - 518 - sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, 1); 519 - } 520 - 521 - /* Called by blk_mq_init_hctx() and blk_mq_init_sched(). */ 522 - static int dd_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) 523 - { 524 - dd_depth_updated(hctx); 525 - return 0; 515 + blk_mq_set_min_shallow_depth(q, 1); 526 516 } 527 517 528 518 static void dd_exit_sched(struct elevator_queue *e) ··· 577 587 blk_queue_flag_set(QUEUE_FLAG_SQ_SCHED, q); 578 588 579 589 q->elevator = eq; 590 + dd_depth_updated(q); 580 591 return 0; 581 592 } 582 593 ··· 1039 1048 .has_work = dd_has_work, 1040 1049 .init_sched = dd_init_sched, 1041 1050 .exit_sched = dd_exit_sched, 1042 - .init_hctx = dd_init_hctx, 1043 1051 }, 1044 1052 1045 1053 #ifdef CONFIG_BLK_DEBUG_FS