blk-mq: use array manage hctx map instead of xarray

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

After commit 4e5cc99e1e48 ("blk-mq: manage hctx map via xarray"), we use
an xarray instead of array to store hctx, but in poll mode, each time
in blk_mq_poll, we need use xa_load to find corresponding hctx, this
introduce some costs. In my test, xa_load may cost 3.8% cpu.

This patch revert previous change, eliminates the overhead of xa_load
and can result in a 3% performance improvement.

Signed-off-by: Fengnan Chang <changfengnan@bytedance.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>

authored by

Fengnan Chang and committed by

Jens Axboe 4 months ago d0c98769 c6a45ee7

+42 -25

5 changed files

expand all

block

blk-mq-tag.c

blk-mq.c

blk-mq.h

include

linux

blk-mq.h

blkdev.h

+1 -1

block/blk-mq-tag.c

··· 499 499 int srcu_idx; 500 500 501 501 /* 502 - * __blk_mq_update_nr_hw_queues() updates nr_hw_queues and hctx_table 502 + * __blk_mq_update_nr_hw_queues() updates nr_hw_queues and queue_hw_ctx 503 503 * while the queue is frozen. So we can use q_usage_counter to avoid 504 504 * racing with it. 505 505 */

+37 -21

block/blk-mq.c

··· 730 730 * If not tell the caller that it should skip this queue. 731 731 */ 732 732 ret = -EXDEV; 733 - data.hctx = xa_load(&q->hctx_table, hctx_idx); 733 + data.hctx = q->queue_hw_ctx[hctx_idx]; 734 734 if (!blk_mq_hw_queue_mapped(data.hctx)) 735 735 goto out_queue_exit; 736 736 cpu = cpumask_first_and(data.hctx->cpumask, cpu_online_mask); ··· 3946 3946 blk_free_flush_queue_callback); 3947 3947 hctx->fq = NULL; 3948 3948 3949 - xa_erase(&q->hctx_table, hctx_idx); 3950 - 3951 3949 spin_lock(&q->unused_hctx_lock); 3952 3950 list_add(&hctx->hctx_list, &q->unused_hctx_list); 3953 3951 spin_unlock(&q->unused_hctx_lock); ··· 3987 3989 hctx->numa_node)) 3988 3990 goto exit_hctx; 3989 3991 3990 - if (xa_insert(&q->hctx_table, hctx_idx, hctx, GFP_KERNEL)) 3991 - goto exit_flush_rq; 3992 - 3993 3992 return 0; 3994 3993 3995 - exit_flush_rq: 3996 - if (set->ops->exit_request) 3997 - set->ops->exit_request(set, hctx->fq->flush_rq, hctx_idx); 3998 3994 exit_hctx: 3999 3995 if (set->ops->exit_hctx) 4000 3996 set->ops->exit_hctx(hctx, hctx_idx); ··· 4377 4385 kobject_put(&hctx->kobj); 4378 4386 } 4379 4387 4380 - xa_destroy(&q->hctx_table); 4388 + kfree(q->queue_hw_ctx); 4381 4389 4382 4390 /* 4383 4391 * release .mq_kobj and sw queue's kobject now because ··· 4521 4529 static void __blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, 4522 4530 struct request_queue *q) 4523 4531 { 4524 - struct blk_mq_hw_ctx *hctx; 4525 - unsigned long i, j; 4532 + int i, j, end; 4533 + struct blk_mq_hw_ctx **hctxs = q->queue_hw_ctx; 4534 + 4535 + if (q->nr_hw_queues < set->nr_hw_queues) { 4536 + struct blk_mq_hw_ctx **new_hctxs; 4537 + 4538 + new_hctxs = kcalloc_node(set->nr_hw_queues, 4539 + sizeof(*new_hctxs), GFP_KERNEL, 4540 + set->numa_node); 4541 + if (!new_hctxs) 4542 + return; 4543 + if (hctxs) 4544 + memcpy(new_hctxs, hctxs, q->nr_hw_queues * 4545 + sizeof(*hctxs)); 4546 + q->queue_hw_ctx = new_hctxs; 4547 + kfree(hctxs); 4548 + hctxs = new_hctxs; 4549 + } 4526 4550 4527 4551 for (i = 0; i < set->nr_hw_queues; i++) { 4528 4552 int old_node; 4529 4553 int node = blk_mq_get_hctx_node(set, i); 4530 - struct blk_mq_hw_ctx *old_hctx = xa_load(&q->hctx_table, i); 4554 + struct blk_mq_hw_ctx *old_hctx = hctxs[i]; 4531 4555 4532 4556 if (old_hctx) { 4533 4557 old_node = old_hctx->numa_node; 4534 4558 blk_mq_exit_hctx(q, set, old_hctx, i); 4535 4559 } 4536 4560 4537 - if (!blk_mq_alloc_and_init_hctx(set, q, i, node)) { 4561 + hctxs[i] = blk_mq_alloc_and_init_hctx(set, q, i, node); 4562 + if (!hctxs[i]) { 4538 4563 if (!old_hctx) 4539 4564 break; 4540 4565 pr_warn("Allocate new hctx on node %d fails, fallback to previous one on node %d\n", 4541 4566 node, old_node); 4542 - hctx = blk_mq_alloc_and_init_hctx(set, q, i, old_node); 4543 - WARN_ON_ONCE(!hctx); 4567 + hctxs[i] = blk_mq_alloc_and_init_hctx(set, q, i, 4568 + old_node); 4569 + WARN_ON_ONCE(!hctxs[i]); 4544 4570 } 4545 4571 } 4546 4572 /* ··· 4567 4557 */ 4568 4558 if (i != set->nr_hw_queues) { 4569 4559 j = q->nr_hw_queues; 4560 + end = i; 4570 4561 } else { 4571 4562 j = i; 4563 + end = q->nr_hw_queues; 4572 4564 q->nr_hw_queues = set->nr_hw_queues; 4573 4565 } 4574 4566 4575 - xa_for_each_start(&q->hctx_table, j, hctx, j) 4576 - blk_mq_exit_hctx(q, set, hctx, j); 4567 + for (; j < end; j++) { 4568 + struct blk_mq_hw_ctx *hctx = hctxs[j]; 4569 + 4570 + if (hctx) { 4571 + blk_mq_exit_hctx(q, set, hctx, j); 4572 + hctxs[j] = NULL; 4573 + } 4574 + } 4577 4575 } 4578 4576 4579 4577 static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, ··· 4616 4598 4617 4599 INIT_LIST_HEAD(&q->unused_hctx_list); 4618 4600 spin_lock_init(&q->unused_hctx_lock); 4619 - 4620 - xa_init(&q->hctx_table); 4621 4601 4622 4602 blk_mq_realloc_hw_ctxs(set, q); 4623 4603 if (!q->nr_hw_queues) ··· 5203 5187 { 5204 5188 if (!blk_mq_can_poll(q)) 5205 5189 return 0; 5206 - return blk_hctx_poll(q, xa_load(&q->hctx_table, cookie), iob, flags); 5190 + return blk_hctx_poll(q, q->queue_hw_ctx[cookie], iob, flags); 5207 5191 } 5208 5192 5209 5193 int blk_rq_poll(struct request *rq, struct io_comp_batch *iob,

+1 -1

block/blk-mq.h

··· 84 84 enum hctx_type type, 85 85 unsigned int cpu) 86 86 { 87 - return xa_load(&q->hctx_table, q->tag_set->map[type].mq_map[cpu]); 87 + return q->queue_hw_ctx[q->tag_set->map[type].mq_map[cpu]]; 88 88 } 89 89 90 90 static inline enum hctx_type blk_mq_get_hctx_type(blk_opf_t opf)

+2 -1

include/linux/blk-mq.h

··· 1016 1016 } 1017 1017 1018 1018 #define queue_for_each_hw_ctx(q, hctx, i) \ 1019 - xa_for_each(&(q)->hctx_table, (i), (hctx)) 1019 + for ((i) = 0; (i) < (q)->nr_hw_queues && \ 1020 + ({ hctx = (q)->queue_hw_ctx[i]; 1; }); (i)++) 1020 1021 1021 1022 #define hctx_for_each_ctx(hctx, ctx, i) \ 1022 1023 for ((i) = 0; (i) < (hctx)->nr_ctx && \

+1 -1

include/linux/blkdev.h

··· 503 503 504 504 /* hw dispatch queues */ 505 505 unsigned int nr_hw_queues; 506 - struct xarray hctx_table; 506 + struct blk_mq_hw_ctx **queue_hw_ctx; 507 507 508 508 struct percpu_ref q_usage_counter; 509 509 struct lock_class_key io_lock_cls_key;