Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

selftests: ublk: kublk: decouple ublk_queues from ublk server threads

Add support in kublk for decoupled ublk_queues and ublk server threads.
kublk now has two modes of operation:

- (preexisting mode) threads and queues are paired 1:1, and each thread
services all the I/Os of one queue
- (new mode) thread and queue counts are independently configurable.
threads service I/Os in a way that balances load across threads even
if load is not balanced over queues.

The default is the preexisting mode. The new mode is activated by
passing the --per_io_tasks flag.

Signed-off-by: Uday Shankar <ushankar@purestorage.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/20250529-ublk_task_per_io-v8-6-e9d3b119336a@purestorage.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>

authored by

Uday Shankar and committed by
Jens Axboe
abe54c16 b9848ca7

+100 -24
+2 -2
tools/testing/selftests/ublk/file_backed.c
··· 54 54 55 55 ublk_io_alloc_sqes(ublk_get_io(q, tag), sqe, 3); 56 56 57 - io_uring_prep_buf_register(sqe[0], 0, tag, q->q_id, tag); 57 + io_uring_prep_buf_register(sqe[0], 0, tag, q->q_id, ublk_get_io(q, tag)->buf_index); 58 58 sqe[0]->flags |= IOSQE_CQE_SKIP_SUCCESS | IOSQE_IO_HARDLINK; 59 59 sqe[0]->user_data = build_user_data(tag, 60 60 ublk_cmd_op_nr(sqe[0]->cmd_op), 0, q->q_id, 1); ··· 66 66 sqe[1]->flags |= IOSQE_FIXED_FILE | IOSQE_IO_HARDLINK; 67 67 sqe[1]->user_data = build_user_data(tag, ublk_op, 0, q->q_id, 1); 68 68 69 - io_uring_prep_buf_unregister(sqe[2], 0, tag, q->q_id, tag); 69 + io_uring_prep_buf_unregister(sqe[2], 0, tag, q->q_id, ublk_get_io(q, tag)->buf_index); 70 70 sqe[2]->user_data = build_user_data(tag, ublk_cmd_op_nr(sqe[2]->cmd_op), 0, q->q_id, 1); 71 71 72 72 return 2;
+88 -17
tools/testing/selftests/ublk/kublk.c
··· 505 505 } 506 506 507 507 if (dev->dev_info.flags & (UBLK_F_SUPPORT_ZERO_COPY | UBLK_F_AUTO_BUF_REG)) { 508 + unsigned nr_ios = dev->dev_info.queue_depth * dev->dev_info.nr_hw_queues; 509 + unsigned max_nr_ios_per_thread = nr_ios / dev->nthreads; 510 + max_nr_ios_per_thread += !!(nr_ios % dev->nthreads); 508 511 ret = io_uring_register_buffers_sparse( 509 - &t->ring, dev->dev_info.queue_depth); 512 + &t->ring, max_nr_ios_per_thread); 510 513 if (ret) { 511 514 ublk_err("ublk dev %d thread %d register spare buffers failed %d", 512 515 dev->dev_info.dev_id, t->idx, ret); ··· 581 578 if (q->tgt_ops->buf_index) 582 579 buf.index = q->tgt_ops->buf_index(q, tag); 583 580 else 584 - buf.index = tag; 581 + buf.index = q->ios[tag].buf_index; 585 582 586 583 if (q->state & UBLKSRV_AUTO_BUF_REG_FALLBACK) 587 584 buf.flags = UBLK_AUTO_BUF_REG_FALLBACK; ··· 663 660 664 661 static void ublk_submit_fetch_commands(struct ublk_thread *t) 665 662 { 666 - /* 667 - * Service exclusively the queue whose q_id matches our thread 668 - * index. This may change in the future. 669 - */ 670 - struct ublk_queue *q = &t->dev->q[t->idx]; 663 + struct ublk_queue *q; 671 664 struct ublk_io *io; 672 - int i = 0; 665 + int i = 0, j = 0; 673 666 674 - for (i = 0; i < q->q_depth; i++) { 675 - io = &q->ios[i]; 676 - io->t = t; 677 - ublk_queue_io_cmd(io); 667 + if (t->dev->per_io_tasks) { 668 + /* 669 + * Lexicographically order all the (qid,tag) pairs, with 670 + * qid taking priority (so (1,0) > (0,1)). Then make 671 + * this thread the daemon for every Nth entry in this 672 + * list (N is the number of threads), starting at this 673 + * thread's index. This ensures that each queue is 674 + * handled by as many ublk server threads as possible, 675 + * so that load that is concentrated on one or a few 676 + * queues can make use of all ublk server threads. 677 + */ 678 + const struct ublksrv_ctrl_dev_info *dinfo = &t->dev->dev_info; 679 + int nr_ios = dinfo->nr_hw_queues * dinfo->queue_depth; 680 + for (i = t->idx; i < nr_ios; i += t->dev->nthreads) { 681 + int q_id = i / dinfo->queue_depth; 682 + int tag = i % dinfo->queue_depth; 683 + q = &t->dev->q[q_id]; 684 + io = &q->ios[tag]; 685 + io->t = t; 686 + io->buf_index = j++; 687 + ublk_queue_io_cmd(io); 688 + } 689 + } else { 690 + /* 691 + * Service exclusively the queue whose q_id matches our 692 + * thread index. 693 + */ 694 + struct ublk_queue *q = &t->dev->q[t->idx]; 695 + for (i = 0; i < q->q_depth; i++) { 696 + io = &q->ios[i]; 697 + io->t = t; 698 + io->buf_index = i; 699 + ublk_queue_io_cmd(io); 700 + } 678 701 } 679 702 } 680 703 ··· 855 826 return NULL; 856 827 } 857 828 /* IO perf is sensitive with queue pthread affinity on NUMA machine*/ 858 - ublk_thread_set_sched_affinity(t, info->affinity); 829 + if (info->affinity) 830 + ublk_thread_set_sched_affinity(t, info->affinity); 859 831 sem_post(info->ready); 860 832 861 833 ublk_dbg(UBLK_DBG_THREAD, "tid %d: ublk dev %d thread %u started\n", ··· 923 893 924 894 ublk_dbg(UBLK_DBG_DEV, "%s enter\n", __func__); 925 895 926 - tinfo = calloc(sizeof(struct ublk_thread_info), dinfo->nr_hw_queues); 896 + tinfo = calloc(sizeof(struct ublk_thread_info), dev->nthreads); 927 897 if (!tinfo) 928 898 return -ENOMEM; 929 899 ··· 949 919 dinfo->dev_id, i); 950 920 goto fail; 951 921 } 922 + } 952 923 924 + for (i = 0; i < dev->nthreads; i++) { 953 925 tinfo[i].dev = dev; 954 926 tinfo[i].idx = i; 955 927 tinfo[i].ready = &ready; 956 - tinfo[i].affinity = &affinity_buf[i]; 928 + 929 + /* 930 + * If threads are not tied 1:1 to queues, setting thread 931 + * affinity based on queue affinity makes little sense. 932 + * However, thread CPU affinity has significant impact 933 + * on performance, so to compare fairly, we'll still set 934 + * thread CPU affinity based on queue affinity where 935 + * possible. 936 + */ 937 + if (dev->nthreads == dinfo->nr_hw_queues) 938 + tinfo[i].affinity = &affinity_buf[i]; 957 939 pthread_create(&dev->threads[i].thread, NULL, 958 940 ublk_io_handler_fn, 959 941 &tinfo[i]); 960 942 } 961 943 962 - for (i = 0; i < dinfo->nr_hw_queues; i++) 944 + for (i = 0; i < dev->nthreads; i++) 963 945 sem_wait(&ready); 964 946 free(tinfo); 965 947 free(affinity_buf); ··· 995 953 ublk_send_dev_event(ctx, dev, dev->dev_info.dev_id); 996 954 997 955 /* wait until we are terminated */ 998 - for (i = 0; i < dinfo->nr_hw_queues; i++) 956 + for (i = 0; i < dev->nthreads; i++) 999 957 pthread_join(dev->threads[i].thread, &thread_ret); 1000 958 fail: 1001 959 for (i = 0; i < dinfo->nr_hw_queues; i++) ··· 1105 1063 1106 1064 static int __cmd_dev_add(const struct dev_ctx *ctx) 1107 1065 { 1066 + unsigned nthreads = ctx->nthreads; 1108 1067 unsigned nr_queues = ctx->nr_hw_queues; 1109 1068 const char *tgt_type = ctx->tgt_type; 1110 1069 unsigned depth = ctx->queue_depth; ··· 1126 1083 if (nr_queues > UBLK_MAX_QUEUES || depth > UBLK_QUEUE_DEPTH) { 1127 1084 ublk_err("%s: invalid nr_queues or depth queues %u depth %u\n", 1128 1085 __func__, nr_queues, depth); 1086 + return -EINVAL; 1087 + } 1088 + 1089 + /* default to 1:1 threads:queues if nthreads is unspecified */ 1090 + if (!nthreads) 1091 + nthreads = nr_queues; 1092 + 1093 + if (nthreads > UBLK_MAX_THREADS) { 1094 + ublk_err("%s: %u is too many threads (max %u)\n", 1095 + __func__, nthreads, UBLK_MAX_THREADS); 1096 + return -EINVAL; 1097 + } 1098 + 1099 + if (nthreads != nr_queues && !ctx->per_io_tasks) { 1100 + ublk_err("%s: threads %u must be same as queues %u if " 1101 + "not using per_io_tasks\n", 1102 + __func__, nthreads, nr_queues); 1129 1103 return -EINVAL; 1130 1104 } 1131 1105 ··· 1169 1109 if ((features & UBLK_F_QUIESCE) && 1170 1110 (info->flags & UBLK_F_USER_RECOVERY)) 1171 1111 info->flags |= UBLK_F_QUIESCE; 1112 + dev->nthreads = nthreads; 1113 + dev->per_io_tasks = ctx->per_io_tasks; 1172 1114 dev->tgt.ops = ops; 1173 1115 dev->tgt.sq_depth = depth; 1174 1116 dev->tgt.cq_depth = depth; ··· 1369 1307 [const_ilog2(UBLK_F_UPDATE_SIZE)] = "UPDATE_SIZE", 1370 1308 [const_ilog2(UBLK_F_AUTO_BUF_REG)] = "AUTO_BUF_REG", 1371 1309 [const_ilog2(UBLK_F_QUIESCE)] = "QUIESCE", 1310 + [const_ilog2(UBLK_F_PER_IO_DAEMON)] = "PER_IO_DAEMON", 1372 1311 }; 1373 1312 struct ublk_dev *dev; 1374 1313 __u64 features = 0; ··· 1464 1401 exe, recovery ? "recover" : "add"); 1465 1402 printf("\t[--foreground] [--quiet] [-z] [--auto_zc] [--auto_zc_fallback] [--debug_mask mask] [-r 0|1 ] [-g]\n"); 1466 1403 printf("\t[-e 0|1 ] [-i 0|1]\n"); 1404 + printf("\t[--nthreads threads] [--per_io_tasks]\n"); 1467 1405 printf("\t[target options] [backfile1] [backfile2] ...\n"); 1468 1406 printf("\tdefault: nr_queues=2(max 32), depth=128(max 1024), dev_id=-1(auto allocation)\n"); 1407 + printf("\tdefault: nthreads=nr_queues"); 1469 1408 1470 1409 for (i = 0; i < sizeof(tgt_ops_list) / sizeof(tgt_ops_list[0]); i++) { 1471 1410 const struct ublk_tgt_ops *ops = tgt_ops_list[i]; ··· 1524 1459 { "auto_zc", 0, NULL, 0 }, 1525 1460 { "auto_zc_fallback", 0, NULL, 0 }, 1526 1461 { "size", 1, NULL, 's'}, 1462 + { "nthreads", 1, NULL, 0 }, 1463 + { "per_io_tasks", 0, NULL, 0 }, 1527 1464 { 0, 0, 0, 0 } 1528 1465 }; 1529 1466 const struct ublk_tgt_ops *ops = NULL; ··· 1601 1534 ctx.flags |= UBLK_F_AUTO_BUF_REG; 1602 1535 if (!strcmp(longopts[option_idx].name, "auto_zc_fallback")) 1603 1536 ctx.auto_zc_fallback = 1; 1537 + if (!strcmp(longopts[option_idx].name, "nthreads")) 1538 + ctx.nthreads = strtol(optarg, NULL, 10); 1539 + if (!strcmp(longopts[option_idx].name, "per_io_tasks")) 1540 + ctx.per_io_tasks = 1; 1604 1541 break; 1605 1542 case '?': 1606 1543 /*
+5
tools/testing/selftests/ublk/kublk.h
··· 80 80 char tgt_type[16]; 81 81 unsigned long flags; 82 82 unsigned nr_hw_queues; 83 + unsigned short nthreads; 83 84 unsigned queue_depth; 84 85 int dev_id; 85 86 int nr_files; ··· 90 89 unsigned int fg:1; 91 90 unsigned int recovery:1; 92 91 unsigned int auto_zc_fallback:1; 92 + unsigned int per_io_tasks:1; 93 93 94 94 int _evtfd; 95 95 int _shmid; ··· 133 131 134 132 int result; 135 133 134 + unsigned short buf_index; 136 135 unsigned short tgt_ios; 137 136 void *private_data; 138 137 struct ublk_thread *t; ··· 206 203 struct ublksrv_ctrl_dev_info dev_info; 207 204 struct ublk_queue q[UBLK_MAX_QUEUES]; 208 205 struct ublk_thread threads[UBLK_MAX_THREADS]; 206 + unsigned nthreads; 207 + unsigned per_io_tasks; 209 208 210 209 int fds[MAX_BACK_FILES + 1]; /* fds[0] points to /dev/ublkcN */ 211 210 int nr_fds;
+3 -3
tools/testing/selftests/ublk/null.c
··· 62 62 63 63 ublk_io_alloc_sqes(ublk_get_io(q, tag), sqe, 3); 64 64 65 - io_uring_prep_buf_register(sqe[0], 0, tag, q->q_id, tag); 65 + io_uring_prep_buf_register(sqe[0], 0, tag, q->q_id, ublk_get_io(q, tag)->buf_index); 66 66 sqe[0]->user_data = build_user_data(tag, 67 67 ublk_cmd_op_nr(sqe[0]->cmd_op), 0, q->q_id, 1); 68 68 sqe[0]->flags |= IOSQE_CQE_SKIP_SUCCESS | IOSQE_IO_HARDLINK; ··· 70 70 __setup_nop_io(tag, iod, sqe[1], q->q_id); 71 71 sqe[1]->flags |= IOSQE_IO_HARDLINK; 72 72 73 - io_uring_prep_buf_unregister(sqe[2], 0, tag, q->q_id, tag); 73 + io_uring_prep_buf_unregister(sqe[2], 0, tag, q->q_id, ublk_get_io(q, tag)->buf_index); 74 74 sqe[2]->user_data = build_user_data(tag, ublk_cmd_op_nr(sqe[2]->cmd_op), 0, q->q_id, 1); 75 75 76 76 // buf register is marked as IOSQE_CQE_SKIP_SUCCESS ··· 136 136 { 137 137 if (q->state & UBLKSRV_AUTO_BUF_REG_FALLBACK) 138 138 return (unsigned short)-1; 139 - return tag; 139 + return q->ios[tag].buf_index; 140 140 } 141 141 142 142 const struct ublk_tgt_ops null_tgt_ops = {
+2 -2
tools/testing/selftests/ublk/stripe.c
··· 141 141 ublk_io_alloc_sqes(ublk_get_io(q, tag), sqe, s->nr + extra); 142 142 143 143 if (zc) { 144 - io_uring_prep_buf_register(sqe[0], 0, tag, q->q_id, tag); 144 + io_uring_prep_buf_register(sqe[0], 0, tag, q->q_id, io->buf_index); 145 145 sqe[0]->flags |= IOSQE_CQE_SKIP_SUCCESS | IOSQE_IO_HARDLINK; 146 146 sqe[0]->user_data = build_user_data(tag, 147 147 ublk_cmd_op_nr(sqe[0]->cmd_op), 0, q->q_id, 1); ··· 167 167 if (zc) { 168 168 struct io_uring_sqe *unreg = sqe[s->nr + 1]; 169 169 170 - io_uring_prep_buf_unregister(unreg, 0, tag, q->q_id, tag); 170 + io_uring_prep_buf_unregister(unreg, 0, tag, q->q_id, io->buf_index); 171 171 unreg->user_data = build_user_data( 172 172 tag, ublk_cmd_op_nr(unreg->cmd_op), 0, q->q_id, 1); 173 173 }