Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

net/smc: make wr buffer count configurable

Think SMC_WR_BUF_CNT_SEND := SMC_WR_BUF_CNT used in send context and
SMC_WR_BUF_CNT_RECV := 3 * SMC_WR_BUF_CNT used in recv context. Those
get replaced with lgr->max_send_wr and lgr->max_recv_wr respective.

Please note that although with the default sysctl values
qp_attr.cap.max_send_wr == qp_attr.cap.max_recv_wr is maintained but
can not be assumed to be generally true any more. I see no downside to
that, but my confidence level is rather modest.

Signed-off-by: Halil Pasic <pasic@linux.ibm.com>
Reviewed-by: Sidraya Jayagond <sidraya@linux.ibm.com>
Reviewed-by: Dust Li <dust.li@linux.alibaba.com>
Tested-by: Mahanta Jambigi <mjambigi@linux.ibm.com>
Link: https://patch.msgid.link/20251027224856.2970019-2-pasic@linux.ibm.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>

authored by

Halil Pasic and committed by
Paolo Abeni
aef3cdb4 ea7d0d60

+91 -22
+36
Documentation/networking/smc-sysctl.rst
··· 71 71 acceptable value ranges from 16 to 255. Only for SMC-R v2.1 and later. 72 72 73 73 Default: 255 74 + 75 + smcr_max_send_wr - INTEGER 76 + So-called work request buffers are SMCR link (and RDMA queue pair) level 77 + resources necessary for performing RDMA operations. Since up to 255 78 + connections can share a link group and thus also a link and the number 79 + of the work request buffers is decided when the link is allocated, 80 + depending on the workload it can be a bottleneck in a sense that threads 81 + have to wait for work request buffers to become available. Before the 82 + introduction of this control the maximal number of work request buffers 83 + available on the send path used to be hard coded to 16. With this control 84 + it becomes configurable. The acceptable range is between 2 and 2048. 85 + 86 + Please be aware that all the buffers need to be allocated as a physically 87 + continuous array in which each element is a single buffer and has the size 88 + of SMC_WR_BUF_SIZE (48) bytes. If the allocation fails we give up much 89 + like before having this control. 90 + 91 + Default: 16 92 + 93 + smcr_max_recv_wr - INTEGER 94 + So-called work request buffers are SMCR link (and RDMA queue pair) level 95 + resources necessary for performing RDMA operations. Since up to 255 96 + connections can share a link group and thus also a link and the number 97 + of the work request buffers is decided when the link is allocated, 98 + depending on the workload it can be a bottleneck in a sense that threads 99 + have to wait for work request buffers to become available. Before the 100 + introduction of this control the maximal number of work request buffers 101 + available on the receive path used to be hard coded to 16. With this control 102 + it becomes configurable. The acceptable range is between 2 and 2048. 103 + 104 + Please be aware that all the buffers need to be allocated as a physically 105 + continuous array in which each element is a single buffer and has the size 106 + of SMC_WR_BUF_SIZE (48) bytes. If the allocation fails we give up much 107 + like before having this control. 108 + 109 + Default: 48
+2
include/net/netns/smc.h
··· 24 24 int sysctl_rmem; 25 25 int sysctl_max_links_per_lgr; 26 26 int sysctl_max_conns_per_lgr; 27 + unsigned int sysctl_smcr_max_send_wr; 28 + unsigned int sysctl_smcr_max_recv_wr; 27 29 }; 28 30 #endif
+6
net/smc/smc_core.h
··· 34 34 * distributions may modify it to a value between 35 35 * 16-255 as needed. 36 36 */ 37 + #define SMCR_MAX_SEND_WR_DEF 16 /* Default number of work requests per send queue */ 38 + #define SMCR_MAX_RECV_WR_DEF 48 /* Default number of work requests per recv queue */ 37 39 38 40 struct smc_lgr_list { /* list of link group definition */ 39 41 struct list_head list; ··· 368 366 /* max conn can be assigned to lgr */ 369 367 u8 max_links; 370 368 /* max links can be added in lgr */ 369 + u16 max_send_wr; 370 + /* number of WR buffers on send */ 371 + u16 max_recv_wr; 372 + /* number of WR buffers on recv */ 371 373 }; 372 374 struct { /* SMC-D */ 373 375 struct smcd_gid peer_gid;
+5 -5
net/smc/smc_ib.c
··· 669 669 .recv_cq = lnk->smcibdev->roce_cq_recv, 670 670 .srq = NULL, 671 671 .cap = { 672 - /* include unsolicited rdma_writes as well, 673 - * there are max. 2 RDMA_WRITE per 1 WR_SEND 674 - */ 675 - .max_send_wr = SMC_WR_BUF_CNT * 3, 676 - .max_recv_wr = SMC_WR_BUF_CNT * 3, 677 672 .max_send_sge = SMC_IB_MAX_SEND_SGE, 678 673 .max_recv_sge = lnk->wr_rx_sge_cnt, 679 674 .max_inline_data = 0, ··· 678 683 }; 679 684 int rc; 680 685 686 + /* include unsolicited rdma_writes as well, 687 + * there are max. 2 RDMA_WRITE per 1 WR_SEND 688 + */ 689 + qp_attr.cap.max_send_wr = 3 * lnk->lgr->max_send_wr; 690 + qp_attr.cap.max_recv_wr = lnk->lgr->max_recv_wr; 681 691 lnk->roce_qp = ib_create_qp(lnk->roce_pd, &qp_attr); 682 692 rc = PTR_ERR_OR_ZERO(lnk->roce_qp); 683 693 if (IS_ERR(lnk->roce_qp))
+2
net/smc/smc_llc.c
··· 2157 2157 init_waitqueue_head(&lgr->llc_msg_waiter); 2158 2158 init_rwsem(&lgr->llc_conf_mutex); 2159 2159 lgr->llc_testlink_time = READ_ONCE(net->smc.sysctl_smcr_testlink_time); 2160 + lgr->max_send_wr = (u16)(READ_ONCE(net->smc.sysctl_smcr_max_send_wr)); 2161 + lgr->max_recv_wr = (u16)(READ_ONCE(net->smc.sysctl_smcr_max_recv_wr)); 2160 2162 } 2161 2163 2162 2164 /* called after lgr was removed from lgr_list */
+22
net/smc/smc_sysctl.c
··· 29 29 static int links_per_lgr_max = SMC_LINKS_ADD_LNK_MAX; 30 30 static int conns_per_lgr_min = SMC_CONN_PER_LGR_MIN; 31 31 static int conns_per_lgr_max = SMC_CONN_PER_LGR_MAX; 32 + static unsigned int smcr_max_wr_min = 2; 33 + static unsigned int smcr_max_wr_max = 2048; 32 34 33 35 static struct ctl_table smc_table[] = { 34 36 { ··· 101 99 .extra1 = SYSCTL_ZERO, 102 100 .extra2 = SYSCTL_ONE, 103 101 }, 102 + { 103 + .procname = "smcr_max_send_wr", 104 + .data = &init_net.smc.sysctl_smcr_max_send_wr, 105 + .maxlen = sizeof(int), 106 + .mode = 0644, 107 + .proc_handler = proc_dointvec_minmax, 108 + .extra1 = &smcr_max_wr_min, 109 + .extra2 = &smcr_max_wr_max, 110 + }, 111 + { 112 + .procname = "smcr_max_recv_wr", 113 + .data = &init_net.smc.sysctl_smcr_max_recv_wr, 114 + .maxlen = sizeof(int), 115 + .mode = 0644, 116 + .proc_handler = proc_dointvec_minmax, 117 + .extra1 = &smcr_max_wr_min, 118 + .extra2 = &smcr_max_wr_max, 119 + }, 104 120 }; 105 121 106 122 int __net_init smc_sysctl_net_init(struct net *net) ··· 150 130 WRITE_ONCE(net->smc.sysctl_rmem, net_smc_rmem_init); 151 131 net->smc.sysctl_max_links_per_lgr = SMC_LINKS_PER_LGR_MAX_PREFER; 152 132 net->smc.sysctl_max_conns_per_lgr = SMC_CONN_PER_LGR_PREFER; 133 + net->smc.sysctl_smcr_max_send_wr = SMCR_MAX_SEND_WR_DEF; 134 + net->smc.sysctl_smcr_max_recv_wr = SMCR_MAX_RECV_WR_DEF; 153 135 /* disable handshake limitation by default */ 154 136 net->smc.limit_smc_hs = 0; 155 137
+2
net/smc/smc_sysctl.h
··· 25 25 net->smc.sysctl_autocorking_size = SMC_AUTOCORKING_DEFAULT_SIZE; 26 26 net->smc.sysctl_max_links_per_lgr = SMC_LINKS_PER_LGR_MAX_PREFER; 27 27 net->smc.sysctl_max_conns_per_lgr = SMC_CONN_PER_LGR_PREFER; 28 + net->smc.sysctl_smcr_max_send_wr = SMCR_MAX_SEND_WR_DEF; 29 + net->smc.sysctl_smcr_max_recv_wr = SMCR_MAX_RECV_WR_DEF; 28 30 return 0; 29 31 } 30 32
+16 -15
net/smc/smc_wr.c
··· 547 547 IB_QP_DEST_QPN, 548 548 &init_attr); 549 549 550 - lnk->wr_tx_cnt = min_t(size_t, SMC_WR_BUF_CNT, 550 + lnk->wr_tx_cnt = min_t(size_t, lnk->lgr->max_send_wr, 551 551 lnk->qp_attr.cap.max_send_wr); 552 - lnk->wr_rx_cnt = min_t(size_t, SMC_WR_BUF_CNT * 3, 552 + lnk->wr_rx_cnt = min_t(size_t, lnk->lgr->max_recv_wr, 553 553 lnk->qp_attr.cap.max_recv_wr); 554 554 } 555 555 ··· 741 741 int smc_wr_alloc_link_mem(struct smc_link *link) 742 742 { 743 743 /* allocate link related memory */ 744 - link->wr_tx_bufs = kcalloc(SMC_WR_BUF_CNT, SMC_WR_BUF_SIZE, GFP_KERNEL); 744 + link->wr_tx_bufs = kcalloc(link->lgr->max_send_wr, 745 + SMC_WR_BUF_SIZE, GFP_KERNEL); 745 746 if (!link->wr_tx_bufs) 746 747 goto no_mem; 747 - link->wr_rx_bufs = kcalloc(SMC_WR_BUF_CNT * 3, link->wr_rx_buflen, 748 + link->wr_rx_bufs = kcalloc(link->lgr->max_recv_wr, link->wr_rx_buflen, 748 749 GFP_KERNEL); 749 750 if (!link->wr_rx_bufs) 750 751 goto no_mem_wr_tx_bufs; 751 - link->wr_tx_ibs = kcalloc(SMC_WR_BUF_CNT, sizeof(link->wr_tx_ibs[0]), 752 - GFP_KERNEL); 752 + link->wr_tx_ibs = kcalloc(link->lgr->max_send_wr, 753 + sizeof(link->wr_tx_ibs[0]), GFP_KERNEL); 753 754 if (!link->wr_tx_ibs) 754 755 goto no_mem_wr_rx_bufs; 755 - link->wr_rx_ibs = kcalloc(SMC_WR_BUF_CNT * 3, 756 + link->wr_rx_ibs = kcalloc(link->lgr->max_recv_wr, 756 757 sizeof(link->wr_rx_ibs[0]), 757 758 GFP_KERNEL); 758 759 if (!link->wr_rx_ibs) 759 760 goto no_mem_wr_tx_ibs; 760 - link->wr_tx_rdmas = kcalloc(SMC_WR_BUF_CNT, 761 + link->wr_tx_rdmas = kcalloc(link->lgr->max_send_wr, 761 762 sizeof(link->wr_tx_rdmas[0]), 762 763 GFP_KERNEL); 763 764 if (!link->wr_tx_rdmas) 764 765 goto no_mem_wr_rx_ibs; 765 - link->wr_tx_rdma_sges = kcalloc(SMC_WR_BUF_CNT, 766 + link->wr_tx_rdma_sges = kcalloc(link->lgr->max_send_wr, 766 767 sizeof(link->wr_tx_rdma_sges[0]), 767 768 GFP_KERNEL); 768 769 if (!link->wr_tx_rdma_sges) 769 770 goto no_mem_wr_tx_rdmas; 770 - link->wr_tx_sges = kcalloc(SMC_WR_BUF_CNT, sizeof(link->wr_tx_sges[0]), 771 + link->wr_tx_sges = kcalloc(link->lgr->max_send_wr, sizeof(link->wr_tx_sges[0]), 771 772 GFP_KERNEL); 772 773 if (!link->wr_tx_sges) 773 774 goto no_mem_wr_tx_rdma_sges; 774 - link->wr_rx_sges = kcalloc(SMC_WR_BUF_CNT * 3, 775 + link->wr_rx_sges = kcalloc(link->lgr->max_recv_wr, 775 776 sizeof(link->wr_rx_sges[0]) * link->wr_rx_sge_cnt, 776 777 GFP_KERNEL); 777 778 if (!link->wr_rx_sges) 778 779 goto no_mem_wr_tx_sges; 779 - link->wr_tx_mask = bitmap_zalloc(SMC_WR_BUF_CNT, GFP_KERNEL); 780 + link->wr_tx_mask = bitmap_zalloc(link->lgr->max_send_wr, GFP_KERNEL); 780 781 if (!link->wr_tx_mask) 781 782 goto no_mem_wr_rx_sges; 782 - link->wr_tx_pends = kcalloc(SMC_WR_BUF_CNT, 783 + link->wr_tx_pends = kcalloc(link->lgr->max_send_wr, 783 784 sizeof(link->wr_tx_pends[0]), 784 785 GFP_KERNEL); 785 786 if (!link->wr_tx_pends) 786 787 goto no_mem_wr_tx_mask; 787 - link->wr_tx_compl = kcalloc(SMC_WR_BUF_CNT, 788 + link->wr_tx_compl = kcalloc(link->lgr->max_send_wr, 788 789 sizeof(link->wr_tx_compl[0]), 789 790 GFP_KERNEL); 790 791 if (!link->wr_tx_compl) ··· 906 905 goto dma_unmap; 907 906 } 908 907 smc_wr_init_sge(lnk); 909 - bitmap_zero(lnk->wr_tx_mask, SMC_WR_BUF_CNT); 908 + bitmap_zero(lnk->wr_tx_mask, lnk->lgr->max_send_wr); 910 909 init_waitqueue_head(&lnk->wr_tx_wait); 911 910 rc = percpu_ref_init(&lnk->wr_tx_refs, smcr_wr_tx_refs_free, 0, GFP_KERNEL); 912 911 if (rc)
-2
net/smc/smc_wr.h
··· 19 19 #include "smc.h" 20 20 #include "smc_core.h" 21 21 22 - #define SMC_WR_BUF_CNT 16 /* # of ctrl buffers per link */ 23 - 24 22 #define SMC_WR_TX_WAIT_FREE_SLOT_TIME (10 * HZ) 25 23 26 24 #define SMC_WR_TX_SIZE 44 /* actual size of wr_send data (<=SMC_WR_BUF_SIZE) */