Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

staging: lustre: o2iblnd: per NI map-on-demand value

Enables support of different map-on-demand values per NI. This is
required to support OPA coexistence with MLX5 cards. MLX5 does not
support FMR, which is enabled via map-on-demand. However OPA's
performance is greatly enahanced when FMR is enabled. In order
to enable coexistence of both of these two types of cards we
need to be able to set different map-on-demand values for both NIs.

This patch also lays the ground work for other per NI tunables to
be added in future patches.

Signed-off-by: Amir Shehata <amir.shehata@intel.com>
Signed-off-by: James Simmons <uja.ornl@yahoo.com>
Intel-bug-id: https://jira.hpdd.intel.com/browse/LU-7101
Reviewed-on: http://review.whamcloud.com/16367
Reviewed-by: Doug Oucharek <doug.s.oucharek@intel.com>
Reviewed-by: Olaf Weber <olaf@sgi.com>
Reviewed-by: Oleg Drokin <oleg.drokin@intel.com>
Signed-off-by: James Simmons <jsimmons@infradead.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

authored by

Amir Shehata and committed by
Greg Kroah-Hartman
32c8deb8 f6e50066

+126 -74
+41 -25
drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.c
··· 1283 1283 } 1284 1284 } 1285 1285 1286 - struct ib_mr *kiblnd_find_rd_dma_mr(kib_hca_dev_t *hdev, kib_rdma_desc_t *rd, 1286 + struct ib_mr *kiblnd_find_rd_dma_mr(struct lnet_ni *ni, kib_rdma_desc_t *rd, 1287 1287 int negotiated_nfrags) 1288 1288 { 1289 - __u16 nfrags = (negotiated_nfrags != -1) ? 1290 - negotiated_nfrags : *kiblnd_tunables.kib_map_on_demand; 1289 + kib_net_t *net = ni->ni_data; 1290 + kib_hca_dev_t *hdev = net->ibn_dev->ibd_hdev; 1291 + struct lnet_ioctl_config_o2iblnd_tunables *tunables; 1292 + __u16 nfrags; 1293 + int mod; 1294 + 1295 + tunables = &ni->ni_lnd_tunables->lt_tun_u.lt_o2ib; 1296 + mod = tunables->lnd_map_on_demand; 1297 + nfrags = (negotiated_nfrags != -1) ? negotiated_nfrags : mod; 1291 1298 1292 1299 LASSERT(hdev->ibh_mrs); 1293 1300 1294 - if (*kiblnd_tunables.kib_map_on_demand > 0 && 1295 - nfrags <= rd->rd_nfrags) 1301 + if (mod > 0 && nfrags <= rd->rd_nfrags) 1296 1302 return NULL; 1297 1303 1298 1304 return hdev->ibh_mrs; ··· 1343 1337 } 1344 1338 } 1345 1339 1346 - static int kiblnd_fmr_pool_size(int ncpts) 1340 + static int 1341 + kiblnd_fmr_pool_size(struct lnet_ioctl_config_o2iblnd_tunables *tunables, 1342 + int ncpts) 1347 1343 { 1348 - int size = *kiblnd_tunables.kib_fmr_pool_size / ncpts; 1344 + int size = tunables->lnd_fmr_pool_size / ncpts; 1349 1345 1350 1346 return max(IBLND_FMR_POOL, size); 1351 1347 } 1352 1348 1353 - static int kiblnd_fmr_flush_trigger(int ncpts) 1349 + static int 1350 + kiblnd_fmr_flush_trigger(struct lnet_ioctl_config_o2iblnd_tunables *tunables, 1351 + int ncpts) 1354 1352 { 1355 - int size = *kiblnd_tunables.kib_fmr_flush_trigger / ncpts; 1353 + int size = tunables->lnd_fmr_flush_trigger / ncpts; 1356 1354 1357 1355 return max(IBLND_FMR_POOL_FLUSH, size); 1358 1356 } ··· 1372 1362 .dirty_watermark = fps->fps_flush_trigger, 1373 1363 .flush_function = NULL, 1374 1364 .flush_arg = NULL, 1375 - .cache = !!*kiblnd_tunables.kib_fmr_cache}; 1365 + .cache = !!fps->fps_cache }; 1376 1366 int rc = 0; 1377 1367 1378 1368 fpo->fmr.fpo_fmr_pool = ib_create_fmr_pool(fpo->fpo_hdev->ibh_pd, ··· 1518 1508 } 1519 1509 } 1520 1510 1521 - static int kiblnd_init_fmr_poolset(kib_fmr_poolset_t *fps, int cpt, 1522 - kib_net_t *net, int pool_size, 1523 - int flush_trigger) 1511 + static int 1512 + kiblnd_init_fmr_poolset(kib_fmr_poolset_t *fps, int cpt, int ncpts, 1513 + kib_net_t *net, 1514 + struct lnet_ioctl_config_o2iblnd_tunables *tunables) 1524 1515 { 1525 1516 kib_fmr_pool_t *fpo; 1526 1517 int rc; ··· 1530 1519 1531 1520 fps->fps_net = net; 1532 1521 fps->fps_cpt = cpt; 1533 - fps->fps_pool_size = pool_size; 1534 - fps->fps_flush_trigger = flush_trigger; 1522 + 1523 + fps->fps_pool_size = kiblnd_fmr_pool_size(tunables, ncpts); 1524 + fps->fps_flush_trigger = kiblnd_fmr_flush_trigger(tunables, ncpts); 1525 + fps->fps_cache = tunables->lnd_fmr_cache; 1526 + 1535 1527 spin_lock_init(&fps->fps_lock); 1536 1528 INIT_LIST_HEAD(&fps->fps_pool_list); 1537 1529 INIT_LIST_HEAD(&fps->fps_failed_pool_list); ··· 2164 2150 } 2165 2151 } 2166 2152 2167 - static int kiblnd_net_init_pools(kib_net_t *net, __u32 *cpts, int ncpts) 2153 + static int kiblnd_net_init_pools(kib_net_t *net, lnet_ni_t *ni, __u32 *cpts, 2154 + int ncpts) 2168 2155 { 2156 + struct lnet_ioctl_config_o2iblnd_tunables *tunables; 2169 2157 unsigned long flags; 2170 2158 int cpt; 2171 - int rc = 0; 2159 + int rc; 2172 2160 int i; 2173 2161 2162 + tunables = &ni->ni_lnd_tunables->lt_tun_u.lt_o2ib; 2163 + 2174 2164 read_lock_irqsave(&kiblnd_data.kib_global_lock, flags); 2175 - if (!*kiblnd_tunables.kib_map_on_demand) { 2165 + if (!tunables->lnd_map_on_demand) { 2176 2166 read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); 2177 2167 goto create_tx_pool; 2178 2168 } 2179 2169 2180 2170 read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); 2181 2171 2182 - if (*kiblnd_tunables.kib_fmr_pool_size < 2183 - *kiblnd_tunables.kib_ntx / 4) { 2172 + if (tunables->lnd_fmr_pool_size < *kiblnd_tunables.kib_ntx / 4) { 2184 2173 CERROR("Can't set fmr pool size (%d) < ntx / 4(%d)\n", 2185 - *kiblnd_tunables.kib_fmr_pool_size, 2174 + tunables->lnd_fmr_pool_size, 2186 2175 *kiblnd_tunables.kib_ntx / 4); 2187 2176 rc = -EINVAL; 2188 2177 goto failed; ··· 2215 2198 2216 2199 for (i = 0; i < ncpts; i++) { 2217 2200 cpt = !cpts ? i : cpts[i]; 2218 - rc = kiblnd_init_fmr_poolset(net->ibn_fmr_ps[cpt], cpt, net, 2219 - kiblnd_fmr_pool_size(ncpts), 2220 - kiblnd_fmr_flush_trigger(ncpts)); 2201 + rc = kiblnd_init_fmr_poolset(net->ibn_fmr_ps[cpt], cpt, ncpts, 2202 + net, tunables); 2221 2203 if (rc) { 2222 2204 CERROR("Can't initialize FMR pool for CPT %d: %d\n", 2223 2205 cpt, rc); ··· 2977 2961 if (rc) 2978 2962 goto failed; 2979 2963 2980 - rc = kiblnd_net_init_pools(net, ni->ni_cpts, ni->ni_ncpts); 2964 + rc = kiblnd_net_init_pools(net, ni, ni->ni_cpts, ni->ni_ncpts); 2981 2965 if (rc) { 2982 2966 CERROR("Failed to initialize NI pools: %d\n", rc); 2983 2967 goto failed;
+19 -16
drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.h
··· 87 87 int *kib_timeout; /* comms timeout (seconds) */ 88 88 int *kib_keepalive; /* keepalive timeout (seconds) */ 89 89 int *kib_ntx; /* # tx descs */ 90 - int *kib_peercredits_hiw; /* # when eagerly to return credits */ 91 90 char **kib_default_ipif; /* default IPoIB interface */ 92 91 int *kib_retry_count; 93 92 int *kib_rnr_retry_count; 94 - int *kib_concurrent_sends; /* send work queue sizing */ 95 93 int *kib_ib_mtu; /* IB MTU */ 96 - int *kib_map_on_demand; /* map-on-demand if RD has more */ 97 - /* fragments than this value, 0 */ 98 - /* disable map-on-demand */ 99 - int *kib_fmr_pool_size; /* # FMRs in pool */ 100 - int *kib_fmr_flush_trigger; /* When to trigger FMR flush */ 101 - int *kib_fmr_cache; /* enable FMR pool cache? */ 102 94 int *kib_require_priv_port; /* accept only privileged ports */ 103 95 int *kib_use_priv_port; /* use privileged port for active connect */ 104 96 int *kib_nscheds; /* # threads on each CPT */ ··· 104 112 #define IBLND_CREDITS_DEFAULT 8 /* default # of peer credits */ 105 113 #define IBLND_CREDITS_MAX ((typeof(((kib_msg_t *) 0)->ibm_credits)) - 1) /* Max # of peer credits */ 106 114 107 - #define IBLND_CREDITS_HIGHWATER(v) ((v) == IBLND_MSG_VERSION_1 ? \ 108 - IBLND_CREDIT_HIGHWATER_V1 : \ 109 - *kiblnd_tunables.kib_peercredits_hiw) /* when eagerly to return credits */ 115 + /* when eagerly to return credits */ 116 + #define IBLND_CREDITS_HIGHWATER(t, v) ((v) == IBLND_MSG_VERSION_1 ? \ 117 + IBLND_CREDIT_HIGHWATER_V1 : \ 118 + t->lnd_peercredits_hiw) 110 119 111 120 #define kiblnd_rdma_create_id(cb, dev, ps, qpt) rdma_create_id(&init_net, \ 112 121 cb, dev, \ ··· 253 260 int fps_cpt; /* CPT id */ 254 261 int fps_pool_size; 255 262 int fps_flush_trigger; 263 + int fps_cache; 256 264 int fps_increasing; /* is allocating new pool */ 257 265 unsigned long fps_next_retry; /* time stamp for retry if*/ 258 266 /* failed to allocate */ ··· 608 614 static inline int 609 615 kiblnd_cfg_rdma_frags(struct lnet_ni *ni) 610 616 { 611 - int mod = *kiblnd_tunables.kib_map_on_demand; 617 + struct lnet_ioctl_config_o2iblnd_tunables *tunables; 618 + int mod; 619 + 620 + tunables = &ni->ni_lnd_tunables->lt_tun_u.lt_o2ib; 621 + mod = tunables->lnd_map_on_demand; 612 622 return mod ? mod : IBLND_MAX_RDMA_FRAGS; 613 623 } 614 624 ··· 627 629 static inline int 628 630 kiblnd_concurrent_sends(int version, struct lnet_ni *ni) 629 631 { 632 + struct lnet_ioctl_config_o2iblnd_tunables *tunables; 630 633 int concurrent_sends; 631 634 632 - concurrent_sends = *kiblnd_tunables.kib_concurrent_sends; 635 + tunables = &ni->ni_lnd_tunables->lt_tun_u.lt_o2ib; 636 + concurrent_sends = tunables->lnd_concurrent_sends; 633 637 634 638 if (version == IBLND_MSG_VERSION_1) { 635 639 if (concurrent_sends > IBLND_MSG_QUEUE_SIZE_V1 * 2) ··· 766 766 static inline int 767 767 kiblnd_need_noop(kib_conn_t *conn) 768 768 { 769 + struct lnet_ioctl_config_o2iblnd_tunables *tunables; 770 + lnet_ni_t *ni = conn->ibc_peer->ibp_ni; 771 + 769 772 LASSERT(conn->ibc_state >= IBLND_CONN_ESTABLISHED); 773 + tunables = &ni->ni_lnd_tunables->lt_tun_u.lt_o2ib; 770 774 771 775 if (conn->ibc_outstanding_credits < 772 - IBLND_CREDITS_HIGHWATER(conn->ibc_version) && 776 + IBLND_CREDITS_HIGHWATER(tunables, conn->ibc_version) && 773 777 !kiblnd_send_keepalive(conn)) 774 778 return 0; /* No need to send NOOP */ 775 779 ··· 981 977 #define KIBLND_CONN_PARAM(e) ((e)->param.conn.private_data) 982 978 #define KIBLND_CONN_PARAM_LEN(e) ((e)->param.conn.private_data_len) 983 979 984 - struct ib_mr *kiblnd_find_rd_dma_mr(kib_hca_dev_t *hdev, 985 - kib_rdma_desc_t *rd, 980 + struct ib_mr *kiblnd_find_rd_dma_mr(struct lnet_ni *ni, kib_rdma_desc_t *rd, 986 981 int negotiated_nfrags); 987 982 void kiblnd_map_rx_descs(kib_conn_t *conn); 988 983 void kiblnd_unmap_rx_descs(kib_conn_t *conn);
+8 -5
drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c
··· 612 612 static int kiblnd_map_tx(lnet_ni_t *ni, kib_tx_t *tx, kib_rdma_desc_t *rd, 613 613 int nfrags) 614 614 { 615 - kib_hca_dev_t *hdev = tx->tx_pool->tpo_hdev; 616 615 kib_net_t *net = ni->ni_data; 616 + kib_hca_dev_t *hdev = net->ibn_dev->ibd_hdev; 617 617 struct ib_mr *mr = NULL; 618 618 __u32 nob; 619 619 int i; ··· 636 636 nob += rd->rd_frags[i].rf_nob; 637 637 } 638 638 639 - mr = kiblnd_find_rd_dma_mr(hdev, rd, tx->tx_conn ? 639 + mr = kiblnd_find_rd_dma_mr(ni, rd, tx->tx_conn ? 640 640 tx->tx_conn->ibc_max_frags : -1); 641 641 if (mr) { 642 642 /* found pre-mapping MR */ ··· 2577 2577 reason = "Unknown"; 2578 2578 break; 2579 2579 2580 - case IBLND_REJECT_RDMA_FRAGS: 2580 + case IBLND_REJECT_RDMA_FRAGS: { 2581 + struct lnet_ioctl_config_lnd_tunables *tunables; 2582 + 2581 2583 if (!cp) { 2582 2584 reason = "can't negotiate max frags"; 2583 2585 goto out; 2584 2586 } 2585 - if (!*kiblnd_tunables.kib_map_on_demand) { 2587 + tunables = peer->ibp_ni->ni_lnd_tunables; 2588 + if (!tunables->lt_tun_u.lt_o2ib.lnd_map_on_demand) { 2586 2589 reason = "map_on_demand must be enabled"; 2587 2590 goto out; 2588 2591 } ··· 2597 2594 peer->ibp_max_frags = frag_num; 2598 2595 reason = "rdma fragments"; 2599 2596 break; 2600 - 2597 + } 2601 2598 case IBLND_REJECT_MSG_QUEUE_SIZE: 2602 2599 if (!cp) { 2603 2600 reason = "can't negotiate queue depth";
+58 -28
drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_modparams.c
··· 152 152 .kib_timeout = &timeout, 153 153 .kib_keepalive = &keepalive, 154 154 .kib_ntx = &ntx, 155 - .kib_peercredits_hiw = &peer_credits_hiw, 156 155 .kib_default_ipif = &ipif_name, 157 156 .kib_retry_count = &retry_count, 158 157 .kib_rnr_retry_count = &rnr_retry_count, 159 - .kib_concurrent_sends = &concurrent_sends, 160 158 .kib_ib_mtu = &ib_mtu, 161 - .kib_map_on_demand = &map_on_demand, 162 - .kib_fmr_pool_size = &fmr_pool_size, 163 - .kib_fmr_flush_trigger = &fmr_flush_trigger, 164 - .kib_fmr_cache = &fmr_cache, 165 159 .kib_require_priv_port = &require_privileged_port, 166 160 .kib_use_priv_port = &use_privileged_port, 167 161 .kib_nscheds = &nscheds ··· 176 182 177 183 int kiblnd_tunables_setup(struct lnet_ni *ni) 178 184 { 185 + struct lnet_ioctl_config_o2iblnd_tunables *tunables; 186 + 187 + /* 188 + * if there was no tunables specified, setup the tunables to be 189 + * defaulted 190 + */ 191 + if (!ni->ni_lnd_tunables) { 192 + LIBCFS_ALLOC(ni->ni_lnd_tunables, 193 + sizeof(*ni->ni_lnd_tunables)); 194 + if (!ni->ni_lnd_tunables) 195 + return -ENOMEM; 196 + 197 + memcpy(&ni->ni_lnd_tunables->lt_tun_u.lt_o2ib, 198 + &default_tunables, sizeof(*tunables)); 199 + } 200 + tunables = &ni->ni_lnd_tunables->lt_tun_u.lt_o2ib; 201 + 202 + /* Current API version */ 203 + tunables->lnd_version = 0; 204 + 179 205 if (kiblnd_translate_mtu(*kiblnd_tunables.kib_ib_mtu) < 0) { 180 206 CERROR("Invalid ib_mtu %d, expected 256/512/1024/2048/4096\n", 181 207 *kiblnd_tunables.kib_ib_mtu); ··· 223 209 if (ni->ni_peertxcredits > credits) 224 210 ni->ni_peertxcredits = credits; 225 211 226 - if (*kiblnd_tunables.kib_peercredits_hiw < ni->ni_peertxcredits / 2) 227 - *kiblnd_tunables.kib_peercredits_hiw = ni->ni_peertxcredits / 2; 212 + if (!tunables->lnd_peercredits_hiw) 213 + tunables->lnd_peercredits_hiw = peer_credits_hiw; 228 214 229 - if (*kiblnd_tunables.kib_peercredits_hiw >= ni->ni_peertxcredits) 230 - *kiblnd_tunables.kib_peercredits_hiw = ni->ni_peertxcredits - 1; 215 + if (tunables->lnd_peercredits_hiw < ni->ni_peertxcredits / 2) 216 + tunables->lnd_peercredits_hiw = ni->ni_peertxcredits / 2; 231 217 232 - if (*kiblnd_tunables.kib_map_on_demand < 0 || 233 - *kiblnd_tunables.kib_map_on_demand > IBLND_MAX_RDMA_FRAGS) 234 - *kiblnd_tunables.kib_map_on_demand = 0; /* disable map-on-demand */ 218 + if (tunables->lnd_peercredits_hiw >= ni->ni_peertxcredits) 219 + tunables->lnd_peercredits_hiw = ni->ni_peertxcredits - 1; 235 220 236 - if (*kiblnd_tunables.kib_map_on_demand == 1) 237 - *kiblnd_tunables.kib_map_on_demand = 2; /* don't make sense to create map if only one fragment */ 238 - 239 - if (!*kiblnd_tunables.kib_concurrent_sends) { 240 - if (*kiblnd_tunables.kib_map_on_demand > 0 && 241 - *kiblnd_tunables.kib_map_on_demand <= IBLND_MAX_RDMA_FRAGS / 8) 242 - *kiblnd_tunables.kib_concurrent_sends = ni->ni_peertxcredits * 2; 243 - else 244 - *kiblnd_tunables.kib_concurrent_sends = ni->ni_peertxcredits; 221 + if (tunables->lnd_map_on_demand < 0 || 222 + tunables->lnd_map_on_demand > IBLND_MAX_RDMA_FRAGS) { 223 + /* disable map-on-demand */ 224 + tunables->lnd_map_on_demand = 0; 245 225 } 246 226 247 - if (*kiblnd_tunables.kib_concurrent_sends > ni->ni_peertxcredits * 2) 248 - *kiblnd_tunables.kib_concurrent_sends = ni->ni_peertxcredits * 2; 227 + if (tunables->lnd_map_on_demand == 1) { 228 + /* don't make sense to create map if only one fragment */ 229 + tunables->lnd_map_on_demand = 2; 230 + } 249 231 250 - if (*kiblnd_tunables.kib_concurrent_sends < ni->ni_peertxcredits / 2) 251 - *kiblnd_tunables.kib_concurrent_sends = ni->ni_peertxcredits / 2; 232 + if (!tunables->lnd_concurrent_sends) { 233 + if (tunables->lnd_map_on_demand > 0 && 234 + tunables->lnd_map_on_demand <= IBLND_MAX_RDMA_FRAGS / 8) { 235 + tunables->lnd_concurrent_sends = 236 + ni->ni_peertxcredits * 2; 237 + } else { 238 + tunables->lnd_concurrent_sends = ni->ni_peertxcredits; 239 + } 240 + } 252 241 253 - if (*kiblnd_tunables.kib_concurrent_sends < ni->ni_peertxcredits) { 242 + if (tunables->lnd_concurrent_sends > ni->ni_peertxcredits * 2) 243 + tunables->lnd_concurrent_sends = ni->ni_peertxcredits * 2; 244 + 245 + if (tunables->lnd_concurrent_sends < ni->ni_peertxcredits / 2) 246 + tunables->lnd_concurrent_sends = ni->ni_peertxcredits / 2; 247 + 248 + if (tunables->lnd_concurrent_sends < ni->ni_peertxcredits) { 254 249 CWARN("Concurrent sends %d is lower than message queue size: %d, performance may drop slightly.\n", 255 - *kiblnd_tunables.kib_concurrent_sends, ni->ni_peertxcredits); 250 + tunables->lnd_concurrent_sends, ni->ni_peertxcredits); 256 251 } 252 + 253 + if (!tunables->lnd_fmr_pool_size) 254 + tunables->lnd_fmr_pool_size = fmr_pool_size; 255 + if (!tunables->lnd_fmr_flush_trigger) 256 + tunables->lnd_fmr_flush_trigger = fmr_flush_trigger; 257 + if (!tunables->lnd_fmr_cache) 258 + tunables->lnd_fmr_cache = fmr_cache; 257 259 258 260 return 0; 259 261 }