Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

RDMA/mlx5: Add support for DMABUF MR registrations with Data-direct

Add support for DMABUF MR registrations with Data-direct device.

Upon userspace calling to register a DMABUF MR with the data direct bit
set, the below algorithm will be followed.

1) Obtain a pinned DMABUF umem from the IB core using the user input
parameters (FD, offset, length) and the DMA PF device. The DMA PF
device is needed to allow the IOMMU to enable the DMA PF to access the
user buffer over PCI.

2) Create a KSM MKEY by setting its entries according to the user buffer
VA to IOVA mapping, with the MKEY being the data direct device-crossed
MKEY. This KSM MKEY is umrable and will be used as part of the MR cache.
The PD for creating it is the internal device 'data direct' kernel one.

3) Create a crossing MKEY that points to the KSM MKEY using the crossing
access mode.

4) Manage the KSM MKEY by adding it to a list of 'data direct' MKEYs
managed on the mlx5_ib device.

5) Return the crossing MKEY to the user, created with its supplied PD.

Upon DMA PF unbind flow, the driver will revoke the KSM entries.
The final deregistration will occur under the hood once the application
deregisters its MKEY.

Notes:
- This version supports only the PINNED UMEM mode, so there is no
dependency on ODP.
- The IOVA supplied by the application must be system page aligned due to
HW translations of KSM.
- The crossing MKEY will not be umrable or part of the MR cache, as we
cannot change its crossed (i.e. KSM) MKEY over UMR.

Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
Link: https://patch.msgid.link/1f99d8020ed540d9702b9e2252a145a439609ba6.1722512548.git.leon@kernel.org
Signed-off-by: Leon Romanovsky <leon@kernel.org>

authored by

Yishai Hadas and committed by
Leon Romanovsky
de8f847a 3aa73c6b

+358 -72
+11
drivers/infiniband/hw/mlx5/main.c
··· 3490 3490 if (ret) 3491 3491 return ret; 3492 3492 3493 + INIT_LIST_HEAD(&dev->data_direct_mr_list); 3493 3494 ret = mlx5_data_direct_ib_reg(dev, vuid); 3494 3495 if (ret) 3495 3496 mlx5_ib_free_data_direct_resources(dev); ··· 3883 3882 dump_fill_mkey), 3884 3883 UA_MANDATORY)); 3885 3884 3885 + ADD_UVERBS_ATTRIBUTES_SIMPLE( 3886 + mlx5_ib_reg_dmabuf_mr, 3887 + UVERBS_OBJECT_MR, 3888 + UVERBS_METHOD_REG_DMABUF_MR, 3889 + UVERBS_ATTR_FLAGS_IN(MLX5_IB_ATTR_REG_DMABUF_MR_ACCESS_FLAGS, 3890 + enum mlx5_ib_uapi_reg_dmabuf_flags, 3891 + UA_OPTIONAL)); 3892 + 3886 3893 static const struct uapi_definition mlx5_ib_defs[] = { 3887 3894 UAPI_DEF_CHAIN(mlx5_ib_devx_defs), 3888 3895 UAPI_DEF_CHAIN(mlx5_ib_flow_defs), ··· 3900 3891 UAPI_DEF_CHAIN(mlx5_ib_create_cq_defs), 3901 3892 3902 3893 UAPI_DEF_CHAIN_OBJ_TREE(UVERBS_OBJECT_DEVICE, &mlx5_ib_query_context), 3894 + UAPI_DEF_CHAIN_OBJ_TREE(UVERBS_OBJECT_MR, &mlx5_ib_reg_dmabuf_mr), 3903 3895 UAPI_DEF_CHAIN_OBJ_TREE_NAMED(MLX5_IB_OBJECT_VAR, 3904 3896 UAPI_DEF_IS_OBJ_SUPPORTED(var_is_supported)), 3905 3897 UAPI_DEF_CHAIN_OBJ_TREE_NAMED(MLX5_IB_OBJECT_UAR), ··· 4406 4396 void mlx5_ib_data_direct_unbind(struct mlx5_ib_dev *ibdev) 4407 4397 { 4408 4398 mutex_lock(&ibdev->data_direct_lock); 4399 + mlx5_ib_revoke_data_direct_mrs(ibdev); 4409 4400 ibdev->data_direct_dev = NULL; 4410 4401 mutex_unlock(&ibdev->data_direct_lock); 4411 4402 }
+8
drivers/infiniband/hw/mlx5/mlx5_ib.h
··· 682 682 struct mlx5_ib_mkey mmkey; 683 683 684 684 struct ib_umem *umem; 685 + /* The mr is data direct related */ 686 + u8 data_direct :1; 685 687 686 688 union { 687 689 /* Used only by kernel MRs (umem == NULL) */ ··· 721 719 } odp_destroy; 722 720 struct ib_odp_counters odp_stats; 723 721 bool is_odp_implicit; 722 + /* The affilated data direct crossed mr */ 723 + struct mlx5_ib_mr *dd_crossed_mr; 724 + struct list_head dd_node; 725 + u8 revoked :1; 724 726 }; 725 727 }; 726 728 }; ··· 1175 1169 /* protect resources needed as part of reset flow */ 1176 1170 spinlock_t reset_flow_resource_lock; 1177 1171 struct list_head qp_list; 1172 + struct list_head data_direct_mr_list; 1178 1173 /* Array with num_ports elements */ 1179 1174 struct mlx5_ib_port *port; 1180 1175 struct mlx5_sq_bfreg bfreg; ··· 1444 1437 void mlx5_ib_data_direct_bind(struct mlx5_ib_dev *ibdev, 1445 1438 struct mlx5_data_direct_dev *dev); 1446 1439 void mlx5_ib_data_direct_unbind(struct mlx5_ib_dev *ibdev); 1440 + void mlx5_ib_revoke_data_direct_mrs(struct mlx5_ib_dev *dev); 1447 1441 1448 1442 #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING 1449 1443 int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev);
+263 -41
drivers/infiniband/hw/mlx5/mr.c
··· 43 43 #include "dm.h" 44 44 #include "mlx5_ib.h" 45 45 #include "umr.h" 46 + #include "data_direct.h" 46 47 47 48 enum { 48 49 MAX_PENDING_REG_MR = 8, ··· 55 54 create_mkey_callback(int status, struct mlx5_async_work *context); 56 55 static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem, 57 56 u64 iova, int access_flags, 58 - unsigned int page_size, bool populate); 57 + unsigned int page_size, bool populate, 58 + int access_mode); 59 + static int __mlx5_ib_dereg_mr(struct ib_mr *ibmr); 59 60 60 61 static void set_mkc_access_pd_addr_fields(void *mkc, int acc, u64 start_addr, 61 62 struct ib_pd *pd) ··· 1129 1126 1130 1127 static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd, 1131 1128 struct ib_umem *umem, u64 iova, 1132 - int access_flags) 1129 + int access_flags, int access_mode) 1133 1130 { 1134 - struct mlx5r_cache_rb_key rb_key = { 1135 - .access_mode = MLX5_MKC_ACCESS_MODE_MTT, 1136 - }; 1137 1131 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1132 + struct mlx5r_cache_rb_key rb_key = {}; 1138 1133 struct mlx5_cache_ent *ent; 1139 1134 struct mlx5_ib_mr *mr; 1140 1135 unsigned int page_size; ··· 1145 1144 if (WARN_ON(!page_size)) 1146 1145 return ERR_PTR(-EINVAL); 1147 1146 1147 + rb_key.access_mode = access_mode; 1148 1148 rb_key.ndescs = ib_umem_num_dma_blocks(umem, page_size); 1149 1149 rb_key.ats = mlx5_umem_needs_ats(dev, umem, access_flags); 1150 1150 rb_key.access_flags = get_unchangeable_access_flags(dev, access_flags); ··· 1156 1154 */ 1157 1155 if (!ent) { 1158 1156 mutex_lock(&dev->slow_path_mutex); 1159 - mr = reg_create(pd, umem, iova, access_flags, page_size, false); 1157 + mr = reg_create(pd, umem, iova, access_flags, page_size, false, access_mode); 1160 1158 mutex_unlock(&dev->slow_path_mutex); 1161 1159 if (IS_ERR(mr)) 1162 1160 return mr; ··· 1177 1175 return mr; 1178 1176 } 1179 1177 1178 + static struct ib_mr * 1179 + reg_create_crossing_vhca_mr(struct ib_pd *pd, u64 iova, u64 length, int access_flags, 1180 + u32 crossed_lkey) 1181 + { 1182 + struct mlx5_ib_dev *dev = to_mdev(pd->device); 1183 + int access_mode = MLX5_MKC_ACCESS_MODE_CROSSING; 1184 + struct mlx5_ib_mr *mr; 1185 + void *mkc; 1186 + int inlen; 1187 + u32 *in; 1188 + int err; 1189 + 1190 + if (!MLX5_CAP_GEN(dev->mdev, crossing_vhca_mkey)) 1191 + return ERR_PTR(-EOPNOTSUPP); 1192 + 1193 + mr = kzalloc(sizeof(*mr), GFP_KERNEL); 1194 + if (!mr) 1195 + return ERR_PTR(-ENOMEM); 1196 + 1197 + inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 1198 + in = kvzalloc(inlen, GFP_KERNEL); 1199 + if (!in) { 1200 + err = -ENOMEM; 1201 + goto err_1; 1202 + } 1203 + 1204 + mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 1205 + MLX5_SET(mkc, mkc, crossing_target_vhca_id, 1206 + MLX5_CAP_GEN(dev->mdev, vhca_id)); 1207 + MLX5_SET(mkc, mkc, translations_octword_size, crossed_lkey); 1208 + MLX5_SET(mkc, mkc, access_mode_1_0, access_mode & 0x3); 1209 + MLX5_SET(mkc, mkc, access_mode_4_2, (access_mode >> 2) & 0x7); 1210 + 1211 + /* for this crossing mkey IOVA should be 0 and len should be IOVA + len */ 1212 + set_mkc_access_pd_addr_fields(mkc, access_flags, 0, pd); 1213 + MLX5_SET64(mkc, mkc, len, iova + length); 1214 + 1215 + MLX5_SET(mkc, mkc, free, 0); 1216 + MLX5_SET(mkc, mkc, umr_en, 0); 1217 + err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen); 1218 + if (err) 1219 + goto err_2; 1220 + 1221 + mr->mmkey.type = MLX5_MKEY_MR; 1222 + set_mr_fields(dev, mr, length, access_flags, iova); 1223 + mr->ibmr.pd = pd; 1224 + kvfree(in); 1225 + mlx5_ib_dbg(dev, "crossing mkey = 0x%x\n", mr->mmkey.key); 1226 + 1227 + return &mr->ibmr; 1228 + err_2: 1229 + kvfree(in); 1230 + err_1: 1231 + kfree(mr); 1232 + return ERR_PTR(err); 1233 + } 1234 + 1180 1235 /* 1181 1236 * If ibmr is NULL it will be allocated by reg_create. 1182 1237 * Else, the given ibmr will be used. 1183 1238 */ 1184 1239 static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem, 1185 1240 u64 iova, int access_flags, 1186 - unsigned int page_size, bool populate) 1241 + unsigned int page_size, bool populate, 1242 + int access_mode) 1187 1243 { 1188 1244 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1189 1245 struct mlx5_ib_mr *mr; ··· 1250 1190 int inlen; 1251 1191 u32 *in; 1252 1192 int err; 1253 - bool pg_cap = !!(MLX5_CAP_GEN(dev->mdev, pg)); 1193 + bool pg_cap = !!(MLX5_CAP_GEN(dev->mdev, pg)) && 1194 + (access_mode == MLX5_MKC_ACCESS_MODE_MTT); 1195 + bool ksm_mode = (access_mode == MLX5_MKC_ACCESS_MODE_KSM); 1254 1196 1255 1197 if (!page_size) 1256 1198 return ERR_PTR(-EINVAL); ··· 1275 1213 } 1276 1214 pas = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt); 1277 1215 if (populate) { 1278 - if (WARN_ON(access_flags & IB_ACCESS_ON_DEMAND)) { 1216 + if (WARN_ON(access_flags & IB_ACCESS_ON_DEMAND || ksm_mode)) { 1279 1217 err = -EINVAL; 1280 1218 goto err_2; 1281 1219 } ··· 1291 1229 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 1292 1230 set_mkc_access_pd_addr_fields(mkc, access_flags, iova, 1293 1231 populate ? pd : dev->umrc.pd); 1232 + /* In case a data direct flow, overwrite the pdn field by its internal kernel PD */ 1233 + if (umem->is_dmabuf && ksm_mode) 1234 + MLX5_SET(mkc, mkc, pd, dev->ddr.pdn); 1235 + 1294 1236 MLX5_SET(mkc, mkc, free, !populate); 1295 - MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_MTT); 1237 + MLX5_SET(mkc, mkc, access_mode_1_0, access_mode); 1296 1238 MLX5_SET(mkc, mkc, umr_en, 1); 1297 1239 1298 1240 MLX5_SET64(mkc, mkc, len, umem->length); 1299 1241 MLX5_SET(mkc, mkc, bsf_octword_size, 0); 1300 - MLX5_SET(mkc, mkc, translations_octword_size, 1301 - get_octo_len(iova, umem->length, mr->page_shift)); 1242 + if (ksm_mode) 1243 + MLX5_SET(mkc, mkc, translations_octword_size, 1244 + get_octo_len(iova, umem->length, mr->page_shift) * 2); 1245 + else 1246 + MLX5_SET(mkc, mkc, translations_octword_size, 1247 + get_octo_len(iova, umem->length, mr->page_shift)); 1302 1248 MLX5_SET(mkc, mkc, log_page_size, mr->page_shift); 1303 1249 if (mlx5_umem_needs_ats(dev, umem, access_flags)) 1304 1250 MLX5_SET(mkc, mkc, ma_translation_mode, 1); ··· 1443 1373 1444 1374 xlt_with_umr = mlx5r_umr_can_load_pas(dev, umem->length); 1445 1375 if (xlt_with_umr) { 1446 - mr = alloc_cacheable_mr(pd, umem, iova, access_flags); 1376 + mr = alloc_cacheable_mr(pd, umem, iova, access_flags, 1377 + MLX5_MKC_ACCESS_MODE_MTT); 1447 1378 } else { 1448 1379 unsigned int page_size = mlx5_umem_find_best_pgsz( 1449 1380 umem, mkc, log_page_size, 0, iova); 1450 1381 1451 1382 mutex_lock(&dev->slow_path_mutex); 1452 - mr = reg_create(pd, umem, iova, access_flags, page_size, true); 1383 + mr = reg_create(pd, umem, iova, access_flags, page_size, 1384 + true, MLX5_MKC_ACCESS_MODE_MTT); 1453 1385 mutex_unlock(&dev->slow_path_mutex); 1454 1386 } 1455 1387 if (IS_ERR(mr)) { ··· 1514 1442 if (IS_ERR(odp)) 1515 1443 return ERR_CAST(odp); 1516 1444 1517 - mr = alloc_cacheable_mr(pd, &odp->umem, iova, access_flags); 1445 + mr = alloc_cacheable_mr(pd, &odp->umem, iova, access_flags, 1446 + MLX5_MKC_ACCESS_MODE_MTT); 1518 1447 if (IS_ERR(mr)) { 1519 1448 ib_umem_release(&odp->umem); 1520 1449 return ERR_CAST(mr); ··· 1583 1510 .move_notify = mlx5_ib_dmabuf_invalidate_cb, 1584 1511 }; 1585 1512 1586 - struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset, 1587 - u64 length, u64 virt_addr, 1588 - int fd, int access_flags, 1589 - struct uverbs_attr_bundle *attrs) 1513 + static struct ib_mr * 1514 + reg_user_mr_dmabuf(struct ib_pd *pd, struct device *dma_device, 1515 + u64 offset, u64 length, u64 virt_addr, 1516 + int fd, int access_flags, int access_mode) 1590 1517 { 1518 + bool pinned_mode = (access_mode == MLX5_MKC_ACCESS_MODE_KSM); 1591 1519 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1592 1520 struct mlx5_ib_mr *mr = NULL; 1593 1521 struct ib_umem_dmabuf *umem_dmabuf; 1594 1522 int err; 1595 1523 1596 - if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM) || 1597 - !IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) 1598 - return ERR_PTR(-EOPNOTSUPP); 1599 - 1600 - mlx5_ib_dbg(dev, 1601 - "offset 0x%llx, virt_addr 0x%llx, length 0x%llx, fd %d, access_flags 0x%x\n", 1602 - offset, virt_addr, length, fd, access_flags); 1603 - 1604 1524 err = mlx5r_umr_resource_init(dev); 1605 1525 if (err) 1606 1526 return ERR_PTR(err); 1607 1527 1608 - /* dmabuf requires xlt update via umr to work. */ 1609 - if (!mlx5r_umr_can_load_pas(dev, length)) 1610 - return ERR_PTR(-EINVAL); 1528 + if (!pinned_mode) 1529 + umem_dmabuf = ib_umem_dmabuf_get(&dev->ib_dev, 1530 + offset, length, fd, 1531 + access_flags, 1532 + &mlx5_ib_dmabuf_attach_ops); 1533 + else 1534 + umem_dmabuf = ib_umem_dmabuf_get_pinned_with_dma_device(&dev->ib_dev, 1535 + dma_device, offset, length, 1536 + fd, access_flags); 1611 1537 1612 - umem_dmabuf = ib_umem_dmabuf_get(&dev->ib_dev, offset, length, fd, 1613 - access_flags, 1614 - &mlx5_ib_dmabuf_attach_ops); 1615 1538 if (IS_ERR(umem_dmabuf)) { 1616 1539 mlx5_ib_dbg(dev, "umem_dmabuf get failed (%ld)\n", 1617 1540 PTR_ERR(umem_dmabuf)); ··· 1615 1546 } 1616 1547 1617 1548 mr = alloc_cacheable_mr(pd, &umem_dmabuf->umem, virt_addr, 1618 - access_flags); 1549 + access_flags, access_mode); 1619 1550 if (IS_ERR(mr)) { 1620 1551 ib_umem_release(&umem_dmabuf->umem); 1621 1552 return ERR_CAST(mr); ··· 1625 1556 1626 1557 atomic_add(ib_umem_num_pages(mr->umem), &dev->mdev->priv.reg_pages); 1627 1558 umem_dmabuf->private = mr; 1628 - err = mlx5r_store_odp_mkey(dev, &mr->mmkey); 1629 - if (err) 1630 - goto err_dereg_mr; 1559 + if (!pinned_mode) { 1560 + err = mlx5r_store_odp_mkey(dev, &mr->mmkey); 1561 + if (err) 1562 + goto err_dereg_mr; 1563 + } else { 1564 + mr->data_direct = true; 1565 + } 1631 1566 1632 1567 err = mlx5_ib_init_dmabuf_mr(mr); 1633 1568 if (err) ··· 1639 1566 return &mr->ibmr; 1640 1567 1641 1568 err_dereg_mr: 1642 - mlx5_ib_dereg_mr(&mr->ibmr, NULL); 1569 + __mlx5_ib_dereg_mr(&mr->ibmr); 1643 1570 return ERR_PTR(err); 1571 + } 1572 + 1573 + static struct ib_mr * 1574 + reg_user_mr_dmabuf_by_data_direct(struct ib_pd *pd, u64 offset, 1575 + u64 length, u64 virt_addr, 1576 + int fd, int access_flags) 1577 + { 1578 + struct mlx5_ib_dev *dev = to_mdev(pd->device); 1579 + struct mlx5_data_direct_dev *data_direct_dev; 1580 + struct ib_mr *crossing_mr; 1581 + struct ib_mr *crossed_mr; 1582 + int ret = 0; 1583 + 1584 + /* As of HW behaviour the IOVA must be page aligned in KSM mode */ 1585 + if (!PAGE_ALIGNED(virt_addr) || (access_flags & IB_ACCESS_ON_DEMAND)) 1586 + return ERR_PTR(-EOPNOTSUPP); 1587 + 1588 + mutex_lock(&dev->data_direct_lock); 1589 + data_direct_dev = dev->data_direct_dev; 1590 + if (!data_direct_dev) { 1591 + ret = -EINVAL; 1592 + goto end; 1593 + } 1594 + 1595 + /* The device's 'data direct mkey' was created without RO flags to 1596 + * simplify things and allow for a single mkey per device. 1597 + * Since RO is not a must, mask it out accordingly. 1598 + */ 1599 + access_flags &= ~IB_ACCESS_RELAXED_ORDERING; 1600 + crossed_mr = reg_user_mr_dmabuf(pd, &data_direct_dev->pdev->dev, 1601 + offset, length, virt_addr, fd, 1602 + access_flags, MLX5_MKC_ACCESS_MODE_KSM); 1603 + if (IS_ERR(crossed_mr)) { 1604 + ret = PTR_ERR(crossed_mr); 1605 + goto end; 1606 + } 1607 + 1608 + mutex_lock(&dev->slow_path_mutex); 1609 + crossing_mr = reg_create_crossing_vhca_mr(pd, virt_addr, length, access_flags, 1610 + crossed_mr->lkey); 1611 + mutex_unlock(&dev->slow_path_mutex); 1612 + if (IS_ERR(crossing_mr)) { 1613 + __mlx5_ib_dereg_mr(crossed_mr); 1614 + ret = PTR_ERR(crossing_mr); 1615 + goto end; 1616 + } 1617 + 1618 + list_add_tail(&to_mmr(crossed_mr)->dd_node, &dev->data_direct_mr_list); 1619 + to_mmr(crossing_mr)->dd_crossed_mr = to_mmr(crossed_mr); 1620 + to_mmr(crossing_mr)->data_direct = true; 1621 + end: 1622 + mutex_unlock(&dev->data_direct_lock); 1623 + return ret ? ERR_PTR(ret) : crossing_mr; 1624 + } 1625 + 1626 + struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset, 1627 + u64 length, u64 virt_addr, 1628 + int fd, int access_flags, 1629 + struct uverbs_attr_bundle *attrs) 1630 + { 1631 + struct mlx5_ib_dev *dev = to_mdev(pd->device); 1632 + int mlx5_access_flags = 0; 1633 + int err; 1634 + 1635 + if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM) || 1636 + !IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) 1637 + return ERR_PTR(-EOPNOTSUPP); 1638 + 1639 + if (uverbs_attr_is_valid(attrs, MLX5_IB_ATTR_REG_DMABUF_MR_ACCESS_FLAGS)) { 1640 + err = uverbs_get_flags32(&mlx5_access_flags, attrs, 1641 + MLX5_IB_ATTR_REG_DMABUF_MR_ACCESS_FLAGS, 1642 + MLX5_IB_UAPI_REG_DMABUF_ACCESS_DATA_DIRECT); 1643 + if (err) 1644 + return ERR_PTR(err); 1645 + } 1646 + 1647 + mlx5_ib_dbg(dev, 1648 + "offset 0x%llx, virt_addr 0x%llx, length 0x%llx, fd %d, access_flags 0x%x, mlx5_access_flags 0x%x\n", 1649 + offset, virt_addr, length, fd, access_flags, mlx5_access_flags); 1650 + 1651 + /* dmabuf requires xlt update via umr to work. */ 1652 + if (!mlx5r_umr_can_load_pas(dev, length)) 1653 + return ERR_PTR(-EINVAL); 1654 + 1655 + if (mlx5_access_flags & MLX5_IB_UAPI_REG_DMABUF_ACCESS_DATA_DIRECT) 1656 + return reg_user_mr_dmabuf_by_data_direct(pd, offset, length, virt_addr, 1657 + fd, access_flags); 1658 + 1659 + return reg_user_mr_dmabuf(pd, pd->device->dma_device, 1660 + offset, length, virt_addr, 1661 + fd, access_flags, MLX5_MKC_ACCESS_MODE_MTT); 1644 1662 } 1645 1663 1646 1664 /* ··· 1829 1665 struct mlx5_ib_mr *mr = to_mmr(ib_mr); 1830 1666 int err; 1831 1667 1832 - if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM)) 1668 + if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM) || mr->data_direct) 1833 1669 return ERR_PTR(-EOPNOTSUPP); 1834 1670 1835 1671 mlx5_ib_dbg( ··· 1957 1793 static void 1958 1794 mlx5_free_priv_descs(struct mlx5_ib_mr *mr) 1959 1795 { 1960 - if (!mr->umem && mr->descs) { 1796 + if (!mr->umem && !mr->data_direct && mr->descs) { 1961 1797 struct ib_device *device = mr->ibmr.device; 1962 1798 int size = mr->max_descs * mr->desc_size; 1963 1799 struct mlx5_ib_dev *dev = to_mdev(device); ··· 2011 1847 return ret; 2012 1848 } 2013 1849 1850 + static int mlx5_ib_revoke_data_direct_mr(struct mlx5_ib_mr *mr) 1851 + { 1852 + struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device); 1853 + struct ib_umem_dmabuf *umem_dmabuf = to_ib_umem_dmabuf(mr->umem); 1854 + int err; 1855 + 1856 + lockdep_assert_held(&dev->data_direct_lock); 1857 + mr->revoked = true; 1858 + err = mlx5r_umr_revoke_mr(mr); 1859 + if (WARN_ON(err)) 1860 + return err; 1861 + 1862 + ib_umem_dmabuf_revoke(umem_dmabuf); 1863 + return 0; 1864 + } 1865 + 1866 + void mlx5_ib_revoke_data_direct_mrs(struct mlx5_ib_dev *dev) 1867 + { 1868 + struct mlx5_ib_mr *mr, *next; 1869 + 1870 + lockdep_assert_held(&dev->data_direct_lock); 1871 + 1872 + list_for_each_entry_safe(mr, next, &dev->data_direct_mr_list, dd_node) { 1873 + list_del(&mr->dd_node); 1874 + mlx5_ib_revoke_data_direct_mr(mr); 1875 + } 1876 + } 1877 + 2014 1878 static int mlx5_revoke_mr(struct mlx5_ib_mr *mr) 2015 1879 { 2016 1880 struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device); ··· 2056 1864 return destroy_mkey(dev, mr); 2057 1865 } 2058 1866 2059 - int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) 1867 + static int __mlx5_ib_dereg_mr(struct ib_mr *ibmr) 2060 1868 { 2061 1869 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2062 1870 struct mlx5_ib_dev *dev = to_mdev(ibmr->device); ··· 2121 1929 2122 1930 kfree(mr); 2123 1931 return 0; 1932 + } 1933 + 1934 + static int dereg_crossing_data_direct_mr(struct mlx5_ib_dev *dev, 1935 + struct mlx5_ib_mr *mr) 1936 + { 1937 + struct mlx5_ib_mr *dd_crossed_mr = mr->dd_crossed_mr; 1938 + int ret; 1939 + 1940 + ret = __mlx5_ib_dereg_mr(&mr->ibmr); 1941 + if (ret) 1942 + return ret; 1943 + 1944 + mutex_lock(&dev->data_direct_lock); 1945 + if (!dd_crossed_mr->revoked) 1946 + list_del(&dd_crossed_mr->dd_node); 1947 + 1948 + ret = __mlx5_ib_dereg_mr(&dd_crossed_mr->ibmr); 1949 + mutex_unlock(&dev->data_direct_lock); 1950 + return ret; 1951 + } 1952 + 1953 + int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) 1954 + { 1955 + struct mlx5_ib_mr *mr = to_mmr(ibmr); 1956 + struct mlx5_ib_dev *dev = to_mdev(ibmr->device); 1957 + 1958 + if (mr->data_direct) 1959 + return dereg_crossing_data_direct_mr(dev, mr); 1960 + 1961 + return __mlx5_ib_dereg_mr(ibmr); 2124 1962 } 2125 1963 2126 1964 static void mlx5_set_umr_free_mkey(struct ib_pd *pd, u32 *in, int ndescs,
+4 -1
drivers/infiniband/hw/mlx5/odp.c
··· 710 710 ib_umem_dmabuf_unmap_pages(umem_dmabuf); 711 711 err = -EINVAL; 712 712 } else { 713 - err = mlx5r_umr_update_mr_pas(mr, xlt_flags); 713 + if (mr->data_direct) 714 + err = mlx5r_umr_update_data_direct_ksm_pas(mr, xlt_flags); 715 + else 716 + err = mlx5r_umr_update_mr_pas(mr, xlt_flags); 714 717 } 715 718 dma_resv_unlock(umem_dmabuf->attach->dmabuf->resv); 716 719
+63 -30
drivers/infiniband/hw/mlx5/umr.c
··· 632 632 wqe->data_seg.byte_count = cpu_to_be32(sg->length); 633 633 } 634 634 635 - /* 636 - * Send the DMA list to the HW for a normal MR using UMR. 637 - * Dmabuf MR is handled in a similar way, except that the MLX5_IB_UPD_XLT_ZAP 638 - * flag may be used. 639 - */ 640 - int mlx5r_umr_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags) 635 + static int 636 + _mlx5r_umr_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags, bool dd) 641 637 { 638 + size_t ent_size = dd ? sizeof(struct mlx5_ksm) : sizeof(struct mlx5_mtt); 642 639 struct mlx5_ib_dev *dev = mr_to_mdev(mr); 643 640 struct device *ddev = &dev->mdev->pdev->dev; 644 641 struct mlx5r_umr_wqe wqe = {}; 645 642 struct ib_block_iter biter; 643 + struct mlx5_ksm *cur_ksm; 646 644 struct mlx5_mtt *cur_mtt; 647 645 size_t orig_sg_length; 648 - struct mlx5_mtt *mtt; 649 646 size_t final_size; 647 + void *curr_entry; 650 648 struct ib_sge sg; 649 + void *entry; 651 650 u64 offset = 0; 652 651 int err = 0; 653 652 654 - if (WARN_ON(mr->umem->is_odp)) 655 - return -EINVAL; 656 - 657 - mtt = mlx5r_umr_create_xlt( 658 - dev, &sg, ib_umem_num_dma_blocks(mr->umem, 1 << mr->page_shift), 659 - sizeof(*mtt), flags); 660 - if (!mtt) 653 + entry = mlx5r_umr_create_xlt(dev, &sg, 654 + ib_umem_num_dma_blocks(mr->umem, 1 << mr->page_shift), 655 + ent_size, flags); 656 + if (!entry) 661 657 return -ENOMEM; 662 658 663 659 orig_sg_length = sg.length; 664 - 665 660 mlx5r_umr_set_update_xlt_ctrl_seg(&wqe.ctrl_seg, flags, &sg); 666 661 mlx5r_umr_set_update_xlt_mkey_seg(dev, &wqe.mkey_seg, mr, 667 662 mr->page_shift); 663 + if (dd) { 664 + /* Use the data direct internal kernel PD */ 665 + MLX5_SET(mkc, &wqe.mkey_seg, pd, dev->ddr.pdn); 666 + cur_ksm = entry; 667 + } else { 668 + cur_mtt = entry; 669 + } 670 + 668 671 mlx5r_umr_set_update_xlt_data_seg(&wqe.data_seg, &sg); 669 672 670 - cur_mtt = mtt; 673 + curr_entry = entry; 671 674 rdma_umem_for_each_dma_block(mr->umem, &biter, BIT(mr->page_shift)) { 672 - if (cur_mtt == (void *)mtt + sg.length) { 675 + if (curr_entry == entry + sg.length) { 673 676 dma_sync_single_for_device(ddev, sg.addr, sg.length, 674 677 DMA_TO_DEVICE); 675 678 ··· 684 681 DMA_TO_DEVICE); 685 682 offset += sg.length; 686 683 mlx5r_umr_update_offset(&wqe.ctrl_seg, offset); 687 - 688 - cur_mtt = mtt; 684 + if (dd) 685 + cur_ksm = entry; 686 + else 687 + cur_mtt = entry; 689 688 } 690 689 691 - cur_mtt->ptag = 692 - cpu_to_be64(rdma_block_iter_dma_address(&biter) | 693 - MLX5_IB_MTT_PRESENT); 694 - 695 - if (mr->umem->is_dmabuf && (flags & MLX5_IB_UPD_XLT_ZAP)) 696 - cur_mtt->ptag = 0; 697 - 698 - cur_mtt++; 690 + if (dd) { 691 + cur_ksm->va = cpu_to_be64(rdma_block_iter_dma_address(&biter)); 692 + cur_ksm->key = cpu_to_be32(dev->ddr.mkey); 693 + cur_ksm++; 694 + curr_entry = cur_ksm; 695 + } else { 696 + cur_mtt->ptag = 697 + cpu_to_be64(rdma_block_iter_dma_address(&biter) | 698 + MLX5_IB_MTT_PRESENT); 699 + if (mr->umem->is_dmabuf && (flags & MLX5_IB_UPD_XLT_ZAP)) 700 + cur_mtt->ptag = 0; 701 + cur_mtt++; 702 + curr_entry = cur_mtt; 703 + } 699 704 } 700 705 701 - final_size = (void *)cur_mtt - (void *)mtt; 706 + final_size = curr_entry - entry; 702 707 sg.length = ALIGN(final_size, MLX5_UMR_FLEX_ALIGNMENT); 703 - memset(cur_mtt, 0, sg.length - final_size); 708 + memset(curr_entry, 0, sg.length - final_size); 704 709 mlx5r_umr_final_update_xlt(dev, &wqe, mr, &sg, flags); 705 710 706 711 dma_sync_single_for_device(ddev, sg.addr, sg.length, DMA_TO_DEVICE); ··· 716 705 717 706 err: 718 707 sg.length = orig_sg_length; 719 - mlx5r_umr_unmap_free_xlt(dev, mtt, &sg); 708 + mlx5r_umr_unmap_free_xlt(dev, entry, &sg); 720 709 return err; 710 + } 711 + 712 + int mlx5r_umr_update_data_direct_ksm_pas(struct mlx5_ib_mr *mr, unsigned int flags) 713 + { 714 + /* No invalidation flow is expected */ 715 + if (WARN_ON(!mr->umem->is_dmabuf) || (flags & MLX5_IB_UPD_XLT_ZAP)) 716 + return -EINVAL; 717 + 718 + return _mlx5r_umr_update_mr_pas(mr, flags, true); 719 + } 720 + 721 + /* 722 + * Send the DMA list to the HW for a normal MR using UMR. 723 + * Dmabuf MR is handled in a similar way, except that the MLX5_IB_UPD_XLT_ZAP 724 + * flag may be used. 725 + */ 726 + int mlx5r_umr_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags) 727 + { 728 + if (WARN_ON(mr->umem->is_odp)) 729 + return -EINVAL; 730 + 731 + return _mlx5r_umr_update_mr_pas(mr, flags, false); 721 732 } 722 733 723 734 static bool umr_can_use_indirect_mkey(struct mlx5_ib_dev *dev)
+1
drivers/infiniband/hw/mlx5/umr.h
··· 95 95 int mlx5r_umr_rereg_pd_access(struct mlx5_ib_mr *mr, struct ib_pd *pd, 96 96 int access_flags); 97 97 int mlx5r_umr_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags); 98 + int mlx5r_umr_update_data_direct_ksm_pas(struct mlx5_ib_mr *mr, unsigned int flags); 98 99 int mlx5r_umr_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages, 99 100 int page_shift, int flags); 100 101
+4
include/uapi/rdma/mlx5_user_ioctl_cmds.h
··· 274 274 MLX5_IB_ATTR_CREATE_CQ_UAR_INDEX = UVERBS_ID_DRIVER_NS_WITH_UHW, 275 275 }; 276 276 277 + enum mlx5_ib_reg_dmabuf_mr_attrs { 278 + MLX5_IB_ATTR_REG_DMABUF_MR_ACCESS_FLAGS = (1U << UVERBS_ID_NS_SHIFT), 279 + }; 280 + 277 281 #define MLX5_IB_DW_MATCH_PARAM 0xA0 278 282 279 283 struct mlx5_ib_match_params {
+4
include/uapi/rdma/mlx5_user_ioctl_verbs.h
··· 54 54 MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TO_L3_TUNNEL = 0x3, 55 55 }; 56 56 57 + enum mlx5_ib_uapi_reg_dmabuf_flags { 58 + MLX5_IB_UAPI_REG_DMABUF_ACCESS_DATA_DIRECT = 1 << 0, 59 + }; 60 + 57 61 struct mlx5_ib_uapi_devx_async_cmd_hdr { 58 62 __aligned_u64 wr_id; 59 63 __u8 out_data[];