Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

RDMA/uverbs: Allow drivers to create a new HW object during rereg_mr

mlx5 has an ugly flow where it tries to allocate a new MR and replace the
existing MR in the same memory during rereg. This is very complicated and
buggy. Instead of trying to replace in-place inside the driver, provide
support from uverbs to change the entire HW object assigned to a handle
during rereg_mr.

Since destroying a MR is allowed to fail (ie if a MW is pointing at it)
and can't be detected in advance, the algorithm creates a completely new
uobject to hold the new MR and swaps the IDR entries of the two objects.

The old MR in the temporary IDR entry is destroyed, and if it fails
rereg_mr succeeds and destruction is deferred to FD release. This
complexity is why this cannot live in a driver safely.

Link: https://lore.kernel.org/r/20201130075839.278575-4-leon@kernel.org
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>

+160 -55
+51
drivers/infiniband/core/rdma_core.c
··· 595 595 WARN_ON(old != NULL); 596 596 } 597 597 598 + static void swap_idr_uobjects(struct ib_uobject *obj_old, 599 + struct ib_uobject *obj_new) 600 + { 601 + struct ib_uverbs_file *ufile = obj_old->ufile; 602 + void *old; 603 + 604 + /* 605 + * New must be an object that been allocated but not yet committed, this 606 + * moves the pre-committed state to obj_old, new still must be comitted. 607 + */ 608 + old = xa_cmpxchg(&ufile->idr, obj_old->id, obj_old, XA_ZERO_ENTRY, 609 + GFP_KERNEL); 610 + if (WARN_ON(old != obj_old)) 611 + return; 612 + 613 + swap(obj_old->id, obj_new->id); 614 + 615 + old = xa_cmpxchg(&ufile->idr, obj_old->id, NULL, obj_old, GFP_KERNEL); 616 + WARN_ON(old != NULL); 617 + } 618 + 598 619 static void alloc_commit_fd_uobject(struct ib_uobject *uobj) 599 620 { 600 621 int fd = uobj->id; ··· 659 638 660 639 /* Matches the down_read in rdma_alloc_begin_uobject */ 661 640 up_read(&ufile->hw_destroy_rwsem); 641 + } 642 + 643 + /* 644 + * new_uobj will be assigned to the handle currently used by to_uobj, and 645 + * to_uobj will be destroyed. 646 + * 647 + * Upon return the caller must do: 648 + * rdma_alloc_commit_uobject(new_uobj) 649 + * uobj_put_destroy(to_uobj) 650 + * 651 + * to_uobj must have a write get but the put mode switches to destroy once 652 + * this is called. 653 + */ 654 + void rdma_assign_uobject(struct ib_uobject *to_uobj, struct ib_uobject *new_uobj, 655 + struct uverbs_attr_bundle *attrs) 656 + { 657 + assert_uverbs_usecnt(new_uobj, UVERBS_LOOKUP_WRITE); 658 + 659 + if (WARN_ON(to_uobj->uapi_object != new_uobj->uapi_object || 660 + !to_uobj->uapi_object->type_class->swap_uobjects)) 661 + return; 662 + 663 + to_uobj->uapi_object->type_class->swap_uobjects(to_uobj, new_uobj); 664 + 665 + /* 666 + * If this fails then the uobject is still completely valid (though with 667 + * a new ID) and we leak it until context close. 668 + */ 669 + uverbs_destroy_uobject(to_uobj, RDMA_REMOVE_DESTROY, attrs); 662 670 } 663 671 664 672 /* ··· 797 747 .lookup_put = lookup_put_idr_uobject, 798 748 .destroy_hw = destroy_hw_idr_uobject, 799 749 .remove_handle = remove_handle_idr_uobject, 750 + .swap_uobjects = swap_idr_uobjects, 800 751 }; 801 752 EXPORT_SYMBOL(uverbs_idr_class); 802 753
+65 -20
drivers/infiniband/core/uverbs_cmd.c
··· 764 764 { 765 765 struct ib_uverbs_rereg_mr cmd; 766 766 struct ib_uverbs_rereg_mr_resp resp; 767 - struct ib_pd *pd = NULL; 768 767 struct ib_mr *mr; 769 - struct ib_pd *old_pd; 770 768 int ret; 771 769 struct ib_uobject *uobj; 770 + struct ib_uobject *new_uobj; 771 + struct ib_device *ib_dev; 772 + struct ib_pd *orig_pd; 773 + struct ib_pd *new_pd; 774 + struct ib_mr *new_mr; 772 775 773 776 ret = uverbs_request(attrs, &cmd, sizeof(cmd)); 774 777 if (ret) ··· 804 801 goto put_uobjs; 805 802 } 806 803 804 + orig_pd = mr->pd; 807 805 if (cmd.flags & IB_MR_REREG_PD) { 808 - pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd.pd_handle, 809 - attrs); 810 - if (!pd) { 806 + new_pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd.pd_handle, 807 + attrs); 808 + if (!new_pd) { 811 809 ret = -EINVAL; 812 810 goto put_uobjs; 813 811 } 812 + } else { 813 + new_pd = mr->pd; 814 814 } 815 815 816 - old_pd = mr->pd; 817 - ret = mr->device->ops.rereg_user_mr(mr, cmd.flags, cmd.start, 818 - cmd.length, cmd.hca_va, 819 - cmd.access_flags, pd, 820 - &attrs->driver_udata); 821 - if (ret) 816 + /* 817 + * The driver might create a new HW object as part of the rereg, we need 818 + * to have a uobject ready to hold it. 819 + */ 820 + new_uobj = uobj_alloc(UVERBS_OBJECT_MR, attrs, &ib_dev); 821 + if (IS_ERR(new_uobj)) { 822 + ret = PTR_ERR(new_uobj); 822 823 goto put_uobj_pd; 823 - 824 - if (cmd.flags & IB_MR_REREG_PD) { 825 - atomic_inc(&pd->usecnt); 826 - mr->pd = pd; 827 - atomic_dec(&old_pd->usecnt); 828 824 } 829 825 830 - if (cmd.flags & IB_MR_REREG_TRANS) 831 - mr->iova = cmd.hca_va; 826 + new_mr = ib_dev->ops.rereg_user_mr(mr, cmd.flags, cmd.start, cmd.length, 827 + cmd.hca_va, cmd.access_flags, new_pd, 828 + &attrs->driver_udata); 829 + if (IS_ERR(new_mr)) { 830 + ret = PTR_ERR(new_mr); 831 + goto put_new_uobj; 832 + } 833 + if (new_mr) { 834 + new_mr->device = new_pd->device; 835 + new_mr->pd = new_pd; 836 + new_mr->type = IB_MR_TYPE_USER; 837 + new_mr->dm = NULL; 838 + new_mr->sig_attrs = NULL; 839 + new_mr->uobject = uobj; 840 + atomic_inc(&new_pd->usecnt); 841 + new_mr->iova = cmd.hca_va; 842 + new_uobj->object = new_mr; 843 + 844 + rdma_restrack_new(&new_mr->res, RDMA_RESTRACK_MR); 845 + rdma_restrack_set_name(&new_mr->res, NULL); 846 + rdma_restrack_add(&new_mr->res); 847 + 848 + /* 849 + * The new uobj for the new HW object is put into the same spot 850 + * in the IDR and the old uobj & HW object is deleted. 851 + */ 852 + rdma_assign_uobject(uobj, new_uobj, attrs); 853 + rdma_alloc_commit_uobject(new_uobj, attrs); 854 + uobj_put_destroy(uobj); 855 + new_uobj = NULL; 856 + uobj = NULL; 857 + mr = new_mr; 858 + } else { 859 + if (cmd.flags & IB_MR_REREG_PD) { 860 + atomic_dec(&orig_pd->usecnt); 861 + mr->pd = new_pd; 862 + atomic_inc(&new_pd->usecnt); 863 + } 864 + if (cmd.flags & IB_MR_REREG_TRANS) 865 + mr->iova = cmd.hca_va; 866 + } 832 867 833 868 memset(&resp, 0, sizeof(resp)); 834 869 resp.lkey = mr->lkey; ··· 874 833 875 834 ret = uverbs_response(attrs, &resp, sizeof(resp)); 876 835 836 + put_new_uobj: 837 + if (new_uobj) 838 + uobj_alloc_abort(new_uobj, attrs); 877 839 put_uobj_pd: 878 840 if (cmd.flags & IB_MR_REREG_PD) 879 - uobj_put_obj_read(pd); 841 + uobj_put_obj_read(new_pd); 880 842 881 843 put_uobjs: 882 - uobj_put_write(uobj); 844 + if (uobj) 845 + uobj_put_write(uobj); 883 846 884 847 return ret; 885 848 }
+4 -3
drivers/infiniband/hw/hns/hns_roce_device.h
··· 1223 1223 struct ib_mr *hns_roce_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, 1224 1224 u64 virt_addr, int access_flags, 1225 1225 struct ib_udata *udata); 1226 - int hns_roce_rereg_user_mr(struct ib_mr *mr, int flags, u64 start, u64 length, 1227 - u64 virt_addr, int mr_access_flags, struct ib_pd *pd, 1228 - struct ib_udata *udata); 1226 + struct ib_mr *hns_roce_rereg_user_mr(struct ib_mr *mr, int flags, u64 start, 1227 + u64 length, u64 virt_addr, 1228 + int mr_access_flags, struct ib_pd *pd, 1229 + struct ib_udata *udata); 1229 1230 struct ib_mr *hns_roce_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, 1230 1231 u32 max_num_sg); 1231 1232 int hns_roce_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents,
+8 -7
drivers/infiniband/hw/hns/hns_roce_mr.c
··· 328 328 return ret; 329 329 } 330 330 331 - int hns_roce_rereg_user_mr(struct ib_mr *ibmr, int flags, u64 start, u64 length, 332 - u64 virt_addr, int mr_access_flags, struct ib_pd *pd, 333 - struct ib_udata *udata) 331 + struct ib_mr *hns_roce_rereg_user_mr(struct ib_mr *ibmr, int flags, u64 start, 332 + u64 length, u64 virt_addr, 333 + int mr_access_flags, struct ib_pd *pd, 334 + struct ib_udata *udata) 334 335 { 335 336 struct hns_roce_dev *hr_dev = to_hr_dev(ibmr->device); 336 337 struct ib_device *ib_dev = &hr_dev->ib_dev; ··· 342 341 int ret; 343 342 344 343 if (!mr->enabled) 345 - return -EINVAL; 344 + return ERR_PTR(-EINVAL); 346 345 347 346 mailbox = hns_roce_alloc_cmd_mailbox(hr_dev); 348 347 if (IS_ERR(mailbox)) 349 - return PTR_ERR(mailbox); 348 + return ERR_CAST(mailbox); 350 349 351 350 mtpt_idx = key_to_hw_index(mr->key) & (hr_dev->caps.num_mtpts - 1); 352 351 ret = hns_roce_cmd_mbox(hr_dev, 0, mailbox->dma, mtpt_idx, 0, ··· 391 390 392 391 hns_roce_free_cmd_mailbox(hr_dev, mailbox); 393 392 394 - return 0; 393 + return NULL; 395 394 396 395 free_cmd_mbox: 397 396 hns_roce_free_cmd_mailbox(hr_dev, mailbox); 398 397 399 - return ret; 398 + return ERR_PTR(ret); 400 399 } 401 400 402 401 int hns_roce_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
+4 -4
drivers/infiniband/hw/mlx4/mlx4_ib.h
··· 908 908 void mlx4_ib_steer_qp_free(struct mlx4_ib_dev *dev, u32 qpn, int count); 909 909 int mlx4_ib_steer_qp_reg(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp, 910 910 int is_attach); 911 - int mlx4_ib_rereg_user_mr(struct ib_mr *mr, int flags, 912 - u64 start, u64 length, u64 virt_addr, 913 - int mr_access_flags, struct ib_pd *pd, 914 - struct ib_udata *udata); 911 + struct ib_mr *mlx4_ib_rereg_user_mr(struct ib_mr *mr, int flags, u64 start, 912 + u64 length, u64 virt_addr, 913 + int mr_access_flags, struct ib_pd *pd, 914 + struct ib_udata *udata); 915 915 int mlx4_ib_gid_index_to_real_index(struct mlx4_ib_dev *ibdev, 916 916 const struct ib_gid_attr *attr); 917 917
+8 -8
drivers/infiniband/hw/mlx4/mr.c
··· 456 456 return ERR_PTR(err); 457 457 } 458 458 459 - int mlx4_ib_rereg_user_mr(struct ib_mr *mr, int flags, 460 - u64 start, u64 length, u64 virt_addr, 461 - int mr_access_flags, struct ib_pd *pd, 462 - struct ib_udata *udata) 459 + struct ib_mr *mlx4_ib_rereg_user_mr(struct ib_mr *mr, int flags, u64 start, 460 + u64 length, u64 virt_addr, 461 + int mr_access_flags, struct ib_pd *pd, 462 + struct ib_udata *udata) 463 463 { 464 464 struct mlx4_ib_dev *dev = to_mdev(mr->device); 465 465 struct mlx4_ib_mr *mmr = to_mmr(mr); ··· 472 472 * race exists. 473 473 */ 474 474 err = mlx4_mr_hw_get_mpt(dev->dev, &mmr->mmr, &pmpt_entry); 475 - 476 475 if (err) 477 - return err; 476 + return ERR_PTR(err); 478 477 479 478 if (flags & IB_MR_REREG_PD) { 480 479 err = mlx4_mr_hw_change_pd(dev->dev, *pmpt_entry, ··· 541 542 542 543 release_mpt_entry: 543 544 mlx4_mr_hw_put_mpt(dev->dev, pmpt_entry); 544 - 545 - return err; 545 + if (err) 546 + return ERR_PTR(err); 547 + return NULL; 546 548 } 547 549 548 550 static int
+3 -3
drivers/infiniband/hw/mlx5/mlx5_ib.h
··· 1254 1254 int access_flags); 1255 1255 void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *mr); 1256 1256 void mlx5_ib_fence_odp_mr(struct mlx5_ib_mr *mr); 1257 - int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, 1258 - u64 length, u64 virt_addr, int access_flags, 1259 - struct ib_pd *pd, struct ib_udata *udata); 1257 + struct ib_mr *mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, 1258 + u64 length, u64 virt_addr, int access_flags, 1259 + struct ib_pd *pd, struct ib_udata *udata); 1260 1260 int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata); 1261 1261 struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, 1262 1262 u32 max_num_sg);
+8 -7
drivers/infiniband/hw/mlx5/mr.c
··· 1620 1620 return err; 1621 1621 } 1622 1622 1623 - int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, 1624 - u64 length, u64 virt_addr, int new_access_flags, 1625 - struct ib_pd *new_pd, struct ib_udata *udata) 1623 + struct ib_mr *mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, 1624 + u64 length, u64 virt_addr, 1625 + int new_access_flags, struct ib_pd *new_pd, 1626 + struct ib_udata *udata) 1626 1627 { 1627 1628 struct mlx5_ib_dev *dev = to_mdev(ib_mr->device); 1628 1629 struct mlx5_ib_mr *mr = to_mmr(ib_mr); ··· 1639 1638 start, virt_addr, length, access_flags); 1640 1639 1641 1640 if (!mr->umem) 1642 - return -EINVAL; 1641 + return ERR_PTR(-EINVAL); 1643 1642 1644 1643 if (is_odp_mr(mr)) 1645 - return -EOPNOTSUPP; 1644 + return ERR_PTR(-EOPNOTSUPP); 1646 1645 1647 1646 if (flags & IB_MR_REREG_TRANS) { 1648 1647 addr = virt_addr; ··· 1718 1717 1719 1718 set_mr_fields(dev, mr, len, access_flags); 1720 1719 1721 - return 0; 1720 + return NULL; 1722 1721 1723 1722 err: 1724 1723 ib_umem_release(mr->umem); 1725 1724 mr->umem = NULL; 1726 1725 1727 1726 clean_mr(dev, mr); 1728 - return err; 1727 + return ERR_PTR(err); 1729 1728 } 1730 1729 1731 1730 static int
+4 -3
include/rdma/ib_verbs.h
··· 2433 2433 struct ib_mr *(*reg_user_mr)(struct ib_pd *pd, u64 start, u64 length, 2434 2434 u64 virt_addr, int mr_access_flags, 2435 2435 struct ib_udata *udata); 2436 - int (*rereg_user_mr)(struct ib_mr *mr, int flags, u64 start, u64 length, 2437 - u64 virt_addr, int mr_access_flags, 2438 - struct ib_pd *pd, struct ib_udata *udata); 2436 + struct ib_mr *(*rereg_user_mr)(struct ib_mr *mr, int flags, u64 start, 2437 + u64 length, u64 virt_addr, 2438 + int mr_access_flags, struct ib_pd *pd, 2439 + struct ib_udata *udata); 2439 2440 int (*dereg_mr)(struct ib_mr *mr, struct ib_udata *udata); 2440 2441 struct ib_mr *(*alloc_mr)(struct ib_pd *pd, enum ib_mr_type mr_type, 2441 2442 u32 max_num_sg);
+5
include/rdma/uverbs_types.h
··· 71 71 enum rdma_remove_reason why, 72 72 struct uverbs_attr_bundle *attrs); 73 73 void (*remove_handle)(struct ib_uobject *uobj); 74 + void (*swap_uobjects)(struct ib_uobject *obj_old, 75 + struct ib_uobject *obj_new); 74 76 }; 75 77 76 78 struct uverbs_obj_type { ··· 118 116 bool hw_obj_valid); 119 117 void rdma_alloc_commit_uobject(struct ib_uobject *uobj, 120 118 struct uverbs_attr_bundle *attrs); 119 + void rdma_assign_uobject(struct ib_uobject *to_uobj, 120 + struct ib_uobject *new_uobj, 121 + struct uverbs_attr_bundle *attrs); 121 122 122 123 /* 123 124 * uverbs_uobject_get is called in order to increase the reference count on