Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

RDMA/hns: Support direct wqe of userspace

The current write wqe mechanism is to write to DDR first, and then notify
the hardware through doorbell to read the data. Direct wqe is a mechanism
to fill wqe directly into the hardware. In the case of light load, the wqe
will be filled into pcie bar space of the hardware, this will reduce one
memory access operation and therefore reduce the latency. SIMD
instructions allows cpu to write the 512 bits at one time to device
memory, thus it can be used for posting direct wqe.

Add direct wqe enable switch and address mapping.

Link: https://lore.kernel.org/r/20211207124901.42123-2-liangwenpeng@huawei.com
Signed-off-by: Yixing Liu <liuyixing1@huawei.com>
Signed-off-by: Wenpeng Liang <liangwenpeng@huawei.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>

authored by

Yixing Liu and committed by
Jason Gunthorpe
0045e0d3 b1a4da64

+94 -12
+4 -4
drivers/infiniband/hw/hns/hns_roce_device.h
··· 182 182 HNS_ROCE_CAP_FLAG_FRMR = BIT(8), 183 183 HNS_ROCE_CAP_FLAG_QP_FLOW_CTRL = BIT(9), 184 184 HNS_ROCE_CAP_FLAG_ATOMIC = BIT(10), 185 + HNS_ROCE_CAP_FLAG_DIRECT_WQE = BIT(12), 185 186 HNS_ROCE_CAP_FLAG_SDI_MODE = BIT(14), 186 187 HNS_ROCE_CAP_FLAG_STASH = BIT(17), 187 188 }; ··· 229 228 enum hns_roce_mmap_type { 230 229 HNS_ROCE_MMAP_TYPE_DB = 1, 231 230 HNS_ROCE_MMAP_TYPE_TPTR, 231 + HNS_ROCE_MMAP_TYPE_DWQE, 232 232 }; 233 233 234 234 struct hns_user_mmap_entry { ··· 629 627 u32 queue_num; 630 628 }; 631 629 632 - enum { 633 - HNS_ROCE_QP_CAP_DIRECT_WQE = BIT(5), 634 - }; 635 - 636 630 struct hns_roce_qp { 637 631 struct ib_qp ibqp; 638 632 struct hns_roce_wq rq; ··· 673 675 struct list_head node; /* all qps are on a list */ 674 676 struct list_head rq_node; /* all recv qps are on a list */ 675 677 struct list_head sq_node; /* all send qps are on a list */ 678 + struct hns_user_mmap_entry *dwqe_mmap_entry; 676 679 }; 677 680 678 681 struct hns_roce_ib_iboe { ··· 1009 1010 u32 func_num; 1010 1011 u32 is_vf; 1011 1012 u32 cong_algo_tmpl_id; 1013 + u64 dwqe_page; 1012 1014 }; 1013 1015 1014 1016 static inline struct hns_roce_dev *to_hr_dev(struct ib_device *ib_dev)
+2 -1
drivers/infiniband/hw/hns/hns_roce_hw_v2.c
··· 1989 1989 caps->gid_table_len[0] = HNS_ROCE_V2_GID_INDEX_NUM; 1990 1990 1991 1991 if (hr_dev->pci_dev->revision >= PCI_REVISION_ID_HIP09) { 1992 - caps->flags |= HNS_ROCE_CAP_FLAG_STASH; 1992 + caps->flags |= HNS_ROCE_CAP_FLAG_STASH | 1993 + HNS_ROCE_CAP_FLAG_DIRECT_WQE; 1993 1994 caps->max_sq_inline = HNS_ROCE_V3_MAX_SQ_INLINE; 1994 1995 } else { 1995 1996 caps->max_sq_inline = HNS_ROCE_V2_MAX_SQ_INLINE;
+30 -6
drivers/infiniband/hw/hns/hns_roce_main.c
··· 310 310 entry->address = address; 311 311 entry->mmap_type = mmap_type; 312 312 313 - ret = rdma_user_mmap_entry_insert_exact( 314 - ucontext, &entry->rdma_entry, length, 315 - mmap_type == HNS_ROCE_MMAP_TYPE_DB ? 0 : 1); 313 + switch (mmap_type) { 314 + case HNS_ROCE_MMAP_TYPE_DB: 315 + ret = rdma_user_mmap_entry_insert_exact( 316 + ucontext, &entry->rdma_entry, length, 0); 317 + break; 318 + case HNS_ROCE_MMAP_TYPE_TPTR: 319 + ret = rdma_user_mmap_entry_insert_exact( 320 + ucontext, &entry->rdma_entry, length, 1); 321 + break; 322 + case HNS_ROCE_MMAP_TYPE_DWQE: 323 + ret = rdma_user_mmap_entry_insert_range( 324 + ucontext, &entry->rdma_entry, length, 2, 325 + U32_MAX); 326 + break; 327 + default: 328 + ret = -EINVAL; 329 + break; 330 + } 331 + 316 332 if (ret) { 317 333 kfree(entry); 318 334 return NULL; ··· 455 439 456 440 entry = to_hns_mmap(rdma_entry); 457 441 pfn = entry->address >> PAGE_SHIFT; 458 - prot = vma->vm_page_prot; 459 442 460 - if (entry->mmap_type != HNS_ROCE_MMAP_TYPE_TPTR) 461 - prot = pgprot_device(prot); 443 + switch (entry->mmap_type) { 444 + case HNS_ROCE_MMAP_TYPE_DB: 445 + case HNS_ROCE_MMAP_TYPE_DWQE: 446 + prot = pgprot_device(vma->vm_page_prot); 447 + break; 448 + case HNS_ROCE_MMAP_TYPE_TPTR: 449 + prot = vma->vm_page_prot; 450 + break; 451 + default: 452 + return -EINVAL; 453 + } 462 454 463 455 ret = rdma_user_mmap_io(uctx, vma, pfn, rdma_entry->npages * PAGE_SIZE, 464 456 prot, rdma_entry);
+3
drivers/infiniband/hw/hns/hns_roce_pd.c
··· 115 115 } else { 116 116 uar->pfn = ((pci_resource_start(hr_dev->pci_dev, 2)) 117 117 >> PAGE_SHIFT); 118 + if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_DIRECT_WQE) 119 + hr_dev->dwqe_page = 120 + pci_resource_start(hr_dev->pci_dev, 4); 118 121 } 119 122 120 123 return 0;
+53 -1
drivers/infiniband/hw/hns/hns_roce_qp.c
··· 379 379 return ret; 380 380 } 381 381 382 + static void qp_user_mmap_entry_remove(struct hns_roce_qp *hr_qp) 383 + { 384 + rdma_user_mmap_entry_remove(&hr_qp->dwqe_mmap_entry->rdma_entry); 385 + } 386 + 382 387 void hns_roce_qp_remove(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp) 383 388 { 384 389 struct xarray *xa = &hr_dev->qp_table_xa; ··· 785 780 goto err_inline; 786 781 } 787 782 783 + if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_DIRECT_WQE) 784 + hr_qp->en_flags |= HNS_ROCE_QP_CAP_DIRECT_WQE; 785 + 788 786 return 0; 787 + 789 788 err_inline: 790 789 free_rq_inline_buf(hr_qp); 791 790 ··· 829 820 { 830 821 return ((hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_QP_RECORD_DB) && 831 822 hns_roce_qp_has_rq(init_attr)); 823 + } 824 + 825 + static int qp_mmap_entry(struct hns_roce_qp *hr_qp, 826 + struct hns_roce_dev *hr_dev, 827 + struct ib_udata *udata, 828 + struct hns_roce_ib_create_qp_resp *resp) 829 + { 830 + struct hns_roce_ucontext *uctx = 831 + rdma_udata_to_drv_context(udata, 832 + struct hns_roce_ucontext, ibucontext); 833 + struct rdma_user_mmap_entry *rdma_entry; 834 + u64 address; 835 + 836 + address = hr_dev->dwqe_page + hr_qp->qpn * HNS_ROCE_DWQE_SIZE; 837 + 838 + hr_qp->dwqe_mmap_entry = 839 + hns_roce_user_mmap_entry_insert(&uctx->ibucontext, address, 840 + HNS_ROCE_DWQE_SIZE, 841 + HNS_ROCE_MMAP_TYPE_DWQE); 842 + 843 + if (!hr_qp->dwqe_mmap_entry) { 844 + ibdev_err(&hr_dev->ib_dev, "failed to get dwqe mmap entry.\n"); 845 + return -ENOMEM; 846 + } 847 + 848 + rdma_entry = &hr_qp->dwqe_mmap_entry->rdma_entry; 849 + resp->dwqe_mmap_key = rdma_user_mmap_get_offset(rdma_entry); 850 + 851 + return 0; 832 852 } 833 853 834 854 static int alloc_user_qp_db(struct hns_roce_dev *hr_dev, ··· 947 909 hr_qp->en_flags |= HNS_ROCE_QP_CAP_OWNER_DB; 948 910 949 911 if (udata) { 912 + if (hr_qp->en_flags & HNS_ROCE_QP_CAP_DIRECT_WQE) { 913 + ret = qp_mmap_entry(hr_qp, hr_dev, udata, resp); 914 + if (ret) 915 + return ret; 916 + } 917 + 950 918 ret = alloc_user_qp_db(hr_dev, hr_qp, init_attr, udata, ucmd, 951 919 resp); 952 920 if (ret) 953 - return ret; 921 + goto err_remove_qp; 954 922 } else { 955 923 ret = alloc_kernel_qp_db(hr_dev, hr_qp, init_attr); 956 924 if (ret) ··· 964 920 } 965 921 966 922 return 0; 923 + 924 + err_remove_qp: 925 + if (hr_qp->en_flags & HNS_ROCE_QP_CAP_DIRECT_WQE) 926 + qp_user_mmap_entry_remove(hr_qp); 927 + 928 + return ret; 967 929 } 968 930 969 931 static void free_qp_db(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp, ··· 983 933 hns_roce_db_unmap_user(uctx, &hr_qp->rdb); 984 934 if (hr_qp->en_flags & HNS_ROCE_QP_CAP_SQ_RECORD_DB) 985 935 hns_roce_db_unmap_user(uctx, &hr_qp->sdb); 936 + if (hr_qp->en_flags & HNS_ROCE_QP_CAP_DIRECT_WQE) 937 + qp_user_mmap_entry_remove(hr_qp); 986 938 } else { 987 939 if (hr_qp->en_flags & HNS_ROCE_QP_CAP_RQ_RECORD_DB) 988 940 hns_roce_free_db(hr_dev, &hr_qp->rdb);
+2
include/uapi/rdma/hns-abi.h
··· 77 77 HNS_ROCE_QP_CAP_RQ_RECORD_DB = 1 << 0, 78 78 HNS_ROCE_QP_CAP_SQ_RECORD_DB = 1 << 1, 79 79 HNS_ROCE_QP_CAP_OWNER_DB = 1 << 2, 80 + HNS_ROCE_QP_CAP_DIRECT_WQE = 1 << 5, 80 81 }; 81 82 82 83 struct hns_roce_ib_create_qp_resp { 83 84 __aligned_u64 cap_flags; 85 + __aligned_u64 dwqe_mmap_key; 84 86 }; 85 87 86 88 struct hns_roce_ib_alloc_ucontext_resp {