Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

RDMA/umem: Combine contiguous PAGE_SIZE regions in SGEs

Combine contiguous regions of PAGE_SIZE pages into single scatter list
entry while building the scatter table for a umem. This minimizes the
number of the entries in the scatter list and reduces the DMA mapping
overhead, particularly with the IOMMU.

Set default max_seg_size in core for IB devices to 2G and do not combine
if we exceed this limit.

Also, purge npages in struct ib_umem as we now DMA map the umem SGL with
sg_nents and npage computation is not needed. Drivers should now be using
ib_umem_num_pages(), so fix the last stragglers.

Move npages tracking to ib_umem_odp as ODP drivers still need it.

Suggested-by: Jason Gunthorpe <jgg@ziepe.ca>
Reviewed-by: Michael J. Ruhl <michael.j.ruhl@intel.com>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Acked-by: Adit Ranadive <aditr@vmware.com>
Signed-off-by: Shiraz Saleem <shiraz.saleem@intel.com>
Tested-by: Gal Pressman <galpress@amazon.com>
Tested-by: Selvin Xavier <selvin.xavier@broadcom.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>

authored by

Shiraz Saleem and committed by
Jason Gunthorpe
d10bcf94 c7252a65

+95 -29
+3
drivers/infiniband/core/device.c
··· 1089 1089 WARN_ON_ONCE(!parent); 1090 1090 device->dma_device = parent; 1091 1091 } 1092 + /* Setup default max segment size for all IB devices */ 1093 + dma_set_max_seg_size(device->dma_device, SZ_2G); 1094 + 1092 1095 } 1093 1096 1094 1097 /*
+81 -20
drivers/infiniband/core/umem.c
··· 39 39 #include <linux/export.h> 40 40 #include <linux/hugetlb.h> 41 41 #include <linux/slab.h> 42 + #include <linux/pagemap.h> 42 43 #include <rdma/ib_umem_odp.h> 43 44 44 45 #include "uverbs.h" 45 46 46 - 47 47 static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int dirty) 48 48 { 49 - struct scatterlist *sg; 49 + struct sg_page_iter sg_iter; 50 50 struct page *page; 51 - int i; 52 51 53 52 if (umem->nmap > 0) 54 - ib_dma_unmap_sg(dev, umem->sg_head.sgl, 55 - umem->npages, 53 + ib_dma_unmap_sg(dev, umem->sg_head.sgl, umem->sg_nents, 56 54 DMA_BIDIRECTIONAL); 57 55 58 - for_each_sg(umem->sg_head.sgl, sg, umem->npages, i) { 59 - 60 - page = sg_page(sg); 56 + for_each_sg_page(umem->sg_head.sgl, &sg_iter, umem->sg_nents, 0) { 57 + page = sg_page_iter_page(&sg_iter); 61 58 if (!PageDirty(page) && umem->writable && dirty) 62 59 set_page_dirty_lock(page); 63 60 put_page(page); 64 61 } 65 62 66 63 sg_free_table(&umem->sg_head); 64 + } 65 + 66 + /* ib_umem_add_sg_table - Add N contiguous pages to scatter table 67 + * 68 + * sg: current scatterlist entry 69 + * page_list: array of npage struct page pointers 70 + * npages: number of pages in page_list 71 + * max_seg_sz: maximum segment size in bytes 72 + * nents: [out] number of entries in the scatterlist 73 + * 74 + * Return new end of scatterlist 75 + */ 76 + static struct scatterlist *ib_umem_add_sg_table(struct scatterlist *sg, 77 + struct page **page_list, 78 + unsigned long npages, 79 + unsigned int max_seg_sz, 80 + int *nents) 81 + { 82 + unsigned long first_pfn; 83 + unsigned long i = 0; 84 + bool update_cur_sg = false; 85 + bool first = !sg_page(sg); 86 + 87 + /* Check if new page_list is contiguous with end of previous page_list. 88 + * sg->length here is a multiple of PAGE_SIZE and sg->offset is 0. 89 + */ 90 + if (!first && (page_to_pfn(sg_page(sg)) + (sg->length >> PAGE_SHIFT) == 91 + page_to_pfn(page_list[0]))) 92 + update_cur_sg = true; 93 + 94 + while (i != npages) { 95 + unsigned long len; 96 + struct page *first_page = page_list[i]; 97 + 98 + first_pfn = page_to_pfn(first_page); 99 + 100 + /* Compute the number of contiguous pages we have starting 101 + * at i 102 + */ 103 + for (len = 0; i != npages && 104 + first_pfn + len == page_to_pfn(page_list[i]); 105 + len++) 106 + i++; 107 + 108 + /* Squash N contiguous pages from page_list into current sge */ 109 + if (update_cur_sg && 110 + ((max_seg_sz - sg->length) >= (len << PAGE_SHIFT))) { 111 + sg_set_page(sg, sg_page(sg), 112 + sg->length + (len << PAGE_SHIFT), 0); 113 + update_cur_sg = false; 114 + continue; 115 + } 116 + 117 + /* Squash N contiguous pages into next sge or first sge */ 118 + if (!first) 119 + sg = sg_next(sg); 120 + 121 + (*nents)++; 122 + sg_set_page(sg, first_page, len << PAGE_SHIFT, 0); 123 + first = false; 124 + } 125 + 126 + return sg; 67 127 } 68 128 69 129 /** ··· 153 93 int ret; 154 94 int i; 155 95 unsigned long dma_attrs = 0; 156 - struct scatterlist *sg, *sg_list_start; 96 + struct scatterlist *sg; 157 97 unsigned int gup_flags = FOLL_WRITE; 158 98 159 99 if (!udata) ··· 250 190 if (!umem->writable) 251 191 gup_flags |= FOLL_FORCE; 252 192 253 - sg_list_start = umem->sg_head.sgl; 193 + sg = umem->sg_head.sgl; 254 194 255 195 while (npages) { 256 196 down_read(&mm->mmap_sem); ··· 263 203 goto umem_release; 264 204 } 265 205 266 - umem->npages += ret; 267 206 cur_base += ret * PAGE_SIZE; 268 207 npages -= ret; 208 + 209 + sg = ib_umem_add_sg_table(sg, page_list, ret, 210 + dma_get_max_seg_size(context->device->dma_device), 211 + &umem->sg_nents); 269 212 270 213 /* Continue to hold the mmap_sem as vma_list access 271 214 * needs to be protected. 272 215 */ 273 - for_each_sg(sg_list_start, sg, ret, i) { 216 + for (i = 0; i < ret && umem->hugetlb; i++) { 274 217 if (vma_list && !is_vm_hugetlb_page(vma_list[i])) 275 218 umem->hugetlb = 0; 276 - 277 - sg_set_page(sg, page_list[i], PAGE_SIZE, 0); 278 219 } 279 - up_read(&mm->mmap_sem); 280 220 281 - /* preparing for next loop */ 282 - sg_list_start = sg; 221 + up_read(&mm->mmap_sem); 283 222 } 223 + 224 + sg_mark_end(sg); 284 225 285 226 umem->nmap = ib_dma_map_sg_attrs(context->device, 286 227 umem->sg_head.sgl, 287 - umem->npages, 228 + umem->sg_nents, 288 229 DMA_BIDIRECTIONAL, 289 230 dma_attrs); 290 231 ··· 381 320 return -EINVAL; 382 321 } 383 322 384 - ret = sg_pcopy_to_buffer(umem->sg_head.sgl, umem->npages, dst, length, 385 - offset + ib_umem_offset(umem)); 323 + ret = sg_pcopy_to_buffer(umem->sg_head.sgl, ib_umem_num_pages(umem), 324 + dst, length, offset + ib_umem_offset(umem)); 386 325 387 326 if (ret < 0) 388 327 return ret;
+2 -2
drivers/infiniband/core/umem_odp.c
··· 526 526 } 527 527 umem_odp->dma_list[page_index] = dma_addr | access_mask; 528 528 umem_odp->page_list[page_index] = page; 529 - umem->npages++; 529 + umem_odp->npages++; 530 530 } else if (umem_odp->page_list[page_index] == page) { 531 531 umem_odp->dma_list[page_index] |= access_mask; 532 532 } else { ··· 752 752 } 753 753 umem_odp->page_list[idx] = NULL; 754 754 umem_odp->dma_list[idx] = 0; 755 - umem->npages--; 755 + umem_odp->npages--; 756 756 } 757 757 } 758 758 mutex_unlock(&umem_odp->umem_mutex);
+1 -1
drivers/infiniband/hw/mlx5/odp.c
··· 288 288 289 289 ib_umem_odp_unmap_dma_pages(umem_odp, start, end); 290 290 291 - if (unlikely(!umem->npages && mr->parent && 291 + if (unlikely(!umem_odp->npages && mr->parent && 292 292 !umem_odp->dying)) { 293 293 WRITE_ONCE(umem_odp->dying, 1); 294 294 atomic_inc(&mr->parent->num_leaf_free);
+6 -5
drivers/infiniband/hw/vmw_pvrdma/pvrdma_mr.c
··· 119 119 union pvrdma_cmd_resp rsp; 120 120 struct pvrdma_cmd_create_mr *cmd = &req.create_mr; 121 121 struct pvrdma_cmd_create_mr_resp *resp = &rsp.create_mr_resp; 122 - int ret; 122 + int ret, npages; 123 123 124 124 if (length == 0 || length > dev->dsr->caps.max_mr_size) { 125 125 dev_warn(&dev->pdev->dev, "invalid mem region length\n"); ··· 133 133 return ERR_CAST(umem); 134 134 } 135 135 136 - if (umem->npages < 0 || umem->npages > PVRDMA_PAGE_DIR_MAX_PAGES) { 136 + npages = ib_umem_num_pages(umem); 137 + if (npages < 0 || npages > PVRDMA_PAGE_DIR_MAX_PAGES) { 137 138 dev_warn(&dev->pdev->dev, "overflow %d pages in mem region\n", 138 - umem->npages); 139 + npages); 139 140 ret = -EINVAL; 140 141 goto err_umem; 141 142 } ··· 151 150 mr->mmr.size = length; 152 151 mr->umem = umem; 153 152 154 - ret = pvrdma_page_dir_init(dev, &mr->pdir, umem->npages, false); 153 + ret = pvrdma_page_dir_init(dev, &mr->pdir, npages, false); 155 154 if (ret) { 156 155 dev_warn(&dev->pdev->dev, 157 156 "could not allocate page directory\n"); ··· 168 167 cmd->length = length; 169 168 cmd->pd_handle = to_vpd(pd)->pd_handle; 170 169 cmd->access_flags = access_flags; 171 - cmd->nchunks = umem->npages; 170 + cmd->nchunks = npages; 172 171 cmd->pdir_dma = mr->pdir.dir_dma; 173 172 174 173 ret = pvrdma_cmd_post(dev, &req, &rsp, PVRDMA_CMD_CREATE_MR_RESP);
+1 -1
include/rdma/ib_umem.h
··· 53 53 struct work_struct work; 54 54 struct sg_table sg_head; 55 55 int nmap; 56 - int npages; 56 + unsigned int sg_nents; 57 57 }; 58 58 59 59 /* Returns the offset of the umem start relative to the first page. */
+1
include/rdma/ib_umem_odp.h
··· 69 69 70 70 int notifiers_seq; 71 71 int notifiers_count; 72 + int npages; 72 73 73 74 /* Tree tracking */ 74 75 struct umem_odp_node interval_tree;