Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

RDMA/core: Convert UMEM ODP DMA mapping to caching IOVA and page linkage

Reuse newly added DMA API to cache IOVA and only link/unlink pages
in fast path for UMEM ODP flow.

Tested-by: Jens Axboe <axboe@kernel.dk>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>

authored by

Leon Romanovsky and committed by
Leon Romanovsky
1efe8c06 eedd5b12

+74 -116
+22 -82
drivers/infiniband/core/umem_odp.c
··· 41 41 #include <linux/hugetlb.h> 42 42 #include <linux/interval_tree.h> 43 43 #include <linux/hmm.h> 44 + #include <linux/hmm-dma.h> 44 45 #include <linux/pagemap.h> 45 46 46 47 #include <rdma/ib_umem_odp.h> ··· 51 50 static inline int ib_init_umem_odp(struct ib_umem_odp *umem_odp, 52 51 const struct mmu_interval_notifier_ops *ops) 53 52 { 53 + struct ib_device *dev = umem_odp->umem.ibdev; 54 54 int ret; 55 55 56 56 umem_odp->umem.is_odp = 1; ··· 61 59 size_t page_size = 1UL << umem_odp->page_shift; 62 60 unsigned long start; 63 61 unsigned long end; 64 - size_t ndmas, npfns; 65 62 66 63 start = ALIGN_DOWN(umem_odp->umem.address, page_size); 67 64 if (check_add_overflow(umem_odp->umem.address, ··· 71 70 if (unlikely(end < page_size)) 72 71 return -EOVERFLOW; 73 72 74 - ndmas = (end - start) >> umem_odp->page_shift; 75 - if (!ndmas) 76 - return -EINVAL; 77 - 78 - npfns = (end - start) >> PAGE_SHIFT; 79 - umem_odp->pfn_list = kvcalloc( 80 - npfns, sizeof(*umem_odp->pfn_list), GFP_KERNEL); 81 - if (!umem_odp->pfn_list) 82 - return -ENOMEM; 83 - 84 - umem_odp->dma_list = kvcalloc( 85 - ndmas, sizeof(*umem_odp->dma_list), GFP_KERNEL); 86 - if (!umem_odp->dma_list) { 87 - ret = -ENOMEM; 88 - goto out_pfn_list; 89 - } 73 + ret = hmm_dma_map_alloc(dev->dma_device, &umem_odp->map, 74 + (end - start) >> PAGE_SHIFT, 75 + 1 << umem_odp->page_shift); 76 + if (ret) 77 + return ret; 90 78 91 79 ret = mmu_interval_notifier_insert(&umem_odp->notifier, 92 80 umem_odp->umem.owning_mm, 93 81 start, end - start, ops); 94 82 if (ret) 95 - goto out_dma_list; 83 + goto out_free_map; 96 84 } 97 85 98 86 return 0; 99 87 100 - out_dma_list: 101 - kvfree(umem_odp->dma_list); 102 - out_pfn_list: 103 - kvfree(umem_odp->pfn_list); 88 + out_free_map: 89 + hmm_dma_map_free(dev->dma_device, &umem_odp->map); 104 90 return ret; 105 91 } 106 92 ··· 250 262 251 263 void ib_umem_odp_release(struct ib_umem_odp *umem_odp) 252 264 { 265 + struct ib_device *dev = umem_odp->umem.ibdev; 266 + 253 267 /* 254 268 * Ensure that no more pages are mapped in the umem. 255 269 * ··· 264 274 ib_umem_end(umem_odp)); 265 275 mutex_unlock(&umem_odp->umem_mutex); 266 276 mmu_interval_notifier_remove(&umem_odp->notifier); 267 - kvfree(umem_odp->dma_list); 268 - kvfree(umem_odp->pfn_list); 277 + hmm_dma_map_free(dev->dma_device, &umem_odp->map); 269 278 } 270 279 put_pid(umem_odp->tgid); 271 280 kfree(umem_odp); 272 281 } 273 282 EXPORT_SYMBOL(ib_umem_odp_release); 274 283 275 - /* 276 - * Map for DMA and insert a single page into the on-demand paging page tables. 277 - * 278 - * @umem: the umem to insert the page to. 279 - * @dma_index: index in the umem to add the dma to. 280 - * @page: the page struct to map and add. 281 - * @access_mask: access permissions needed for this page. 282 - * 283 - * The function returns -EFAULT if the DMA mapping operation fails. 284 - * 285 - */ 286 - static int ib_umem_odp_map_dma_single_page( 287 - struct ib_umem_odp *umem_odp, 288 - unsigned int dma_index, 289 - struct page *page) 290 - { 291 - struct ib_device *dev = umem_odp->umem.ibdev; 292 - dma_addr_t *dma_addr = &umem_odp->dma_list[dma_index]; 293 - 294 - *dma_addr = ib_dma_map_page(dev, page, 0, 1 << umem_odp->page_shift, 295 - DMA_BIDIRECTIONAL); 296 - if (ib_dma_mapping_error(dev, *dma_addr)) { 297 - *dma_addr = 0; 298 - return -EFAULT; 299 - } 300 - umem_odp->npages++; 301 - return 0; 302 - } 303 - 304 284 /** 305 285 * ib_umem_odp_map_dma_and_lock - DMA map userspace memory in an ODP MR and lock it. 306 286 * 307 287 * Maps the range passed in the argument to DMA addresses. 308 - * The DMA addresses of the mapped pages is updated in umem_odp->dma_list. 309 288 * Upon success the ODP MR will be locked to let caller complete its device 310 289 * page table update. 311 290 * ··· 331 372 range.default_flags |= HMM_PFN_REQ_WRITE; 332 373 } 333 374 334 - range.hmm_pfns = &(umem_odp->pfn_list[pfn_start_idx]); 375 + range.hmm_pfns = &(umem_odp->map.pfn_list[pfn_start_idx]); 335 376 timeout = jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT); 336 377 337 378 retry: ··· 382 423 __func__, hmm_order, page_shift); 383 424 break; 384 425 } 385 - 386 - ret = ib_umem_odp_map_dma_single_page( 387 - umem_odp, dma_index, 388 - hmm_pfn_to_page(range.hmm_pfns[pfn_index])); 389 - if (ret < 0) { 390 - ibdev_dbg(umem_odp->umem.ibdev, 391 - "ib_umem_odp_map_dma_single_page failed with error %d\n", ret); 392 - break; 393 - } 394 - range.hmm_pfns[pfn_index] |= HMM_PFN_DMA_MAPPED; 395 426 } 396 427 /* upon success lock should stay on hold for the callee */ 397 428 if (!ret) ··· 401 452 void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt, 402 453 u64 bound) 403 454 { 404 - dma_addr_t dma; 405 - int idx; 406 - u64 addr; 407 455 struct ib_device *dev = umem_odp->umem.ibdev; 456 + u64 addr; 408 457 409 458 lockdep_assert_held(&umem_odp->umem_mutex); 410 459 411 460 virt = max_t(u64, virt, ib_umem_start(umem_odp)); 412 461 bound = min_t(u64, bound, ib_umem_end(umem_odp)); 413 462 for (addr = virt; addr < bound; addr += BIT(umem_odp->page_shift)) { 414 - unsigned long pfn_idx = (addr - ib_umem_start(umem_odp)) >> 415 - PAGE_SHIFT; 416 - struct page *page = 417 - hmm_pfn_to_page(umem_odp->pfn_list[pfn_idx]); 463 + u64 offset = addr - ib_umem_start(umem_odp); 464 + size_t idx = offset >> umem_odp->page_shift; 465 + unsigned long pfn = umem_odp->map.pfn_list[idx]; 418 466 419 - idx = (addr - ib_umem_start(umem_odp)) >> umem_odp->page_shift; 420 - dma = umem_odp->dma_list[idx]; 421 - 422 - if (!(umem_odp->pfn_list[pfn_idx] & HMM_PFN_VALID)) 423 - goto clear; 424 - if (!(umem_odp->pfn_list[pfn_idx] & HMM_PFN_DMA_MAPPED)) 467 + if (!hmm_dma_unmap_pfn(dev->dma_device, &umem_odp->map, idx)) 425 468 goto clear; 426 469 427 - ib_dma_unmap_page(dev, dma, BIT(umem_odp->page_shift), 428 - DMA_BIDIRECTIONAL); 429 - if (umem_odp->pfn_list[pfn_idx] & HMM_PFN_WRITE) { 470 + if (pfn & HMM_PFN_WRITE) { 471 + struct page *page = hmm_pfn_to_page(pfn); 430 472 struct page *head_page = compound_head(page); 431 473 /* 432 474 * set_page_dirty prefers being called with ··· 432 492 } 433 493 umem_odp->npages--; 434 494 clear: 435 - umem_odp->pfn_list[pfn_idx] &= ~HMM_PFN_FLAGS; 495 + umem_odp->map.pfn_list[idx] &= ~HMM_PFN_FLAGS; 436 496 } 437 497 } 438 498 EXPORT_SYMBOL(ib_umem_odp_unmap_dma_pages);
+7 -4
drivers/infiniband/hw/mlx5/mlx5_ib.h
··· 1474 1474 int __init mlx5_ib_odp_init(void); 1475 1475 void mlx5_ib_odp_cleanup(void); 1476 1476 int mlx5_odp_init_mkey_cache(struct mlx5_ib_dev *dev); 1477 - void mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries, 1478 - struct mlx5_ib_mr *mr, int flags); 1477 + int mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries, 1478 + struct mlx5_ib_mr *mr, int flags); 1479 1479 1480 1480 int mlx5_ib_advise_mr_prefetch(struct ib_pd *pd, 1481 1481 enum ib_uverbs_advise_mr_advice advice, ··· 1496 1496 { 1497 1497 return 0; 1498 1498 } 1499 - static inline void mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries, 1500 - struct mlx5_ib_mr *mr, int flags) {} 1499 + static inline int mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries, 1500 + struct mlx5_ib_mr *mr, int flags) 1501 + { 1502 + return -EOPNOTSUPP; 1503 + } 1501 1504 1502 1505 static inline int 1503 1506 mlx5_ib_advise_mr_prefetch(struct ib_pd *pd,
+27 -15
drivers/infiniband/hw/mlx5/odp.c
··· 35 35 #include <linux/dma-buf.h> 36 36 #include <linux/dma-resv.h> 37 37 #include <linux/hmm.h> 38 + #include <linux/hmm-dma.h> 39 + #include <linux/pci-p2pdma.h> 38 40 39 41 #include "mlx5_ib.h" 40 42 #include "cmd.h" ··· 161 159 } 162 160 } 163 161 164 - static void populate_mtt(__be64 *pas, size_t idx, size_t nentries, 165 - struct mlx5_ib_mr *mr, int flags) 162 + static int populate_mtt(__be64 *pas, size_t start, size_t nentries, 163 + struct mlx5_ib_mr *mr, int flags) 166 164 { 167 165 struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem); 168 166 bool downgrade = flags & MLX5_IB_UPD_XLT_DOWNGRADE; 169 - unsigned long pfn; 170 - dma_addr_t pa; 167 + struct pci_p2pdma_map_state p2pdma_state = {}; 168 + struct ib_device *dev = odp->umem.ibdev; 171 169 size_t i; 172 170 173 171 if (flags & MLX5_IB_UPD_XLT_ZAP) 174 - return; 172 + return 0; 175 173 176 174 for (i = 0; i < nentries; i++) { 177 - pfn = odp->pfn_list[idx + i]; 175 + unsigned long pfn = odp->map.pfn_list[start + i]; 176 + dma_addr_t dma_addr; 177 + 178 + pfn = odp->map.pfn_list[start + i]; 178 179 if (!(pfn & HMM_PFN_VALID)) 179 180 /* ODP initialization */ 180 181 continue; 181 182 182 - pa = odp->dma_list[idx + i]; 183 - pa |= MLX5_IB_MTT_READ; 184 - if ((pfn & HMM_PFN_WRITE) && !downgrade) 185 - pa |= MLX5_IB_MTT_WRITE; 183 + dma_addr = hmm_dma_map_pfn(dev->dma_device, &odp->map, 184 + start + i, &p2pdma_state); 185 + if (ib_dma_mapping_error(dev, dma_addr)) 186 + return -EFAULT; 186 187 187 - pas[i] = cpu_to_be64(pa); 188 + dma_addr |= MLX5_IB_MTT_READ; 189 + if ((pfn & HMM_PFN_WRITE) && !downgrade) 190 + dma_addr |= MLX5_IB_MTT_WRITE; 191 + 192 + pas[i] = cpu_to_be64(dma_addr); 193 + odp->npages++; 188 194 } 195 + return 0; 189 196 } 190 197 191 - void mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries, 192 - struct mlx5_ib_mr *mr, int flags) 198 + int mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries, 199 + struct mlx5_ib_mr *mr, int flags) 193 200 { 194 201 if (flags & MLX5_IB_UPD_XLT_INDIRECT) { 195 202 populate_klm(xlt, idx, nentries, mr, flags); 203 + return 0; 196 204 } else { 197 - populate_mtt(xlt, idx, nentries, mr, flags); 205 + return populate_mtt(xlt, idx, nentries, mr, flags); 198 206 } 199 207 } 200 208 ··· 315 303 * estimate the cost of another UMR vs. the cost of bigger 316 304 * UMR. 317 305 */ 318 - if (umem_odp->pfn_list[idx] & HMM_PFN_VALID) { 306 + if (umem_odp->map.pfn_list[idx] & HMM_PFN_VALID) { 319 307 if (!in_block) { 320 308 blk_start_idx = idx; 321 309 in_block = 1;
+11 -1
drivers/infiniband/hw/mlx5/umr.c
··· 840 840 size_to_map = npages * desc_size; 841 841 dma_sync_single_for_cpu(ddev, sg.addr, sg.length, 842 842 DMA_TO_DEVICE); 843 - mlx5_odp_populate_xlt(xlt, idx, npages, mr, flags); 843 + /* 844 + * npages is the maximum number of pages to map, but we 845 + * can't guarantee that all pages are actually mapped. 846 + * 847 + * For example, if page is p2p of type which is not supported 848 + * for mapping, the number of pages mapped will be less than 849 + * requested. 850 + */ 851 + err = mlx5_odp_populate_xlt(xlt, idx, npages, mr, flags); 852 + if (err) 853 + return err; 844 854 dma_sync_single_for_device(ddev, sg.addr, sg.length, 845 855 DMA_TO_DEVICE); 846 856 sg.length = ALIGN(size_to_map, MLX5_UMR_FLEX_ALIGNMENT);
+4 -4
drivers/infiniband/sw/rxe/rxe_odp.c
··· 205 205 while (length > 0) { 206 206 u8 *src, *dest; 207 207 208 - page = hmm_pfn_to_page(umem_odp->pfn_list[idx]); 208 + page = hmm_pfn_to_page(umem_odp->map.pfn_list[idx]); 209 209 user_va = kmap_local_page(page); 210 210 if (!user_va) 211 211 return -EFAULT; ··· 289 289 290 290 idx = rxe_odp_iova_to_index(umem_odp, iova); 291 291 page_offset = rxe_odp_iova_to_page_offset(umem_odp, iova); 292 - page = hmm_pfn_to_page(umem_odp->pfn_list[idx]); 292 + page = hmm_pfn_to_page(umem_odp->map.pfn_list[idx]); 293 293 if (!page) 294 294 return RESPST_ERR_RKEY_VIOLATION; 295 295 ··· 355 355 index = rxe_odp_iova_to_index(umem_odp, iova); 356 356 page_offset = rxe_odp_iova_to_page_offset(umem_odp, iova); 357 357 358 - page = hmm_pfn_to_page(umem_odp->pfn_list[index]); 358 + page = hmm_pfn_to_page(umem_odp->map.pfn_list[index]); 359 359 if (!page) { 360 360 mutex_unlock(&umem_odp->umem_mutex); 361 361 return -EFAULT; ··· 401 401 402 402 page_offset = rxe_odp_iova_to_page_offset(umem_odp, iova); 403 403 index = rxe_odp_iova_to_index(umem_odp, iova); 404 - page = hmm_pfn_to_page(umem_odp->pfn_list[index]); 404 + page = hmm_pfn_to_page(umem_odp->map.pfn_list[index]); 405 405 if (!page) { 406 406 mutex_unlock(&umem_odp->umem_mutex); 407 407 return RESPST_ERR_RKEY_VIOLATION;
+3 -10
include/rdma/ib_umem_odp.h
··· 8 8 9 9 #include <rdma/ib_umem.h> 10 10 #include <rdma/ib_verbs.h> 11 - #include <linux/hmm.h> 11 + #include <linux/hmm-dma.h> 12 12 13 13 struct ib_umem_odp { 14 14 struct ib_umem umem; 15 15 struct mmu_interval_notifier notifier; 16 16 struct pid *tgid; 17 17 18 - /* An array of the pfns included in the on-demand paging umem. */ 19 - unsigned long *pfn_list; 18 + struct hmm_dma_map map; 20 19 21 20 /* 22 - * An array with DMA addresses mapped for pfns in pfn_list. 23 - * The lower two bits designate access permissions. 24 - * See ODP_READ_ALLOWED_BIT and ODP_WRITE_ALLOWED_BIT. 25 - */ 26 - dma_addr_t *dma_list; 27 - /* 28 - * The umem_mutex protects the page_list and dma_list fields of an ODP 21 + * The umem_mutex protects the page_list field of an ODP 29 22 * umem, allowing only a single thread to map/unmap pages. The mutex 30 23 * also protects access to the mmu notifier counters. 31 24 */