Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

IB/core: Enable ODP sync without faulting

Enable ODP sync without faulting, this improves performance by reducing
the number of page faults in the system.

The gain from this option is that the device page table can be aligned
with the presented pages in the CPU page table without causing page
faults.

As of that, the overhead on data path from hardware point of view to
trigger a fault which end-up by calling the driver to bring the pages
will be dropped.

Link: https://lore.kernel.org/r/20200930163828.1336747-3-leon@kernel.org
Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>

authored by

Yishai Hadas and committed by
Jason Gunthorpe
8bfafde0 36f30e48

+27 -12
+25 -10
drivers/infiniband/core/umem_odp.c
··· 347 347 * the return value. 348 348 * @access_mask: bit mask of the requested access permissions for the given 349 349 * range. 350 + * @fault: is faulting required for the given range 350 351 */ 351 352 int ib_umem_odp_map_dma_and_lock(struct ib_umem_odp *umem_odp, u64 user_virt, 352 - u64 bcnt, u64 access_mask) 353 + u64 bcnt, u64 access_mask, bool fault) 353 354 __acquires(&umem_odp->umem_mutex) 354 355 { 355 356 struct task_struct *owning_process = NULL; ··· 386 385 range.end = ALIGN(user_virt + bcnt, 1UL << page_shift); 387 386 pfn_start_idx = (range.start - ib_umem_start(umem_odp)) >> PAGE_SHIFT; 388 387 num_pfns = (range.end - range.start) >> PAGE_SHIFT; 389 - range.default_flags = HMM_PFN_REQ_FAULT; 388 + if (fault) { 389 + range.default_flags = HMM_PFN_REQ_FAULT; 390 390 391 - if (access_mask & ODP_WRITE_ALLOWED_BIT) 392 - range.default_flags |= HMM_PFN_REQ_WRITE; 391 + if (access_mask & ODP_WRITE_ALLOWED_BIT) 392 + range.default_flags |= HMM_PFN_REQ_WRITE; 393 + } 393 394 394 395 range.hmm_pfns = &(umem_odp->pfn_list[pfn_start_idx]); 395 396 timeout = jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT); ··· 420 417 421 418 for (pfn_index = 0; pfn_index < num_pfns; 422 419 pfn_index += 1 << (page_shift - PAGE_SHIFT), dma_index++) { 423 - /* 424 - * Since we asked for hmm_range_fault() to populate pages, 425 - * it shouldn't return an error entry on success. 426 - */ 427 - WARN_ON(range.hmm_pfns[pfn_index] & HMM_PFN_ERROR); 428 - WARN_ON(!(range.hmm_pfns[pfn_index] & HMM_PFN_VALID)); 420 + 421 + if (fault) { 422 + /* 423 + * Since we asked for hmm_range_fault() to populate 424 + * pages it shouldn't return an error entry on success. 425 + */ 426 + WARN_ON(range.hmm_pfns[pfn_index] & HMM_PFN_ERROR); 427 + WARN_ON(!(range.hmm_pfns[pfn_index] & HMM_PFN_VALID)); 428 + } else { 429 + if (!(range.hmm_pfns[pfn_index] & HMM_PFN_VALID)) { 430 + WARN_ON(umem_odp->dma_list[dma_index]); 431 + continue; 432 + } 433 + access_mask = ODP_READ_ALLOWED_BIT; 434 + if (range.hmm_pfns[pfn_index] & HMM_PFN_WRITE) 435 + access_mask |= ODP_WRITE_ALLOWED_BIT; 436 + } 437 + 429 438 hmm_order = hmm_pfn_to_map_order(range.hmm_pfns[pfn_index]); 430 439 /* If a hugepage was detected and ODP wasn't set for, the umem 431 440 * page_shift will be used, the opposite case is an error.
+1 -1
drivers/infiniband/hw/mlx5/odp.c
··· 681 681 if (odp->umem.writable && !downgrade) 682 682 access_mask |= ODP_WRITE_ALLOWED_BIT; 683 683 684 - np = ib_umem_odp_map_dma_and_lock(odp, user_va, bcnt, access_mask); 684 + np = ib_umem_odp_map_dma_and_lock(odp, user_va, bcnt, access_mask, true); 685 685 if (np < 0) 686 686 return np; 687 687
+1 -1
include/rdma/ib_umem_odp.h
··· 94 94 void ib_umem_odp_release(struct ib_umem_odp *umem_odp); 95 95 96 96 int ib_umem_odp_map_dma_and_lock(struct ib_umem_odp *umem_odp, u64 start_offset, 97 - u64 bcnt, u64 access_mask); 97 + u64 bcnt, u64 access_mask, bool fault); 98 98 99 99 void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 start_offset, 100 100 u64 bound);