Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

RDMA/umem: Store ODP access mask information in PFN

As a preparation to remove dma_list, store access mask in PFN pointer
and not in dma_addr_t.

Tested-by: Jens Axboe <axboe@kernel.dk>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>

authored by

Leon Romanovsky and committed by
Leon Romanovsky
eedd5b12 8cad4713

+69 -98
+42 -59
drivers/infiniband/core/umem_odp.c
··· 296 296 static int ib_umem_odp_map_dma_single_page( 297 297 struct ib_umem_odp *umem_odp, 298 298 unsigned int dma_index, 299 - struct page *page, 300 - u64 access_mask) 299 + struct page *page) 301 300 { 302 301 struct ib_device *dev = umem_odp->umem.ibdev; 303 302 dma_addr_t *dma_addr = &umem_odp->dma_list[dma_index]; 304 - 305 - if (*dma_addr) { 306 - /* 307 - * If the page is already dma mapped it means it went through 308 - * a non-invalidating trasition, like read-only to writable. 309 - * Resync the flags. 310 - */ 311 - *dma_addr = (*dma_addr & ODP_DMA_ADDR_MASK) | access_mask; 312 - return 0; 313 - } 314 303 315 304 *dma_addr = ib_dma_map_page(dev, page, 0, 1 << umem_odp->page_shift, 316 305 DMA_BIDIRECTIONAL); ··· 308 319 return -EFAULT; 309 320 } 310 321 umem_odp->npages++; 311 - *dma_addr |= access_mask; 312 322 return 0; 313 323 } 314 324 ··· 343 355 struct hmm_range range = {}; 344 356 unsigned long timeout; 345 357 346 - if (access_mask == 0) 347 - return -EINVAL; 348 - 349 358 if (user_virt < ib_umem_start(umem_odp) || 350 359 user_virt + bcnt > ib_umem_end(umem_odp)) 351 360 return -EFAULT; ··· 368 383 if (fault) { 369 384 range.default_flags = HMM_PFN_REQ_FAULT; 370 385 371 - if (access_mask & ODP_WRITE_ALLOWED_BIT) 386 + if (access_mask & HMM_PFN_WRITE) 372 387 range.default_flags |= HMM_PFN_REQ_WRITE; 373 388 } 374 389 ··· 400 415 for (pfn_index = 0; pfn_index < num_pfns; 401 416 pfn_index += 1 << (page_shift - PAGE_SHIFT), dma_index++) { 402 417 403 - if (fault) { 404 - /* 405 - * Since we asked for hmm_range_fault() to populate 406 - * pages it shouldn't return an error entry on success. 407 - */ 408 - WARN_ON(range.hmm_pfns[pfn_index] & HMM_PFN_ERROR); 409 - WARN_ON(!(range.hmm_pfns[pfn_index] & HMM_PFN_VALID)); 410 - } else { 411 - if (!(range.hmm_pfns[pfn_index] & HMM_PFN_VALID)) { 412 - WARN_ON(umem_odp->dma_list[dma_index]); 413 - continue; 414 - } 415 - access_mask = ODP_READ_ALLOWED_BIT; 416 - if (range.hmm_pfns[pfn_index] & HMM_PFN_WRITE) 417 - access_mask |= ODP_WRITE_ALLOWED_BIT; 418 - } 418 + /* 419 + * Since we asked for hmm_range_fault() to populate 420 + * pages it shouldn't return an error entry on success. 421 + */ 422 + WARN_ON(fault && range.hmm_pfns[pfn_index] & HMM_PFN_ERROR); 423 + WARN_ON(fault && !(range.hmm_pfns[pfn_index] & HMM_PFN_VALID)); 424 + if (!(range.hmm_pfns[pfn_index] & HMM_PFN_VALID)) 425 + continue; 426 + 427 + if (range.hmm_pfns[pfn_index] & HMM_PFN_DMA_MAPPED) 428 + continue; 419 429 420 430 hmm_order = hmm_pfn_to_map_order(range.hmm_pfns[pfn_index]); 421 431 /* If a hugepage was detected and ODP wasn't set for, the umem ··· 425 445 } 426 446 427 447 ret = ib_umem_odp_map_dma_single_page( 428 - umem_odp, dma_index, hmm_pfn_to_page(range.hmm_pfns[pfn_index]), 429 - access_mask); 448 + umem_odp, dma_index, 449 + hmm_pfn_to_page(range.hmm_pfns[pfn_index])); 430 450 if (ret < 0) { 431 451 ibdev_dbg(umem_odp->umem.ibdev, 432 452 "ib_umem_odp_map_dma_single_page failed with error %d\n", ret); 433 453 break; 434 454 } 455 + range.hmm_pfns[pfn_index] |= HMM_PFN_DMA_MAPPED; 435 456 } 436 457 /* upon success lock should stay on hold for the callee */ 437 458 if (!ret) ··· 452 471 void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt, 453 472 u64 bound) 454 473 { 455 - dma_addr_t dma_addr; 456 474 dma_addr_t dma; 457 475 int idx; 458 476 u64 addr; ··· 462 482 virt = max_t(u64, virt, ib_umem_start(umem_odp)); 463 483 bound = min_t(u64, bound, ib_umem_end(umem_odp)); 464 484 for (addr = virt; addr < bound; addr += BIT(umem_odp->page_shift)) { 485 + unsigned long pfn_idx = (addr - ib_umem_start(umem_odp)) >> 486 + PAGE_SHIFT; 487 + struct page *page = 488 + hmm_pfn_to_page(umem_odp->pfn_list[pfn_idx]); 489 + 465 490 idx = (addr - ib_umem_start(umem_odp)) >> umem_odp->page_shift; 466 491 dma = umem_odp->dma_list[idx]; 467 492 468 - /* The access flags guaranteed a valid DMA address in case was NULL */ 469 - if (dma) { 470 - unsigned long pfn_idx = (addr - ib_umem_start(umem_odp)) >> PAGE_SHIFT; 471 - struct page *page = hmm_pfn_to_page(umem_odp->pfn_list[pfn_idx]); 493 + if (!(umem_odp->pfn_list[pfn_idx] & HMM_PFN_VALID)) 494 + goto clear; 495 + if (!(umem_odp->pfn_list[pfn_idx] & HMM_PFN_DMA_MAPPED)) 496 + goto clear; 472 497 473 - dma_addr = dma & ODP_DMA_ADDR_MASK; 474 - ib_dma_unmap_page(dev, dma_addr, 475 - BIT(umem_odp->page_shift), 476 - DMA_BIDIRECTIONAL); 477 - if (dma & ODP_WRITE_ALLOWED_BIT) { 478 - struct page *head_page = compound_head(page); 479 - /* 480 - * set_page_dirty prefers being called with 481 - * the page lock. However, MMU notifiers are 482 - * called sometimes with and sometimes without 483 - * the lock. We rely on the umem_mutex instead 484 - * to prevent other mmu notifiers from 485 - * continuing and allowing the page mapping to 486 - * be removed. 487 - */ 488 - set_page_dirty(head_page); 489 - } 490 - umem_odp->dma_list[idx] = 0; 491 - umem_odp->npages--; 498 + ib_dma_unmap_page(dev, dma, BIT(umem_odp->page_shift), 499 + DMA_BIDIRECTIONAL); 500 + if (umem_odp->pfn_list[pfn_idx] & HMM_PFN_WRITE) { 501 + struct page *head_page = compound_head(page); 502 + /* 503 + * set_page_dirty prefers being called with 504 + * the page lock. However, MMU notifiers are 505 + * called sometimes with and sometimes without 506 + * the lock. We rely on the umem_mutex instead 507 + * to prevent other mmu notifiers from 508 + * continuing and allowing the page mapping to 509 + * be removed. 510 + */ 511 + set_page_dirty(head_page); 492 512 } 513 + umem_odp->npages--; 514 + clear: 515 + umem_odp->pfn_list[pfn_idx] &= ~HMM_PFN_FLAGS; 493 516 } 494 517 } 495 518 EXPORT_SYMBOL(ib_umem_odp_unmap_dma_pages);
+1
drivers/infiniband/hw/mlx5/mlx5_ib.h
··· 351 351 #define MLX5_IB_UPD_XLT_PD BIT(4) 352 352 #define MLX5_IB_UPD_XLT_ACCESS BIT(5) 353 353 #define MLX5_IB_UPD_XLT_INDIRECT BIT(6) 354 + #define MLX5_IB_UPD_XLT_DOWNGRADE BIT(7) 354 355 355 356 /* Private QP creation flags to be passed in ib_qp_init_attr.create_flags. 356 357 *
+19 -18
drivers/infiniband/hw/mlx5/odp.c
··· 34 34 #include <linux/kernel.h> 35 35 #include <linux/dma-buf.h> 36 36 #include <linux/dma-resv.h> 37 + #include <linux/hmm.h> 37 38 38 39 #include "mlx5_ib.h" 39 40 #include "cmd.h" ··· 159 158 } 160 159 } 161 160 162 - static u64 umem_dma_to_mtt(dma_addr_t umem_dma) 163 - { 164 - u64 mtt_entry = umem_dma & ODP_DMA_ADDR_MASK; 165 - 166 - if (umem_dma & ODP_READ_ALLOWED_BIT) 167 - mtt_entry |= MLX5_IB_MTT_READ; 168 - if (umem_dma & ODP_WRITE_ALLOWED_BIT) 169 - mtt_entry |= MLX5_IB_MTT_WRITE; 170 - 171 - return mtt_entry; 172 - } 173 - 174 161 static void populate_mtt(__be64 *pas, size_t idx, size_t nentries, 175 162 struct mlx5_ib_mr *mr, int flags) 176 163 { 177 164 struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem); 165 + bool downgrade = flags & MLX5_IB_UPD_XLT_DOWNGRADE; 166 + unsigned long pfn; 178 167 dma_addr_t pa; 179 168 size_t i; 180 169 ··· 172 181 return; 173 182 174 183 for (i = 0; i < nentries; i++) { 184 + pfn = odp->pfn_list[idx + i]; 185 + if (!(pfn & HMM_PFN_VALID)) 186 + /* ODP initialization */ 187 + continue; 188 + 175 189 pa = odp->dma_list[idx + i]; 176 - pas[i] = cpu_to_be64(umem_dma_to_mtt(pa)); 190 + pa |= MLX5_IB_MTT_READ; 191 + if ((pfn & HMM_PFN_WRITE) && !downgrade) 192 + pa |= MLX5_IB_MTT_WRITE; 193 + 194 + pas[i] = cpu_to_be64(pa); 177 195 } 178 196 } 179 197 ··· 303 303 * estimate the cost of another UMR vs. the cost of bigger 304 304 * UMR. 305 305 */ 306 - if (umem_odp->dma_list[idx] & 307 - (ODP_READ_ALLOWED_BIT | ODP_WRITE_ALLOWED_BIT)) { 306 + if (umem_odp->pfn_list[idx] & HMM_PFN_VALID) { 308 307 if (!in_block) { 309 308 blk_start_idx = idx; 310 309 in_block = 1; ··· 686 687 { 687 688 int page_shift, ret, np; 688 689 bool downgrade = flags & MLX5_PF_FLAGS_DOWNGRADE; 689 - u64 access_mask; 690 + u64 access_mask = 0; 690 691 u64 start_idx; 691 692 bool fault = !(flags & MLX5_PF_FLAGS_SNAPSHOT); 692 693 u32 xlt_flags = MLX5_IB_UPD_XLT_ATOMIC; ··· 694 695 if (flags & MLX5_PF_FLAGS_ENABLE) 695 696 xlt_flags |= MLX5_IB_UPD_XLT_ENABLE; 696 697 698 + if (flags & MLX5_PF_FLAGS_DOWNGRADE) 699 + xlt_flags |= MLX5_IB_UPD_XLT_DOWNGRADE; 700 + 697 701 page_shift = odp->page_shift; 698 702 start_idx = (user_va - ib_umem_start(odp)) >> page_shift; 699 - access_mask = ODP_READ_ALLOWED_BIT; 700 703 701 704 if (odp->umem.writable && !downgrade) 702 - access_mask |= ODP_WRITE_ALLOWED_BIT; 705 + access_mask |= HMM_PFN_WRITE; 703 706 704 707 np = ib_umem_odp_map_dma_and_lock(odp, user_va, bcnt, access_mask, fault); 705 708 if (np < 0)
+6 -8
drivers/infiniband/sw/rxe/rxe_odp.c
··· 27 27 start = max_t(u64, ib_umem_start(umem_odp), range->start); 28 28 end = min_t(u64, ib_umem_end(umem_odp), range->end); 29 29 30 - /* update umem_odp->dma_list */ 30 + /* update umem_odp->map.pfn_list */ 31 31 ib_umem_odp_unmap_dma_pages(umem_odp, start, end); 32 32 33 33 mutex_unlock(&umem_odp->umem_mutex); ··· 45 45 { 46 46 struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem); 47 47 bool fault = !(flags & RXE_PAGEFAULT_SNAPSHOT); 48 - u64 access_mask; 48 + u64 access_mask = 0; 49 49 int np; 50 50 51 - access_mask = ODP_READ_ALLOWED_BIT; 52 51 if (umem_odp->umem.writable && !(flags & RXE_PAGEFAULT_RDONLY)) 53 - access_mask |= ODP_WRITE_ALLOWED_BIT; 52 + access_mask |= HMM_PFN_WRITE; 54 53 55 54 /* 56 55 * ib_umem_odp_map_dma_and_lock() locks umem_mutex on success. ··· 137 138 while (addr < iova + length) { 138 139 idx = (addr - ib_umem_start(umem_odp)) >> umem_odp->page_shift; 139 140 140 - if (!(umem_odp->dma_list[idx] & perm)) { 141 + if (!(umem_odp->map.pfn_list[idx] & perm)) { 141 142 need_fault = true; 142 143 break; 143 144 } ··· 161 162 { 162 163 struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem); 163 164 bool need_fault; 164 - u64 perm; 165 + u64 perm = 0; 165 166 int err; 166 167 167 168 if (unlikely(length < 1)) 168 169 return -EINVAL; 169 170 170 - perm = ODP_READ_ALLOWED_BIT; 171 171 if (!(flags & RXE_PAGEFAULT_RDONLY)) 172 - perm |= ODP_WRITE_ALLOWED_BIT; 172 + perm |= HMM_PFN_WRITE; 173 173 174 174 mutex_lock(&umem_odp->umem_mutex); 175 175
+1 -13
include/rdma/ib_umem_odp.h
··· 8 8 9 9 #include <rdma/ib_umem.h> 10 10 #include <rdma/ib_verbs.h> 11 + #include <linux/hmm.h> 11 12 12 13 struct ib_umem_odp { 13 14 struct ib_umem umem; ··· 67 66 return (ib_umem_end(umem_odp) - ib_umem_start(umem_odp)) >> 68 67 umem_odp->page_shift; 69 68 } 70 - 71 - /* 72 - * The lower 2 bits of the DMA address signal the R/W permissions for 73 - * the entry. To upgrade the permissions, provide the appropriate 74 - * bitmask to the map_dma_pages function. 75 - * 76 - * Be aware that upgrading a mapped address might result in change of 77 - * the DMA address for the page. 78 - */ 79 - #define ODP_READ_ALLOWED_BIT (1<<0ULL) 80 - #define ODP_WRITE_ALLOWED_BIT (1<<1ULL) 81 - 82 - #define ODP_DMA_ADDR_MASK (~(ODP_READ_ALLOWED_BIT | ODP_WRITE_ALLOWED_BIT)) 83 69 84 70 #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING 85 71