Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

IB/core: Improve ODP to use hmm_range_fault()

Move to use hmm_range_fault() instead of get_user_pags_remote() to improve
performance in a few aspects:

This includes:
- Dropping the need to allocate and free memory to hold its output

- No need any more to use put_page() to unpin the pages

- The logic to detect contiguous pages is done based on the returned
order, no need to run per page and evaluate.

In addition, moving to use hmm_range_fault() enables to reduce page faults
in the system with it's snapshot mode, this will be introduced in next
patches from this series.

As part of this, cleanup some flows and use the required data structures
to work with hmm_range_fault().

Link: https://lore.kernel.org/r/20200930163828.1336747-2-leon@kernel.org
Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>

authored by

Yishai Hadas and committed by
Jason Gunthorpe
36f30e48 2ee9bf34

+130 -198
+1
drivers/infiniband/Kconfig
··· 48 48 depends on INFINIBAND_USER_MEM 49 49 select MMU_NOTIFIER 50 50 select INTERVAL_TREE 51 + select HMM_MIRROR 51 52 default y 52 53 help 53 54 On demand paging support for the InfiniBand subsystem.
+114 -168
drivers/infiniband/core/umem_odp.c
··· 40 40 #include <linux/vmalloc.h> 41 41 #include <linux/hugetlb.h> 42 42 #include <linux/interval_tree.h> 43 + #include <linux/hmm.h> 43 44 #include <linux/pagemap.h> 44 45 45 46 #include <rdma/ib_verbs.h> ··· 61 60 size_t page_size = 1UL << umem_odp->page_shift; 62 61 unsigned long start; 63 62 unsigned long end; 64 - size_t pages; 63 + size_t ndmas, npfns; 65 64 66 65 start = ALIGN_DOWN(umem_odp->umem.address, page_size); 67 66 if (check_add_overflow(umem_odp->umem.address, ··· 72 71 if (unlikely(end < page_size)) 73 72 return -EOVERFLOW; 74 73 75 - pages = (end - start) >> umem_odp->page_shift; 76 - if (!pages) 74 + ndmas = (end - start) >> umem_odp->page_shift; 75 + if (!ndmas) 77 76 return -EINVAL; 78 77 79 - umem_odp->page_list = kvcalloc( 80 - pages, sizeof(*umem_odp->page_list), GFP_KERNEL); 81 - if (!umem_odp->page_list) 78 + npfns = (end - start) >> PAGE_SHIFT; 79 + umem_odp->pfn_list = kvcalloc( 80 + npfns, sizeof(*umem_odp->pfn_list), GFP_KERNEL); 81 + if (!umem_odp->pfn_list) 82 82 return -ENOMEM; 83 83 84 84 umem_odp->dma_list = kvcalloc( 85 - pages, sizeof(*umem_odp->dma_list), GFP_KERNEL); 85 + ndmas, sizeof(*umem_odp->dma_list), GFP_KERNEL); 86 86 if (!umem_odp->dma_list) { 87 87 ret = -ENOMEM; 88 - goto out_page_list; 88 + goto out_pfn_list; 89 89 } 90 90 91 91 ret = mmu_interval_notifier_insert(&umem_odp->notifier, ··· 100 98 101 99 out_dma_list: 102 100 kvfree(umem_odp->dma_list); 103 - out_page_list: 104 - kvfree(umem_odp->page_list); 101 + out_pfn_list: 102 + kvfree(umem_odp->pfn_list); 105 103 return ret; 106 104 } 107 105 ··· 278 276 mutex_unlock(&umem_odp->umem_mutex); 279 277 mmu_interval_notifier_remove(&umem_odp->notifier); 280 278 kvfree(umem_odp->dma_list); 281 - kvfree(umem_odp->page_list); 279 + kvfree(umem_odp->pfn_list); 282 280 } 283 281 put_pid(umem_odp->tgid); 284 282 kfree(umem_odp); ··· 289 287 * Map for DMA and insert a single page into the on-demand paging page tables. 290 288 * 291 289 * @umem: the umem to insert the page to. 292 - * @page_index: index in the umem to add the page to. 290 + * @dma_index: index in the umem to add the dma to. 293 291 * @page: the page struct to map and add. 294 292 * @access_mask: access permissions needed for this page. 295 293 * @current_seq: sequence number for synchronization with invalidations. 296 294 * the sequence number is taken from 297 295 * umem_odp->notifiers_seq. 298 296 * 299 - * The function returns -EFAULT if the DMA mapping operation fails. It returns 300 - * -EAGAIN if a concurrent invalidation prevents us from updating the page. 297 + * The function returns -EFAULT if the DMA mapping operation fails. 301 298 * 302 - * The page is released via put_page even if the operation failed. For on-demand 303 - * pinning, the page is released whenever it isn't stored in the umem. 304 299 */ 305 300 static int ib_umem_odp_map_dma_single_page( 306 301 struct ib_umem_odp *umem_odp, 307 - unsigned int page_index, 302 + unsigned int dma_index, 308 303 struct page *page, 309 - u64 access_mask, 310 - unsigned long current_seq) 304 + u64 access_mask) 311 305 { 312 306 struct ib_device *dev = umem_odp->umem.ibdev; 313 - dma_addr_t dma_addr; 314 - int ret = 0; 307 + dma_addr_t *dma_addr = &umem_odp->dma_list[dma_index]; 315 308 316 - if (mmu_interval_check_retry(&umem_odp->notifier, current_seq)) { 317 - ret = -EAGAIN; 318 - goto out; 319 - } 320 - if (!(umem_odp->dma_list[page_index])) { 321 - dma_addr = 322 - ib_dma_map_page(dev, page, 0, BIT(umem_odp->page_shift), 323 - DMA_BIDIRECTIONAL); 324 - if (ib_dma_mapping_error(dev, dma_addr)) { 325 - ret = -EFAULT; 326 - goto out; 327 - } 328 - umem_odp->dma_list[page_index] = dma_addr | access_mask; 329 - umem_odp->page_list[page_index] = page; 330 - umem_odp->npages++; 331 - } else if (umem_odp->page_list[page_index] == page) { 332 - umem_odp->dma_list[page_index] |= access_mask; 333 - } else { 309 + if (*dma_addr) { 334 310 /* 335 - * This is a race here where we could have done: 336 - * 337 - * CPU0 CPU1 338 - * get_user_pages() 339 - * invalidate() 340 - * page_fault() 341 - * mutex_lock(umem_mutex) 342 - * page from GUP != page in ODP 343 - * 344 - * It should be prevented by the retry test above as reading 345 - * the seq number should be reliable under the 346 - * umem_mutex. Thus something is really not working right if 347 - * things get here. 311 + * If the page is already dma mapped it means it went through 312 + * a non-invalidating trasition, like read-only to writable. 313 + * Resync the flags. 348 314 */ 349 - WARN(true, 350 - "Got different pages in IB device and from get_user_pages. IB device page: %p, gup page: %p\n", 351 - umem_odp->page_list[page_index], page); 352 - ret = -EAGAIN; 315 + *dma_addr = (*dma_addr & ODP_DMA_ADDR_MASK) | access_mask; 316 + return 0; 353 317 } 354 318 355 - out: 356 - put_page(page); 357 - return ret; 319 + *dma_addr = ib_dma_map_page(dev, page, 0, 1 << umem_odp->page_shift, 320 + DMA_BIDIRECTIONAL); 321 + if (ib_dma_mapping_error(dev, *dma_addr)) { 322 + *dma_addr = 0; 323 + return -EFAULT; 324 + } 325 + umem_odp->npages++; 326 + *dma_addr |= access_mask; 327 + return 0; 358 328 } 359 329 360 330 /** 361 - * ib_umem_odp_map_dma_pages - Pin and DMA map userspace memory in an ODP MR. 331 + * ib_umem_odp_map_dma_and_lock - DMA map userspace memory in an ODP MR and lock it. 362 332 * 363 - * Pins the range of pages passed in the argument, and maps them to 364 - * DMA addresses. The DMA addresses of the mapped pages is updated in 365 - * umem_odp->dma_list. 333 + * Maps the range passed in the argument to DMA addresses. 334 + * The DMA addresses of the mapped pages is updated in umem_odp->dma_list. 335 + * Upon success the ODP MR will be locked to let caller complete its device 336 + * page table update. 366 337 * 367 338 * Returns the number of pages mapped in success, negative error code 368 339 * for failure. 369 - * An -EAGAIN error code is returned when a concurrent mmu notifier prevents 370 - * the function from completing its task. 371 - * An -ENOENT error code indicates that userspace process is being terminated 372 - * and mm was already destroyed. 373 340 * @umem_odp: the umem to map and pin 374 341 * @user_virt: the address from which we need to map. 375 342 * @bcnt: the minimal number of bytes to pin and map. The mapping might be ··· 347 376 * the return value. 348 377 * @access_mask: bit mask of the requested access permissions for the given 349 378 * range. 350 - * @current_seq: the MMU notifiers sequance value for synchronization with 351 - * invalidations. the sequance number is read from 352 - * umem_odp->notifiers_seq before calling this function 353 379 */ 354 - int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 user_virt, 355 - u64 bcnt, u64 access_mask, 356 - unsigned long current_seq) 380 + int ib_umem_odp_map_dma_and_lock(struct ib_umem_odp *umem_odp, u64 user_virt, 381 + u64 bcnt, u64 access_mask) 382 + __acquires(&umem_odp->umem_mutex) 357 383 { 358 384 struct task_struct *owning_process = NULL; 359 385 struct mm_struct *owning_mm = umem_odp->umem.owning_mm; 360 - struct page **local_page_list = NULL; 361 - u64 page_mask, off; 362 - int j, k, ret = 0, start_idx, npages = 0; 363 - unsigned int flags = 0, page_shift; 364 - phys_addr_t p = 0; 386 + int pfn_index, dma_index, ret = 0, start_idx; 387 + unsigned int page_shift, hmm_order, pfn_start_idx; 388 + unsigned long num_pfns, current_seq; 389 + struct hmm_range range = {}; 390 + unsigned long timeout; 365 391 366 392 if (access_mask == 0) 367 393 return -EINVAL; ··· 367 399 user_virt + bcnt > ib_umem_end(umem_odp)) 368 400 return -EFAULT; 369 401 370 - local_page_list = (struct page **)__get_free_page(GFP_KERNEL); 371 - if (!local_page_list) 372 - return -ENOMEM; 373 - 374 402 page_shift = umem_odp->page_shift; 375 - page_mask = ~(BIT(page_shift) - 1); 376 - off = user_virt & (~page_mask); 377 - user_virt = user_virt & page_mask; 378 - bcnt += off; /* Charge for the first page offset as well. */ 379 403 380 404 /* 381 405 * owning_process is allowed to be NULL, this means somehow the mm is ··· 380 420 goto out_put_task; 381 421 } 382 422 423 + range.notifier = &umem_odp->notifier; 424 + range.start = ALIGN_DOWN(user_virt, 1UL << page_shift); 425 + range.end = ALIGN(user_virt + bcnt, 1UL << page_shift); 426 + pfn_start_idx = (range.start - ib_umem_start(umem_odp)) >> PAGE_SHIFT; 427 + num_pfns = (range.end - range.start) >> PAGE_SHIFT; 428 + range.default_flags = HMM_PFN_REQ_FAULT; 429 + 383 430 if (access_mask & ODP_WRITE_ALLOWED_BIT) 384 - flags |= FOLL_WRITE; 431 + range.default_flags |= HMM_PFN_REQ_WRITE; 385 432 386 - start_idx = (user_virt - ib_umem_start(umem_odp)) >> page_shift; 387 - k = start_idx; 433 + range.hmm_pfns = &(umem_odp->pfn_list[pfn_start_idx]); 434 + timeout = jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT); 388 435 389 - while (bcnt > 0) { 390 - const size_t gup_num_pages = min_t(size_t, 391 - ALIGN(bcnt, PAGE_SIZE) / PAGE_SIZE, 392 - PAGE_SIZE / sizeof(struct page *)); 436 + retry: 437 + current_seq = range.notifier_seq = 438 + mmu_interval_read_begin(&umem_odp->notifier); 393 439 394 - mmap_read_lock(owning_mm); 440 + mmap_read_lock(owning_mm); 441 + ret = hmm_range_fault(&range); 442 + mmap_read_unlock(owning_mm); 443 + if (unlikely(ret)) { 444 + if (ret == -EBUSY && !time_after(jiffies, timeout)) 445 + goto retry; 446 + goto out_put_mm; 447 + } 448 + 449 + start_idx = (range.start - ib_umem_start(umem_odp)) >> page_shift; 450 + dma_index = start_idx; 451 + 452 + mutex_lock(&umem_odp->umem_mutex); 453 + if (mmu_interval_read_retry(&umem_odp->notifier, current_seq)) { 454 + mutex_unlock(&umem_odp->umem_mutex); 455 + goto retry; 456 + } 457 + 458 + for (pfn_index = 0; pfn_index < num_pfns; 459 + pfn_index += 1 << (page_shift - PAGE_SHIFT), dma_index++) { 395 460 /* 396 - * Note: this might result in redundent page getting. We can 397 - * avoid this by checking dma_list to be 0 before calling 398 - * get_user_pages. However, this make the code much more 399 - * complex (and doesn't gain us much performance in most use 400 - * cases). 461 + * Since we asked for hmm_range_fault() to populate pages, 462 + * it shouldn't return an error entry on success. 401 463 */ 402 - npages = get_user_pages_remote(owning_mm, 403 - user_virt, gup_num_pages, 404 - flags, local_page_list, NULL, NULL); 405 - mmap_read_unlock(owning_mm); 406 - 407 - if (npages < 0) { 408 - if (npages != -EAGAIN) 409 - pr_warn("fail to get %zu user pages with error %d\n", gup_num_pages, npages); 410 - else 411 - pr_debug("fail to get %zu user pages with error %d\n", gup_num_pages, npages); 464 + WARN_ON(range.hmm_pfns[pfn_index] & HMM_PFN_ERROR); 465 + WARN_ON(!(range.hmm_pfns[pfn_index] & HMM_PFN_VALID)); 466 + hmm_order = hmm_pfn_to_map_order(range.hmm_pfns[pfn_index]); 467 + /* If a hugepage was detected and ODP wasn't set for, the umem 468 + * page_shift will be used, the opposite case is an error. 469 + */ 470 + if (hmm_order + PAGE_SHIFT < page_shift) { 471 + ret = -EINVAL; 472 + ibdev_dbg(umem_odp->umem.ibdev, 473 + "%s: un-expected hmm_order %d, page_shift %d\n", 474 + __func__, hmm_order, page_shift); 412 475 break; 413 476 } 414 477 415 - bcnt -= min_t(size_t, npages << PAGE_SHIFT, bcnt); 416 - mutex_lock(&umem_odp->umem_mutex); 417 - for (j = 0; j < npages; j++, user_virt += PAGE_SIZE) { 418 - if (user_virt & ~page_mask) { 419 - p += PAGE_SIZE; 420 - if (page_to_phys(local_page_list[j]) != p) { 421 - ret = -EFAULT; 422 - break; 423 - } 424 - put_page(local_page_list[j]); 425 - continue; 426 - } 427 - 428 - ret = ib_umem_odp_map_dma_single_page( 429 - umem_odp, k, local_page_list[j], 430 - access_mask, current_seq); 431 - if (ret < 0) { 432 - if (ret != -EAGAIN) 433 - pr_warn("ib_umem_odp_map_dma_single_page failed with error %d\n", ret); 434 - else 435 - pr_debug("ib_umem_odp_map_dma_single_page failed with error %d\n", ret); 436 - break; 437 - } 438 - 439 - p = page_to_phys(local_page_list[j]); 440 - k++; 478 + ret = ib_umem_odp_map_dma_single_page( 479 + umem_odp, dma_index, hmm_pfn_to_page(range.hmm_pfns[pfn_index]), 480 + access_mask); 481 + if (ret < 0) { 482 + ibdev_dbg(umem_odp->umem.ibdev, 483 + "ib_umem_odp_map_dma_single_page failed with error %d\n", ret); 484 + break; 441 485 } 486 + } 487 + /* upon sucesss lock should stay on hold for the callee */ 488 + if (!ret) 489 + ret = dma_index - start_idx; 490 + else 442 491 mutex_unlock(&umem_odp->umem_mutex); 443 492 444 - if (ret < 0) { 445 - /* 446 - * Release pages, remembering that the first page 447 - * to hit an error was already released by 448 - * ib_umem_odp_map_dma_single_page(). 449 - */ 450 - if (npages - (j + 1) > 0) 451 - release_pages(&local_page_list[j+1], 452 - npages - (j + 1)); 453 - break; 454 - } 455 - } 456 - 457 - if (ret >= 0) { 458 - if (npages < 0 && k == start_idx) 459 - ret = npages; 460 - else 461 - ret = k - start_idx; 462 - } 463 - 493 + out_put_mm: 464 494 mmput(owning_mm); 465 495 out_put_task: 466 496 if (owning_process) 467 497 put_task_struct(owning_process); 468 - free_page((unsigned long)local_page_list); 469 498 return ret; 470 499 } 471 - EXPORT_SYMBOL(ib_umem_odp_map_dma_pages); 500 + EXPORT_SYMBOL(ib_umem_odp_map_dma_and_lock); 472 501 473 502 void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt, 474 503 u64 bound) 475 504 { 505 + dma_addr_t dma_addr; 506 + dma_addr_t dma; 476 507 int idx; 477 508 u64 addr; 478 509 struct ib_device *dev = umem_odp->umem.ibdev; ··· 472 521 473 522 virt = max_t(u64, virt, ib_umem_start(umem_odp)); 474 523 bound = min_t(u64, bound, ib_umem_end(umem_odp)); 475 - /* Note that during the run of this function, the 476 - * notifiers_count of the MR is > 0, preventing any racing 477 - * faults from completion. We might be racing with other 478 - * invalidations, so we must make sure we free each page only 479 - * once. */ 480 524 for (addr = virt; addr < bound; addr += BIT(umem_odp->page_shift)) { 481 525 idx = (addr - ib_umem_start(umem_odp)) >> umem_odp->page_shift; 482 - if (umem_odp->page_list[idx]) { 483 - struct page *page = umem_odp->page_list[idx]; 484 - dma_addr_t dma = umem_odp->dma_list[idx]; 485 - dma_addr_t dma_addr = dma & ODP_DMA_ADDR_MASK; 526 + dma = umem_odp->dma_list[idx]; 486 527 487 - WARN_ON(!dma_addr); 528 + /* The access flags guaranteed a valid DMA address in case was NULL */ 529 + if (dma) { 530 + unsigned long pfn_idx = (addr - ib_umem_start(umem_odp)) >> PAGE_SHIFT; 531 + struct page *page = hmm_pfn_to_page(umem_odp->pfn_list[pfn_idx]); 488 532 533 + dma_addr = dma & ODP_DMA_ADDR_MASK; 489 534 ib_dma_unmap_page(dev, dma_addr, 490 535 BIT(umem_odp->page_shift), 491 536 DMA_BIDIRECTIONAL); ··· 498 551 */ 499 552 set_page_dirty(head_page); 500 553 } 501 - umem_odp->page_list[idx] = NULL; 502 554 umem_odp->dma_list[idx] = 0; 503 555 umem_odp->npages--; 504 556 }
+7 -17
drivers/infiniband/hw/mlx5/odp.c
··· 671 671 { 672 672 int page_shift, ret, np; 673 673 bool downgrade = flags & MLX5_PF_FLAGS_DOWNGRADE; 674 - unsigned long current_seq; 675 674 u64 access_mask; 676 675 u64 start_idx; 677 676 ··· 681 682 if (odp->umem.writable && !downgrade) 682 683 access_mask |= ODP_WRITE_ALLOWED_BIT; 683 684 684 - current_seq = mmu_interval_read_begin(&odp->notifier); 685 - 686 - np = ib_umem_odp_map_dma_pages(odp, user_va, bcnt, access_mask, 687 - current_seq); 685 + np = ib_umem_odp_map_dma_and_lock(odp, user_va, bcnt, access_mask); 688 686 if (np < 0) 689 687 return np; 690 688 691 - mutex_lock(&odp->umem_mutex); 692 - if (!mmu_interval_read_retry(&odp->notifier, current_seq)) { 693 - /* 694 - * No need to check whether the MTTs really belong to 695 - * this MR, since ib_umem_odp_map_dma_pages already 696 - * checks this. 697 - */ 698 - ret = mlx5_ib_update_xlt(mr, start_idx, np, 699 - page_shift, MLX5_IB_UPD_XLT_ATOMIC); 700 - } else { 701 - ret = -EAGAIN; 702 - } 689 + /* 690 + * No need to check whether the MTTs really belong to this MR, since 691 + * ib_umem_odp_map_dma_and_lock already checks this. 692 + */ 693 + ret = mlx5_ib_update_xlt(mr, start_idx, np, page_shift, 694 + MLX5_IB_UPD_XLT_ATOMIC); 703 695 mutex_unlock(&odp->umem_mutex); 704 696 705 697 if (ret < 0) {
+8 -13
include/rdma/ib_umem_odp.h
··· 14 14 struct mmu_interval_notifier notifier; 15 15 struct pid *tgid; 16 16 17 + /* An array of the pfns included in the on-demand paging umem. */ 18 + unsigned long *pfn_list; 19 + 17 20 /* 18 - * An array of the pages included in the on-demand paging umem. 19 - * Indices of pages that are currently not mapped into the device will 20 - * contain NULL. 21 - */ 22 - struct page **page_list; 23 - /* 24 - * An array of the same size as page_list, with DMA addresses mapped 25 - * for pages the pages in page_list. The lower two bits designate 26 - * access permissions. See ODP_READ_ALLOWED_BIT and 27 - * ODP_WRITE_ALLOWED_BIT. 21 + * An array with DMA addresses mapped for pfns in pfn_list. 22 + * The lower two bits designate access permissions. 23 + * See ODP_READ_ALLOWED_BIT and ODP_WRITE_ALLOWED_BIT. 28 24 */ 29 25 dma_addr_t *dma_list; 30 26 /* ··· 93 97 const struct mmu_interval_notifier_ops *ops); 94 98 void ib_umem_odp_release(struct ib_umem_odp *umem_odp); 95 99 96 - int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 start_offset, 97 - u64 bcnt, u64 access_mask, 98 - unsigned long current_seq); 100 + int ib_umem_odp_map_dma_and_lock(struct ib_umem_odp *umem_odp, u64 start_offset, 101 + u64 bcnt, u64 access_mask); 99 102 100 103 void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 start_offset, 101 104 u64 bound);