Merge tag 'for-linus-hmm' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma

+11 -62

Documentation/vm/hmm.rst

··· 192 192 the driver callback returns. 193 193 194 194 When the device driver wants to populate a range of virtual addresses, it can 195 - use either:: 195 + use:: 196 196 197 - long hmm_range_snapshot(struct hmm_range *range); 198 - long hmm_range_fault(struct hmm_range *range, bool block); 197 + long hmm_range_fault(struct hmm_range *range, unsigned int flags); 199 198 200 - The first one (hmm_range_snapshot()) will only fetch present CPU page table 199 + With the HMM_RANGE_SNAPSHOT flag, it will only fetch present CPU page table 201 200 entries and will not trigger a page fault on missing or non-present entries. 202 - The second one does trigger a page fault on missing or read-only entries if 203 - write access is requested (see below). Page faults use the generic mm page 201 + Without that flag, it does trigger a page fault on missing or read-only entries 202 + if write access is requested (see below). Page faults use the generic mm page 204 203 fault code path just like a CPU page fault. 205 204 206 205 Both functions copy CPU page table entries into their pfns array argument. Each ··· 222 223 range.flags = ...; 223 224 range.values = ...; 224 225 range.pfn_shift = ...; 225 - hmm_range_register(&range); 226 + hmm_range_register(&range, mirror); 226 227 227 228 /* 228 229 * Just wait for range to be valid, safe to ignore return value as we 229 - * will use the return value of hmm_range_snapshot() below under the 230 + * will use the return value of hmm_range_fault() below under the 230 231 * mmap_sem to ascertain the validity of the range. 231 232 */ 232 233 hmm_range_wait_until_valid(&range, TIMEOUT_IN_MSEC); 233 234 234 235 again: 235 236 down_read(&mm->mmap_sem); 236 - ret = hmm_range_snapshot(&range); 237 + ret = hmm_range_fault(&range, HMM_RANGE_SNAPSHOT); 237 238 if (ret) { 238 239 up_read(&mm->mmap_sem); 239 240 if (ret == -EBUSY) { 240 241 /* 241 242 * No need to check hmm_range_wait_until_valid() return value 242 - * on retry we will get proper error with hmm_range_snapshot() 243 + * on retry we will get proper error with hmm_range_fault() 243 244 */ 244 245 hmm_range_wait_until_valid(&range, TIMEOUT_IN_MSEC); 245 246 goto again; ··· 339 340 =================================== 340 341 341 342 Because the CPU cannot access device memory, migration must use the device DMA 342 - engine to perform copy from and to device memory. For this we need a new 343 - migration helper:: 344 - 345 - int migrate_vma(const struct migrate_vma_ops *ops, 346 - struct vm_area_struct *vma, 347 - unsigned long mentries, 348 - unsigned long start, 349 - unsigned long end, 350 - unsigned long *src, 351 - unsigned long *dst, 352 - void *private); 353 - 354 - Unlike other migration functions it works on a range of virtual address, there 355 - are two reasons for that. First, device DMA copy has a high setup overhead cost 356 - and thus batching multiple pages is needed as otherwise the migration overhead 357 - makes the whole exercise pointless. The second reason is because the 358 - migration might be for a range of addresses the device is actively accessing. 359 - 360 - The migrate_vma_ops struct defines two callbacks. First one (alloc_and_copy()) 361 - controls destination memory allocation and copy operation. Second one is there 362 - to allow the device driver to perform cleanup operations after migration:: 363 - 364 - struct migrate_vma_ops { 365 - void (*alloc_and_copy)(struct vm_area_struct *vma, 366 - const unsigned long *src, 367 - unsigned long *dst, 368 - unsigned long start, 369 - unsigned long end, 370 - void *private); 371 - void (*finalize_and_map)(struct vm_area_struct *vma, 372 - const unsigned long *src, 373 - const unsigned long *dst, 374 - unsigned long start, 375 - unsigned long end, 376 - void *private); 377 - }; 378 - 379 - It is important to stress that these migration helpers allow for holes in the 380 - virtual address range. Some pages in the range might not be migrated for all 381 - the usual reasons (page is pinned, page is locked, ...). This helper does not 382 - fail but just skips over those pages. 383 - 384 - The alloc_and_copy() might decide to not migrate all pages in the 385 - range (for reasons under the callback control). For those, the callback just 386 - has to leave the corresponding dst entry empty. 387 - 388 - Finally, the migration of the struct page might fail (for file backed page) for 389 - various reasons (failure to freeze reference, or update page cache, ...). If 390 - that happens, then the finalize_and_map() can catch any pages that were not 391 - migrated. Note those pages were still copied to a new page and thus we wasted 392 - bandwidth but this is considered as a rare event and a price that we are 393 - willing to pay to keep all the code simpler. 343 + engine to perform copy from and to device memory. For this we need to use 344 + migrate_vma_setup(), migrate_vma_pages(), and migrate_vma_finalize() helpers. 394 345 395 346 396 347 Memory cgroup (memcg) and rss accounting

+4 -4

arch/csky/include/asm/tlb.h

··· 8 8 9 9 #define tlb_start_vma(tlb, vma) \ 10 10 do { \ 11 - if (!tlb->fullmm) \ 12 - flush_cache_range(vma, vma->vm_start, vma->vm_end); \ 11 + if (!(tlb)->fullmm) \ 12 + flush_cache_range(vma, (vma)->vm_start, (vma)->vm_end); \ 13 13 } while (0) 14 14 15 15 #define tlb_end_vma(tlb, vma) \ 16 16 do { \ 17 - if (!tlb->fullmm) \ 18 - flush_tlb_range(vma, vma->vm_start, vma->vm_end); \ 17 + if (!(tlb)->fullmm) \ 18 + flush_tlb_range(vma, (vma)->vm_start, (vma)->vm_end); \ 19 19 } while (0) 20 20 21 21 #define tlb_flush(tlb) flush_tlb_mm((tlb)->mm)

+13 -10

arch/openrisc/kernel/dma.c

··· 16 16 */ 17 17 18 18 #include <linux/dma-noncoherent.h> 19 + #include <linux/pagewalk.h> 19 20 20 21 #include <asm/cpuinfo.h> 21 22 #include <asm/spr_defs.h> ··· 44 43 return 0; 45 44 } 46 45 46 + static const struct mm_walk_ops set_nocache_walk_ops = { 47 + .pte_entry = page_set_nocache, 48 + }; 49 + 47 50 static int 48 51 page_clear_nocache(pte_t *pte, unsigned long addr, 49 52 unsigned long next, struct mm_walk *walk) ··· 62 57 63 58 return 0; 64 59 } 60 + 61 + static const struct mm_walk_ops clear_nocache_walk_ops = { 62 + .pte_entry = page_clear_nocache, 63 + }; 65 64 66 65 /* 67 66 * Alloc "coherent" memory, which for OpenRISC means simply uncached. ··· 89 80 { 90 81 unsigned long va; 91 82 void *page; 92 - struct mm_walk walk = { 93 - .pte_entry = page_set_nocache, 94 - .mm = &init_mm 95 - }; 96 83 97 84 page = alloc_pages_exact(size, gfp | __GFP_ZERO); 98 85 if (!page) ··· 103 98 * We need to iterate through the pages, clearing the dcache for 104 99 * them and setting the cache-inhibit bit. 105 100 */ 106 - if (walk_page_range(va, va + size, &walk)) { 101 + if (walk_page_range(&init_mm, va, va + size, &set_nocache_walk_ops, 102 + NULL)) { 107 103 free_pages_exact(page, size); 108 104 return NULL; 109 105 } ··· 117 111 dma_addr_t dma_handle, unsigned long attrs) 118 112 { 119 113 unsigned long va = (unsigned long)vaddr; 120 - struct mm_walk walk = { 121 - .pte_entry = page_clear_nocache, 122 - .mm = &init_mm 123 - }; 124 114 125 115 /* walk_page_range shouldn't be able to fail here */ 126 - WARN_ON(walk_page_range(va, va + size, &walk)); 116 + WARN_ON(walk_page_range(&init_mm, va, va + size, 117 + &clear_nocache_walk_ops, NULL)); 127 118 128 119 free_pages_exact(vaddr, size); 129 120 }

+6 -6

arch/powerpc/mm/book3s64/subpage_prot.c

··· 7 7 #include <linux/kernel.h> 8 8 #include <linux/gfp.h> 9 9 #include <linux/types.h> 10 - #include <linux/mm.h> 10 + #include <linux/pagewalk.h> 11 11 #include <linux/hugetlb.h> 12 12 #include <linux/syscalls.h> 13 13 ··· 139 139 return 0; 140 140 } 141 141 142 + static const struct mm_walk_ops subpage_walk_ops = { 143 + .pmd_entry = subpage_walk_pmd_entry, 144 + }; 145 + 142 146 static void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr, 143 147 unsigned long len) 144 148 { 145 149 struct vm_area_struct *vma; 146 - struct mm_walk subpage_proto_walk = { 147 - .mm = mm, 148 - .pmd_entry = subpage_walk_pmd_entry, 149 - }; 150 150 151 151 /* 152 152 * We don't try too hard, we just mark all the vma in that range ··· 163 163 if (vma->vm_start >= (addr + len)) 164 164 break; 165 165 vma->vm_flags |= VM_NOHUGEPAGE; 166 - walk_page_vma(vma, &subpage_proto_walk); 166 + walk_page_vma(vma, &subpage_walk_ops, NULL); 167 167 vma = vma->vm_next; 168 168 } 169 169 }

+16 -19

arch/s390/mm/gmap.c

··· 9 9 */ 10 10 11 11 #include <linux/kernel.h> 12 - #include <linux/mm.h> 12 + #include <linux/pagewalk.h> 13 13 #include <linux/swap.h> 14 14 #include <linux/smp.h> 15 15 #include <linux/spinlock.h> ··· 2521 2521 return 0; 2522 2522 } 2523 2523 2524 - static inline void zap_zero_pages(struct mm_struct *mm) 2525 - { 2526 - struct mm_walk walk = { .pmd_entry = __zap_zero_pages }; 2527 - 2528 - walk.mm = mm; 2529 - walk_page_range(0, TASK_SIZE, &walk); 2530 - } 2524 + static const struct mm_walk_ops zap_zero_walk_ops = { 2525 + .pmd_entry = __zap_zero_pages, 2526 + }; 2531 2527 2532 2528 /* 2533 2529 * switch on pgstes for its userspace process (for kvm) ··· 2542 2546 mm->context.has_pgste = 1; 2543 2547 /* split thp mappings and disable thp for future mappings */ 2544 2548 thp_split_mm(mm); 2545 - zap_zero_pages(mm); 2549 + walk_page_range(mm, 0, TASK_SIZE, &zap_zero_walk_ops, NULL); 2546 2550 up_write(&mm->mmap_sem); 2547 2551 return 0; 2548 2552 } ··· 2585 2589 return 0; 2586 2590 } 2587 2591 2592 + static const struct mm_walk_ops enable_skey_walk_ops = { 2593 + .hugetlb_entry = __s390_enable_skey_hugetlb, 2594 + .pte_entry = __s390_enable_skey_pte, 2595 + }; 2596 + 2588 2597 int s390_enable_skey(void) 2589 2598 { 2590 - struct mm_walk walk = { 2591 - .hugetlb_entry = __s390_enable_skey_hugetlb, 2592 - .pte_entry = __s390_enable_skey_pte, 2593 - }; 2594 2599 struct mm_struct *mm = current->mm; 2595 2600 struct vm_area_struct *vma; 2596 2601 int rc = 0; ··· 2611 2614 } 2612 2615 mm->def_flags &= ~VM_MERGEABLE; 2613 2616 2614 - walk.mm = mm; 2615 - walk_page_range(0, TASK_SIZE, &walk); 2617 + walk_page_range(mm, 0, TASK_SIZE, &enable_skey_walk_ops, NULL); 2616 2618 2617 2619 out_up: 2618 2620 up_write(&mm->mmap_sem); ··· 2629 2633 return 0; 2630 2634 } 2631 2635 2636 + static const struct mm_walk_ops reset_cmma_walk_ops = { 2637 + .pte_entry = __s390_reset_cmma, 2638 + }; 2639 + 2632 2640 void s390_reset_cmma(struct mm_struct *mm) 2633 2641 { 2634 - struct mm_walk walk = { .pte_entry = __s390_reset_cmma }; 2635 - 2636 2642 down_write(&mm->mmap_sem); 2637 - walk.mm = mm; 2638 - walk_page_range(0, TASK_SIZE, &walk); 2643 + walk_page_range(mm, 0, TASK_SIZE, &reset_cmma_walk_ops, NULL); 2639 2644 up_write(&mm->mmap_sem); 2640 2645 } 2641 2646 EXPORT_SYMBOL_GPL(s390_reset_cmma);

+3 -1

drivers/gpu/drm/amd/amdgpu/Kconfig

··· 27 27 config DRM_AMDGPU_USERPTR 28 28 bool "Always enable userptr write support" 29 29 depends on DRM_AMDGPU 30 - depends on HMM_MIRROR 30 + depends on MMU 31 + select HMM_MIRROR 32 + select MMU_NOTIFIER 31 33 help 32 34 This option selects CONFIG_HMM and CONFIG_HMM_MIRROR if it 33 35 isn't already selected to enabled full userptr support.

+2

drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c

··· 35 35 #include <linux/pm_runtime.h> 36 36 #include <linux/vga_switcheroo.h> 37 37 #include <drm/drm_probe_helper.h> 38 + #include <linux/mmu_notifier.h> 38 39 39 40 #include "amdgpu.h" 40 41 #include "amdgpu_irq.h" ··· 1470 1469 amdgpu_unregister_atpx_handler(); 1471 1470 amdgpu_sync_fini(); 1472 1471 amdgpu_fence_slab_fini(); 1472 + mmu_notifier_synchronize(); 1473 1473 } 1474 1474 1475 1475 module_init(amdgpu_init);

+8 -7

drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c

··· 195 195 * Block for operations on BOs to finish and mark pages as accessed and 196 196 * potentially dirty. 197 197 */ 198 - static int amdgpu_mn_sync_pagetables_gfx(struct hmm_mirror *mirror, 199 - const struct hmm_update *update) 198 + static int 199 + amdgpu_mn_sync_pagetables_gfx(struct hmm_mirror *mirror, 200 + const struct mmu_notifier_range *update) 200 201 { 201 202 struct amdgpu_mn *amn = container_of(mirror, struct amdgpu_mn, mirror); 202 203 unsigned long start = update->start; 203 204 unsigned long end = update->end; 204 - bool blockable = update->blockable; 205 + bool blockable = mmu_notifier_range_blockable(update); 205 206 struct interval_tree_node *it; 206 207 207 208 /* notification is exclusive, but interval is inclusive */ ··· 244 243 * necessitates evicting all user-mode queues of the process. The BOs 245 244 * are restorted in amdgpu_mn_invalidate_range_end_hsa. 246 245 */ 247 - static int amdgpu_mn_sync_pagetables_hsa(struct hmm_mirror *mirror, 248 - const struct hmm_update *update) 246 + static int 247 + amdgpu_mn_sync_pagetables_hsa(struct hmm_mirror *mirror, 248 + const struct mmu_notifier_range *update) 249 249 { 250 250 struct amdgpu_mn *amn = container_of(mirror, struct amdgpu_mn, mirror); 251 251 unsigned long start = update->start; 252 252 unsigned long end = update->end; 253 - bool blockable = update->blockable; 253 + bool blockable = mmu_notifier_range_blockable(update); 254 254 struct interval_tree_node *it; 255 255 256 256 /* notification is exclusive, but interval is inclusive */ ··· 484 482 range->flags = hmm_range_flags; 485 483 range->values = hmm_range_values; 486 484 range->pfn_shift = PAGE_SHIFT; 487 - INIT_LIST_HEAD(&range->list); 488 485 } 489 486 }

+8 -23

drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c

··· 794 794 struct hmm_range *range; 795 795 unsigned long i; 796 796 uint64_t *pfns; 797 - int retry = 0; 798 797 int r = 0; 799 798 800 799 if (!mm) /* Happens during process shutdown */ ··· 834 835 0 : range->flags[HMM_PFN_WRITE]; 835 836 range->pfn_flags_mask = 0; 836 837 range->pfns = pfns; 837 - hmm_range_register(range, mirror, start, 838 - start + ttm->num_pages * PAGE_SIZE, PAGE_SHIFT); 838 + range->start = start; 839 + range->end = start + ttm->num_pages * PAGE_SIZE; 839 840 840 - retry: 841 + hmm_range_register(range, mirror); 842 + 841 843 /* 842 844 * Just wait for range to be valid, safe to ignore return value as we 843 845 * will use the return value of hmm_range_fault() below under the ··· 847 847 hmm_range_wait_until_valid(range, HMM_RANGE_DEFAULT_TIMEOUT); 848 848 849 849 down_read(&mm->mmap_sem); 850 - 851 - r = hmm_range_fault(range, true); 852 - if (unlikely(r < 0)) { 853 - if (likely(r == -EAGAIN)) { 854 - /* 855 - * return -EAGAIN, mmap_sem is dropped 856 - */ 857 - if (retry++ < MAX_RETRY_HMM_RANGE_FAULT) 858 - goto retry; 859 - else 860 - pr_err("Retry hmm fault too many times\n"); 861 - } 862 - 863 - goto out_up_read; 864 - } 865 - 850 + r = hmm_range_fault(range, 0); 866 851 up_read(&mm->mmap_sem); 852 + 853 + if (unlikely(r < 0)) 854 + goto out_free_pfns; 867 855 868 856 for (i = 0; i < ttm->num_pages; i++) { 869 857 pages[i] = hmm_device_entry_to_page(range, pfns[i]); ··· 868 880 869 881 return 0; 870 882 871 - out_up_read: 872 - if (likely(r != -EAGAIN)) 873 - up_read(&mm->mmap_sem); 874 883 out_free_pfns: 875 884 hmm_range_unregister(range); 876 885 kvfree(pfns);

-3

drivers/gpu/drm/amd/amdkfd/kfd_priv.h

··· 687 687 /* We want to receive a notification when the mm_struct is destroyed */ 688 688 struct mmu_notifier mmu_notifier; 689 689 690 - /* Use for delayed freeing of kfd_process structure */ 691 - struct rcu_head rcu; 692 - 693 690 unsigned int pasid; 694 691 unsigned int doorbell_index; 695 692

+41 -47

drivers/gpu/drm/amd/amdkfd/kfd_process.c

··· 62 62 63 63 static struct kfd_process *find_process(const struct task_struct *thread); 64 64 static void kfd_process_ref_release(struct kref *ref); 65 - static struct kfd_process *create_process(const struct task_struct *thread, 66 - struct file *filep); 65 + static struct kfd_process *create_process(const struct task_struct *thread); 66 + static int kfd_process_init_cwsr_apu(struct kfd_process *p, struct file *filep); 67 67 68 68 static void evict_process_worker(struct work_struct *work); 69 69 static void restore_process_worker(struct work_struct *work); ··· 289 289 if (process) { 290 290 pr_debug("Process already found\n"); 291 291 } else { 292 - process = create_process(thread, filep); 292 + process = create_process(thread); 293 + if (IS_ERR(process)) 294 + goto out; 295 + 296 + ret = kfd_process_init_cwsr_apu(process, filep); 297 + if (ret) { 298 + process = ERR_PTR(ret); 299 + goto out; 300 + } 293 301 294 302 if (!procfs.kobj) 295 303 goto out; ··· 486 478 queue_work(kfd_process_wq, &p->release_work); 487 479 } 488 480 489 - static void kfd_process_destroy_delayed(struct rcu_head *rcu) 481 + static void kfd_process_free_notifier(struct mmu_notifier *mn) 490 482 { 491 - struct kfd_process *p = container_of(rcu, struct kfd_process, rcu); 492 - 493 - kfd_unref_process(p); 483 + kfd_unref_process(container_of(mn, struct kfd_process, mmu_notifier)); 494 484 } 495 485 496 486 static void kfd_process_notifier_release(struct mmu_notifier *mn, ··· 540 534 541 535 mutex_unlock(&p->mutex); 542 536 543 - mmu_notifier_unregister_no_release(&p->mmu_notifier, mm); 544 - mmu_notifier_call_srcu(&p->rcu, &kfd_process_destroy_delayed); 537 + mmu_notifier_put(&p->mmu_notifier); 545 538 } 546 539 547 540 static const struct mmu_notifier_ops kfd_process_mmu_notifier_ops = { 548 541 .release = kfd_process_notifier_release, 542 + .free_notifier = kfd_process_free_notifier, 549 543 }; 550 544 551 545 static int kfd_process_init_cwsr_apu(struct kfd_process *p, struct file *filep) ··· 615 609 return 0; 616 610 } 617 611 618 - static struct kfd_process *create_process(const struct task_struct *thread, 619 - struct file *filep) 612 + /* 613 + * On return the kfd_process is fully operational and will be freed when the 614 + * mm is released 615 + */ 616 + static struct kfd_process *create_process(const struct task_struct *thread) 620 617 { 621 618 struct kfd_process *process; 622 619 int err = -ENOMEM; 623 620 624 621 process = kzalloc(sizeof(*process), GFP_KERNEL); 625 - 626 622 if (!process) 627 623 goto err_alloc_process; 624 + 625 + kref_init(&process->ref); 626 + mutex_init(&process->mutex); 627 + process->mm = thread->mm; 628 + process->lead_thread = thread->group_leader; 629 + INIT_LIST_HEAD(&process->per_device_data); 630 + INIT_DELAYED_WORK(&process->eviction_work, evict_process_worker); 631 + INIT_DELAYED_WORK(&process->restore_work, restore_process_worker); 632 + process->last_restore_timestamp = get_jiffies_64(); 633 + kfd_event_init_process(process); 634 + process->is_32bit_user_mode = in_compat_syscall(); 628 635 629 636 process->pasid = kfd_pasid_alloc(); 630 637 if (process->pasid == 0) ··· 646 627 if (kfd_alloc_process_doorbells(process) < 0) 647 628 goto err_alloc_doorbells; 648 629 649 - kref_init(&process->ref); 650 - 651 - mutex_init(&process->mutex); 652 - 653 - process->mm = thread->mm; 654 - 655 - /* register notifier */ 656 - process->mmu_notifier.ops = &kfd_process_mmu_notifier_ops; 657 - err = mmu_notifier_register(&process->mmu_notifier, process->mm); 658 - if (err) 659 - goto err_mmu_notifier; 660 - 661 - hash_add_rcu(kfd_processes_table, &process->kfd_processes, 662 - (uintptr_t)process->mm); 663 - 664 - process->lead_thread = thread->group_leader; 665 - get_task_struct(process->lead_thread); 666 - 667 - INIT_LIST_HEAD(&process->per_device_data); 668 - 669 - kfd_event_init_process(process); 670 - 671 630 err = pqm_init(&process->pqm, process); 672 631 if (err != 0) 673 632 goto err_process_pqm_init; 674 633 675 634 /* init process apertures*/ 676 - process->is_32bit_user_mode = in_compat_syscall(); 677 635 err = kfd_init_apertures(process); 678 636 if (err != 0) 679 637 goto err_init_apertures; 680 638 681 - INIT_DELAYED_WORK(&process->eviction_work, evict_process_worker); 682 - INIT_DELAYED_WORK(&process->restore_work, restore_process_worker); 683 - process->last_restore_timestamp = get_jiffies_64(); 684 - 685 - err = kfd_process_init_cwsr_apu(process, filep); 639 + /* Must be last, have to use release destruction after this */ 640 + process->mmu_notifier.ops = &kfd_process_mmu_notifier_ops; 641 + err = mmu_notifier_register(&process->mmu_notifier, process->mm); 686 642 if (err) 687 - goto err_init_cwsr; 643 + goto err_register_notifier; 644 + 645 + get_task_struct(process->lead_thread); 646 + hash_add_rcu(kfd_processes_table, &process->kfd_processes, 647 + (uintptr_t)process->mm); 688 648 689 649 return process; 690 650 691 - err_init_cwsr: 651 + err_register_notifier: 692 652 kfd_process_free_outstanding_kfd_bos(process); 693 653 kfd_process_destroy_pdds(process); 694 654 err_init_apertures: 695 655 pqm_uninit(&process->pqm); 696 656 err_process_pqm_init: 697 - hash_del_rcu(&process->kfd_processes); 698 - synchronize_rcu(); 699 - mmu_notifier_unregister_no_release(&process->mmu_notifier, process->mm); 700 - err_mmu_notifier: 701 - mutex_destroy(&process->mutex); 702 657 kfd_free_process_doorbells(process); 703 658 err_alloc_doorbells: 704 659 kfd_pasid_free(process->pasid); 705 660 err_alloc_pasid: 661 + mutex_destroy(&process->mutex); 706 662 kfree(process); 707 663 err_alloc_process: 708 664 return ERR_PTR(err);

+3 -2

drivers/gpu/drm/nouveau/Kconfig

··· 86 86 bool "(EXPERIMENTAL) Enable SVM (Shared Virtual Memory) support" 87 87 depends on DEVICE_PRIVATE 88 88 depends on DRM_NOUVEAU 89 - depends on HMM_MIRROR 89 + depends on MMU 90 90 depends on STAGING 91 - select MIGRATE_VMA_HELPER 91 + select HMM_MIRROR 92 + select MMU_NOTIFIER 92 93 default n 93 94 help 94 95 Say Y here if you want to enable experimental support for

+147 -309

drivers/gpu/drm/nouveau/nouveau_dmem.c

··· 44 44 #define DMEM_CHUNK_SIZE (2UL << 20) 45 45 #define DMEM_CHUNK_NPAGES (DMEM_CHUNK_SIZE >> PAGE_SHIFT) 46 46 47 - struct nouveau_migrate; 48 - 49 47 enum nouveau_aper { 50 48 NOUVEAU_APER_VIRT, 51 49 NOUVEAU_APER_VRAM, ··· 84 86 return container_of(page->pgmap, struct nouveau_dmem, pagemap); 85 87 } 86 88 87 - struct nouveau_dmem_fault { 88 - struct nouveau_drm *drm; 89 - struct nouveau_fence *fence; 90 - dma_addr_t *dma; 91 - unsigned long npages; 92 - }; 89 + static unsigned long nouveau_dmem_page_addr(struct page *page) 90 + { 91 + struct nouveau_dmem_chunk *chunk = page->zone_device_data; 92 + unsigned long idx = page_to_pfn(page) - chunk->pfn_first; 93 93 94 - struct nouveau_migrate { 95 - struct vm_area_struct *vma; 96 - struct nouveau_drm *drm; 97 - struct nouveau_fence *fence; 98 - unsigned long npages; 99 - dma_addr_t *dma; 100 - unsigned long dma_nr; 101 - }; 94 + return (idx << PAGE_SHIFT) + chunk->bo->bo.offset; 95 + } 102 96 103 97 static void nouveau_dmem_page_free(struct page *page) 104 98 { ··· 115 125 spin_unlock(&chunk->lock); 116 126 } 117 127 118 - static void 119 - nouveau_dmem_fault_alloc_and_copy(struct vm_area_struct *vma, 120 - const unsigned long *src_pfns, 121 - unsigned long *dst_pfns, 122 - unsigned long start, 123 - unsigned long end, 124 - void *private) 128 + static void nouveau_dmem_fence_done(struct nouveau_fence **fence) 125 129 { 126 - struct nouveau_dmem_fault *fault = private; 127 - struct nouveau_drm *drm = fault->drm; 128 - struct device *dev = drm->dev->dev; 129 - unsigned long addr, i, npages = 0; 130 - nouveau_migrate_copy_t copy; 131 - int ret; 132 - 133 - 134 - /* First allocate new memory */ 135 - for (addr = start, i = 0; addr < end; addr += PAGE_SIZE, i++) { 136 - struct page *dpage, *spage; 137 - 138 - dst_pfns[i] = 0; 139 - spage = migrate_pfn_to_page(src_pfns[i]); 140 - if (!spage || !(src_pfns[i] & MIGRATE_PFN_MIGRATE)) 141 - continue; 142 - 143 - dpage = alloc_page_vma(GFP_HIGHUSER, vma, addr); 144 - if (!dpage) { 145 - dst_pfns[i] = MIGRATE_PFN_ERROR; 146 - continue; 147 - } 148 - lock_page(dpage); 149 - 150 - dst_pfns[i] = migrate_pfn(page_to_pfn(dpage)) | 151 - MIGRATE_PFN_LOCKED; 152 - npages++; 153 - } 154 - 155 - /* Allocate storage for DMA addresses, so we can unmap later. */ 156 - fault->dma = kmalloc(sizeof(*fault->dma) * npages, GFP_KERNEL); 157 - if (!fault->dma) 158 - goto error; 159 - 160 - /* Copy things over */ 161 - copy = drm->dmem->migrate.copy_func; 162 - for (addr = start, i = 0; addr < end; addr += PAGE_SIZE, i++) { 163 - struct nouveau_dmem_chunk *chunk; 164 - struct page *spage, *dpage; 165 - u64 src_addr, dst_addr; 166 - 167 - dpage = migrate_pfn_to_page(dst_pfns[i]); 168 - if (!dpage || dst_pfns[i] == MIGRATE_PFN_ERROR) 169 - continue; 170 - 171 - spage = migrate_pfn_to_page(src_pfns[i]); 172 - if (!spage || !(src_pfns[i] & MIGRATE_PFN_MIGRATE)) { 173 - dst_pfns[i] = MIGRATE_PFN_ERROR; 174 - __free_page(dpage); 175 - continue; 176 - } 177 - 178 - fault->dma[fault->npages] = 179 - dma_map_page_attrs(dev, dpage, 0, PAGE_SIZE, 180 - PCI_DMA_BIDIRECTIONAL, 181 - DMA_ATTR_SKIP_CPU_SYNC); 182 - if (dma_mapping_error(dev, fault->dma[fault->npages])) { 183 - dst_pfns[i] = MIGRATE_PFN_ERROR; 184 - __free_page(dpage); 185 - continue; 186 - } 187 - 188 - dst_addr = fault->dma[fault->npages++]; 189 - 190 - chunk = spage->zone_device_data; 191 - src_addr = page_to_pfn(spage) - chunk->pfn_first; 192 - src_addr = (src_addr << PAGE_SHIFT) + chunk->bo->bo.offset; 193 - 194 - ret = copy(drm, 1, NOUVEAU_APER_HOST, dst_addr, 195 - NOUVEAU_APER_VRAM, src_addr); 196 - if (ret) { 197 - dst_pfns[i] = MIGRATE_PFN_ERROR; 198 - __free_page(dpage); 199 - continue; 200 - } 201 - } 202 - 203 - nouveau_fence_new(drm->dmem->migrate.chan, false, &fault->fence); 204 - 205 - return; 206 - 207 - error: 208 - for (addr = start, i = 0; addr < end; addr += PAGE_SIZE, ++i) { 209 - struct page *page; 210 - 211 - if (!dst_pfns[i] || dst_pfns[i] == MIGRATE_PFN_ERROR) 212 - continue; 213 - 214 - page = migrate_pfn_to_page(dst_pfns[i]); 215 - dst_pfns[i] = MIGRATE_PFN_ERROR; 216 - if (page == NULL) 217 - continue; 218 - 219 - __free_page(page); 220 - } 221 - } 222 - 223 - void nouveau_dmem_fault_finalize_and_map(struct vm_area_struct *vma, 224 - const unsigned long *src_pfns, 225 - const unsigned long *dst_pfns, 226 - unsigned long start, 227 - unsigned long end, 228 - void *private) 229 - { 230 - struct nouveau_dmem_fault *fault = private; 231 - struct nouveau_drm *drm = fault->drm; 232 - 233 - if (fault->fence) { 234 - nouveau_fence_wait(fault->fence, true, false); 235 - nouveau_fence_unref(&fault->fence); 130 + if (fence) { 131 + nouveau_fence_wait(*fence, true, false); 132 + nouveau_fence_unref(fence); 236 133 } else { 237 134 /* 238 135 * FIXME wait for channel to be IDLE before calling finalizing 239 - * the hmem object below (nouveau_migrate_hmem_fini()). 136 + * the hmem object. 240 137 */ 241 138 } 242 - 243 - while (fault->npages--) { 244 - dma_unmap_page(drm->dev->dev, fault->dma[fault->npages], 245 - PAGE_SIZE, PCI_DMA_BIDIRECTIONAL); 246 - } 247 - kfree(fault->dma); 248 139 } 249 140 250 - static const struct migrate_vma_ops nouveau_dmem_fault_migrate_ops = { 251 - .alloc_and_copy = nouveau_dmem_fault_alloc_and_copy, 252 - .finalize_and_map = nouveau_dmem_fault_finalize_and_map, 253 - }; 141 + static vm_fault_t nouveau_dmem_fault_copy_one(struct nouveau_drm *drm, 142 + struct vm_fault *vmf, struct migrate_vma *args, 143 + dma_addr_t *dma_addr) 144 + { 145 + struct device *dev = drm->dev->dev; 146 + struct page *dpage, *spage; 147 + 148 + spage = migrate_pfn_to_page(args->src[0]); 149 + if (!spage || !(args->src[0] & MIGRATE_PFN_MIGRATE)) 150 + return 0; 151 + 152 + dpage = alloc_page_vma(GFP_HIGHUSER, vmf->vma, vmf->address); 153 + if (!dpage) 154 + return VM_FAULT_SIGBUS; 155 + lock_page(dpage); 156 + 157 + *dma_addr = dma_map_page(dev, dpage, 0, PAGE_SIZE, DMA_BIDIRECTIONAL); 158 + if (dma_mapping_error(dev, *dma_addr)) 159 + goto error_free_page; 160 + 161 + if (drm->dmem->migrate.copy_func(drm, 1, NOUVEAU_APER_HOST, *dma_addr, 162 + NOUVEAU_APER_VRAM, nouveau_dmem_page_addr(spage))) 163 + goto error_dma_unmap; 164 + 165 + args->dst[0] = migrate_pfn(page_to_pfn(dpage)) | MIGRATE_PFN_LOCKED; 166 + return 0; 167 + 168 + error_dma_unmap: 169 + dma_unmap_page(dev, *dma_addr, PAGE_SIZE, DMA_BIDIRECTIONAL); 170 + error_free_page: 171 + __free_page(dpage); 172 + return VM_FAULT_SIGBUS; 173 + } 254 174 255 175 static vm_fault_t nouveau_dmem_migrate_to_ram(struct vm_fault *vmf) 256 176 { 257 177 struct nouveau_dmem *dmem = page_to_dmem(vmf->page); 258 - unsigned long src[1] = {0}, dst[1] = {0}; 259 - struct nouveau_dmem_fault fault = { .drm = dmem->drm }; 260 - int ret; 178 + struct nouveau_drm *drm = dmem->drm; 179 + struct nouveau_fence *fence; 180 + unsigned long src = 0, dst = 0; 181 + dma_addr_t dma_addr = 0; 182 + vm_fault_t ret; 183 + struct migrate_vma args = { 184 + .vma = vmf->vma, 185 + .start = vmf->address, 186 + .end = vmf->address + PAGE_SIZE, 187 + .src = &src, 188 + .dst = &dst, 189 + }; 261 190 262 191 /* 263 192 * FIXME what we really want is to find some heuristic to migrate more 264 193 * than just one page on CPU fault. When such fault happens it is very 265 194 * likely that more surrounding page will CPU fault too. 266 195 */ 267 - ret = migrate_vma(&nouveau_dmem_fault_migrate_ops, vmf->vma, 268 - vmf->address, vmf->address + PAGE_SIZE, 269 - src, dst, &fault); 270 - if (ret) 196 + if (migrate_vma_setup(&args) < 0) 271 197 return VM_FAULT_SIGBUS; 198 + if (!args.cpages) 199 + return 0; 272 200 273 - if (dst[0] == MIGRATE_PFN_ERROR) 274 - return VM_FAULT_SIGBUS; 201 + ret = nouveau_dmem_fault_copy_one(drm, vmf, &args, &dma_addr); 202 + if (ret || dst == 0) 203 + goto done; 275 204 276 - return 0; 205 + nouveau_fence_new(dmem->migrate.chan, false, &fence); 206 + migrate_vma_pages(&args); 207 + nouveau_dmem_fence_done(&fence); 208 + dma_unmap_page(drm->dev->dev, dma_addr, PAGE_SIZE, DMA_BIDIRECTIONAL); 209 + done: 210 + migrate_vma_finalize(&args); 211 + return ret; 277 212 } 278 213 279 214 static const struct dev_pagemap_ops nouveau_dmem_pagemap_ops = { ··· 557 642 drm->dmem = NULL; 558 643 } 559 644 560 - static void 561 - nouveau_dmem_migrate_alloc_and_copy(struct vm_area_struct *vma, 562 - const unsigned long *src_pfns, 563 - unsigned long *dst_pfns, 564 - unsigned long start, 565 - unsigned long end, 566 - void *private) 645 + static unsigned long nouveau_dmem_migrate_copy_one(struct nouveau_drm *drm, 646 + unsigned long src, dma_addr_t *dma_addr) 567 647 { 568 - struct nouveau_migrate *migrate = private; 569 - struct nouveau_drm *drm = migrate->drm; 570 648 struct device *dev = drm->dev->dev; 571 - unsigned long addr, i, npages = 0; 572 - nouveau_migrate_copy_t copy; 573 - int ret; 649 + struct page *dpage, *spage; 574 650 575 - /* First allocate new memory */ 576 - for (addr = start, i = 0; addr < end; addr += PAGE_SIZE, i++) { 577 - struct page *dpage, *spage; 651 + spage = migrate_pfn_to_page(src); 652 + if (!spage || !(src & MIGRATE_PFN_MIGRATE)) 653 + goto out; 578 654 579 - dst_pfns[i] = 0; 580 - spage = migrate_pfn_to_page(src_pfns[i]); 581 - if (!spage || !(src_pfns[i] & MIGRATE_PFN_MIGRATE)) 582 - continue; 655 + dpage = nouveau_dmem_page_alloc_locked(drm); 656 + if (!dpage) 657 + return 0; 583 658 584 - dpage = nouveau_dmem_page_alloc_locked(drm); 585 - if (!dpage) 586 - continue; 659 + *dma_addr = dma_map_page(dev, spage, 0, PAGE_SIZE, DMA_BIDIRECTIONAL); 660 + if (dma_mapping_error(dev, *dma_addr)) 661 + goto out_free_page; 587 662 588 - dst_pfns[i] = migrate_pfn(page_to_pfn(dpage)) | 589 - MIGRATE_PFN_LOCKED | 590 - MIGRATE_PFN_DEVICE; 591 - npages++; 592 - } 663 + if (drm->dmem->migrate.copy_func(drm, 1, NOUVEAU_APER_VRAM, 664 + nouveau_dmem_page_addr(dpage), NOUVEAU_APER_HOST, 665 + *dma_addr)) 666 + goto out_dma_unmap; 593 667 594 - if (!npages) 595 - return; 668 + return migrate_pfn(page_to_pfn(dpage)) | MIGRATE_PFN_LOCKED; 596 669 597 - /* Allocate storage for DMA addresses, so we can unmap later. */ 598 - migrate->dma = kmalloc(sizeof(*migrate->dma) * npages, GFP_KERNEL); 599 - if (!migrate->dma) 600 - goto error; 601 - 602 - /* Copy things over */ 603 - copy = drm->dmem->migrate.copy_func; 604 - for (addr = start, i = 0; addr < end; addr += PAGE_SIZE, i++) { 605 - struct nouveau_dmem_chunk *chunk; 606 - struct page *spage, *dpage; 607 - u64 src_addr, dst_addr; 608 - 609 - dpage = migrate_pfn_to_page(dst_pfns[i]); 610 - if (!dpage || dst_pfns[i] == MIGRATE_PFN_ERROR) 611 - continue; 612 - 613 - chunk = dpage->zone_device_data; 614 - dst_addr = page_to_pfn(dpage) - chunk->pfn_first; 615 - dst_addr = (dst_addr << PAGE_SHIFT) + chunk->bo->bo.offset; 616 - 617 - spage = migrate_pfn_to_page(src_pfns[i]); 618 - if (!spage || !(src_pfns[i] & MIGRATE_PFN_MIGRATE)) { 619 - nouveau_dmem_page_free_locked(drm, dpage); 620 - dst_pfns[i] = 0; 621 - continue; 622 - } 623 - 624 - migrate->dma[migrate->dma_nr] = 625 - dma_map_page_attrs(dev, spage, 0, PAGE_SIZE, 626 - PCI_DMA_BIDIRECTIONAL, 627 - DMA_ATTR_SKIP_CPU_SYNC); 628 - if (dma_mapping_error(dev, migrate->dma[migrate->dma_nr])) { 629 - nouveau_dmem_page_free_locked(drm, dpage); 630 - dst_pfns[i] = 0; 631 - continue; 632 - } 633 - 634 - src_addr = migrate->dma[migrate->dma_nr++]; 635 - 636 - ret = copy(drm, 1, NOUVEAU_APER_VRAM, dst_addr, 637 - NOUVEAU_APER_HOST, src_addr); 638 - if (ret) { 639 - nouveau_dmem_page_free_locked(drm, dpage); 640 - dst_pfns[i] = 0; 641 - continue; 642 - } 643 - } 644 - 645 - nouveau_fence_new(drm->dmem->migrate.chan, false, &migrate->fence); 646 - 647 - return; 648 - 649 - error: 650 - for (addr = start, i = 0; addr < end; addr += PAGE_SIZE, ++i) { 651 - struct page *page; 652 - 653 - if (!dst_pfns[i] || dst_pfns[i] == MIGRATE_PFN_ERROR) 654 - continue; 655 - 656 - page = migrate_pfn_to_page(dst_pfns[i]); 657 - dst_pfns[i] = MIGRATE_PFN_ERROR; 658 - if (page == NULL) 659 - continue; 660 - 661 - __free_page(page); 662 - } 670 + out_dma_unmap: 671 + dma_unmap_page(dev, *dma_addr, PAGE_SIZE, DMA_BIDIRECTIONAL); 672 + out_free_page: 673 + nouveau_dmem_page_free_locked(drm, dpage); 674 + out: 675 + return 0; 663 676 } 664 677 665 - void nouveau_dmem_migrate_finalize_and_map(struct vm_area_struct *vma, 666 - const unsigned long *src_pfns, 667 - const unsigned long *dst_pfns, 668 - unsigned long start, 669 - unsigned long end, 670 - void *private) 678 + static void nouveau_dmem_migrate_chunk(struct nouveau_drm *drm, 679 + struct migrate_vma *args, dma_addr_t *dma_addrs) 671 680 { 672 - struct nouveau_migrate *migrate = private; 673 - struct nouveau_drm *drm = migrate->drm; 681 + struct nouveau_fence *fence; 682 + unsigned long addr = args->start, nr_dma = 0, i; 674 683 675 - if (migrate->fence) { 676 - nouveau_fence_wait(migrate->fence, true, false); 677 - nouveau_fence_unref(&migrate->fence); 678 - } else { 679 - /* 680 - * FIXME wait for channel to be IDLE before finalizing 681 - * the hmem object below (nouveau_migrate_hmem_fini()) ? 682 - */ 684 + for (i = 0; addr < args->end; i++) { 685 + args->dst[i] = nouveau_dmem_migrate_copy_one(drm, args->src[i], 686 + dma_addrs + nr_dma); 687 + if (args->dst[i]) 688 + nr_dma++; 689 + addr += PAGE_SIZE; 683 690 } 684 691 685 - while (migrate->dma_nr--) { 686 - dma_unmap_page(drm->dev->dev, migrate->dma[migrate->dma_nr], 687 - PAGE_SIZE, PCI_DMA_BIDIRECTIONAL); 688 - } 689 - kfree(migrate->dma); 692 + nouveau_fence_new(drm->dmem->migrate.chan, false, &fence); 693 + migrate_vma_pages(args); 694 + nouveau_dmem_fence_done(&fence); 690 695 696 + while (nr_dma--) { 697 + dma_unmap_page(drm->dev->dev, dma_addrs[nr_dma], PAGE_SIZE, 698 + DMA_BIDIRECTIONAL); 699 + } 691 700 /* 692 - * FIXME optimization: update GPU page table to point to newly 693 - * migrated memory. 701 + * FIXME optimization: update GPU page table to point to newly migrated 702 + * memory. 694 703 */ 704 + migrate_vma_finalize(args); 695 705 } 696 - 697 - static const struct migrate_vma_ops nouveau_dmem_migrate_ops = { 698 - .alloc_and_copy = nouveau_dmem_migrate_alloc_and_copy, 699 - .finalize_and_map = nouveau_dmem_migrate_finalize_and_map, 700 - }; 701 706 702 707 int 703 708 nouveau_dmem_migrate_vma(struct nouveau_drm *drm, ··· 625 790 unsigned long start, 626 791 unsigned long end) 627 792 { 628 - unsigned long *src_pfns, *dst_pfns, npages; 629 - struct nouveau_migrate migrate = {0}; 630 - unsigned long i, c, max; 631 - int ret = 0; 793 + unsigned long npages = (end - start) >> PAGE_SHIFT; 794 + unsigned long max = min(SG_MAX_SINGLE_ALLOC, npages); 795 + dma_addr_t *dma_addrs; 796 + struct migrate_vma args = { 797 + .vma = vma, 798 + .start = start, 799 + }; 800 + unsigned long c, i; 801 + int ret = -ENOMEM; 632 802 633 - npages = (end - start) >> PAGE_SHIFT; 634 - max = min(SG_MAX_SINGLE_ALLOC, npages); 635 - src_pfns = kzalloc(sizeof(long) * max, GFP_KERNEL); 636 - if (src_pfns == NULL) 637 - return -ENOMEM; 638 - dst_pfns = kzalloc(sizeof(long) * max, GFP_KERNEL); 639 - if (dst_pfns == NULL) { 640 - kfree(src_pfns); 641 - return -ENOMEM; 642 - } 803 + args.src = kcalloc(max, sizeof(args.src), GFP_KERNEL); 804 + if (!args.src) 805 + goto out; 806 + args.dst = kcalloc(max, sizeof(args.dst), GFP_KERNEL); 807 + if (!args.dst) 808 + goto out_free_src; 643 809 644 - migrate.drm = drm; 645 - migrate.vma = vma; 646 - migrate.npages = npages; 810 + dma_addrs = kmalloc_array(max, sizeof(*dma_addrs), GFP_KERNEL); 811 + if (!dma_addrs) 812 + goto out_free_dst; 813 + 647 814 for (i = 0; i < npages; i += c) { 648 - unsigned long next; 649 - 650 815 c = min(SG_MAX_SINGLE_ALLOC, npages); 651 - next = start + (c << PAGE_SHIFT); 652 - ret = migrate_vma(&nouveau_dmem_migrate_ops, vma, start, 653 - next, src_pfns, dst_pfns, &migrate); 816 + args.end = start + (c << PAGE_SHIFT); 817 + ret = migrate_vma_setup(&args); 654 818 if (ret) 655 - goto out; 656 - start = next; 819 + goto out_free_dma; 820 + 821 + if (args.cpages) 822 + nouveau_dmem_migrate_chunk(drm, &args, dma_addrs); 823 + args.start = args.end; 657 824 } 658 825 826 + ret = 0; 827 + out_free_dma: 828 + kfree(dma_addrs); 829 + out_free_dst: 830 + kfree(args.dst); 831 + out_free_src: 832 + kfree(args.src); 659 833 out: 660 - kfree(dst_pfns); 661 - kfree(src_pfns); 662 834 return ret; 663 835 } 664 836 ··· 683 841 684 842 npages = (range->end - range->start) >> PAGE_SHIFT; 685 843 for (i = 0; i < npages; ++i) { 686 - struct nouveau_dmem_chunk *chunk; 687 844 struct page *page; 688 845 uint64_t addr; 689 846 690 - page = hmm_pfn_to_page(range, range->pfns[i]); 847 + page = hmm_device_entry_to_page(range, range->pfns[i]); 691 848 if (page == NULL) 692 849 continue; 693 850 ··· 700 859 continue; 701 860 } 702 861 703 - chunk = page->zone_device_data; 704 - addr = page_to_pfn(page) - chunk->pfn_first; 705 - addr = (addr + chunk->bo->bo.mem.start) << PAGE_SHIFT; 706 - 862 + addr = nouveau_dmem_page_addr(page); 707 863 range->pfns[i] &= ((1UL << range->pfn_shift) - 1); 708 864 range->pfns[i] |= (addr >> PAGE_SHIFT) << range->pfn_shift; 709 865 }

-11

drivers/gpu/drm/nouveau/nouveau_dmem.h

··· 45 45 static inline void nouveau_dmem_fini(struct nouveau_drm *drm) {} 46 46 static inline void nouveau_dmem_suspend(struct nouveau_drm *drm) {} 47 47 static inline void nouveau_dmem_resume(struct nouveau_drm *drm) {} 48 - 49 - static inline int nouveau_dmem_migrate_vma(struct nouveau_drm *drm, 50 - struct vm_area_struct *vma, 51 - unsigned long start, 52 - unsigned long end) 53 - { 54 - return 0; 55 - } 56 - 57 - static inline void nouveau_dmem_convert_pfn(struct nouveau_drm *drm, 58 - struct hmm_range *range) {} 59 48 #endif /* IS_ENABLED(CONFIG_DRM_NOUVEAU_SVM) */ 60 49 #endif

+3

drivers/gpu/drm/nouveau/nouveau_drm.c

··· 28 28 #include <linux/pci.h> 29 29 #include <linux/pm_runtime.h> 30 30 #include <linux/vga_switcheroo.h> 31 + #include <linux/mmu_notifier.h> 31 32 32 33 #include <drm/drm_crtc_helper.h> 33 34 #include <drm/drm_ioctl.h> ··· 1291 1290 #ifdef CONFIG_NOUVEAU_PLATFORM_DRIVER 1292 1291 platform_driver_unregister(&nouveau_platform_driver); 1293 1292 #endif 1293 + if (IS_ENABLED(CONFIG_DRM_NOUVEAU_SVM)) 1294 + mmu_notifier_synchronize(); 1294 1295 } 1295 1296 1296 1297 module_init(nouveau_drm_init);

+10 -13

drivers/gpu/drm/nouveau/nouveau_svm.c

··· 252 252 253 253 static int 254 254 nouveau_svmm_sync_cpu_device_pagetables(struct hmm_mirror *mirror, 255 - const struct hmm_update *update) 255 + const struct mmu_notifier_range *update) 256 256 { 257 257 struct nouveau_svmm *svmm = container_of(mirror, typeof(*svmm), mirror); 258 258 unsigned long start = update->start; 259 259 unsigned long limit = update->end; 260 260 261 - if (!update->blockable) 261 + if (!mmu_notifier_range_blockable(update)) 262 262 return -EAGAIN; 263 263 264 264 SVMM_DBG(svmm, "invalidate %016lx-%016lx", start, limit); ··· 485 485 } 486 486 487 487 static int 488 - nouveau_range_fault(struct hmm_mirror *mirror, struct hmm_range *range) 488 + nouveau_range_fault(struct nouveau_svmm *svmm, struct hmm_range *range) 489 489 { 490 490 long ret; 491 491 492 492 range->default_flags = 0; 493 493 range->pfn_flags_mask = -1UL; 494 494 495 - ret = hmm_range_register(range, mirror, 496 - range->start, range->end, 497 - PAGE_SHIFT); 495 + ret = hmm_range_register(range, &svmm->mirror); 498 496 if (ret) { 499 - up_read(&range->vma->vm_mm->mmap_sem); 497 + up_read(&svmm->mm->mmap_sem); 500 498 return (int)ret; 501 499 } 502 500 503 501 if (!hmm_range_wait_until_valid(range, HMM_RANGE_DEFAULT_TIMEOUT)) { 504 - up_read(&range->vma->vm_mm->mmap_sem); 505 - return -EAGAIN; 502 + up_read(&svmm->mm->mmap_sem); 503 + return -EBUSY; 506 504 } 507 505 508 - ret = hmm_range_fault(range, true); 506 + ret = hmm_range_fault(range, 0); 509 507 if (ret <= 0) { 510 508 if (ret == 0) 511 509 ret = -EBUSY; 512 - up_read(&range->vma->vm_mm->mmap_sem); 510 + up_read(&svmm->mm->mmap_sem); 513 511 hmm_range_unregister(range); 514 512 return ret; 515 513 } ··· 680 682 args.i.p.addr + args.i.p.size, fn - fi); 681 683 682 684 /* Have HMM fault pages within the fault window to the GPU. */ 683 - range.vma = vma; 684 685 range.start = args.i.p.addr; 685 686 range.end = args.i.p.addr + args.i.p.size; 686 687 range.pfns = args.phys; ··· 687 690 range.values = nouveau_svm_pfn_values; 688 691 range.pfn_shift = NVIF_VMM_PFNMAP_V0_ADDR_SHIFT; 689 692 again: 690 - ret = nouveau_range_fault(&svmm->mirror, &range); 693 + ret = nouveau_range_fault(svmm, &range); 691 694 if (ret == 0) { 692 695 mutex_lock(&svmm->mutex); 693 696 if (!nouveau_range_done(&range)) {

-3

drivers/gpu/drm/radeon/radeon.h

··· 2449 2449 /* tracking pinned memory */ 2450 2450 u64 vram_pin_size; 2451 2451 u64 gart_pin_size; 2452 - 2453 - struct mutex mn_lock; 2454 - DECLARE_HASHTABLE(mn_hash, 7); 2455 2452 }; 2456 2453 2457 2454 bool radeon_is_px(struct drm_device *dev);

-2

drivers/gpu/drm/radeon/radeon_device.c

··· 1325 1325 init_rwsem(&rdev->pm.mclk_lock); 1326 1326 init_rwsem(&rdev->exclusive_lock); 1327 1327 init_waitqueue_head(&rdev->irq.vblank_queue); 1328 - mutex_init(&rdev->mn_lock); 1329 - hash_init(rdev->mn_hash); 1330 1328 r = radeon_gem_init(rdev); 1331 1329 if (r) 1332 1330 return r;

+2

drivers/gpu/drm/radeon/radeon_drv.c

··· 35 35 #include <linux/module.h> 36 36 #include <linux/pm_runtime.h> 37 37 #include <linux/vga_switcheroo.h> 38 + #include <linux/mmu_notifier.h> 38 39 39 40 #include <drm/drm_crtc_helper.h> 40 41 #include <drm/drm_drv.h> ··· 624 623 { 625 624 pci_unregister_driver(pdriver); 626 625 radeon_unregister_atpx_handler(); 626 + mmu_notifier_synchronize(); 627 627 } 628 628 629 629 module_init(radeon_init);

+43 -125

drivers/gpu/drm/radeon/radeon_mn.c

··· 37 37 #include "radeon.h" 38 38 39 39 struct radeon_mn { 40 - /* constant after initialisation */ 41 - struct radeon_device *rdev; 42 - struct mm_struct *mm; 43 40 struct mmu_notifier mn; 44 - 45 - /* only used on destruction */ 46 - struct work_struct work; 47 - 48 - /* protected by rdev->mn_lock */ 49 - struct hlist_node node; 50 41 51 42 /* objects protected by lock */ 52 43 struct mutex lock; ··· 48 57 struct interval_tree_node it; 49 58 struct list_head bos; 50 59 }; 51 - 52 - /** 53 - * radeon_mn_destroy - destroy the rmn 54 - * 55 - * @work: previously sheduled work item 56 - * 57 - * Lazy destroys the notifier from a work item 58 - */ 59 - static void radeon_mn_destroy(struct work_struct *work) 60 - { 61 - struct radeon_mn *rmn = container_of(work, struct radeon_mn, work); 62 - struct radeon_device *rdev = rmn->rdev; 63 - struct radeon_mn_node *node, *next_node; 64 - struct radeon_bo *bo, *next_bo; 65 - 66 - mutex_lock(&rdev->mn_lock); 67 - mutex_lock(&rmn->lock); 68 - hash_del(&rmn->node); 69 - rbtree_postorder_for_each_entry_safe(node, next_node, 70 - &rmn->objects.rb_root, it.rb) { 71 - 72 - interval_tree_remove(&node->it, &rmn->objects); 73 - list_for_each_entry_safe(bo, next_bo, &node->bos, mn_list) { 74 - bo->mn = NULL; 75 - list_del_init(&bo->mn_list); 76 - } 77 - kfree(node); 78 - } 79 - mutex_unlock(&rmn->lock); 80 - mutex_unlock(&rdev->mn_lock); 81 - mmu_notifier_unregister(&rmn->mn, rmn->mm); 82 - kfree(rmn); 83 - } 84 - 85 - /** 86 - * radeon_mn_release - callback to notify about mm destruction 87 - * 88 - * @mn: our notifier 89 - * @mn: the mm this callback is about 90 - * 91 - * Shedule a work item to lazy destroy our notifier. 92 - */ 93 - static void radeon_mn_release(struct mmu_notifier *mn, 94 - struct mm_struct *mm) 95 - { 96 - struct radeon_mn *rmn = container_of(mn, struct radeon_mn, mn); 97 - INIT_WORK(&rmn->work, radeon_mn_destroy); 98 - schedule_work(&rmn->work); 99 - } 100 60 101 61 /** 102 62 * radeon_mn_invalidate_range_start - callback to notify about mm change ··· 125 183 return ret; 126 184 } 127 185 186 + static void radeon_mn_release(struct mmu_notifier *mn, struct mm_struct *mm) 187 + { 188 + struct mmu_notifier_range range = { 189 + .mm = mm, 190 + .start = 0, 191 + .end = ULONG_MAX, 192 + .flags = 0, 193 + .event = MMU_NOTIFY_UNMAP, 194 + }; 195 + 196 + radeon_mn_invalidate_range_start(mn, &range); 197 + } 198 + 199 + static struct mmu_notifier *radeon_mn_alloc_notifier(struct mm_struct *mm) 200 + { 201 + struct radeon_mn *rmn; 202 + 203 + rmn = kzalloc(sizeof(*rmn), GFP_KERNEL); 204 + if (!rmn) 205 + return ERR_PTR(-ENOMEM); 206 + 207 + mutex_init(&rmn->lock); 208 + rmn->objects = RB_ROOT_CACHED; 209 + return &rmn->mn; 210 + } 211 + 212 + static void radeon_mn_free_notifier(struct mmu_notifier *mn) 213 + { 214 + kfree(container_of(mn, struct radeon_mn, mn)); 215 + } 216 + 128 217 static const struct mmu_notifier_ops radeon_mn_ops = { 129 218 .release = radeon_mn_release, 130 219 .invalidate_range_start = radeon_mn_invalidate_range_start, 220 + .alloc_notifier = radeon_mn_alloc_notifier, 221 + .free_notifier = radeon_mn_free_notifier, 131 222 }; 132 - 133 - /** 134 - * radeon_mn_get - create notifier context 135 - * 136 - * @rdev: radeon device pointer 137 - * 138 - * Creates a notifier context for current->mm. 139 - */ 140 - static struct radeon_mn *radeon_mn_get(struct radeon_device *rdev) 141 - { 142 - struct mm_struct *mm = current->mm; 143 - struct radeon_mn *rmn; 144 - int r; 145 - 146 - if (down_write_killable(&mm->mmap_sem)) 147 - return ERR_PTR(-EINTR); 148 - 149 - mutex_lock(&rdev->mn_lock); 150 - 151 - hash_for_each_possible(rdev->mn_hash, rmn, node, (unsigned long)mm) 152 - if (rmn->mm == mm) 153 - goto release_locks; 154 - 155 - rmn = kzalloc(sizeof(*rmn), GFP_KERNEL); 156 - if (!rmn) { 157 - rmn = ERR_PTR(-ENOMEM); 158 - goto release_locks; 159 - } 160 - 161 - rmn->rdev = rdev; 162 - rmn->mm = mm; 163 - rmn->mn.ops = &radeon_mn_ops; 164 - mutex_init(&rmn->lock); 165 - rmn->objects = RB_ROOT_CACHED; 166 - 167 - r = __mmu_notifier_register(&rmn->mn, mm); 168 - if (r) 169 - goto free_rmn; 170 - 171 - hash_add(rdev->mn_hash, &rmn->node, (unsigned long)mm); 172 - 173 - release_locks: 174 - mutex_unlock(&rdev->mn_lock); 175 - up_write(&mm->mmap_sem); 176 - 177 - return rmn; 178 - 179 - free_rmn: 180 - mutex_unlock(&rdev->mn_lock); 181 - up_write(&mm->mmap_sem); 182 - kfree(rmn); 183 - 184 - return ERR_PTR(r); 185 - } 186 223 187 224 /** 188 225 * radeon_mn_register - register a BO for notifier updates ··· 175 254 int radeon_mn_register(struct radeon_bo *bo, unsigned long addr) 176 255 { 177 256 unsigned long end = addr + radeon_bo_size(bo) - 1; 178 - struct radeon_device *rdev = bo->rdev; 257 + struct mmu_notifier *mn; 179 258 struct radeon_mn *rmn; 180 259 struct radeon_mn_node *node = NULL; 181 260 struct list_head bos; 182 261 struct interval_tree_node *it; 183 262 184 - rmn = radeon_mn_get(rdev); 185 - if (IS_ERR(rmn)) 186 - return PTR_ERR(rmn); 263 + mn = mmu_notifier_get(&radeon_mn_ops, current->mm); 264 + if (IS_ERR(mn)) 265 + return PTR_ERR(mn); 266 + rmn = container_of(mn, struct radeon_mn, mn); 187 267 188 268 INIT_LIST_HEAD(&bos); 189 269 ··· 231 309 */ 232 310 void radeon_mn_unregister(struct radeon_bo *bo) 233 311 { 234 - struct radeon_device *rdev = bo->rdev; 235 - struct radeon_mn *rmn; 312 + struct radeon_mn *rmn = bo->mn; 236 313 struct list_head *head; 237 314 238 - mutex_lock(&rdev->mn_lock); 239 - rmn = bo->mn; 240 - if (rmn == NULL) { 241 - mutex_unlock(&rdev->mn_lock); 315 + if (!rmn) 242 316 return; 243 - } 244 317 245 318 mutex_lock(&rmn->lock); 246 319 /* save the next list entry for later */ 247 320 head = bo->mn_list.next; 248 321 249 - bo->mn = NULL; 250 322 list_del(&bo->mn_list); 251 323 252 324 if (list_empty(head)) { ··· 251 335 } 252 336 253 337 mutex_unlock(&rmn->lock); 254 - mutex_unlock(&rdev->mn_lock); 338 + 339 + mmu_notifier_put(&rmn->mn); 340 + bo->mn = NULL; 255 341 }

+1

drivers/infiniband/Kconfig

··· 55 55 bool "InfiniBand on-demand paging support" 56 56 depends on INFINIBAND_USER_MEM 57 57 select MMU_NOTIFIER 58 + select INTERVAL_TREE 58 59 default y 59 60 ---help--- 60 61 On demand paging support for the InfiniBand subsystem.

+1

drivers/infiniband/core/device.c

··· 2562 2562 SET_DEVICE_OP(dev_ops, get_vf_config); 2563 2563 SET_DEVICE_OP(dev_ops, get_vf_stats); 2564 2564 SET_DEVICE_OP(dev_ops, init_port); 2565 + SET_DEVICE_OP(dev_ops, invalidate_range); 2565 2566 SET_DEVICE_OP(dev_ops, iw_accept); 2566 2567 SET_DEVICE_OP(dev_ops, iw_add_ref); 2567 2568 SET_DEVICE_OP(dev_ops, iw_connect);

+11 -43

drivers/infiniband/core/umem.c

··· 184 184 /** 185 185 * ib_umem_get - Pin and DMA map userspace memory. 186 186 * 187 - * If access flags indicate ODP memory, avoid pinning. Instead, stores 188 - * the mm for future page fault handling in conjunction with MMU notifiers. 189 - * 190 187 * @udata: userspace context to pin memory for 191 188 * @addr: userspace virtual address to start at 192 189 * @size: length of region to pin ··· 228 231 if (!can_do_mlock()) 229 232 return ERR_PTR(-EPERM); 230 233 231 - if (access & IB_ACCESS_ON_DEMAND) { 232 - umem = kzalloc(sizeof(struct ib_umem_odp), GFP_KERNEL); 233 - if (!umem) 234 - return ERR_PTR(-ENOMEM); 235 - umem->is_odp = 1; 236 - } else { 237 - umem = kzalloc(sizeof(*umem), GFP_KERNEL); 238 - if (!umem) 239 - return ERR_PTR(-ENOMEM); 240 - } 234 + if (access & IB_ACCESS_ON_DEMAND) 235 + return ERR_PTR(-EOPNOTSUPP); 241 236 242 - umem->context = context; 237 + umem = kzalloc(sizeof(*umem), GFP_KERNEL); 238 + if (!umem) 239 + return ERR_PTR(-ENOMEM); 240 + umem->ibdev = context->device; 243 241 umem->length = size; 244 242 umem->address = addr; 245 243 umem->writable = ib_access_writable(access); 246 244 umem->owning_mm = mm = current->mm; 247 245 mmgrab(mm); 248 - 249 - if (access & IB_ACCESS_ON_DEMAND) { 250 - if (WARN_ON_ONCE(!context->invalidate_range)) { 251 - ret = -EINVAL; 252 - goto umem_kfree; 253 - } 254 - 255 - ret = ib_umem_odp_get(to_ib_umem_odp(umem), access); 256 - if (ret) 257 - goto umem_kfree; 258 - return umem; 259 - } 260 246 261 247 page_list = (struct page **) __get_free_page(GFP_KERNEL); 262 248 if (!page_list) { ··· 326 346 } 327 347 EXPORT_SYMBOL(ib_umem_get); 328 348 329 - static void __ib_umem_release_tail(struct ib_umem *umem) 330 - { 331 - mmdrop(umem->owning_mm); 332 - if (umem->is_odp) 333 - kfree(to_ib_umem_odp(umem)); 334 - else 335 - kfree(umem); 336 - } 337 - 338 349 /** 339 350 * ib_umem_release - release memory pinned with ib_umem_get 340 351 * @umem: umem struct to release ··· 334 363 { 335 364 if (!umem) 336 365 return; 366 + if (umem->is_odp) 367 + return ib_umem_odp_release(to_ib_umem_odp(umem)); 337 368 338 - if (umem->is_odp) { 339 - ib_umem_odp_release(to_ib_umem_odp(umem)); 340 - __ib_umem_release_tail(umem); 341 - return; 342 - } 343 - 344 - __ib_umem_release(umem->context->device, umem, 1); 369 + __ib_umem_release(umem->ibdev, umem, 1); 345 370 346 371 atomic64_sub(ib_umem_num_pages(umem), &umem->owning_mm->pinned_vm); 347 - __ib_umem_release_tail(umem); 372 + mmdrop(umem->owning_mm); 373 + kfree(umem); 348 374 } 349 375 EXPORT_SYMBOL(ib_umem_release); 350 376

+276 -280

drivers/infiniband/core/umem_odp.c

··· 39 39 #include <linux/export.h> 40 40 #include <linux/vmalloc.h> 41 41 #include <linux/hugetlb.h> 42 - #include <linux/interval_tree_generic.h> 42 + #include <linux/interval_tree.h> 43 43 #include <linux/pagemap.h> 44 44 45 45 #include <rdma/ib_verbs.h> 46 46 #include <rdma/ib_umem.h> 47 47 #include <rdma/ib_umem_odp.h> 48 48 49 - /* 50 - * The ib_umem list keeps track of memory regions for which the HW 51 - * device request to receive notification when the related memory 52 - * mapping is changed. 53 - * 54 - * ib_umem_lock protects the list. 55 - */ 56 - 57 - static u64 node_start(struct umem_odp_node *n) 58 - { 59 - struct ib_umem_odp *umem_odp = 60 - container_of(n, struct ib_umem_odp, interval_tree); 61 - 62 - return ib_umem_start(umem_odp); 63 - } 64 - 65 - /* Note that the representation of the intervals in the interval tree 66 - * considers the ending point as contained in the interval, while the 67 - * function ib_umem_end returns the first address which is not contained 68 - * in the umem. 69 - */ 70 - static u64 node_last(struct umem_odp_node *n) 71 - { 72 - struct ib_umem_odp *umem_odp = 73 - container_of(n, struct ib_umem_odp, interval_tree); 74 - 75 - return ib_umem_end(umem_odp) - 1; 76 - } 77 - 78 - INTERVAL_TREE_DEFINE(struct umem_odp_node, rb, u64, __subtree_last, 79 - node_start, node_last, static, rbt_ib_umem) 49 + #include "uverbs.h" 80 50 81 51 static void ib_umem_notifier_start_account(struct ib_umem_odp *umem_odp) 82 52 { ··· 74 104 mutex_unlock(&umem_odp->umem_mutex); 75 105 } 76 106 77 - static int ib_umem_notifier_release_trampoline(struct ib_umem_odp *umem_odp, 78 - u64 start, u64 end, void *cookie) 79 - { 80 - /* 81 - * Increase the number of notifiers running, to 82 - * prevent any further fault handling on this MR. 83 - */ 84 - ib_umem_notifier_start_account(umem_odp); 85 - complete_all(&umem_odp->notifier_completion); 86 - umem_odp->umem.context->invalidate_range( 87 - umem_odp, ib_umem_start(umem_odp), ib_umem_end(umem_odp)); 88 - return 0; 89 - } 90 - 91 107 static void ib_umem_notifier_release(struct mmu_notifier *mn, 92 108 struct mm_struct *mm) 93 109 { 94 110 struct ib_ucontext_per_mm *per_mm = 95 111 container_of(mn, struct ib_ucontext_per_mm, mn); 112 + struct rb_node *node; 96 113 97 114 down_read(&per_mm->umem_rwsem); 98 - if (per_mm->active) 99 - rbt_ib_umem_for_each_in_range( 100 - &per_mm->umem_tree, 0, ULLONG_MAX, 101 - ib_umem_notifier_release_trampoline, true, NULL); 115 + if (!per_mm->mn.users) 116 + goto out; 117 + 118 + for (node = rb_first_cached(&per_mm->umem_tree); node; 119 + node = rb_next(node)) { 120 + struct ib_umem_odp *umem_odp = 121 + rb_entry(node, struct ib_umem_odp, interval_tree.rb); 122 + 123 + /* 124 + * Increase the number of notifiers running, to prevent any 125 + * further fault handling on this MR. 126 + */ 127 + ib_umem_notifier_start_account(umem_odp); 128 + complete_all(&umem_odp->notifier_completion); 129 + umem_odp->umem.ibdev->ops.invalidate_range( 130 + umem_odp, ib_umem_start(umem_odp), 131 + ib_umem_end(umem_odp)); 132 + } 133 + 134 + out: 102 135 up_read(&per_mm->umem_rwsem); 103 136 } 104 137 ··· 109 136 u64 start, u64 end, void *cookie) 110 137 { 111 138 ib_umem_notifier_start_account(item); 112 - item->umem.context->invalidate_range(item, start, end); 139 + item->umem.ibdev->ops.invalidate_range(item, start, end); 113 140 return 0; 114 141 } 115 142 ··· 125 152 else if (!down_read_trylock(&per_mm->umem_rwsem)) 126 153 return -EAGAIN; 127 154 128 - if (!per_mm->active) { 155 + if (!per_mm->mn.users) { 129 156 up_read(&per_mm->umem_rwsem); 130 157 /* 131 - * At this point active is permanently set and visible to this 158 + * At this point users is permanently zero and visible to this 132 159 * CPU without a lock, that fact is relied on to skip the unlock 133 160 * in range_end. 134 161 */ ··· 158 185 struct ib_ucontext_per_mm *per_mm = 159 186 container_of(mn, struct ib_ucontext_per_mm, mn); 160 187 161 - if (unlikely(!per_mm->active)) 188 + if (unlikely(!per_mm->mn.users)) 162 189 return; 163 190 164 191 rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, range->start, ··· 167 194 up_read(&per_mm->umem_rwsem); 168 195 } 169 196 170 - static const struct mmu_notifier_ops ib_umem_notifiers = { 171 - .release = ib_umem_notifier_release, 172 - .invalidate_range_start = ib_umem_notifier_invalidate_range_start, 173 - .invalidate_range_end = ib_umem_notifier_invalidate_range_end, 174 - }; 175 - 176 - static void add_umem_to_per_mm(struct ib_umem_odp *umem_odp) 177 - { 178 - struct ib_ucontext_per_mm *per_mm = umem_odp->per_mm; 179 - 180 - down_write(&per_mm->umem_rwsem); 181 - if (likely(ib_umem_start(umem_odp) != ib_umem_end(umem_odp))) 182 - rbt_ib_umem_insert(&umem_odp->interval_tree, 183 - &per_mm->umem_tree); 184 - up_write(&per_mm->umem_rwsem); 185 - } 186 - 187 - static void remove_umem_from_per_mm(struct ib_umem_odp *umem_odp) 188 - { 189 - struct ib_ucontext_per_mm *per_mm = umem_odp->per_mm; 190 - 191 - down_write(&per_mm->umem_rwsem); 192 - if (likely(ib_umem_start(umem_odp) != ib_umem_end(umem_odp))) 193 - rbt_ib_umem_remove(&umem_odp->interval_tree, 194 - &per_mm->umem_tree); 195 - complete_all(&umem_odp->notifier_completion); 196 - 197 - up_write(&per_mm->umem_rwsem); 198 - } 199 - 200 - static struct ib_ucontext_per_mm *alloc_per_mm(struct ib_ucontext *ctx, 201 - struct mm_struct *mm) 197 + static struct mmu_notifier *ib_umem_alloc_notifier(struct mm_struct *mm) 202 198 { 203 199 struct ib_ucontext_per_mm *per_mm; 204 - int ret; 205 200 206 201 per_mm = kzalloc(sizeof(*per_mm), GFP_KERNEL); 207 202 if (!per_mm) 208 203 return ERR_PTR(-ENOMEM); 209 204 210 - per_mm->context = ctx; 211 - per_mm->mm = mm; 212 205 per_mm->umem_tree = RB_ROOT_CACHED; 213 206 init_rwsem(&per_mm->umem_rwsem); 214 - per_mm->active = true; 215 207 208 + WARN_ON(mm != current->mm); 216 209 rcu_read_lock(); 217 210 per_mm->tgid = get_task_pid(current->group_leader, PIDTYPE_PID); 218 211 rcu_read_unlock(); 219 - 220 - WARN_ON(mm != current->mm); 221 - 222 - per_mm->mn.ops = &ib_umem_notifiers; 223 - ret = mmu_notifier_register(&per_mm->mn, per_mm->mm); 224 - if (ret) { 225 - dev_err(&ctx->device->dev, 226 - "Failed to register mmu_notifier %d\n", ret); 227 - goto out_pid; 228 - } 229 - 230 - list_add(&per_mm->ucontext_list, &ctx->per_mm_list); 231 - return per_mm; 232 - 233 - out_pid: 234 - put_pid(per_mm->tgid); 235 - kfree(per_mm); 236 - return ERR_PTR(ret); 212 + return &per_mm->mn; 237 213 } 238 214 239 - static int get_per_mm(struct ib_umem_odp *umem_odp) 215 + static void ib_umem_free_notifier(struct mmu_notifier *mn) 240 216 { 241 - struct ib_ucontext *ctx = umem_odp->umem.context; 242 - struct ib_ucontext_per_mm *per_mm; 243 - 244 - /* 245 - * Generally speaking we expect only one or two per_mm in this list, 246 - * so no reason to optimize this search today. 247 - */ 248 - mutex_lock(&ctx->per_mm_list_lock); 249 - list_for_each_entry(per_mm, &ctx->per_mm_list, ucontext_list) { 250 - if (per_mm->mm == umem_odp->umem.owning_mm) 251 - goto found; 252 - } 253 - 254 - per_mm = alloc_per_mm(ctx, umem_odp->umem.owning_mm); 255 - if (IS_ERR(per_mm)) { 256 - mutex_unlock(&ctx->per_mm_list_lock); 257 - return PTR_ERR(per_mm); 258 - } 259 - 260 - found: 261 - umem_odp->per_mm = per_mm; 262 - per_mm->odp_mrs_count++; 263 - mutex_unlock(&ctx->per_mm_list_lock); 264 - 265 - return 0; 266 - } 267 - 268 - static void free_per_mm(struct rcu_head *rcu) 269 - { 270 - kfree(container_of(rcu, struct ib_ucontext_per_mm, rcu)); 271 - } 272 - 273 - static void put_per_mm(struct ib_umem_odp *umem_odp) 274 - { 275 - struct ib_ucontext_per_mm *per_mm = umem_odp->per_mm; 276 - struct ib_ucontext *ctx = umem_odp->umem.context; 277 - bool need_free; 278 - 279 - mutex_lock(&ctx->per_mm_list_lock); 280 - umem_odp->per_mm = NULL; 281 - per_mm->odp_mrs_count--; 282 - need_free = per_mm->odp_mrs_count == 0; 283 - if (need_free) 284 - list_del(&per_mm->ucontext_list); 285 - mutex_unlock(&ctx->per_mm_list_lock); 286 - 287 - if (!need_free) 288 - return; 289 - 290 - /* 291 - * NOTE! mmu_notifier_unregister() can happen between a start/end 292 - * callback, resulting in an start/end, and thus an unbalanced 293 - * lock. This doesn't really matter to us since we are about to kfree 294 - * the memory that holds the lock, however LOCKDEP doesn't like this. 295 - */ 296 - down_write(&per_mm->umem_rwsem); 297 - per_mm->active = false; 298 - up_write(&per_mm->umem_rwsem); 217 + struct ib_ucontext_per_mm *per_mm = 218 + container_of(mn, struct ib_ucontext_per_mm, mn); 299 219 300 220 WARN_ON(!RB_EMPTY_ROOT(&per_mm->umem_tree.rb_root)); 301 - mmu_notifier_unregister_no_release(&per_mm->mn, per_mm->mm); 221 + 302 222 put_pid(per_mm->tgid); 303 - mmu_notifier_call_srcu(&per_mm->rcu, free_per_mm); 223 + kfree(per_mm); 304 224 } 305 225 306 - struct ib_umem_odp *ib_alloc_odp_umem(struct ib_umem_odp *root, 307 - unsigned long addr, size_t size) 226 + static const struct mmu_notifier_ops ib_umem_notifiers = { 227 + .release = ib_umem_notifier_release, 228 + .invalidate_range_start = ib_umem_notifier_invalidate_range_start, 229 + .invalidate_range_end = ib_umem_notifier_invalidate_range_end, 230 + .alloc_notifier = ib_umem_alloc_notifier, 231 + .free_notifier = ib_umem_free_notifier, 232 + }; 233 + 234 + static inline int ib_init_umem_odp(struct ib_umem_odp *umem_odp) 308 235 { 309 - struct ib_ucontext_per_mm *per_mm = root->per_mm; 310 - struct ib_ucontext *ctx = per_mm->context; 236 + struct ib_ucontext_per_mm *per_mm; 237 + struct mmu_notifier *mn; 238 + int ret; 239 + 240 + umem_odp->umem.is_odp = 1; 241 + if (!umem_odp->is_implicit_odp) { 242 + size_t page_size = 1UL << umem_odp->page_shift; 243 + size_t pages; 244 + 245 + umem_odp->interval_tree.start = 246 + ALIGN_DOWN(umem_odp->umem.address, page_size); 247 + if (check_add_overflow(umem_odp->umem.address, 248 + umem_odp->umem.length, 249 + &umem_odp->interval_tree.last)) 250 + return -EOVERFLOW; 251 + umem_odp->interval_tree.last = 252 + ALIGN(umem_odp->interval_tree.last, page_size); 253 + if (unlikely(umem_odp->interval_tree.last < page_size)) 254 + return -EOVERFLOW; 255 + 256 + pages = (umem_odp->interval_tree.last - 257 + umem_odp->interval_tree.start) >> 258 + umem_odp->page_shift; 259 + if (!pages) 260 + return -EINVAL; 261 + 262 + /* 263 + * Note that the representation of the intervals in the 264 + * interval tree considers the ending point as contained in 265 + * the interval. 266 + */ 267 + umem_odp->interval_tree.last--; 268 + 269 + umem_odp->page_list = kvcalloc( 270 + pages, sizeof(*umem_odp->page_list), GFP_KERNEL); 271 + if (!umem_odp->page_list) 272 + return -ENOMEM; 273 + 274 + umem_odp->dma_list = kvcalloc( 275 + pages, sizeof(*umem_odp->dma_list), GFP_KERNEL); 276 + if (!umem_odp->dma_list) { 277 + ret = -ENOMEM; 278 + goto out_page_list; 279 + } 280 + } 281 + 282 + mn = mmu_notifier_get(&ib_umem_notifiers, umem_odp->umem.owning_mm); 283 + if (IS_ERR(mn)) { 284 + ret = PTR_ERR(mn); 285 + goto out_dma_list; 286 + } 287 + umem_odp->per_mm = per_mm = 288 + container_of(mn, struct ib_ucontext_per_mm, mn); 289 + 290 + mutex_init(&umem_odp->umem_mutex); 291 + init_completion(&umem_odp->notifier_completion); 292 + 293 + if (!umem_odp->is_implicit_odp) { 294 + down_write(&per_mm->umem_rwsem); 295 + interval_tree_insert(&umem_odp->interval_tree, 296 + &per_mm->umem_tree); 297 + up_write(&per_mm->umem_rwsem); 298 + } 299 + mmgrab(umem_odp->umem.owning_mm); 300 + 301 + return 0; 302 + 303 + out_dma_list: 304 + kvfree(umem_odp->dma_list); 305 + out_page_list: 306 + kvfree(umem_odp->page_list); 307 + return ret; 308 + } 309 + 310 + /** 311 + * ib_umem_odp_alloc_implicit - Allocate a parent implicit ODP umem 312 + * 313 + * Implicit ODP umems do not have a VA range and do not have any page lists. 314 + * They exist only to hold the per_mm reference to help the driver create 315 + * children umems. 316 + * 317 + * @udata: udata from the syscall being used to create the umem 318 + * @access: ib_reg_mr access flags 319 + */ 320 + struct ib_umem_odp *ib_umem_odp_alloc_implicit(struct ib_udata *udata, 321 + int access) 322 + { 323 + struct ib_ucontext *context = 324 + container_of(udata, struct uverbs_attr_bundle, driver_udata) 325 + ->context; 326 + struct ib_umem *umem; 327 + struct ib_umem_odp *umem_odp; 328 + int ret; 329 + 330 + if (access & IB_ACCESS_HUGETLB) 331 + return ERR_PTR(-EINVAL); 332 + 333 + if (!context) 334 + return ERR_PTR(-EIO); 335 + if (WARN_ON_ONCE(!context->device->ops.invalidate_range)) 336 + return ERR_PTR(-EINVAL); 337 + 338 + umem_odp = kzalloc(sizeof(*umem_odp), GFP_KERNEL); 339 + if (!umem_odp) 340 + return ERR_PTR(-ENOMEM); 341 + umem = &umem_odp->umem; 342 + umem->ibdev = context->device; 343 + umem->writable = ib_access_writable(access); 344 + umem->owning_mm = current->mm; 345 + umem_odp->is_implicit_odp = 1; 346 + umem_odp->page_shift = PAGE_SHIFT; 347 + 348 + ret = ib_init_umem_odp(umem_odp); 349 + if (ret) { 350 + kfree(umem_odp); 351 + return ERR_PTR(ret); 352 + } 353 + return umem_odp; 354 + } 355 + EXPORT_SYMBOL(ib_umem_odp_alloc_implicit); 356 + 357 + /** 358 + * ib_umem_odp_alloc_child - Allocate a child ODP umem under an implicit 359 + * parent ODP umem 360 + * 361 + * @root: The parent umem enclosing the child. This must be allocated using 362 + * ib_alloc_implicit_odp_umem() 363 + * @addr: The starting userspace VA 364 + * @size: The length of the userspace VA 365 + */ 366 + struct ib_umem_odp *ib_umem_odp_alloc_child(struct ib_umem_odp *root, 367 + unsigned long addr, size_t size) 368 + { 369 + /* 370 + * Caller must ensure that root cannot be freed during the call to 371 + * ib_alloc_odp_umem. 372 + */ 311 373 struct ib_umem_odp *odp_data; 312 374 struct ib_umem *umem; 313 - int pages = size >> PAGE_SHIFT; 314 375 int ret; 376 + 377 + if (WARN_ON(!root->is_implicit_odp)) 378 + return ERR_PTR(-EINVAL); 315 379 316 380 odp_data = kzalloc(sizeof(*odp_data), GFP_KERNEL); 317 381 if (!odp_data) 318 382 return ERR_PTR(-ENOMEM); 319 383 umem = &odp_data->umem; 320 - umem->context = ctx; 384 + umem->ibdev = root->umem.ibdev; 321 385 umem->length = size; 322 386 umem->address = addr; 323 - odp_data->page_shift = PAGE_SHIFT; 324 387 umem->writable = root->umem.writable; 325 - umem->is_odp = 1; 326 - odp_data->per_mm = per_mm; 327 - umem->owning_mm = per_mm->mm; 328 - mmgrab(umem->owning_mm); 388 + umem->owning_mm = root->umem.owning_mm; 389 + odp_data->page_shift = PAGE_SHIFT; 329 390 330 - mutex_init(&odp_data->umem_mutex); 331 - init_completion(&odp_data->notifier_completion); 332 - 333 - odp_data->page_list = 334 - vzalloc(array_size(pages, sizeof(*odp_data->page_list))); 335 - if (!odp_data->page_list) { 336 - ret = -ENOMEM; 337 - goto out_odp_data; 391 + ret = ib_init_umem_odp(odp_data); 392 + if (ret) { 393 + kfree(odp_data); 394 + return ERR_PTR(ret); 338 395 } 339 - 340 - odp_data->dma_list = 341 - vzalloc(array_size(pages, sizeof(*odp_data->dma_list))); 342 - if (!odp_data->dma_list) { 343 - ret = -ENOMEM; 344 - goto out_page_list; 345 - } 346 - 347 - /* 348 - * Caller must ensure that the umem_odp that the per_mm came from 349 - * cannot be freed during the call to ib_alloc_odp_umem. 350 - */ 351 - mutex_lock(&ctx->per_mm_list_lock); 352 - per_mm->odp_mrs_count++; 353 - mutex_unlock(&ctx->per_mm_list_lock); 354 - add_umem_to_per_mm(odp_data); 355 - 356 396 return odp_data; 357 - 358 - out_page_list: 359 - vfree(odp_data->page_list); 360 - out_odp_data: 361 - mmdrop(umem->owning_mm); 362 - kfree(odp_data); 363 - return ERR_PTR(ret); 364 397 } 365 - EXPORT_SYMBOL(ib_alloc_odp_umem); 398 + EXPORT_SYMBOL(ib_umem_odp_alloc_child); 366 399 367 - int ib_umem_odp_get(struct ib_umem_odp *umem_odp, int access) 400 + /** 401 + * ib_umem_odp_get - Create a umem_odp for a userspace va 402 + * 403 + * @udata: userspace context to pin memory for 404 + * @addr: userspace virtual address to start at 405 + * @size: length of region to pin 406 + * @access: IB_ACCESS_xxx flags for memory being pinned 407 + * 408 + * The driver should use when the access flags indicate ODP memory. It avoids 409 + * pinning, instead, stores the mm for future page fault handling in 410 + * conjunction with MMU notifiers. 411 + */ 412 + struct ib_umem_odp *ib_umem_odp_get(struct ib_udata *udata, unsigned long addr, 413 + size_t size, int access) 368 414 { 369 - struct ib_umem *umem = &umem_odp->umem; 370 - /* 371 - * NOTE: This must called in a process context where umem->owning_mm 372 - * == current->mm 373 - */ 374 - struct mm_struct *mm = umem->owning_mm; 375 - int ret_val; 415 + struct ib_umem_odp *umem_odp; 416 + struct ib_ucontext *context; 417 + struct mm_struct *mm; 418 + int ret; 419 + 420 + if (!udata) 421 + return ERR_PTR(-EIO); 422 + 423 + context = container_of(udata, struct uverbs_attr_bundle, driver_udata) 424 + ->context; 425 + if (!context) 426 + return ERR_PTR(-EIO); 427 + 428 + if (WARN_ON_ONCE(!(access & IB_ACCESS_ON_DEMAND)) || 429 + WARN_ON_ONCE(!context->device->ops.invalidate_range)) 430 + return ERR_PTR(-EINVAL); 431 + 432 + umem_odp = kzalloc(sizeof(struct ib_umem_odp), GFP_KERNEL); 433 + if (!umem_odp) 434 + return ERR_PTR(-ENOMEM); 435 + 436 + umem_odp->umem.ibdev = context->device; 437 + umem_odp->umem.length = size; 438 + umem_odp->umem.address = addr; 439 + umem_odp->umem.writable = ib_access_writable(access); 440 + umem_odp->umem.owning_mm = mm = current->mm; 376 441 377 442 umem_odp->page_shift = PAGE_SHIFT; 378 443 if (access & IB_ACCESS_HUGETLB) { ··· 421 410 vma = find_vma(mm, ib_umem_start(umem_odp)); 422 411 if (!vma || !is_vm_hugetlb_page(vma)) { 423 412 up_read(&mm->mmap_sem); 424 - return -EINVAL; 413 + ret = -EINVAL; 414 + goto err_free; 425 415 } 426 416 h = hstate_vma(vma); 427 417 umem_odp->page_shift = huge_page_shift(h); 428 418 up_read(&mm->mmap_sem); 429 419 } 430 420 431 - mutex_init(&umem_odp->umem_mutex); 421 + ret = ib_init_umem_odp(umem_odp); 422 + if (ret) 423 + goto err_free; 424 + return umem_odp; 432 425 433 - init_completion(&umem_odp->notifier_completion); 434 - 435 - if (ib_umem_odp_num_pages(umem_odp)) { 436 - umem_odp->page_list = 437 - vzalloc(array_size(sizeof(*umem_odp->page_list), 438 - ib_umem_odp_num_pages(umem_odp))); 439 - if (!umem_odp->page_list) 440 - return -ENOMEM; 441 - 442 - umem_odp->dma_list = 443 - vzalloc(array_size(sizeof(*umem_odp->dma_list), 444 - ib_umem_odp_num_pages(umem_odp))); 445 - if (!umem_odp->dma_list) { 446 - ret_val = -ENOMEM; 447 - goto out_page_list; 448 - } 449 - } 450 - 451 - ret_val = get_per_mm(umem_odp); 452 - if (ret_val) 453 - goto out_dma_list; 454 - add_umem_to_per_mm(umem_odp); 455 - 456 - return 0; 457 - 458 - out_dma_list: 459 - vfree(umem_odp->dma_list); 460 - out_page_list: 461 - vfree(umem_odp->page_list); 462 - return ret_val; 426 + err_free: 427 + kfree(umem_odp); 428 + return ERR_PTR(ret); 463 429 } 430 + EXPORT_SYMBOL(ib_umem_odp_get); 464 431 465 432 void ib_umem_odp_release(struct ib_umem_odp *umem_odp) 466 433 { 434 + struct ib_ucontext_per_mm *per_mm = umem_odp->per_mm; 435 + 467 436 /* 468 437 * Ensure that no more pages are mapped in the umem. 469 438 * 470 439 * It is the driver's responsibility to ensure, before calling us, 471 440 * that the hardware will not attempt to access the MR any more. 472 441 */ 473 - ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem_odp), 474 - ib_umem_end(umem_odp)); 442 + if (!umem_odp->is_implicit_odp) { 443 + ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem_odp), 444 + ib_umem_end(umem_odp)); 445 + kvfree(umem_odp->dma_list); 446 + kvfree(umem_odp->page_list); 447 + } 475 448 476 - remove_umem_from_per_mm(umem_odp); 477 - put_per_mm(umem_odp); 478 - vfree(umem_odp->dma_list); 479 - vfree(umem_odp->page_list); 449 + down_write(&per_mm->umem_rwsem); 450 + if (!umem_odp->is_implicit_odp) { 451 + interval_tree_remove(&umem_odp->interval_tree, 452 + &per_mm->umem_tree); 453 + complete_all(&umem_odp->notifier_completion); 454 + } 455 + /* 456 + * NOTE! mmu_notifier_unregister() can happen between a start/end 457 + * callback, resulting in a missing end, and thus an unbalanced 458 + * lock. This doesn't really matter to us since we are about to kfree 459 + * the memory that holds the lock, however LOCKDEP doesn't like this. 460 + * Thus we call the mmu_notifier_put under the rwsem and test the 461 + * internal users count to reliably see if we are past this point. 462 + */ 463 + mmu_notifier_put(&per_mm->mn); 464 + up_write(&per_mm->umem_rwsem); 465 + 466 + mmdrop(umem_odp->umem.owning_mm); 467 + kfree(umem_odp); 480 468 } 469 + EXPORT_SYMBOL(ib_umem_odp_release); 481 470 482 471 /* 483 472 * Map for DMA and insert a single page into the on-demand paging page tables. ··· 504 493 u64 access_mask, 505 494 unsigned long current_seq) 506 495 { 507 - struct ib_ucontext *context = umem_odp->umem.context; 508 - struct ib_device *dev = context->device; 496 + struct ib_device *dev = umem_odp->umem.ibdev; 509 497 dma_addr_t dma_addr; 510 498 int remove_existing_mapping = 0; 511 499 int ret = 0; ··· 544 534 545 535 if (remove_existing_mapping) { 546 536 ib_umem_notifier_start_account(umem_odp); 547 - context->invalidate_range( 537 + dev->ops.invalidate_range( 548 538 umem_odp, 549 539 ib_umem_start(umem_odp) + 550 540 (page_index << umem_odp->page_shift), ··· 717 707 { 718 708 int idx; 719 709 u64 addr; 720 - struct ib_device *dev = umem_odp->umem.context->device; 710 + struct ib_device *dev = umem_odp->umem.ibdev; 721 711 722 712 virt = max_t(u64, virt, ib_umem_start(umem_odp)); 723 713 bound = min_t(u64, bound, ib_umem_end(umem_odp)); ··· 771 761 void *cookie) 772 762 { 773 763 int ret_val = 0; 774 - struct umem_odp_node *node, *next; 764 + struct interval_tree_node *node, *next; 775 765 struct ib_umem_odp *umem; 776 766 777 767 if (unlikely(start == last)) 778 768 return ret_val; 779 769 780 - for (node = rbt_ib_umem_iter_first(root, start, last - 1); 770 + for (node = interval_tree_iter_first(root, start, last - 1); 781 771 node; node = next) { 782 772 /* TODO move the blockable decision up to the callback */ 783 773 if (!blockable) 784 774 return -EAGAIN; 785 - next = rbt_ib_umem_iter_next(node, start, last - 1); 775 + next = interval_tree_iter_next(node, start, last - 1); 786 776 umem = container_of(node, struct ib_umem_odp, interval_tree); 787 777 ret_val = cb(umem, start, last, cookie) || ret_val; 788 778 } 789 779 790 780 return ret_val; 791 781 } 792 - EXPORT_SYMBOL(rbt_ib_umem_for_each_in_range); 793 - 794 - struct ib_umem_odp *rbt_ib_umem_lookup(struct rb_root_cached *root, 795 - u64 addr, u64 length) 796 - { 797 - struct umem_odp_node *node; 798 - 799 - node = rbt_ib_umem_iter_first(root, addr, addr + length - 1); 800 - if (node) 801 - return container_of(node, struct ib_umem_odp, interval_tree); 802 - return NULL; 803 - 804 - } 805 - EXPORT_SYMBOL(rbt_ib_umem_lookup);

-5

drivers/infiniband/core/uverbs_cmd.c

··· 252 252 ucontext->closing = false; 253 253 ucontext->cleanup_retryable = false; 254 254 255 - mutex_init(&ucontext->per_mm_list_lock); 256 - INIT_LIST_HEAD(&ucontext->per_mm_list); 257 - 258 255 ret = get_unused_fd_flags(O_CLOEXEC); 259 256 if (ret < 0) 260 257 goto err_free; ··· 272 275 ret = ib_dev->ops.alloc_ucontext(ucontext, &attrs->driver_udata); 273 276 if (ret) 274 277 goto err_file; 275 - if (!(ib_dev->attrs.device_cap_flags & IB_DEVICE_ON_DEMAND_PAGING)) 276 - ucontext->invalidate_range = NULL; 277 278 278 279 rdma_restrack_uadd(&ucontext->res); 279 280

+1

drivers/infiniband/core/uverbs_main.c

··· 1487 1487 IB_UVERBS_NUM_FIXED_MINOR); 1488 1488 unregister_chrdev_region(dynamic_uverbs_dev, 1489 1489 IB_UVERBS_NUM_DYNAMIC_MINOR); 1490 + mmu_notifier_synchronize(); 1490 1491 } 1491 1492 1492 1493 module_init(ib_uverbs_init);

-9

drivers/infiniband/hw/mlx5/main.c

··· 1867 1867 if (err) 1868 1868 goto out_sys_pages; 1869 1869 1870 - if (ibdev->attrs.device_cap_flags & IB_DEVICE_ON_DEMAND_PAGING) 1871 - context->ibucontext.invalidate_range = 1872 - &mlx5_ib_invalidate_range; 1873 - 1874 1870 if (req.flags & MLX5_IB_ALLOC_UCTX_DEVX) { 1875 1871 err = mlx5_ib_devx_create(dev, true); 1876 1872 if (err < 0) ··· 1994 1998 struct mlx5_ib_ucontext *context = to_mucontext(ibcontext); 1995 1999 struct mlx5_ib_dev *dev = to_mdev(ibcontext->device); 1996 2000 struct mlx5_bfreg_info *bfregi; 1997 - 1998 - /* All umem's must be destroyed before destroying the ucontext. */ 1999 - mutex_lock(&ibcontext->per_mm_list_lock); 2000 - WARN_ON(!list_empty(&ibcontext->per_mm_list)); 2001 - mutex_unlock(&ibcontext->per_mm_list_lock); 2002 2001 2003 2002 bfregi = &context->bfregi; 2004 2003 mlx5_ib_dealloc_transport_domain(dev, context->tdn, context->devx_uid);

-13

drivers/infiniband/hw/mlx5/mem.c

··· 56 56 struct scatterlist *sg; 57 57 int entry; 58 58 59 - if (umem->is_odp) { 60 - struct ib_umem_odp *odp = to_ib_umem_odp(umem); 61 - unsigned int page_shift = odp->page_shift; 62 - 63 - *ncont = ib_umem_odp_num_pages(odp); 64 - *count = *ncont << (page_shift - PAGE_SHIFT); 65 - *shift = page_shift; 66 - if (order) 67 - *order = ilog2(roundup_pow_of_two(*ncont)); 68 - 69 - return; 70 - } 71 - 72 59 addr = addr >> PAGE_SHIFT; 73 60 tmp = (unsigned long)addr; 74 61 m = find_first_bit(&tmp, BITS_PER_LONG);

+28 -10

drivers/infiniband/hw/mlx5/mr.c

··· 784 784 int *ncont, int *order) 785 785 { 786 786 struct ib_umem *u; 787 - int err; 788 787 789 788 *umem = NULL; 790 789 791 - u = ib_umem_get(udata, start, length, access_flags, 0); 792 - err = PTR_ERR_OR_ZERO(u); 793 - if (err) { 794 - mlx5_ib_dbg(dev, "umem get failed (%d)\n", err); 795 - return err; 790 + if (access_flags & IB_ACCESS_ON_DEMAND) { 791 + struct ib_umem_odp *odp; 792 + 793 + odp = ib_umem_odp_get(udata, start, length, access_flags); 794 + if (IS_ERR(odp)) { 795 + mlx5_ib_dbg(dev, "umem get failed (%ld)\n", 796 + PTR_ERR(odp)); 797 + return PTR_ERR(odp); 798 + } 799 + 800 + u = &odp->umem; 801 + 802 + *page_shift = odp->page_shift; 803 + *ncont = ib_umem_odp_num_pages(odp); 804 + *npages = *ncont << (*page_shift - PAGE_SHIFT); 805 + if (order) 806 + *order = ilog2(roundup_pow_of_two(*ncont)); 807 + } else { 808 + u = ib_umem_get(udata, start, length, access_flags, 0); 809 + if (IS_ERR(u)) { 810 + mlx5_ib_dbg(dev, "umem get failed (%ld)\n", PTR_ERR(u)); 811 + return PTR_ERR(u); 812 + } 813 + 814 + mlx5_ib_cont_pages(u, start, MLX5_MKEY_PAGE_SHIFT_MASK, npages, 815 + page_shift, ncont, order); 796 816 } 797 817 798 - mlx5_ib_cont_pages(u, start, MLX5_MKEY_PAGE_SHIFT_MASK, npages, 799 - page_shift, ncont, order); 800 818 if (!*npages) { 801 819 mlx5_ib_warn(dev, "avoid zero region\n"); 802 820 ib_umem_release(u); ··· 1617 1599 /* Wait for all running page-fault handlers to finish. */ 1618 1600 synchronize_srcu(&dev->mr_srcu); 1619 1601 /* Destroy all page mappings */ 1620 - if (umem_odp->page_list) 1602 + if (!umem_odp->is_implicit_odp) 1621 1603 mlx5_ib_invalidate_range(umem_odp, 1622 1604 ib_umem_start(umem_odp), 1623 1605 ib_umem_end(umem_odp)); ··· 1628 1610 * so that there will not be any invalidations in 1629 1611 * flight, looking at the *mr struct. 1630 1612 */ 1631 - ib_umem_release(umem); 1613 + ib_umem_odp_release(umem_odp); 1632 1614 atomic_sub(npages, &dev->mdev->priv.reg_pages); 1633 1615 1634 1616 /* Avoid double-freeing the umem. */

+43 -47

drivers/infiniband/hw/mlx5/odp.c

··· 184 184 for (i = 0; i < nentries; i++, pklm++) { 185 185 pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE); 186 186 va = (offset + i) * MLX5_IMR_MTT_SIZE; 187 - if (odp && odp->umem.address == va) { 187 + if (odp && ib_umem_start(odp) == va) { 188 188 struct mlx5_ib_mr *mtt = odp->private; 189 189 190 190 pklm->key = cpu_to_be32(mtt->ibmr.lkey); ··· 206 206 mr->parent = NULL; 207 207 synchronize_srcu(&mr->dev->mr_srcu); 208 208 209 - ib_umem_release(&odp->umem); 209 + ib_umem_odp_release(odp); 210 210 if (imr->live) 211 211 mlx5_ib_update_xlt(imr, idx, 1, 0, 212 212 MLX5_IB_UPD_XLT_INDIRECT | ··· 386 386 } 387 387 388 388 static struct mlx5_ib_mr *implicit_mr_alloc(struct ib_pd *pd, 389 - struct ib_umem *umem, 389 + struct ib_umem_odp *umem_odp, 390 390 bool ksm, int access_flags) 391 391 { 392 392 struct mlx5_ib_dev *dev = to_mdev(pd->device); ··· 404 404 mr->dev = dev; 405 405 mr->access_flags = access_flags; 406 406 mr->mmkey.iova = 0; 407 - mr->umem = umem; 407 + mr->umem = &umem_odp->umem; 408 408 409 409 if (ksm) { 410 410 err = mlx5_ib_update_xlt(mr, 0, ··· 464 464 if (nentries) 465 465 nentries++; 466 466 } else { 467 - odp = ib_alloc_odp_umem(odp_mr, addr, 468 - MLX5_IMR_MTT_SIZE); 467 + odp = ib_umem_odp_alloc_child(odp_mr, addr, MLX5_IMR_MTT_SIZE); 469 468 if (IS_ERR(odp)) { 470 469 mutex_unlock(&odp_mr->umem_mutex); 471 470 return ERR_CAST(odp); 472 471 } 473 472 474 - mtt = implicit_mr_alloc(mr->ibmr.pd, &odp->umem, 0, 473 + mtt = implicit_mr_alloc(mr->ibmr.pd, odp, 0, 475 474 mr->access_flags); 476 475 if (IS_ERR(mtt)) { 477 476 mutex_unlock(&odp_mr->umem_mutex); 478 - ib_umem_release(&odp->umem); 477 + ib_umem_odp_release(odp); 479 478 return ERR_CAST(mtt); 480 479 } 481 480 ··· 496 497 addr += MLX5_IMR_MTT_SIZE; 497 498 if (unlikely(addr < io_virt + bcnt)) { 498 499 odp = odp_next(odp); 499 - if (odp && odp->umem.address != addr) 500 + if (odp && ib_umem_start(odp) != addr) 500 501 odp = NULL; 501 502 goto next_mr; 502 503 } ··· 520 521 int access_flags) 521 522 { 522 523 struct mlx5_ib_mr *imr; 523 - struct ib_umem *umem; 524 + struct ib_umem_odp *umem_odp; 524 525 525 - umem = ib_umem_get(udata, 0, 0, access_flags, 0); 526 - if (IS_ERR(umem)) 527 - return ERR_CAST(umem); 526 + umem_odp = ib_umem_odp_alloc_implicit(udata, access_flags); 527 + if (IS_ERR(umem_odp)) 528 + return ERR_CAST(umem_odp); 528 529 529 - imr = implicit_mr_alloc(&pd->ibpd, umem, 1, access_flags); 530 + imr = implicit_mr_alloc(&pd->ibpd, umem_odp, 1, access_flags); 530 531 if (IS_ERR(imr)) { 531 - ib_umem_release(umem); 532 + ib_umem_odp_release(umem_odp); 532 533 return ERR_CAST(imr); 533 534 } 534 535 535 - imr->umem = umem; 536 + imr->umem = &umem_odp->umem; 536 537 init_waitqueue_head(&imr->q_leaf_free); 537 538 atomic_set(&imr->num_leaf_free, 0); 538 539 atomic_set(&imr->num_pending_prefetch, 0); ··· 540 541 return imr; 541 542 } 542 543 543 - static int mr_leaf_free(struct ib_umem_odp *umem_odp, u64 start, u64 end, 544 - void *cookie) 545 - { 546 - struct mlx5_ib_mr *mr = umem_odp->private, *imr = cookie; 547 - 548 - if (mr->parent != imr) 549 - return 0; 550 - 551 - ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem_odp), 552 - ib_umem_end(umem_odp)); 553 - 554 - if (umem_odp->dying) 555 - return 0; 556 - 557 - WRITE_ONCE(umem_odp->dying, 1); 558 - atomic_inc(&imr->num_leaf_free); 559 - schedule_work(&umem_odp->work); 560 - 561 - return 0; 562 - } 563 - 564 544 void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *imr) 565 545 { 566 546 struct ib_ucontext_per_mm *per_mm = mr_to_per_mm(imr); 547 + struct rb_node *node; 567 548 568 549 down_read(&per_mm->umem_rwsem); 569 - rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, 0, ULLONG_MAX, 570 - mr_leaf_free, true, imr); 550 + for (node = rb_first_cached(&per_mm->umem_tree); node; 551 + node = rb_next(node)) { 552 + struct ib_umem_odp *umem_odp = 553 + rb_entry(node, struct ib_umem_odp, interval_tree.rb); 554 + struct mlx5_ib_mr *mr = umem_odp->private; 555 + 556 + if (mr->parent != imr) 557 + continue; 558 + 559 + ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem_odp), 560 + ib_umem_end(umem_odp)); 561 + 562 + if (umem_odp->dying) 563 + continue; 564 + 565 + WRITE_ONCE(umem_odp->dying, 1); 566 + atomic_inc(&imr->num_leaf_free); 567 + schedule_work(&umem_odp->work); 568 + } 571 569 up_read(&per_mm->umem_rwsem); 572 570 573 571 wait_event(imr->q_leaf_free, !atomic_read(&imr->num_leaf_free)); ··· 585 589 struct ib_umem_odp *odp; 586 590 size_t size; 587 591 588 - if (!odp_mr->page_list) { 592 + if (odp_mr->is_implicit_odp) { 589 593 odp = implicit_mr_get_data(mr, io_virt, bcnt); 590 594 591 595 if (IS_ERR(odp)) ··· 603 607 start_idx = (io_virt - (mr->mmkey.iova & page_mask)) >> page_shift; 604 608 access_mask = ODP_READ_ALLOWED_BIT; 605 609 606 - if (prefetch && !downgrade && !mr->umem->writable) { 610 + if (prefetch && !downgrade && !odp->umem.writable) { 607 611 /* prefetch with write-access must 608 612 * be supported by the MR 609 613 */ ··· 611 615 goto out; 612 616 } 613 617 614 - if (mr->umem->writable && !downgrade) 618 + if (odp->umem.writable && !downgrade) 615 619 access_mask |= ODP_WRITE_ALLOWED_BIT; 616 620 617 621 current_seq = READ_ONCE(odp->notifiers_seq); ··· 621 625 */ 622 626 smp_rmb(); 623 627 624 - ret = ib_umem_odp_map_dma_pages(to_ib_umem_odp(mr->umem), io_virt, size, 625 - access_mask, current_seq); 628 + ret = ib_umem_odp_map_dma_pages(odp, io_virt, size, access_mask, 629 + current_seq); 626 630 627 631 if (ret < 0) 628 632 goto out; ··· 630 634 np = ret; 631 635 632 636 mutex_lock(&odp->umem_mutex); 633 - if (!ib_umem_mmu_notifier_retry(to_ib_umem_odp(mr->umem), 634 - current_seq)) { 637 + if (!ib_umem_mmu_notifier_retry(odp, current_seq)) { 635 638 /* 636 639 * No need to check whether the MTTs really belong to 637 640 * this MR, since ib_umem_odp_map_dma_pages already ··· 663 668 664 669 io_virt += size; 665 670 next = odp_next(odp); 666 - if (unlikely(!next || next->umem.address != io_virt)) { 671 + if (unlikely(!next || ib_umem_start(next) != io_virt)) { 667 672 mlx5_ib_dbg(dev, "next implicit leaf removed at 0x%llx. got %p\n", 668 673 io_virt, next); 669 674 return -EAGAIN; ··· 1613 1618 1614 1619 static const struct ib_device_ops mlx5_ib_dev_odp_ops = { 1615 1620 .advise_mr = mlx5_ib_advise_mr, 1621 + .invalidate_range = mlx5_ib_invalidate_range, 1616 1622 }; 1617 1623 1618 1624 int mlx5_ib_odp_init_one(struct mlx5_ib_dev *dev)

+1

drivers/misc/sgi-gru/grufile.c

··· 573 573 gru_free_tables(); 574 574 misc_deregister(&gru_miscdev); 575 575 gru_proc_exit(); 576 + mmu_notifier_synchronize(); 576 577 } 577 578 578 579 static const struct file_operations gru_fops = {

-2

drivers/misc/sgi-gru/grutables.h

··· 307 307 308 308 struct gru_mm_struct { 309 309 struct mmu_notifier ms_notifier; 310 - atomic_t ms_refcnt; 311 310 spinlock_t ms_asid_lock; /* protects ASID assignment */ 312 311 atomic_t ms_range_active;/* num range_invals active */ 313 - char ms_released; 314 312 wait_queue_head_t ms_wait_queue; 315 313 DECLARE_BITMAP(ms_asidmap, GRU_MAX_GRUS); 316 314 struct gru_mm_tracker ms_asids[GRU_MAX_GRUS];

+23 -59

drivers/misc/sgi-gru/grutlbpurge.c

··· 235 235 gms, range->start, range->end); 236 236 } 237 237 238 - static void gru_release(struct mmu_notifier *mn, struct mm_struct *mm) 238 + static struct mmu_notifier *gru_alloc_notifier(struct mm_struct *mm) 239 239 { 240 - struct gru_mm_struct *gms = container_of(mn, struct gru_mm_struct, 241 - ms_notifier); 240 + struct gru_mm_struct *gms; 242 241 243 - gms->ms_released = 1; 244 - gru_dbg(grudev, "gms %p\n", gms); 242 + gms = kzalloc(sizeof(*gms), GFP_KERNEL); 243 + if (!gms) 244 + return ERR_PTR(-ENOMEM); 245 + STAT(gms_alloc); 246 + spin_lock_init(&gms->ms_asid_lock); 247 + init_waitqueue_head(&gms->ms_wait_queue); 248 + 249 + return &gms->ms_notifier; 245 250 } 246 251 252 + static void gru_free_notifier(struct mmu_notifier *mn) 253 + { 254 + kfree(container_of(mn, struct gru_mm_struct, ms_notifier)); 255 + STAT(gms_free); 256 + } 247 257 248 258 static const struct mmu_notifier_ops gru_mmuops = { 249 259 .invalidate_range_start = gru_invalidate_range_start, 250 260 .invalidate_range_end = gru_invalidate_range_end, 251 - .release = gru_release, 261 + .alloc_notifier = gru_alloc_notifier, 262 + .free_notifier = gru_free_notifier, 252 263 }; 253 - 254 - /* Move this to the basic mmu_notifier file. But for now... */ 255 - static struct mmu_notifier *mmu_find_ops(struct mm_struct *mm, 256 - const struct mmu_notifier_ops *ops) 257 - { 258 - struct mmu_notifier *mn, *gru_mn = NULL; 259 - 260 - if (mm->mmu_notifier_mm) { 261 - rcu_read_lock(); 262 - hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, 263 - hlist) 264 - if (mn->ops == ops) { 265 - gru_mn = mn; 266 - break; 267 - } 268 - rcu_read_unlock(); 269 - } 270 - return gru_mn; 271 - } 272 264 273 265 struct gru_mm_struct *gru_register_mmu_notifier(void) 274 266 { 275 - struct gru_mm_struct *gms; 276 267 struct mmu_notifier *mn; 277 - int err; 278 268 279 - mn = mmu_find_ops(current->mm, &gru_mmuops); 280 - if (mn) { 281 - gms = container_of(mn, struct gru_mm_struct, ms_notifier); 282 - atomic_inc(&gms->ms_refcnt); 283 - } else { 284 - gms = kzalloc(sizeof(*gms), GFP_KERNEL); 285 - if (!gms) 286 - return ERR_PTR(-ENOMEM); 287 - STAT(gms_alloc); 288 - spin_lock_init(&gms->ms_asid_lock); 289 - gms->ms_notifier.ops = &gru_mmuops; 290 - atomic_set(&gms->ms_refcnt, 1); 291 - init_waitqueue_head(&gms->ms_wait_queue); 292 - err = __mmu_notifier_register(&gms->ms_notifier, current->mm); 293 - if (err) 294 - goto error; 295 - } 296 - if (gms) 297 - gru_dbg(grudev, "gms %p, refcnt %d\n", gms, 298 - atomic_read(&gms->ms_refcnt)); 299 - return gms; 300 - error: 301 - kfree(gms); 302 - return ERR_PTR(err); 269 + mn = mmu_notifier_get_locked(&gru_mmuops, current->mm); 270 + if (IS_ERR(mn)) 271 + return ERR_CAST(mn); 272 + 273 + return container_of(mn, struct gru_mm_struct, ms_notifier); 303 274 } 304 275 305 276 void gru_drop_mmu_notifier(struct gru_mm_struct *gms) 306 277 { 307 - gru_dbg(grudev, "gms %p, refcnt %d, released %d\n", gms, 308 - atomic_read(&gms->ms_refcnt), gms->ms_released); 309 - if (atomic_dec_return(&gms->ms_refcnt) == 0) { 310 - if (!gms->ms_released) 311 - mmu_notifier_unregister(&gms->ms_notifier, current->mm); 312 - kfree(gms); 313 - STAT(gms_free); 314 - } 278 + mmu_notifier_put(&gms->ms_notifier); 315 279 } 316 280 317 281 /*

+12

drivers/nvdimm/Kconfig

··· 118 118 depends on ENCRYPTED_KEYS 119 119 depends on (LIBNVDIMM=ENCRYPTED_KEYS) || LIBNVDIMM=m 120 120 121 + config NVDIMM_TEST_BUILD 122 + tristate "Build the unit test core" 123 + depends on m 124 + depends on COMPILE_TEST && X86_64 125 + default m if COMPILE_TEST 126 + help 127 + Build the core of the unit test infrastructure. The result of 128 + this build is non-functional for unit test execution, but it 129 + otherwise helps catch build errors induced by changes to the 130 + core devm_memremap_pages() implementation and other 131 + infrastructure. 132 + 121 133 endif

+4

drivers/nvdimm/Makefile

··· 29 29 libnvdimm-$(CONFIG_NVDIMM_PFN) += pfn_devs.o 30 30 libnvdimm-$(CONFIG_NVDIMM_DAX) += dax_devs.o 31 31 libnvdimm-$(CONFIG_NVDIMM_KEYS) += security.o 32 + 33 + TOOLS := ../../tools 34 + TEST_SRC := $(TOOLS)/testing/nvdimm/test 35 + obj-$(CONFIG_NVDIMM_TEST_BUILD) += $(TEST_SRC)/iomap.o

+42 -38

fs/proc/task_mmu.c

··· 1 1 // SPDX-License-Identifier: GPL-2.0 2 - #include <linux/mm.h> 2 + #include <linux/pagewalk.h> 3 3 #include <linux/vmacache.h> 4 4 #include <linux/hugetlb.h> 5 5 #include <linux/huge_mm.h> ··· 513 513 514 514 return 0; 515 515 } 516 - #endif 516 + #else 517 + #define smaps_pte_hole NULL 518 + #endif /* CONFIG_SHMEM */ 517 519 518 520 static void smaps_pte_entry(pte_t *pte, unsigned long addr, 519 521 struct mm_walk *walk) ··· 731 729 } 732 730 return 0; 733 731 } 732 + #else 733 + #define smaps_hugetlb_range NULL 734 734 #endif /* HUGETLB_PAGE */ 735 + 736 + static const struct mm_walk_ops smaps_walk_ops = { 737 + .pmd_entry = smaps_pte_range, 738 + .hugetlb_entry = smaps_hugetlb_range, 739 + }; 740 + 741 + static const struct mm_walk_ops smaps_shmem_walk_ops = { 742 + .pmd_entry = smaps_pte_range, 743 + .hugetlb_entry = smaps_hugetlb_range, 744 + .pte_hole = smaps_pte_hole, 745 + }; 735 746 736 747 static void smap_gather_stats(struct vm_area_struct *vma, 737 748 struct mem_size_stats *mss) 738 749 { 739 - struct mm_walk smaps_walk = { 740 - .pmd_entry = smaps_pte_range, 741 - #ifdef CONFIG_HUGETLB_PAGE 742 - .hugetlb_entry = smaps_hugetlb_range, 743 - #endif 744 - .mm = vma->vm_mm, 745 - }; 746 - 747 - smaps_walk.private = mss; 748 - 749 750 #ifdef CONFIG_SHMEM 750 751 /* In case of smaps_rollup, reset the value from previous vma */ 751 752 mss->check_shmem_swap = false; ··· 770 765 mss->swap += shmem_swapped; 771 766 } else { 772 767 mss->check_shmem_swap = true; 773 - smaps_walk.pte_hole = smaps_pte_hole; 768 + walk_page_vma(vma, &smaps_shmem_walk_ops, mss); 769 + return; 774 770 } 775 771 } 776 772 #endif 777 773 /* mmap_sem is held in m_start */ 778 - walk_page_vma(vma, &smaps_walk); 774 + walk_page_vma(vma, &smaps_walk_ops, mss); 779 775 } 780 776 781 777 #define SEQ_PUT_DEC(str, val) \ ··· 1124 1118 return 0; 1125 1119 } 1126 1120 1121 + static const struct mm_walk_ops clear_refs_walk_ops = { 1122 + .pmd_entry = clear_refs_pte_range, 1123 + .test_walk = clear_refs_test_walk, 1124 + }; 1125 + 1127 1126 static ssize_t clear_refs_write(struct file *file, const char __user *buf, 1128 1127 size_t count, loff_t *ppos) 1129 1128 { ··· 1161 1150 struct mmu_notifier_range range; 1162 1151 struct clear_refs_private cp = { 1163 1152 .type = type, 1164 - }; 1165 - struct mm_walk clear_refs_walk = { 1166 - .pmd_entry = clear_refs_pte_range, 1167 - .test_walk = clear_refs_test_walk, 1168 - .mm = mm, 1169 - .private = &cp, 1170 1153 }; 1171 1154 1172 1155 if (type == CLEAR_REFS_MM_HIWATER_RSS) { ··· 1222 1217 0, NULL, mm, 0, -1UL); 1223 1218 mmu_notifier_invalidate_range_start(&range); 1224 1219 } 1225 - walk_page_range(0, mm->highest_vm_end, &clear_refs_walk); 1220 + walk_page_range(mm, 0, mm->highest_vm_end, &clear_refs_walk_ops, 1221 + &cp); 1226 1222 if (type == CLEAR_REFS_SOFT_DIRTY) 1227 1223 mmu_notifier_invalidate_range_end(&range); 1228 1224 tlb_finish_mmu(&tlb, 0, -1); ··· 1495 1489 1496 1490 return err; 1497 1491 } 1492 + #else 1493 + #define pagemap_hugetlb_range NULL 1498 1494 #endif /* HUGETLB_PAGE */ 1495 + 1496 + static const struct mm_walk_ops pagemap_ops = { 1497 + .pmd_entry = pagemap_pmd_range, 1498 + .pte_hole = pagemap_pte_hole, 1499 + .hugetlb_entry = pagemap_hugetlb_range, 1500 + }; 1499 1501 1500 1502 /* 1501 1503 * /proc/pid/pagemap - an array mapping virtual pages to pfns ··· 1536 1522 { 1537 1523 struct mm_struct *mm = file->private_data; 1538 1524 struct pagemapread pm; 1539 - struct mm_walk pagemap_walk = {}; 1540 1525 unsigned long src; 1541 1526 unsigned long svpfn; 1542 1527 unsigned long start_vaddr; ··· 1562 1549 ret = -ENOMEM; 1563 1550 if (!pm.buffer) 1564 1551 goto out_mm; 1565 - 1566 - pagemap_walk.pmd_entry = pagemap_pmd_range; 1567 - pagemap_walk.pte_hole = pagemap_pte_hole; 1568 - #ifdef CONFIG_HUGETLB_PAGE 1569 - pagemap_walk.hugetlb_entry = pagemap_hugetlb_range; 1570 - #endif 1571 - pagemap_walk.mm = mm; 1572 - pagemap_walk.private = &pm; 1573 1552 1574 1553 src = *ppos; 1575 1554 svpfn = src / PM_ENTRY_BYTES; ··· 1591 1586 ret = down_read_killable(&mm->mmap_sem); 1592 1587 if (ret) 1593 1588 goto out_free; 1594 - ret = walk_page_range(start_vaddr, end, &pagemap_walk); 1589 + ret = walk_page_range(mm, start_vaddr, end, &pagemap_ops, &pm); 1595 1590 up_read(&mm->mmap_sem); 1596 1591 start_vaddr = end; 1597 1592 ··· 1803 1798 } 1804 1799 #endif 1805 1800 1801 + static const struct mm_walk_ops show_numa_ops = { 1802 + .hugetlb_entry = gather_hugetlb_stats, 1803 + .pmd_entry = gather_pte_stats, 1804 + }; 1805 + 1806 1806 /* 1807 1807 * Display pages allocated per node and memory policy via /proc. 1808 1808 */ ··· 1819 1809 struct numa_maps *md = &numa_priv->md; 1820 1810 struct file *file = vma->vm_file; 1821 1811 struct mm_struct *mm = vma->vm_mm; 1822 - struct mm_walk walk = { 1823 - .hugetlb_entry = gather_hugetlb_stats, 1824 - .pmd_entry = gather_pte_stats, 1825 - .private = md, 1826 - .mm = mm, 1827 - }; 1828 1812 struct mempolicy *pol; 1829 1813 char buffer[64]; 1830 1814 int nid; ··· 1852 1848 seq_puts(m, " huge"); 1853 1849 1854 1850 /* mmap_sem is held by m_start */ 1855 - walk_page_vma(vma, &walk); 1851 + walk_page_vma(vma, &show_numa_ops, md); 1856 1852 1857 1853 if (!md->pages) 1858 1854 goto out;

+20 -105

include/linux/hmm.h

··· 84 84 * @notifiers: count of active mmu notifiers 85 85 */ 86 86 struct hmm { 87 - struct mm_struct *mm; 88 - struct kref kref; 87 + struct mmu_notifier mmu_notifier; 89 88 spinlock_t ranges_lock; 90 89 struct list_head ranges; 91 90 struct list_head mirrors; 92 - struct mmu_notifier mmu_notifier; 93 91 struct rw_semaphore mirrors_sem; 94 92 wait_queue_head_t wq; 95 - struct rcu_head rcu; 96 93 long notifiers; 97 94 }; 98 95 ··· 155 158 * @values: pfn value for some special case (none, special, error, ...) 156 159 * @default_flags: default flags for the range (write, read, ... see hmm doc) 157 160 * @pfn_flags_mask: allows to mask pfn flags so that only default_flags matter 158 - * @page_shift: device virtual address shift value (should be >= PAGE_SHIFT) 159 161 * @pfn_shifts: pfn shift value (should be <= PAGE_SHIFT) 160 162 * @valid: pfns array did not change since it has been fill by an HMM function 161 163 */ 162 164 struct hmm_range { 163 165 struct hmm *hmm; 164 - struct vm_area_struct *vma; 165 166 struct list_head list; 166 167 unsigned long start; 167 168 unsigned long end; ··· 168 173 const uint64_t *values; 169 174 uint64_t default_flags; 170 175 uint64_t pfn_flags_mask; 171 - uint8_t page_shift; 172 176 uint8_t pfn_shift; 173 177 bool valid; 174 178 }; 175 - 176 - /* 177 - * hmm_range_page_shift() - return the page shift for the range 178 - * @range: range being queried 179 - * Return: page shift (page size = 1 << page shift) for the range 180 - */ 181 - static inline unsigned hmm_range_page_shift(const struct hmm_range *range) 182 - { 183 - return range->page_shift; 184 - } 185 - 186 - /* 187 - * hmm_range_page_size() - return the page size for the range 188 - * @range: range being queried 189 - * Return: page size for the range in bytes 190 - */ 191 - static inline unsigned long hmm_range_page_size(const struct hmm_range *range) 192 - { 193 - return 1UL << hmm_range_page_shift(range); 194 - } 195 179 196 180 /* 197 181 * hmm_range_wait_until_valid() - wait for range to be valid ··· 265 291 } 266 292 267 293 /* 268 - * Old API: 269 - * hmm_pfn_to_page() 270 - * hmm_pfn_to_pfn() 271 - * hmm_pfn_from_page() 272 - * hmm_pfn_from_pfn() 273 - * 274 - * This are the OLD API please use new API, it is here to avoid cross-tree 275 - * merge painfullness ie we convert things to new API in stages. 276 - */ 277 - static inline struct page *hmm_pfn_to_page(const struct hmm_range *range, 278 - uint64_t pfn) 279 - { 280 - return hmm_device_entry_to_page(range, pfn); 281 - } 282 - 283 - static inline unsigned long hmm_pfn_to_pfn(const struct hmm_range *range, 284 - uint64_t pfn) 285 - { 286 - return hmm_device_entry_to_pfn(range, pfn); 287 - } 288 - 289 - static inline uint64_t hmm_pfn_from_page(const struct hmm_range *range, 290 - struct page *page) 291 - { 292 - return hmm_device_entry_from_page(range, page); 293 - } 294 - 295 - static inline uint64_t hmm_pfn_from_pfn(const struct hmm_range *range, 296 - unsigned long pfn) 297 - { 298 - return hmm_device_entry_from_pfn(range, pfn); 299 - } 300 - 301 - /* 302 294 * Mirroring: how to synchronize device page table with CPU page table. 303 295 * 304 296 * A device driver that is participating in HMM mirroring must always ··· 315 375 struct hmm_mirror; 316 376 317 377 /* 318 - * enum hmm_update_event - type of update 319 - * @HMM_UPDATE_INVALIDATE: invalidate range (no indication as to why) 320 - */ 321 - enum hmm_update_event { 322 - HMM_UPDATE_INVALIDATE, 323 - }; 324 - 325 - /* 326 - * struct hmm_update - HMM update information for callback 327 - * 328 - * @start: virtual start address of the range to update 329 - * @end: virtual end address of the range to update 330 - * @event: event triggering the update (what is happening) 331 - * @blockable: can the callback block/sleep ? 332 - */ 333 - struct hmm_update { 334 - unsigned long start; 335 - unsigned long end; 336 - enum hmm_update_event event; 337 - bool blockable; 338 - }; 339 - 340 - /* 341 378 * struct hmm_mirror_ops - HMM mirror device operations callback 342 379 * 343 380 * @update: callback to update range on a device ··· 334 417 /* sync_cpu_device_pagetables() - synchronize page tables 335 418 * 336 419 * @mirror: pointer to struct hmm_mirror 337 - * @update: update information (see struct hmm_update) 338 - * Return: -EAGAIN if update.blockable false and callback need to 339 - * block, 0 otherwise. 420 + * @update: update information (see struct mmu_notifier_range) 421 + * Return: -EAGAIN if mmu_notifier_range_blockable(update) is false 422 + * and callback needs to block, 0 otherwise. 340 423 * 341 424 * This callback ultimately originates from mmu_notifiers when the CPU 342 425 * page table is updated. The device driver must update its page table ··· 347 430 * page tables are completely updated (TLBs flushed, etc); this is a 348 431 * synchronous call. 349 432 */ 350 - int (*sync_cpu_device_pagetables)(struct hmm_mirror *mirror, 351 - const struct hmm_update *update); 433 + int (*sync_cpu_device_pagetables)( 434 + struct hmm_mirror *mirror, 435 + const struct mmu_notifier_range *update); 352 436 }; 353 437 354 438 /* ··· 375 457 /* 376 458 * Please see Documentation/vm/hmm.rst for how to use the range API. 377 459 */ 378 - int hmm_range_register(struct hmm_range *range, 379 - struct hmm_mirror *mirror, 380 - unsigned long start, 381 - unsigned long end, 382 - unsigned page_shift); 460 + int hmm_range_register(struct hmm_range *range, struct hmm_mirror *mirror); 383 461 void hmm_range_unregister(struct hmm_range *range); 384 - long hmm_range_snapshot(struct hmm_range *range); 385 - long hmm_range_fault(struct hmm_range *range, bool block); 462 + 463 + /* 464 + * Retry fault if non-blocking, drop mmap_sem and return -EAGAIN in that case. 465 + */ 466 + #define HMM_FAULT_ALLOW_RETRY (1 << 0) 467 + 468 + /* Don't fault in missing PTEs, just snapshot the current state. */ 469 + #define HMM_FAULT_SNAPSHOT (1 << 1) 470 + 471 + long hmm_range_fault(struct hmm_range *range, unsigned int flags); 472 + 386 473 long hmm_range_dma_map(struct hmm_range *range, 387 474 struct device *device, 388 475 dma_addr_t *daddrs, 389 - bool block); 476 + unsigned int flags); 390 477 long hmm_range_dma_unmap(struct hmm_range *range, 391 - struct vm_area_struct *vma, 392 478 struct device *device, 393 479 dma_addr_t *daddrs, 394 480 bool dirty); ··· 406 484 */ 407 485 #define HMM_RANGE_DEFAULT_TIMEOUT 1000 408 486 409 - /* Below are for HMM internal use only! Not to be used by device driver! */ 410 - static inline void hmm_mm_init(struct mm_struct *mm) 411 - { 412 - mm->hmm = NULL; 413 - } 414 - #else /* IS_ENABLED(CONFIG_HMM_MIRROR) */ 415 - static inline void hmm_mm_init(struct mm_struct *mm) {} 416 487 #endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */ 417 488 418 489 #endif /* LINUX_HMM_H */

+2

include/linux/ioport.h

··· 297 297 298 298 struct resource *devm_request_free_mem_region(struct device *dev, 299 299 struct resource *base, unsigned long size); 300 + struct resource *request_free_mem_region(struct resource *base, 301 + unsigned long size, const char *name); 300 302 301 303 #endif /* __ASSEMBLY__ */ 302 304 #endif /* _LINUX_IOPORT_H */

+22 -1

include/linux/kernel.h

··· 217 217 * might_sleep - annotation for functions that can sleep 218 218 * 219 219 * this macro will print a stack trace if it is executed in an atomic 220 - * context (spinlock, irq-handler, ...). 220 + * context (spinlock, irq-handler, ...). Additional sections where blocking is 221 + * not allowed can be annotated with non_block_start() and non_block_end() 222 + * pairs. 221 223 * 222 224 * This is a useful debugging help to be able to catch problems early and not 223 225 * be bitten later when the calling function happens to sleep when it is not ··· 235 233 # define cant_sleep() \ 236 234 do { __cant_sleep(__FILE__, __LINE__, 0); } while (0) 237 235 # define sched_annotate_sleep() (current->task_state_change = 0) 236 + /** 237 + * non_block_start - annotate the start of section where sleeping is prohibited 238 + * 239 + * This is on behalf of the oom reaper, specifically when it is calling the mmu 240 + * notifiers. The problem is that if the notifier were to block on, for example, 241 + * mutex_lock() and if the process which holds that mutex were to perform a 242 + * sleeping memory allocation, the oom reaper is now blocked on completion of 243 + * that memory allocation. Other blocking calls like wait_event() pose similar 244 + * issues. 245 + */ 246 + # define non_block_start() (current->non_block_count++) 247 + /** 248 + * non_block_end - annotate the end of section where sleeping is prohibited 249 + * 250 + * Closes a section opened by non_block_start(). 251 + */ 252 + # define non_block_end() WARN_ON(current->non_block_count-- == 0) 238 253 #else 239 254 static inline void ___might_sleep(const char *file, int line, 240 255 int preempt_offset) { } ··· 260 241 # define might_sleep() do { might_resched(); } while (0) 261 242 # define cant_sleep() do { } while (0) 262 243 # define sched_annotate_sleep() do { } while (0) 244 + # define non_block_start() do { } while (0) 245 + # define non_block_end() do { } while (0) 263 246 #endif 264 247 265 248 #define might_sleep_if(cond) do { if (cond) might_sleep(); } while (0)

+2 -1

include/linux/memremap.h

··· 109 109 struct percpu_ref *ref; 110 110 struct percpu_ref internal_ref; 111 111 struct completion done; 112 - struct device *dev; 113 112 enum memory_type type; 114 113 unsigned int flags; 115 114 u64 pci_p2pdma_bus_offset; ··· 123 124 } 124 125 125 126 #ifdef CONFIG_ZONE_DEVICE 127 + void *memremap_pages(struct dev_pagemap *pgmap, int nid); 128 + void memunmap_pages(struct dev_pagemap *pgmap); 126 129 void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap); 127 130 void devm_memunmap_pages(struct device *dev, struct dev_pagemap *pgmap); 128 131 struct dev_pagemap *get_dev_pagemap(unsigned long pfn,

+19 -101

include/linux/migrate.h

··· 166 166 #define MIGRATE_PFN_MIGRATE (1UL << 1) 167 167 #define MIGRATE_PFN_LOCKED (1UL << 2) 168 168 #define MIGRATE_PFN_WRITE (1UL << 3) 169 - #define MIGRATE_PFN_DEVICE (1UL << 4) 170 - #define MIGRATE_PFN_ERROR (1UL << 5) 171 169 #define MIGRATE_PFN_SHIFT 6 172 170 173 171 static inline struct page *migrate_pfn_to_page(unsigned long mpfn) ··· 180 182 return (pfn << MIGRATE_PFN_SHIFT) | MIGRATE_PFN_VALID; 181 183 } 182 184 183 - /* 184 - * struct migrate_vma_ops - migrate operation callback 185 - * 186 - * @alloc_and_copy: alloc destination memory and copy source memory to it 187 - * @finalize_and_map: allow caller to map the successfully migrated pages 188 - * 189 - * 190 - * The alloc_and_copy() callback happens once all source pages have been locked, 191 - * unmapped and checked (checked whether pinned or not). All pages that can be 192 - * migrated will have an entry in the src array set with the pfn value of the 193 - * page and with the MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag set (other 194 - * flags might be set but should be ignored by the callback). 195 - * 196 - * The alloc_and_copy() callback can then allocate destination memory and copy 197 - * source memory to it for all those entries (ie with MIGRATE_PFN_VALID and 198 - * MIGRATE_PFN_MIGRATE flag set). Once these are allocated and copied, the 199 - * callback must update each corresponding entry in the dst array with the pfn 200 - * value of the destination page and with the MIGRATE_PFN_VALID and 201 - * MIGRATE_PFN_LOCKED flags set (destination pages must have their struct pages 202 - * locked, via lock_page()). 203 - * 204 - * At this point the alloc_and_copy() callback is done and returns. 205 - * 206 - * Note that the callback does not have to migrate all the pages that are 207 - * marked with MIGRATE_PFN_MIGRATE flag in src array unless this is a migration 208 - * from device memory to system memory (ie the MIGRATE_PFN_DEVICE flag is also 209 - * set in the src array entry). If the device driver cannot migrate a device 210 - * page back to system memory, then it must set the corresponding dst array 211 - * entry to MIGRATE_PFN_ERROR. This will trigger a SIGBUS if CPU tries to 212 - * access any of the virtual addresses originally backed by this page. Because 213 - * a SIGBUS is such a severe result for the userspace process, the device 214 - * driver should avoid setting MIGRATE_PFN_ERROR unless it is really in an 215 - * unrecoverable state. 216 - * 217 - * For empty entry inside CPU page table (pte_none() or pmd_none() is true) we 218 - * do set MIGRATE_PFN_MIGRATE flag inside the corresponding source array thus 219 - * allowing device driver to allocate device memory for those unback virtual 220 - * address. For this the device driver simply have to allocate device memory 221 - * and properly set the destination entry like for regular migration. Note that 222 - * this can still fails and thus inside the device driver must check if the 223 - * migration was successful for those entry inside the finalize_and_map() 224 - * callback just like for regular migration. 225 - * 226 - * THE alloc_and_copy() CALLBACK MUST NOT CHANGE ANY OF THE SRC ARRAY ENTRIES 227 - * OR BAD THINGS WILL HAPPEN ! 228 - * 229 - * 230 - * The finalize_and_map() callback happens after struct page migration from 231 - * source to destination (destination struct pages are the struct pages for the 232 - * memory allocated by the alloc_and_copy() callback). Migration can fail, and 233 - * thus the finalize_and_map() allows the driver to inspect which pages were 234 - * successfully migrated, and which were not. Successfully migrated pages will 235 - * have the MIGRATE_PFN_MIGRATE flag set for their src array entry. 236 - * 237 - * It is safe to update device page table from within the finalize_and_map() 238 - * callback because both destination and source page are still locked, and the 239 - * mmap_sem is held in read mode (hence no one can unmap the range being 240 - * migrated). 241 - * 242 - * Once callback is done cleaning up things and updating its page table (if it 243 - * chose to do so, this is not an obligation) then it returns. At this point, 244 - * the HMM core will finish up the final steps, and the migration is complete. 245 - * 246 - * THE finalize_and_map() CALLBACK MUST NOT CHANGE ANY OF THE SRC OR DST ARRAY 247 - * ENTRIES OR BAD THINGS WILL HAPPEN ! 248 - */ 249 - struct migrate_vma_ops { 250 - void (*alloc_and_copy)(struct vm_area_struct *vma, 251 - const unsigned long *src, 252 - unsigned long *dst, 253 - unsigned long start, 254 - unsigned long end, 255 - void *private); 256 - void (*finalize_and_map)(struct vm_area_struct *vma, 257 - const unsigned long *src, 258 - const unsigned long *dst, 259 - unsigned long start, 260 - unsigned long end, 261 - void *private); 185 + struct migrate_vma { 186 + struct vm_area_struct *vma; 187 + /* 188 + * Both src and dst array must be big enough for 189 + * (end - start) >> PAGE_SHIFT entries. 190 + * 191 + * The src array must not be modified by the caller after 192 + * migrate_vma_setup(), and must not change the dst array after 193 + * migrate_vma_pages() returns. 194 + */ 195 + unsigned long *dst; 196 + unsigned long *src; 197 + unsigned long cpages; 198 + unsigned long npages; 199 + unsigned long start; 200 + unsigned long end; 262 201 }; 263 202 264 - #if defined(CONFIG_MIGRATE_VMA_HELPER) 265 - int migrate_vma(const struct migrate_vma_ops *ops, 266 - struct vm_area_struct *vma, 267 - unsigned long start, 268 - unsigned long end, 269 - unsigned long *src, 270 - unsigned long *dst, 271 - void *private); 272 - #else 273 - static inline int migrate_vma(const struct migrate_vma_ops *ops, 274 - struct vm_area_struct *vma, 275 - unsigned long start, 276 - unsigned long end, 277 - unsigned long *src, 278 - unsigned long *dst, 279 - void *private) 280 - { 281 - return -EINVAL; 282 - } 283 - #endif /* IS_ENABLED(CONFIG_MIGRATE_VMA_HELPER) */ 203 + int migrate_vma_setup(struct migrate_vma *args); 204 + void migrate_vma_pages(struct migrate_vma *migrate); 205 + void migrate_vma_finalize(struct migrate_vma *migrate); 284 206 285 207 #endif /* CONFIG_MIGRATION */ 286 208

-46

include/linux/mm.h

··· 1430 1430 void unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *start_vma, 1431 1431 unsigned long start, unsigned long end); 1432 1432 1433 - /** 1434 - * mm_walk - callbacks for walk_page_range 1435 - * @pud_entry: if set, called for each non-empty PUD (2nd-level) entry 1436 - * this handler should only handle pud_trans_huge() puds. 1437 - * the pmd_entry or pte_entry callbacks will be used for 1438 - * regular PUDs. 1439 - * @pmd_entry: if set, called for each non-empty PMD (3rd-level) entry 1440 - * this handler is required to be able to handle 1441 - * pmd_trans_huge() pmds. They may simply choose to 1442 - * split_huge_page() instead of handling it explicitly. 1443 - * @pte_entry: if set, called for each non-empty PTE (4th-level) entry 1444 - * @pte_hole: if set, called for each hole at all levels 1445 - * @hugetlb_entry: if set, called for each hugetlb entry 1446 - * @test_walk: caller specific callback function to determine whether 1447 - * we walk over the current vma or not. Returning 0 1448 - * value means "do page table walk over the current vma," 1449 - * and a negative one means "abort current page table walk 1450 - * right now." 1 means "skip the current vma." 1451 - * @mm: mm_struct representing the target process of page table walk 1452 - * @vma: vma currently walked (NULL if walking outside vmas) 1453 - * @private: private data for callbacks' usage 1454 - * 1455 - * (see the comment on walk_page_range() for more details) 1456 - */ 1457 - struct mm_walk { 1458 - int (*pud_entry)(pud_t *pud, unsigned long addr, 1459 - unsigned long next, struct mm_walk *walk); 1460 - int (*pmd_entry)(pmd_t *pmd, unsigned long addr, 1461 - unsigned long next, struct mm_walk *walk); 1462 - int (*pte_entry)(pte_t *pte, unsigned long addr, 1463 - unsigned long next, struct mm_walk *walk); 1464 - int (*pte_hole)(unsigned long addr, unsigned long next, 1465 - struct mm_walk *walk); 1466 - int (*hugetlb_entry)(pte_t *pte, unsigned long hmask, 1467 - unsigned long addr, unsigned long next, 1468 - struct mm_walk *walk); 1469 - int (*test_walk)(unsigned long addr, unsigned long next, 1470 - struct mm_walk *walk); 1471 - struct mm_struct *mm; 1472 - struct vm_area_struct *vma; 1473 - void *private; 1474 - }; 1475 - 1476 1433 struct mmu_notifier_range; 1477 1434 1478 - int walk_page_range(unsigned long addr, unsigned long end, 1479 - struct mm_walk *walk); 1480 - int walk_page_vma(struct vm_area_struct *vma, struct mm_walk *walk); 1481 1435 void free_pgd_range(struct mmu_gather *tlb, unsigned long addr, 1482 1436 unsigned long end, unsigned long floor, unsigned long ceiling); 1483 1437 int copy_page_range(struct mm_struct *dst, struct mm_struct *src,

-6

include/linux/mm_types.h

··· 25 25 26 26 struct address_space; 27 27 struct mem_cgroup; 28 - struct hmm; 29 28 30 29 /* 31 30 * Each physical page in the system has a struct page associated with ··· 510 511 atomic_long_t hugetlb_usage; 511 512 #endif 512 513 struct work_struct async_put_work; 513 - 514 - #ifdef CONFIG_HMM_MIRROR 515 - /* HMM needs to track a few things per mm */ 516 - struct hmm *hmm; 517 - #endif 518 514 } __randomize_layout; 519 515 520 516 /*

+52 -7

include/linux/mmu_notifier.h

··· 42 42 43 43 #ifdef CONFIG_MMU_NOTIFIER 44 44 45 + #ifdef CONFIG_LOCKDEP 46 + extern struct lockdep_map __mmu_notifier_invalidate_range_start_map; 47 + #endif 48 + 45 49 /* 46 50 * The mmu notifier_mm structure is allocated and installed in 47 51 * mm->mmu_notifier_mm inside the mm_take_all_locks() protected ··· 215 211 */ 216 212 void (*invalidate_range)(struct mmu_notifier *mn, struct mm_struct *mm, 217 213 unsigned long start, unsigned long end); 214 + 215 + /* 216 + * These callbacks are used with the get/put interface to manage the 217 + * lifetime of the mmu_notifier memory. alloc_notifier() returns a new 218 + * notifier for use with the mm. 219 + * 220 + * free_notifier() is only called after the mmu_notifier has been 221 + * fully put, calls to any ops callback are prevented and no ops 222 + * callbacks are currently running. It is called from a SRCU callback 223 + * and cannot sleep. 224 + */ 225 + struct mmu_notifier *(*alloc_notifier)(struct mm_struct *mm); 226 + void (*free_notifier)(struct mmu_notifier *mn); 218 227 }; 219 228 220 229 /* ··· 244 227 struct mmu_notifier { 245 228 struct hlist_node hlist; 246 229 const struct mmu_notifier_ops *ops; 230 + struct mm_struct *mm; 231 + struct rcu_head rcu; 232 + unsigned int users; 247 233 }; 248 234 249 235 static inline int mm_has_notifiers(struct mm_struct *mm) ··· 254 234 return unlikely(mm->mmu_notifier_mm); 255 235 } 256 236 237 + struct mmu_notifier *mmu_notifier_get_locked(const struct mmu_notifier_ops *ops, 238 + struct mm_struct *mm); 239 + static inline struct mmu_notifier * 240 + mmu_notifier_get(const struct mmu_notifier_ops *ops, struct mm_struct *mm) 241 + { 242 + struct mmu_notifier *ret; 243 + 244 + down_write(&mm->mmap_sem); 245 + ret = mmu_notifier_get_locked(ops, mm); 246 + up_write(&mm->mmap_sem); 247 + return ret; 248 + } 249 + void mmu_notifier_put(struct mmu_notifier *mn); 250 + void mmu_notifier_synchronize(void); 251 + 257 252 extern int mmu_notifier_register(struct mmu_notifier *mn, 258 253 struct mm_struct *mm); 259 254 extern int __mmu_notifier_register(struct mmu_notifier *mn, 260 255 struct mm_struct *mm); 261 256 extern void mmu_notifier_unregister(struct mmu_notifier *mn, 262 257 struct mm_struct *mm); 263 - extern void mmu_notifier_unregister_no_release(struct mmu_notifier *mn, 264 - struct mm_struct *mm); 265 258 extern void __mmu_notifier_mm_destroy(struct mm_struct *mm); 266 259 extern void __mmu_notifier_release(struct mm_struct *mm); 267 260 extern int __mmu_notifier_clear_flush_young(struct mm_struct *mm, ··· 343 310 static inline void 344 311 mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range) 345 312 { 313 + might_sleep(); 314 + 315 + lock_map_acquire(&__mmu_notifier_invalidate_range_start_map); 346 316 if (mm_has_notifiers(range->mm)) { 347 317 range->flags |= MMU_NOTIFIER_RANGE_BLOCKABLE; 348 318 __mmu_notifier_invalidate_range_start(range); 349 319 } 320 + lock_map_release(&__mmu_notifier_invalidate_range_start_map); 350 321 } 351 322 352 323 static inline int 353 324 mmu_notifier_invalidate_range_start_nonblock(struct mmu_notifier_range *range) 354 325 { 326 + int ret = 0; 327 + 328 + lock_map_acquire(&__mmu_notifier_invalidate_range_start_map); 355 329 if (mm_has_notifiers(range->mm)) { 356 330 range->flags &= ~MMU_NOTIFIER_RANGE_BLOCKABLE; 357 - return __mmu_notifier_invalidate_range_start(range); 331 + ret = __mmu_notifier_invalidate_range_start(range); 358 332 } 359 - return 0; 333 + lock_map_release(&__mmu_notifier_invalidate_range_start_map); 334 + return ret; 360 335 } 361 336 362 337 static inline void 363 338 mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range) 364 339 { 340 + if (mmu_notifier_range_blockable(range)) 341 + might_sleep(); 342 + 365 343 if (mm_has_notifiers(range->mm)) 366 344 __mmu_notifier_invalidate_range_end(range, false); 367 345 } ··· 526 482 set_pte_at(___mm, ___address, __ptep, ___pte); \ 527 483 }) 528 484 529 - extern void mmu_notifier_call_srcu(struct rcu_head *rcu, 530 - void (*func)(struct rcu_head *rcu)); 531 - 532 485 #else /* CONFIG_MMU_NOTIFIER */ 533 486 534 487 struct mmu_notifier_range { ··· 621 580 #define pmdp_huge_clear_flush_notify pmdp_huge_clear_flush 622 581 #define pudp_huge_clear_flush_notify pudp_huge_clear_flush 623 582 #define set_pte_at_notify set_pte_at 583 + 584 + static inline void mmu_notifier_synchronize(void) 585 + { 586 + } 624 587 625 588 #endif /* CONFIG_MMU_NOTIFIER */ 626 589

+66

include/linux/pagewalk.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + #ifndef _LINUX_PAGEWALK_H 3 + #define _LINUX_PAGEWALK_H 4 + 5 + #include <linux/mm.h> 6 + 7 + struct mm_walk; 8 + 9 + /** 10 + * mm_walk_ops - callbacks for walk_page_range 11 + * @pud_entry: if set, called for each non-empty PUD (2nd-level) entry 12 + * this handler should only handle pud_trans_huge() puds. 13 + * the pmd_entry or pte_entry callbacks will be used for 14 + * regular PUDs. 15 + * @pmd_entry: if set, called for each non-empty PMD (3rd-level) entry 16 + * this handler is required to be able to handle 17 + * pmd_trans_huge() pmds. They may simply choose to 18 + * split_huge_page() instead of handling it explicitly. 19 + * @pte_entry: if set, called for each non-empty PTE (4th-level) entry 20 + * @pte_hole: if set, called for each hole at all levels 21 + * @hugetlb_entry: if set, called for each hugetlb entry 22 + * @test_walk: caller specific callback function to determine whether 23 + * we walk over the current vma or not. Returning 0 means 24 + * "do page table walk over the current vma", returning 25 + * a negative value means "abort current page table walk 26 + * right now" and returning 1 means "skip the current vma" 27 + */ 28 + struct mm_walk_ops { 29 + int (*pud_entry)(pud_t *pud, unsigned long addr, 30 + unsigned long next, struct mm_walk *walk); 31 + int (*pmd_entry)(pmd_t *pmd, unsigned long addr, 32 + unsigned long next, struct mm_walk *walk); 33 + int (*pte_entry)(pte_t *pte, unsigned long addr, 34 + unsigned long next, struct mm_walk *walk); 35 + int (*pte_hole)(unsigned long addr, unsigned long next, 36 + struct mm_walk *walk); 37 + int (*hugetlb_entry)(pte_t *pte, unsigned long hmask, 38 + unsigned long addr, unsigned long next, 39 + struct mm_walk *walk); 40 + int (*test_walk)(unsigned long addr, unsigned long next, 41 + struct mm_walk *walk); 42 + }; 43 + 44 + /** 45 + * mm_walk - walk_page_range data 46 + * @ops: operation to call during the walk 47 + * @mm: mm_struct representing the target process of page table walk 48 + * @vma: vma currently walked (NULL if walking outside vmas) 49 + * @private: private data for callbacks' usage 50 + * 51 + * (see the comment on walk_page_range() for more details) 52 + */ 53 + struct mm_walk { 54 + const struct mm_walk_ops *ops; 55 + struct mm_struct *mm; 56 + struct vm_area_struct *vma; 57 + void *private; 58 + }; 59 + 60 + int walk_page_range(struct mm_struct *mm, unsigned long start, 61 + unsigned long end, const struct mm_walk_ops *ops, 62 + void *private); 63 + int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops, 64 + void *private); 65 + 66 + #endif /* _LINUX_PAGEWALK_H */

+4

include/linux/sched.h

··· 958 958 struct mutex_waiter *blocked_on; 959 959 #endif 960 960 961 + #ifdef CONFIG_DEBUG_ATOMIC_SLEEP 962 + int non_block_count; 963 + #endif 964 + 961 965 #ifdef CONFIG_TRACE_IRQFLAGS 962 966 unsigned int irq_events; 963 967 unsigned long hardirq_enable_ip;

+1 -1

include/rdma/ib_umem.h

··· 42 42 struct ib_umem_odp; 43 43 44 44 struct ib_umem { 45 - struct ib_ucontext *context; 45 + struct ib_device *ibdev; 46 46 struct mm_struct *owning_mm; 47 47 size_t length; 48 48 unsigned long address;

+33 -25

include/rdma/ib_umem_odp.h

··· 37 37 #include <rdma/ib_verbs.h> 38 38 #include <linux/interval_tree.h> 39 39 40 - struct umem_odp_node { 41 - u64 __subtree_last; 42 - struct rb_node rb; 43 - }; 44 - 45 40 struct ib_umem_odp { 46 41 struct ib_umem umem; 47 42 struct ib_ucontext_per_mm *per_mm; ··· 67 72 int npages; 68 73 69 74 /* Tree tracking */ 70 - struct umem_odp_node interval_tree; 75 + struct interval_tree_node interval_tree; 76 + 77 + /* 78 + * An implicit odp umem cannot be DMA mapped, has 0 length, and serves 79 + * only as an anchor for the driver to hold onto the per_mm. FIXME: 80 + * This should be removed and drivers should work with the per_mm 81 + * directly. 82 + */ 83 + bool is_implicit_odp; 71 84 72 85 struct completion notifier_completion; 73 86 int dying; ··· 91 88 /* Returns the first page of an ODP umem. */ 92 89 static inline unsigned long ib_umem_start(struct ib_umem_odp *umem_odp) 93 90 { 94 - return ALIGN_DOWN(umem_odp->umem.address, 1UL << umem_odp->page_shift); 91 + return umem_odp->interval_tree.start; 95 92 } 96 93 97 94 /* Returns the address of the page after the last one of an ODP umem. */ 98 95 static inline unsigned long ib_umem_end(struct ib_umem_odp *umem_odp) 99 96 { 100 - return ALIGN(umem_odp->umem.address + umem_odp->umem.length, 101 - 1UL << umem_odp->page_shift); 97 + return umem_odp->interval_tree.last + 1; 102 98 } 103 99 104 100 static inline size_t ib_umem_odp_num_pages(struct ib_umem_odp *umem_odp) ··· 122 120 #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING 123 121 124 122 struct ib_ucontext_per_mm { 125 - struct ib_ucontext *context; 126 - struct mm_struct *mm; 123 + struct mmu_notifier mn; 127 124 struct pid *tgid; 128 - bool active; 129 125 130 126 struct rb_root_cached umem_tree; 131 127 /* Protects umem_tree */ 132 128 struct rw_semaphore umem_rwsem; 133 - 134 - struct mmu_notifier mn; 135 - unsigned int odp_mrs_count; 136 - 137 - struct list_head ucontext_list; 138 - struct rcu_head rcu; 139 129 }; 140 130 141 - int ib_umem_odp_get(struct ib_umem_odp *umem_odp, int access); 142 - struct ib_umem_odp *ib_alloc_odp_umem(struct ib_umem_odp *root_umem, 143 - unsigned long addr, size_t size); 131 + struct ib_umem_odp *ib_umem_odp_get(struct ib_udata *udata, unsigned long addr, 132 + size_t size, int access); 133 + struct ib_umem_odp *ib_umem_odp_alloc_implicit(struct ib_udata *udata, 134 + int access); 135 + struct ib_umem_odp *ib_umem_odp_alloc_child(struct ib_umem_odp *root_umem, 136 + unsigned long addr, size_t size); 144 137 void ib_umem_odp_release(struct ib_umem_odp *umem_odp); 145 138 146 139 int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 start_offset, ··· 160 163 * Find first region intersecting with address range. 161 164 * Return NULL if not found 162 165 */ 163 - struct ib_umem_odp *rbt_ib_umem_lookup(struct rb_root_cached *root, 164 - u64 addr, u64 length); 166 + static inline struct ib_umem_odp * 167 + rbt_ib_umem_lookup(struct rb_root_cached *root, u64 addr, u64 length) 168 + { 169 + struct interval_tree_node *node; 170 + 171 + node = interval_tree_iter_first(root, addr, addr + length - 1); 172 + if (!node) 173 + return NULL; 174 + return container_of(node, struct ib_umem_odp, interval_tree); 175 + 176 + } 165 177 166 178 static inline int ib_umem_mmu_notifier_retry(struct ib_umem_odp *umem_odp, 167 179 unsigned long mmu_seq) ··· 191 185 192 186 #else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ 193 187 194 - static inline int ib_umem_odp_get(struct ib_umem_odp *umem_odp, int access) 188 + static inline struct ib_umem_odp *ib_umem_odp_get(struct ib_udata *udata, 189 + unsigned long addr, 190 + size_t size, int access) 195 191 { 196 - return -EINVAL; 192 + return ERR_PTR(-EINVAL); 197 193 } 198 194 199 195 static inline void ib_umem_odp_release(struct ib_umem_odp *umem_odp) {}

+2 -5

include/rdma/ib_verbs.h

··· 1417 1417 1418 1418 bool cleanup_retryable; 1419 1419 1420 - void (*invalidate_range)(struct ib_umem_odp *umem_odp, 1421 - unsigned long start, unsigned long end); 1422 - struct mutex per_mm_list_lock; 1423 - struct list_head per_mm_list; 1424 - 1425 1420 struct ib_rdmacg_object cg_obj; 1426 1421 /* 1427 1422 * Implementation details of the RDMA core, don't use in drivers: ··· 2373 2378 u64 iova); 2374 2379 int (*unmap_fmr)(struct list_head *fmr_list); 2375 2380 int (*dealloc_fmr)(struct ib_fmr *fmr); 2381 + void (*invalidate_range)(struct ib_umem_odp *umem_odp, 2382 + unsigned long start, unsigned long end); 2376 2383 int (*attach_mcast)(struct ib_qp *qp, union ib_gid *gid, u16 lid); 2377 2384 int (*detach_mcast)(struct ib_qp *qp, union ib_gid *gid, u16 lid); 2378 2385 struct ib_xrcd *(*alloc_xrcd)(struct ib_device *device,

-1

kernel/fork.c

··· 1009 1009 mm_init_owner(mm, p); 1010 1010 RCU_INIT_POINTER(mm->exe_file, NULL); 1011 1011 mmu_notifier_mm_init(mm); 1012 - hmm_mm_init(mm); 1013 1012 init_tlb_flush_pending(mm); 1014 1013 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS 1015 1014 mm->pmd_huge_pte = NULL;

+37 -20

kernel/resource.c

··· 1644 1644 EXPORT_SYMBOL(resource_list_free); 1645 1645 1646 1646 #ifdef CONFIG_DEVICE_PRIVATE 1647 + static struct resource *__request_free_mem_region(struct device *dev, 1648 + struct resource *base, unsigned long size, const char *name) 1649 + { 1650 + resource_size_t end, addr; 1651 + struct resource *res; 1652 + 1653 + size = ALIGN(size, 1UL << PA_SECTION_SHIFT); 1654 + end = min_t(unsigned long, base->end, (1UL << MAX_PHYSMEM_BITS) - 1); 1655 + addr = end - size + 1UL; 1656 + 1657 + for (; addr > size && addr >= base->start; addr -= size) { 1658 + if (region_intersects(addr, size, 0, IORES_DESC_NONE) != 1659 + REGION_DISJOINT) 1660 + continue; 1661 + 1662 + if (dev) 1663 + res = devm_request_mem_region(dev, addr, size, name); 1664 + else 1665 + res = request_mem_region(addr, size, name); 1666 + if (!res) 1667 + return ERR_PTR(-ENOMEM); 1668 + res->desc = IORES_DESC_DEVICE_PRIVATE_MEMORY; 1669 + return res; 1670 + } 1671 + 1672 + return ERR_PTR(-ERANGE); 1673 + } 1674 + 1647 1675 /** 1648 1676 * devm_request_free_mem_region - find free region for device private memory 1649 1677 * ··· 1686 1658 struct resource *devm_request_free_mem_region(struct device *dev, 1687 1659 struct resource *base, unsigned long size) 1688 1660 { 1689 - resource_size_t end, addr; 1690 - struct resource *res; 1691 - 1692 - size = ALIGN(size, 1UL << PA_SECTION_SHIFT); 1693 - end = min_t(unsigned long, base->end, (1UL << MAX_PHYSMEM_BITS) - 1); 1694 - addr = end - size + 1UL; 1695 - 1696 - for (; addr > size && addr >= base->start; addr -= size) { 1697 - if (region_intersects(addr, size, 0, IORES_DESC_NONE) != 1698 - REGION_DISJOINT) 1699 - continue; 1700 - 1701 - res = devm_request_mem_region(dev, addr, size, dev_name(dev)); 1702 - if (!res) 1703 - return ERR_PTR(-ENOMEM); 1704 - res->desc = IORES_DESC_DEVICE_PRIVATE_MEMORY; 1705 - return res; 1706 - } 1707 - 1708 - return ERR_PTR(-ERANGE); 1661 + return __request_free_mem_region(dev, base, size, dev_name(dev)); 1709 1662 } 1710 1663 EXPORT_SYMBOL_GPL(devm_request_free_mem_region); 1664 + 1665 + struct resource *request_free_mem_region(struct resource *base, 1666 + unsigned long size, const char *name) 1667 + { 1668 + return __request_free_mem_region(NULL, base, size, name); 1669 + } 1670 + EXPORT_SYMBOL_GPL(request_free_mem_region); 1671 + 1711 1672 #endif /* CONFIG_DEVICE_PRIVATE */ 1712 1673 1713 1674 static int __init strict_iomem(char *str)

+14 -5

kernel/sched/core.c

··· 3871 3871 /* 3872 3872 * Various schedule()-time debugging checks and statistics: 3873 3873 */ 3874 - static inline void schedule_debug(struct task_struct *prev) 3874 + static inline void schedule_debug(struct task_struct *prev, bool preempt) 3875 3875 { 3876 3876 #ifdef CONFIG_SCHED_STACK_END_CHECK 3877 3877 if (task_stack_end_corrupted(prev)) 3878 3878 panic("corrupted stack end detected inside scheduler\n"); 3879 + #endif 3880 + 3881 + #ifdef CONFIG_DEBUG_ATOMIC_SLEEP 3882 + if (!preempt && prev->state && prev->non_block_count) { 3883 + printk(KERN_ERR "BUG: scheduling in a non-blocking section: %s/%d/%i\n", 3884 + prev->comm, prev->pid, prev->non_block_count); 3885 + dump_stack(); 3886 + add_taint(TAINT_WARN, LOCKDEP_STILL_OK); 3887 + } 3879 3888 #endif 3880 3889 3881 3890 if (unlikely(in_atomic_preempt_off())) { ··· 3998 3989 rq = cpu_rq(cpu); 3999 3990 prev = rq->curr; 4000 3991 4001 - schedule_debug(prev); 3992 + schedule_debug(prev, preempt); 4002 3993 4003 3994 if (sched_feat(HRTICK)) 4004 3995 hrtick_clear(rq); ··· 6772 6763 rcu_sleep_check(); 6773 6764 6774 6765 if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && 6775 - !is_idle_task(current)) || 6766 + !is_idle_task(current) && !current->non_block_count) || 6776 6767 system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING || 6777 6768 oops_in_progress) 6778 6769 return; ··· 6788 6779 "BUG: sleeping function called from invalid context at %s:%d\n", 6789 6780 file, line); 6790 6781 printk(KERN_ERR 6791 - "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", 6792 - in_atomic(), irqs_disabled(), 6782 + "in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n", 6783 + in_atomic(), irqs_disabled(), current->non_block_count, 6793 6784 current->pid, current->comm); 6794 6785 6795 6786 if (task_stack_end_corrupted(current))

+7 -13

mm/Kconfig

··· 669 669 670 670 If FS_DAX is enabled, then say Y. 671 671 672 - config MIGRATE_VMA_HELPER 673 - bool 674 - 675 672 config DEV_PAGEMAP_OPS 676 673 bool 677 674 675 + # 676 + # Helpers to mirror range of the CPU page tables of a process into device page 677 + # tables. 678 + # 678 679 config HMM_MIRROR 679 - bool "HMM mirror CPU page table into a device page table" 680 - depends on (X86_64 || PPC64) 681 - depends on MMU && 64BIT 682 - select MMU_NOTIFIER 683 - help 684 - Select HMM_MIRROR if you want to mirror range of the CPU page table of a 685 - process into a device page table. Here, mirror means "keep synchronized". 686 - Prerequisites: the device must provide the ability to write-protect its 687 - page tables (at PAGE_SIZE granularity), and must be able to recover from 688 - the resulting potential page faults. 680 + bool 681 + depends on MMU 682 + depends on MMU_NOTIFIER 689 683 690 684 config DEVICE_PRIVATE 691 685 bool "Unaddressable device memory (GPU memory, ...)"

+152 -338

mm/hmm.c

··· 8 8 * Refer to include/linux/hmm.h for information about heterogeneous memory 9 9 * management or HMM for short. 10 10 */ 11 - #include <linux/mm.h> 11 + #include <linux/pagewalk.h> 12 12 #include <linux/hmm.h> 13 13 #include <linux/init.h> 14 14 #include <linux/rmap.h> ··· 26 26 #include <linux/mmu_notifier.h> 27 27 #include <linux/memory_hotplug.h> 28 28 29 - static const struct mmu_notifier_ops hmm_mmu_notifier_ops; 30 - 31 - /** 32 - * hmm_get_or_create - register HMM against an mm (HMM internal) 33 - * 34 - * @mm: mm struct to attach to 35 - * Returns: returns an HMM object, either by referencing the existing 36 - * (per-process) object, or by creating a new one. 37 - * 38 - * This is not intended to be used directly by device drivers. If mm already 39 - * has an HMM struct then it get a reference on it and returns it. Otherwise 40 - * it allocates an HMM struct, initializes it, associate it with the mm and 41 - * returns it. 42 - */ 43 - static struct hmm *hmm_get_or_create(struct mm_struct *mm) 29 + static struct mmu_notifier *hmm_alloc_notifier(struct mm_struct *mm) 44 30 { 45 31 struct hmm *hmm; 46 32 47 - lockdep_assert_held_write(&mm->mmap_sem); 48 - 49 - /* Abuse the page_table_lock to also protect mm->hmm. */ 50 - spin_lock(&mm->page_table_lock); 51 - hmm = mm->hmm; 52 - if (mm->hmm && kref_get_unless_zero(&mm->hmm->kref)) 53 - goto out_unlock; 54 - spin_unlock(&mm->page_table_lock); 55 - 56 - hmm = kmalloc(sizeof(*hmm), GFP_KERNEL); 33 + hmm = kzalloc(sizeof(*hmm), GFP_KERNEL); 57 34 if (!hmm) 58 - return NULL; 35 + return ERR_PTR(-ENOMEM); 36 + 59 37 init_waitqueue_head(&hmm->wq); 60 38 INIT_LIST_HEAD(&hmm->mirrors); 61 39 init_rwsem(&hmm->mirrors_sem); 62 - hmm->mmu_notifier.ops = NULL; 63 40 INIT_LIST_HEAD(&hmm->ranges); 64 41 spin_lock_init(&hmm->ranges_lock); 65 - kref_init(&hmm->kref); 66 42 hmm->notifiers = 0; 67 - hmm->mm = mm; 68 - 69 - hmm->mmu_notifier.ops = &hmm_mmu_notifier_ops; 70 - if (__mmu_notifier_register(&hmm->mmu_notifier, mm)) { 71 - kfree(hmm); 72 - return NULL; 73 - } 74 - 75 - mmgrab(hmm->mm); 76 - 77 - /* 78 - * We hold the exclusive mmap_sem here so we know that mm->hmm is 79 - * still NULL or 0 kref, and is safe to update. 80 - */ 81 - spin_lock(&mm->page_table_lock); 82 - mm->hmm = hmm; 83 - 84 - out_unlock: 85 - spin_unlock(&mm->page_table_lock); 86 - return hmm; 43 + return &hmm->mmu_notifier; 87 44 } 88 45 89 - static void hmm_free_rcu(struct rcu_head *rcu) 46 + static void hmm_free_notifier(struct mmu_notifier *mn) 90 47 { 91 - struct hmm *hmm = container_of(rcu, struct hmm, rcu); 48 + struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier); 92 49 93 - mmdrop(hmm->mm); 50 + WARN_ON(!list_empty(&hmm->ranges)); 51 + WARN_ON(!list_empty(&hmm->mirrors)); 94 52 kfree(hmm); 95 - } 96 - 97 - static void hmm_free(struct kref *kref) 98 - { 99 - struct hmm *hmm = container_of(kref, struct hmm, kref); 100 - 101 - spin_lock(&hmm->mm->page_table_lock); 102 - if (hmm->mm->hmm == hmm) 103 - hmm->mm->hmm = NULL; 104 - spin_unlock(&hmm->mm->page_table_lock); 105 - 106 - mmu_notifier_unregister_no_release(&hmm->mmu_notifier, hmm->mm); 107 - mmu_notifier_call_srcu(&hmm->rcu, hmm_free_rcu); 108 - } 109 - 110 - static inline void hmm_put(struct hmm *hmm) 111 - { 112 - kref_put(&hmm->kref, hmm_free); 113 53 } 114 54 115 55 static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm) 116 56 { 117 57 struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier); 118 58 struct hmm_mirror *mirror; 119 - 120 - /* Bail out if hmm is in the process of being freed */ 121 - if (!kref_get_unless_zero(&hmm->kref)) 122 - return; 123 59 124 60 /* 125 61 * Since hmm_range_register() holds the mmget() lock hmm_release() is ··· 73 137 mirror->ops->release(mirror); 74 138 } 75 139 up_read(&hmm->mirrors_sem); 76 - 77 - hmm_put(hmm); 78 140 } 79 141 80 142 static void notifiers_decrement(struct hmm *hmm) ··· 99 165 { 100 166 struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier); 101 167 struct hmm_mirror *mirror; 102 - struct hmm_update update; 103 168 struct hmm_range *range; 104 169 unsigned long flags; 105 170 int ret = 0; 106 171 107 - if (!kref_get_unless_zero(&hmm->kref)) 108 - return 0; 109 - 110 - update.start = nrange->start; 111 - update.end = nrange->end; 112 - update.event = HMM_UPDATE_INVALIDATE; 113 - update.blockable = mmu_notifier_range_blockable(nrange); 114 - 115 172 spin_lock_irqsave(&hmm->ranges_lock, flags); 116 173 hmm->notifiers++; 117 174 list_for_each_entry(range, &hmm->ranges, list) { 118 - if (update.end < range->start || update.start >= range->end) 175 + if (nrange->end < range->start || nrange->start >= range->end) 119 176 continue; 120 177 121 178 range->valid = false; ··· 123 198 list_for_each_entry(mirror, &hmm->mirrors, list) { 124 199 int rc; 125 200 126 - rc = mirror->ops->sync_cpu_device_pagetables(mirror, &update); 201 + rc = mirror->ops->sync_cpu_device_pagetables(mirror, nrange); 127 202 if (rc) { 128 - if (WARN_ON(update.blockable || rc != -EAGAIN)) 203 + if (WARN_ON(mmu_notifier_range_blockable(nrange) || 204 + rc != -EAGAIN)) 129 205 continue; 130 206 ret = -EAGAIN; 131 207 break; ··· 137 211 out: 138 212 if (ret) 139 213 notifiers_decrement(hmm); 140 - hmm_put(hmm); 141 214 return ret; 142 215 } 143 216 ··· 145 220 { 146 221 struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier); 147 222 148 - if (!kref_get_unless_zero(&hmm->kref)) 149 - return; 150 - 151 223 notifiers_decrement(hmm); 152 - hmm_put(hmm); 153 224 } 154 225 155 226 static const struct mmu_notifier_ops hmm_mmu_notifier_ops = { 156 227 .release = hmm_release, 157 228 .invalidate_range_start = hmm_invalidate_range_start, 158 229 .invalidate_range_end = hmm_invalidate_range_end, 230 + .alloc_notifier = hmm_alloc_notifier, 231 + .free_notifier = hmm_free_notifier, 159 232 }; 160 233 161 234 /* ··· 165 242 * 166 243 * To start mirroring a process address space, the device driver must register 167 244 * an HMM mirror struct. 245 + * 246 + * The caller cannot unregister the hmm_mirror while any ranges are 247 + * registered. 248 + * 249 + * Callers using this function must put a call to mmu_notifier_synchronize() 250 + * in their module exit functions. 168 251 */ 169 252 int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm) 170 253 { 254 + struct mmu_notifier *mn; 255 + 171 256 lockdep_assert_held_write(&mm->mmap_sem); 172 257 173 258 /* Sanity check */ 174 259 if (!mm || !mirror || !mirror->ops) 175 260 return -EINVAL; 176 261 177 - mirror->hmm = hmm_get_or_create(mm); 178 - if (!mirror->hmm) 179 - return -ENOMEM; 262 + mn = mmu_notifier_get_locked(&hmm_mmu_notifier_ops, mm); 263 + if (IS_ERR(mn)) 264 + return PTR_ERR(mn); 265 + mirror->hmm = container_of(mn, struct hmm, mmu_notifier); 180 266 181 267 down_write(&mirror->hmm->mirrors_sem); 182 268 list_add(&mirror->list, &mirror->hmm->mirrors); ··· 209 277 down_write(&hmm->mirrors_sem); 210 278 list_del(&mirror->list); 211 279 up_write(&hmm->mirrors_sem); 212 - hmm_put(hmm); 280 + mmu_notifier_put(&hmm->mmu_notifier); 213 281 } 214 282 EXPORT_SYMBOL(hmm_mirror_unregister); 215 283 ··· 217 285 struct hmm_range *range; 218 286 struct dev_pagemap *pgmap; 219 287 unsigned long last; 220 - bool fault; 221 - bool block; 288 + unsigned int flags; 222 289 }; 223 290 224 291 static int hmm_vma_do_fault(struct mm_walk *walk, unsigned long addr, ··· 229 298 struct vm_area_struct *vma = walk->vma; 230 299 vm_fault_t ret; 231 300 232 - flags |= hmm_vma_walk->block ? 0 : FAULT_FLAG_ALLOW_RETRY; 233 - flags |= write_fault ? FAULT_FLAG_WRITE : 0; 301 + if (!vma) 302 + goto err; 303 + 304 + if (hmm_vma_walk->flags & HMM_FAULT_ALLOW_RETRY) 305 + flags |= FAULT_FLAG_ALLOW_RETRY; 306 + if (write_fault) 307 + flags |= FAULT_FLAG_WRITE; 308 + 234 309 ret = handle_mm_fault(vma, addr, flags); 235 - if (ret & VM_FAULT_RETRY) 310 + if (ret & VM_FAULT_RETRY) { 311 + /* Note, handle_mm_fault did up_read(&mm->mmap_sem)) */ 236 312 return -EAGAIN; 237 - if (ret & VM_FAULT_ERROR) { 238 - *pfn = range->values[HMM_PFN_ERROR]; 239 - return -EFAULT; 240 313 } 314 + if (ret & VM_FAULT_ERROR) 315 + goto err; 241 316 242 317 return -EBUSY; 318 + 319 + err: 320 + *pfn = range->values[HMM_PFN_ERROR]; 321 + return -EFAULT; 243 322 } 244 323 245 324 static int hmm_pfns_bad(unsigned long addr, ··· 269 328 } 270 329 271 330 /* 272 - * hmm_vma_walk_hole() - handle a range lacking valid pmd or pte(s) 273 - * @start: range virtual start address (inclusive) 331 + * hmm_vma_walk_hole_() - handle a range lacking valid pmd or pte(s) 332 + * @addr: range virtual start address (inclusive) 274 333 * @end: range virtual end address (exclusive) 275 334 * @fault: should we fault or not ? 276 335 * @write_fault: write fault ? ··· 287 346 struct hmm_vma_walk *hmm_vma_walk = walk->private; 288 347 struct hmm_range *range = hmm_vma_walk->range; 289 348 uint64_t *pfns = range->pfns; 290 - unsigned long i, page_size; 349 + unsigned long i; 291 350 292 351 hmm_vma_walk->last = addr; 293 - page_size = hmm_range_page_size(range); 294 - i = (addr - range->start) >> range->page_shift; 352 + i = (addr - range->start) >> PAGE_SHIFT; 295 353 296 - for (; addr < end; addr += page_size, i++) { 354 + if (write_fault && walk->vma && !(walk->vma->vm_flags & VM_WRITE)) 355 + return -EPERM; 356 + 357 + for (; addr < end; addr += PAGE_SIZE, i++) { 297 358 pfns[i] = range->values[HMM_PFN_NONE]; 298 359 if (fault || write_fault) { 299 360 int ret; ··· 316 373 { 317 374 struct hmm_range *range = hmm_vma_walk->range; 318 375 319 - if (!hmm_vma_walk->fault) 376 + if (hmm_vma_walk->flags & HMM_FAULT_SNAPSHOT) 320 377 return; 321 378 322 379 /* 323 380 * So we not only consider the individual per page request we also 324 381 * consider the default flags requested for the range. The API can 325 - * be use in 2 fashions. The first one where the HMM user coalesce 326 - * multiple page fault into one request and set flags per pfns for 327 - * of those faults. The second one where the HMM user want to pre- 382 + * be used 2 ways. The first one where the HMM user coalesces 383 + * multiple page faults into one request and sets flags per pfn for 384 + * those faults. The second one where the HMM user wants to pre- 328 385 * fault a range with specific flags. For the latter one it is a 329 386 * waste to have the user pre-fill the pfn arrays with a default 330 387 * flags value. ··· 334 391 /* We aren't ask to do anything ... */ 335 392 if (!(pfns & range->flags[HMM_PFN_VALID])) 336 393 return; 337 - /* If this is device memory than only fault if explicitly requested */ 394 + /* If this is device memory then only fault if explicitly requested */ 338 395 if ((cpu_flags & range->flags[HMM_PFN_DEVICE_PRIVATE])) { 339 396 /* Do we fault on device memory ? */ 340 397 if (pfns & range->flags[HMM_PFN_DEVICE_PRIVATE]) { ··· 361 418 { 362 419 unsigned long i; 363 420 364 - if (!hmm_vma_walk->fault) { 421 + if (hmm_vma_walk->flags & HMM_FAULT_SNAPSHOT) { 365 422 *fault = *write_fault = false; 366 423 return; 367 424 } ··· 401 458 range->flags[HMM_PFN_VALID]; 402 459 } 403 460 404 - static inline uint64_t pud_to_hmm_pfn_flags(struct hmm_range *range, pud_t pud) 405 - { 406 - if (!pud_present(pud)) 407 - return 0; 408 - return pud_write(pud) ? range->flags[HMM_PFN_VALID] | 409 - range->flags[HMM_PFN_WRITE] : 410 - range->flags[HMM_PFN_VALID]; 411 - } 412 - 413 - static int hmm_vma_handle_pmd(struct mm_walk *walk, 414 - unsigned long addr, 415 - unsigned long end, 416 - uint64_t *pfns, 417 - pmd_t pmd) 418 - { 419 461 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 462 + static int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr, 463 + unsigned long end, uint64_t *pfns, pmd_t pmd) 464 + { 420 465 struct hmm_vma_walk *hmm_vma_walk = walk->private; 421 466 struct hmm_range *range = hmm_vma_walk->range; 422 467 unsigned long pfn, npages, i; ··· 419 488 if (pmd_protnone(pmd) || fault || write_fault) 420 489 return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); 421 490 422 - pfn = pmd_pfn(pmd) + pte_index(addr); 491 + pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); 423 492 for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) { 424 493 if (pmd_devmap(pmd)) { 425 494 hmm_vma_walk->pgmap = get_dev_pagemap(pfn, ··· 435 504 } 436 505 hmm_vma_walk->last = end; 437 506 return 0; 438 - #else 439 - /* If THP is not enabled then we should never reach that code ! */ 440 - return -EINVAL; 441 - #endif 442 507 } 508 + #else /* CONFIG_TRANSPARENT_HUGEPAGE */ 509 + /* stub to allow the code below to compile */ 510 + int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr, 511 + unsigned long end, uint64_t *pfns, pmd_t pmd); 512 + #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 443 513 444 514 static inline uint64_t pte_to_hmm_pfn_flags(struct hmm_range *range, pte_t pte) 445 515 { ··· 457 525 { 458 526 struct hmm_vma_walk *hmm_vma_walk = walk->private; 459 527 struct hmm_range *range = hmm_vma_walk->range; 460 - struct vm_area_struct *vma = walk->vma; 461 528 bool fault, write_fault; 462 529 uint64_t cpu_flags; 463 530 pte_t pte = *ptep; ··· 477 546 swp_entry_t entry = pte_to_swp_entry(pte); 478 547 479 548 if (!non_swap_entry(entry)) { 549 + cpu_flags = pte_to_hmm_pfn_flags(range, pte); 550 + hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags, 551 + &fault, &write_fault); 480 552 if (fault || write_fault) 481 553 goto fault; 482 554 return 0; ··· 508 574 if (fault || write_fault) { 509 575 pte_unmap(ptep); 510 576 hmm_vma_walk->last = addr; 511 - migration_entry_wait(vma->vm_mm, 512 - pmdp, addr); 577 + migration_entry_wait(walk->mm, pmdp, addr); 513 578 return -EBUSY; 514 579 } 515 580 return 0; ··· 556 623 { 557 624 struct hmm_vma_walk *hmm_vma_walk = walk->private; 558 625 struct hmm_range *range = hmm_vma_walk->range; 559 - struct vm_area_struct *vma = walk->vma; 560 626 uint64_t *pfns = range->pfns; 561 627 unsigned long addr = start, i; 562 628 pte_t *ptep; 563 629 pmd_t pmd; 564 630 565 - 566 631 again: 567 632 pmd = READ_ONCE(*pmdp); 568 633 if (pmd_none(pmd)) 569 634 return hmm_vma_walk_hole(start, end, walk); 570 - 571 - if (pmd_huge(pmd) && (range->vma->vm_flags & VM_HUGETLB)) 572 - return hmm_pfns_bad(start, end, walk); 573 635 574 636 if (thp_migration_supported() && is_pmd_migration_entry(pmd)) { 575 637 bool fault, write_fault; ··· 579 651 0, &fault, &write_fault); 580 652 if (fault || write_fault) { 581 653 hmm_vma_walk->last = addr; 582 - pmd_migration_entry_wait(vma->vm_mm, pmdp); 654 + pmd_migration_entry_wait(walk->mm, pmdp); 583 655 return -EBUSY; 584 656 } 585 657 return 0; ··· 588 660 589 661 if (pmd_devmap(pmd) || pmd_trans_huge(pmd)) { 590 662 /* 591 - * No need to take pmd_lock here, even if some other threads 663 + * No need to take pmd_lock here, even if some other thread 592 664 * is splitting the huge pmd we will get that event through 593 665 * mmu_notifier callback. 594 666 * 595 - * So just read pmd value and check again its a transparent 667 + * So just read pmd value and check again it's a transparent 596 668 * huge or device mapping one and compute corresponding pfn 597 669 * values. 598 670 */ ··· 606 678 } 607 679 608 680 /* 609 - * We have handled all the valid case above ie either none, migration, 681 + * We have handled all the valid cases above ie either none, migration, 610 682 * huge or transparent huge. At this point either it is a valid pmd 611 683 * entry pointing to pte directory or it is a bad pmd that will not 612 684 * recover. ··· 642 714 return 0; 643 715 } 644 716 645 - static int hmm_vma_walk_pud(pud_t *pudp, 646 - unsigned long start, 647 - unsigned long end, 648 - struct mm_walk *walk) 717 + #if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && \ 718 + defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) 719 + static inline uint64_t pud_to_hmm_pfn_flags(struct hmm_range *range, pud_t pud) 720 + { 721 + if (!pud_present(pud)) 722 + return 0; 723 + return pud_write(pud) ? range->flags[HMM_PFN_VALID] | 724 + range->flags[HMM_PFN_WRITE] : 725 + range->flags[HMM_PFN_VALID]; 726 + } 727 + 728 + static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end, 729 + struct mm_walk *walk) 649 730 { 650 731 struct hmm_vma_walk *hmm_vma_walk = walk->private; 651 732 struct hmm_range *range = hmm_vma_walk->range; ··· 718 781 719 782 return 0; 720 783 } 784 + #else 785 + #define hmm_vma_walk_pud NULL 786 + #endif 721 787 788 + #ifdef CONFIG_HUGETLB_PAGE 722 789 static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask, 723 790 unsigned long start, unsigned long end, 724 791 struct mm_walk *walk) 725 792 { 726 - #ifdef CONFIG_HUGETLB_PAGE 727 - unsigned long addr = start, i, pfn, mask, size, pfn_inc; 793 + unsigned long addr = start, i, pfn; 728 794 struct hmm_vma_walk *hmm_vma_walk = walk->private; 729 795 struct hmm_range *range = hmm_vma_walk->range; 730 796 struct vm_area_struct *vma = walk->vma; 731 - struct hstate *h = hstate_vma(vma); 732 797 uint64_t orig_pfn, cpu_flags; 733 798 bool fault, write_fault; 734 799 spinlock_t *ptl; 735 800 pte_t entry; 736 801 int ret = 0; 737 802 738 - size = 1UL << huge_page_shift(h); 739 - mask = size - 1; 740 - if (range->page_shift != PAGE_SHIFT) { 741 - /* Make sure we are looking at full page. */ 742 - if (start & mask) 743 - return -EINVAL; 744 - if (end < (start + size)) 745 - return -EINVAL; 746 - pfn_inc = size >> PAGE_SHIFT; 747 - } else { 748 - pfn_inc = 1; 749 - size = PAGE_SIZE; 750 - } 751 - 752 - 753 - ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte); 803 + ptl = huge_pte_lock(hstate_vma(vma), walk->mm, pte); 754 804 entry = huge_ptep_get(pte); 755 805 756 - i = (start - range->start) >> range->page_shift; 806 + i = (start - range->start) >> PAGE_SHIFT; 757 807 orig_pfn = range->pfns[i]; 758 808 range->pfns[i] = range->values[HMM_PFN_NONE]; 759 809 cpu_flags = pte_to_hmm_pfn_flags(range, entry); ··· 752 828 goto unlock; 753 829 } 754 830 755 - pfn = pte_pfn(entry) + ((start & mask) >> range->page_shift); 756 - for (; addr < end; addr += size, i++, pfn += pfn_inc) 831 + pfn = pte_pfn(entry) + ((start & ~hmask) >> PAGE_SHIFT); 832 + for (; addr < end; addr += PAGE_SIZE, i++, pfn++) 757 833 range->pfns[i] = hmm_device_entry_from_pfn(range, pfn) | 758 834 cpu_flags; 759 835 hmm_vma_walk->last = end; ··· 765 841 return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); 766 842 767 843 return ret; 768 - #else /* CONFIG_HUGETLB_PAGE */ 769 - return -EINVAL; 770 - #endif 771 844 } 845 + #else 846 + #define hmm_vma_walk_hugetlb_entry NULL 847 + #endif /* CONFIG_HUGETLB_PAGE */ 772 848 773 849 static void hmm_pfns_clear(struct hmm_range *range, 774 850 uint64_t *pfns, ··· 783 859 * hmm_range_register() - start tracking change to CPU page table over a range 784 860 * @range: range 785 861 * @mm: the mm struct for the range of virtual address 786 - * @start: start virtual address (inclusive) 787 - * @end: end virtual address (exclusive) 788 - * @page_shift: expect page shift for the range 789 - * Returns 0 on success, -EFAULT if the address space is no longer valid 862 + * 863 + * Return: 0 on success, -EFAULT if the address space is no longer valid 790 864 * 791 865 * Track updates to the CPU page table see include/linux/hmm.h 792 866 */ 793 - int hmm_range_register(struct hmm_range *range, 794 - struct hmm_mirror *mirror, 795 - unsigned long start, 796 - unsigned long end, 797 - unsigned page_shift) 867 + int hmm_range_register(struct hmm_range *range, struct hmm_mirror *mirror) 798 868 { 799 - unsigned long mask = ((1UL << page_shift) - 1UL); 800 869 struct hmm *hmm = mirror->hmm; 801 870 unsigned long flags; 802 871 803 872 range->valid = false; 804 873 range->hmm = NULL; 805 874 806 - if ((start & mask) || (end & mask)) 875 + if ((range->start & (PAGE_SIZE - 1)) || (range->end & (PAGE_SIZE - 1))) 807 876 return -EINVAL; 808 - if (start >= end) 877 + if (range->start >= range->end) 809 878 return -EINVAL; 810 - 811 - range->page_shift = page_shift; 812 - range->start = start; 813 - range->end = end; 814 879 815 880 /* Prevent hmm_release() from running while the range is valid */ 816 - if (!mmget_not_zero(hmm->mm)) 881 + if (!mmget_not_zero(hmm->mmu_notifier.mm)) 817 882 return -EFAULT; 818 883 819 884 /* Initialize range to track CPU page table updates. */ 820 885 spin_lock_irqsave(&hmm->ranges_lock, flags); 821 886 822 887 range->hmm = hmm; 823 - kref_get(&hmm->kref); 824 888 list_add(&range->list, &hmm->ranges); 825 889 826 890 /* ··· 840 928 spin_unlock_irqrestore(&hmm->ranges_lock, flags); 841 929 842 930 /* Drop reference taken by hmm_range_register() */ 843 - mmput(hmm->mm); 844 - hmm_put(hmm); 931 + mmput(hmm->mmu_notifier.mm); 845 932 846 933 /* 847 934 * The range is now invalid and the ref on the hmm is dropped, so ··· 852 941 } 853 942 EXPORT_SYMBOL(hmm_range_unregister); 854 943 855 - /* 856 - * hmm_range_snapshot() - snapshot CPU page table for a range 857 - * @range: range 858 - * Return: -EINVAL if invalid argument, -ENOMEM out of memory, -EPERM invalid 859 - * permission (for instance asking for write and range is read only), 860 - * -EBUSY if you need to retry, -EFAULT invalid (ie either no valid 861 - * vma or it is illegal to access that range), number of valid pages 862 - * in range->pfns[] (from range start address). 944 + static const struct mm_walk_ops hmm_walk_ops = { 945 + .pud_entry = hmm_vma_walk_pud, 946 + .pmd_entry = hmm_vma_walk_pmd, 947 + .pte_hole = hmm_vma_walk_hole, 948 + .hugetlb_entry = hmm_vma_walk_hugetlb_entry, 949 + }; 950 + 951 + /** 952 + * hmm_range_fault - try to fault some address in a virtual address range 953 + * @range: range being faulted 954 + * @flags: HMM_FAULT_* flags 863 955 * 864 - * This snapshots the CPU page table for a range of virtual addresses. Snapshot 865 - * validity is tracked by range struct. See in include/linux/hmm.h for example 866 - * on how to use. 867 - */ 868 - long hmm_range_snapshot(struct hmm_range *range) 869 - { 870 - const unsigned long device_vma = VM_IO | VM_PFNMAP | VM_MIXEDMAP; 871 - unsigned long start = range->start, end; 872 - struct hmm_vma_walk hmm_vma_walk; 873 - struct hmm *hmm = range->hmm; 874 - struct vm_area_struct *vma; 875 - struct mm_walk mm_walk; 876 - 877 - lockdep_assert_held(&hmm->mm->mmap_sem); 878 - do { 879 - /* If range is no longer valid force retry. */ 880 - if (!range->valid) 881 - return -EBUSY; 882 - 883 - vma = find_vma(hmm->mm, start); 884 - if (vma == NULL || (vma->vm_flags & device_vma)) 885 - return -EFAULT; 886 - 887 - if (is_vm_hugetlb_page(vma)) { 888 - if (huge_page_shift(hstate_vma(vma)) != 889 - range->page_shift && 890 - range->page_shift != PAGE_SHIFT) 891 - return -EINVAL; 892 - } else { 893 - if (range->page_shift != PAGE_SHIFT) 894 - return -EINVAL; 895 - } 896 - 897 - if (!(vma->vm_flags & VM_READ)) { 898 - /* 899 - * If vma do not allow read access, then assume that it 900 - * does not allow write access, either. HMM does not 901 - * support architecture that allow write without read. 902 - */ 903 - hmm_pfns_clear(range, range->pfns, 904 - range->start, range->end); 905 - return -EPERM; 906 - } 907 - 908 - range->vma = vma; 909 - hmm_vma_walk.pgmap = NULL; 910 - hmm_vma_walk.last = start; 911 - hmm_vma_walk.fault = false; 912 - hmm_vma_walk.range = range; 913 - mm_walk.private = &hmm_vma_walk; 914 - end = min(range->end, vma->vm_end); 915 - 916 - mm_walk.vma = vma; 917 - mm_walk.mm = vma->vm_mm; 918 - mm_walk.pte_entry = NULL; 919 - mm_walk.test_walk = NULL; 920 - mm_walk.hugetlb_entry = NULL; 921 - mm_walk.pud_entry = hmm_vma_walk_pud; 922 - mm_walk.pmd_entry = hmm_vma_walk_pmd; 923 - mm_walk.pte_hole = hmm_vma_walk_hole; 924 - mm_walk.hugetlb_entry = hmm_vma_walk_hugetlb_entry; 925 - 926 - walk_page_range(start, end, &mm_walk); 927 - start = end; 928 - } while (start < range->end); 929 - 930 - return (hmm_vma_walk.last - range->start) >> PAGE_SHIFT; 931 - } 932 - EXPORT_SYMBOL(hmm_range_snapshot); 933 - 934 - /* 935 - * hmm_range_fault() - try to fault some address in a virtual address range 936 - * @range: range being faulted 937 - * @block: allow blocking on fault (if true it sleeps and do not drop mmap_sem) 938 - * Return: number of valid pages in range->pfns[] (from range start 939 - * address). This may be zero. If the return value is negative, 940 - * then one of the following values may be returned: 956 + * Return: the number of valid pages in range->pfns[] (from range start 957 + * address), which may be zero. On error one of the following status codes 958 + * can be returned: 941 959 * 942 - * -EINVAL invalid arguments or mm or virtual address are in an 943 - * invalid vma (for instance device file vma). 944 - * -ENOMEM: Out of memory. 945 - * -EPERM: Invalid permission (for instance asking for write and 946 - * range is read only). 947 - * -EAGAIN: If you need to retry and mmap_sem was drop. This can only 948 - * happens if block argument is false. 949 - * -EBUSY: If the the range is being invalidated and you should wait 950 - * for invalidation to finish. 951 - * -EFAULT: Invalid (ie either no valid vma or it is illegal to access 952 - * that range), number of valid pages in range->pfns[] (from 953 - * range start address). 960 + * -EINVAL: Invalid arguments or mm or virtual address is in an invalid vma 961 + * (e.g., device file vma). 962 + * -ENOMEM: Out of memory. 963 + * -EPERM: Invalid permission (e.g., asking for write and range is read 964 + * only). 965 + * -EAGAIN: A page fault needs to be retried and mmap_sem was dropped. 966 + * -EBUSY: The range has been invalidated and the caller needs to wait for 967 + * the invalidation to finish. 968 + * -EFAULT: Invalid (i.e., either no valid vma or it is illegal to access 969 + * that range) number of valid pages in range->pfns[] (from 970 + * range start address). 954 971 * 955 972 * This is similar to a regular CPU page fault except that it will not trigger 956 973 * any memory migration if the memory being faulted is not accessible by CPUs ··· 887 1048 * On error, for one virtual address in the range, the function will mark the 888 1049 * corresponding HMM pfn entry with an error flag. 889 1050 */ 890 - long hmm_range_fault(struct hmm_range *range, bool block) 1051 + long hmm_range_fault(struct hmm_range *range, unsigned int flags) 891 1052 { 892 1053 const unsigned long device_vma = VM_IO | VM_PFNMAP | VM_MIXEDMAP; 893 1054 unsigned long start = range->start, end; 894 1055 struct hmm_vma_walk hmm_vma_walk; 895 1056 struct hmm *hmm = range->hmm; 896 1057 struct vm_area_struct *vma; 897 - struct mm_walk mm_walk; 898 1058 int ret; 899 1059 900 - lockdep_assert_held(&hmm->mm->mmap_sem); 1060 + lockdep_assert_held(&hmm->mmu_notifier.mm->mmap_sem); 901 1061 902 1062 do { 903 1063 /* If range is no longer valid force retry. */ 904 1064 if (!range->valid) 905 1065 return -EBUSY; 906 1066 907 - vma = find_vma(hmm->mm, start); 1067 + vma = find_vma(hmm->mmu_notifier.mm, start); 908 1068 if (vma == NULL || (vma->vm_flags & device_vma)) 909 1069 return -EFAULT; 910 - 911 - if (is_vm_hugetlb_page(vma)) { 912 - if (huge_page_shift(hstate_vma(vma)) != 913 - range->page_shift && 914 - range->page_shift != PAGE_SHIFT) 915 - return -EINVAL; 916 - } else { 917 - if (range->page_shift != PAGE_SHIFT) 918 - return -EINVAL; 919 - } 920 1070 921 1071 if (!(vma->vm_flags & VM_READ)) { 922 1072 /* ··· 918 1090 return -EPERM; 919 1091 } 920 1092 921 - range->vma = vma; 922 1093 hmm_vma_walk.pgmap = NULL; 923 1094 hmm_vma_walk.last = start; 924 - hmm_vma_walk.fault = true; 925 - hmm_vma_walk.block = block; 1095 + hmm_vma_walk.flags = flags; 926 1096 hmm_vma_walk.range = range; 927 - mm_walk.private = &hmm_vma_walk; 928 1097 end = min(range->end, vma->vm_end); 929 1098 930 - mm_walk.vma = vma; 931 - mm_walk.mm = vma->vm_mm; 932 - mm_walk.pte_entry = NULL; 933 - mm_walk.test_walk = NULL; 934 - mm_walk.hugetlb_entry = NULL; 935 - mm_walk.pud_entry = hmm_vma_walk_pud; 936 - mm_walk.pmd_entry = hmm_vma_walk_pmd; 937 - mm_walk.pte_hole = hmm_vma_walk_hole; 938 - mm_walk.hugetlb_entry = hmm_vma_walk_hugetlb_entry; 1099 + walk_page_range(vma->vm_mm, start, end, &hmm_walk_ops, 1100 + &hmm_vma_walk); 939 1101 940 1102 do { 941 - ret = walk_page_range(start, end, &mm_walk); 1103 + ret = walk_page_range(vma->vm_mm, start, end, 1104 + &hmm_walk_ops, &hmm_vma_walk); 942 1105 start = hmm_vma_walk.last; 943 1106 944 1107 /* Keep trying while the range is valid. */ ··· 952 1133 EXPORT_SYMBOL(hmm_range_fault); 953 1134 954 1135 /** 955 - * hmm_range_dma_map() - hmm_range_fault() and dma map page all in one. 956 - * @range: range being faulted 957 - * @device: device against to dma map page to 958 - * @daddrs: dma address of mapped pages 959 - * @block: allow blocking on fault (if true it sleeps and do not drop mmap_sem) 960 - * Return: number of pages mapped on success, -EAGAIN if mmap_sem have been 961 - * drop and you need to try again, some other error value otherwise 1136 + * hmm_range_dma_map - hmm_range_fault() and dma map page all in one. 1137 + * @range: range being faulted 1138 + * @device: device to map page to 1139 + * @daddrs: array of dma addresses for the mapped pages 1140 + * @flags: HMM_FAULT_* 962 1141 * 963 - * Note same usage pattern as hmm_range_fault(). 1142 + * Return: the number of pages mapped on success (including zero), or any 1143 + * status return from hmm_range_fault() otherwise. 964 1144 */ 965 - long hmm_range_dma_map(struct hmm_range *range, 966 - struct device *device, 967 - dma_addr_t *daddrs, 968 - bool block) 1145 + long hmm_range_dma_map(struct hmm_range *range, struct device *device, 1146 + dma_addr_t *daddrs, unsigned int flags) 969 1147 { 970 1148 unsigned long i, npages, mapped; 971 1149 long ret; 972 1150 973 - ret = hmm_range_fault(range, block); 1151 + ret = hmm_range_fault(range, flags); 974 1152 if (ret <= 0) 975 1153 return ret ? ret : -EBUSY; 976 1154 ··· 1038 1222 /** 1039 1223 * hmm_range_dma_unmap() - unmap range of that was map with hmm_range_dma_map() 1040 1224 * @range: range being unmapped 1041 - * @vma: the vma against which the range (optional) 1042 1225 * @device: device against which dma map was done 1043 1226 * @daddrs: dma address of mapped pages 1044 1227 * @dirty: dirty page if it had the write flag set ··· 1049 1234 * concurrent mmu notifier or sync_cpu_device_pagetables() to make progress. 1050 1235 */ 1051 1236 long hmm_range_dma_unmap(struct hmm_range *range, 1052 - struct vm_area_struct *vma, 1053 1237 struct device *device, 1054 1238 dma_addr_t *daddrs, 1055 1239 bool dirty)

+13 -29

mm/madvise.c

··· 21 21 #include <linux/file.h> 22 22 #include <linux/blkdev.h> 23 23 #include <linux/backing-dev.h> 24 + #include <linux/pagewalk.h> 24 25 #include <linux/swap.h> 25 26 #include <linux/swapops.h> 26 27 #include <linux/shmem_fs.h> ··· 227 226 return 0; 228 227 } 229 228 230 - static void force_swapin_readahead(struct vm_area_struct *vma, 231 - unsigned long start, unsigned long end) 232 - { 233 - struct mm_walk walk = { 234 - .mm = vma->vm_mm, 235 - .pmd_entry = swapin_walk_pmd_entry, 236 - .private = vma, 237 - }; 238 - 239 - walk_page_range(start, end, &walk); 240 - 241 - lru_add_drain(); /* Push any new pages onto the LRU now */ 242 - } 229 + static const struct mm_walk_ops swapin_walk_ops = { 230 + .pmd_entry = swapin_walk_pmd_entry, 231 + }; 243 232 244 233 static void force_shm_swapin_readahead(struct vm_area_struct *vma, 245 234 unsigned long start, unsigned long end, ··· 272 281 *prev = vma; 273 282 #ifdef CONFIG_SWAP 274 283 if (!file) { 275 - force_swapin_readahead(vma, start, end); 284 + walk_page_range(vma->vm_mm, start, end, &swapin_walk_ops, vma); 285 + lru_add_drain(); /* Push any new pages onto the LRU now */ 276 286 return 0; 277 287 } 278 288 ··· 442 450 return 0; 443 451 } 444 452 445 - static void madvise_free_page_range(struct mmu_gather *tlb, 446 - struct vm_area_struct *vma, 447 - unsigned long addr, unsigned long end) 448 - { 449 - struct mm_walk free_walk = { 450 - .pmd_entry = madvise_free_pte_range, 451 - .mm = vma->vm_mm, 452 - .private = tlb, 453 - }; 454 - 455 - tlb_start_vma(tlb, vma); 456 - walk_page_range(addr, end, &free_walk); 457 - tlb_end_vma(tlb, vma); 458 - } 453 + static const struct mm_walk_ops madvise_free_walk_ops = { 454 + .pmd_entry = madvise_free_pte_range, 455 + }; 459 456 460 457 static int madvise_free_single_vma(struct vm_area_struct *vma, 461 458 unsigned long start_addr, unsigned long end_addr) ··· 471 490 update_hiwater_rss(mm); 472 491 473 492 mmu_notifier_invalidate_range_start(&range); 474 - madvise_free_page_range(&tlb, vma, range.start, range.end); 493 + tlb_start_vma(&tlb, vma); 494 + walk_page_range(vma->vm_mm, range.start, range.end, 495 + &madvise_free_walk_ops, &tlb); 496 + tlb_end_vma(&tlb, vma); 475 497 mmu_notifier_invalidate_range_end(&range); 476 498 tlb_finish_mmu(&tlb, range.start, range.end); 477 499

+12 -13

mm/memcontrol.c

··· 25 25 #include <linux/page_counter.h> 26 26 #include <linux/memcontrol.h> 27 27 #include <linux/cgroup.h> 28 - #include <linux/mm.h> 28 + #include <linux/pagewalk.h> 29 29 #include <linux/sched/mm.h> 30 30 #include <linux/shmem_fs.h> 31 31 #include <linux/hugetlb.h> ··· 5499 5499 return 0; 5500 5500 } 5501 5501 5502 + static const struct mm_walk_ops precharge_walk_ops = { 5503 + .pmd_entry = mem_cgroup_count_precharge_pte_range, 5504 + }; 5505 + 5502 5506 static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) 5503 5507 { 5504 5508 unsigned long precharge; 5505 5509 5506 - struct mm_walk mem_cgroup_count_precharge_walk = { 5507 - .pmd_entry = mem_cgroup_count_precharge_pte_range, 5508 - .mm = mm, 5509 - }; 5510 5510 down_read(&mm->mmap_sem); 5511 - walk_page_range(0, mm->highest_vm_end, 5512 - &mem_cgroup_count_precharge_walk); 5511 + walk_page_range(mm, 0, mm->highest_vm_end, &precharge_walk_ops, NULL); 5513 5512 up_read(&mm->mmap_sem); 5514 5513 5515 5514 precharge = mc.precharge; ··· 5777 5778 return ret; 5778 5779 } 5779 5780 5781 + static const struct mm_walk_ops charge_walk_ops = { 5782 + .pmd_entry = mem_cgroup_move_charge_pte_range, 5783 + }; 5784 + 5780 5785 static void mem_cgroup_move_charge(void) 5781 5786 { 5782 - struct mm_walk mem_cgroup_move_charge_walk = { 5783 - .pmd_entry = mem_cgroup_move_charge_pte_range, 5784 - .mm = mc.mm, 5785 - }; 5786 - 5787 5787 lru_add_drain_all(); 5788 5788 /* 5789 5789 * Signal lock_page_memcg() to take the memcg's move_lock ··· 5808 5810 * When we have consumed all precharges and failed in doing 5809 5811 * additional charge, the page walk just aborts. 5810 5812 */ 5811 - walk_page_range(0, mc.mm->highest_vm_end, &mem_cgroup_move_charge_walk); 5813 + walk_page_range(mc.mm, 0, mc.mm->highest_vm_end, &charge_walk_ops, 5814 + NULL); 5812 5815 5813 5816 up_read(&mc.mm->mmap_sem); 5814 5817 atomic_dec(&mc.from->moving_account);

+8 -9

mm/mempolicy.c

··· 68 68 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 69 69 70 70 #include <linux/mempolicy.h> 71 - #include <linux/mm.h> 71 + #include <linux/pagewalk.h> 72 72 #include <linux/highmem.h> 73 73 #include <linux/hugetlb.h> 74 74 #include <linux/kernel.h> ··· 655 655 return 1; 656 656 } 657 657 658 + static const struct mm_walk_ops queue_pages_walk_ops = { 659 + .hugetlb_entry = queue_pages_hugetlb, 660 + .pmd_entry = queue_pages_pte_range, 661 + .test_walk = queue_pages_test_walk, 662 + }; 663 + 658 664 /* 659 665 * Walk through page tables and collect pages to be migrated. 660 666 * ··· 685 679 .nmask = nodes, 686 680 .prev = NULL, 687 681 }; 688 - struct mm_walk queue_pages_walk = { 689 - .hugetlb_entry = queue_pages_hugetlb, 690 - .pmd_entry = queue_pages_pte_range, 691 - .test_walk = queue_pages_test_walk, 692 - .mm = mm, 693 - .private = &qp, 694 - }; 695 682 696 - return walk_page_range(start, end, &queue_pages_walk); 683 + return walk_page_range(mm, start, end, &queue_pages_walk_ops, &qp); 697 684 } 698 685 699 686 /*

+63 -42

mm/memremap.c

··· 21 21 EXPORT_SYMBOL(devmap_managed_key); 22 22 static atomic_t devmap_managed_enable; 23 23 24 - static void devmap_managed_enable_put(void *data) 24 + static void devmap_managed_enable_put(void) 25 25 { 26 26 if (atomic_dec_and_test(&devmap_managed_enable)) 27 27 static_branch_disable(&devmap_managed_key); 28 28 } 29 29 30 - static int devmap_managed_enable_get(struct device *dev, struct dev_pagemap *pgmap) 30 + static int devmap_managed_enable_get(struct dev_pagemap *pgmap) 31 31 { 32 32 if (!pgmap->ops || !pgmap->ops->page_free) { 33 33 WARN(1, "Missing page_free method\n"); ··· 36 36 37 37 if (atomic_inc_return(&devmap_managed_enable) == 1) 38 38 static_branch_enable(&devmap_managed_key); 39 - return devm_add_action_or_reset(dev, devmap_managed_enable_put, NULL); 39 + return 0; 40 40 } 41 41 #else 42 - static int devmap_managed_enable_get(struct device *dev, struct dev_pagemap *pgmap) 42 + static int devmap_managed_enable_get(struct dev_pagemap *pgmap) 43 43 { 44 44 return -EINVAL; 45 + } 46 + static void devmap_managed_enable_put(void) 47 + { 45 48 } 46 49 #endif /* CONFIG_DEV_PAGEMAP_OPS */ 47 50 ··· 102 99 pgmap->ref = NULL; 103 100 } 104 101 105 - static void devm_memremap_pages_release(void *data) 102 + void memunmap_pages(struct dev_pagemap *pgmap) 106 103 { 107 - struct dev_pagemap *pgmap = data; 108 - struct device *dev = pgmap->dev; 109 104 struct resource *res = &pgmap->res; 110 105 unsigned long pfn; 111 106 int nid; ··· 130 129 131 130 untrack_pfn(NULL, PHYS_PFN(res->start), resource_size(res)); 132 131 pgmap_array_delete(res); 133 - dev_WARN_ONCE(dev, pgmap->altmap.alloc, 134 - "%s: failed to free all reserved pages\n", __func__); 132 + WARN_ONCE(pgmap->altmap.alloc, "failed to free all reserved pages\n"); 133 + devmap_managed_enable_put(); 134 + } 135 + EXPORT_SYMBOL_GPL(memunmap_pages); 136 + 137 + static void devm_memremap_pages_release(void *data) 138 + { 139 + memunmap_pages(data); 135 140 } 136 141 137 142 static void dev_pagemap_percpu_release(struct percpu_ref *ref) ··· 148 141 complete(&pgmap->done); 149 142 } 150 143 151 - /** 152 - * devm_memremap_pages - remap and provide memmap backing for the given resource 153 - * @dev: hosting device for @res 154 - * @pgmap: pointer to a struct dev_pagemap 155 - * 156 - * Notes: 157 - * 1/ At a minimum the res and type members of @pgmap must be initialized 158 - * by the caller before passing it to this function 159 - * 160 - * 2/ The altmap field may optionally be initialized, in which case 161 - * PGMAP_ALTMAP_VALID must be set in pgmap->flags. 162 - * 163 - * 3/ The ref field may optionally be provided, in which pgmap->ref must be 164 - * 'live' on entry and will be killed and reaped at 165 - * devm_memremap_pages_release() time, or if this routine fails. 166 - * 167 - * 4/ res is expected to be a host memory range that could feasibly be 168 - * treated as a "System RAM" range, i.e. not a device mmio range, but 169 - * this is not enforced. 144 + /* 145 + * Not device managed version of dev_memremap_pages, undone by 146 + * memunmap_pages(). Please use dev_memremap_pages if you have a struct 147 + * device available. 170 148 */ 171 - void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) 149 + void *memremap_pages(struct dev_pagemap *pgmap, int nid) 172 150 { 173 151 struct resource *res = &pgmap->res; 174 152 struct dev_pagemap *conflict_pgmap; ··· 164 172 .altmap = pgmap_altmap(pgmap), 165 173 }; 166 174 pgprot_t pgprot = PAGE_KERNEL; 167 - int error, nid, is_ram; 175 + int error, is_ram; 168 176 bool need_devmap_managed = true; 169 177 170 178 switch (pgmap->type) { ··· 212 220 } 213 221 214 222 if (need_devmap_managed) { 215 - error = devmap_managed_enable_get(dev, pgmap); 223 + error = devmap_managed_enable_get(pgmap); 216 224 if (error) 217 225 return ERR_PTR(error); 218 226 } 219 227 220 228 conflict_pgmap = get_dev_pagemap(PHYS_PFN(res->start), NULL); 221 229 if (conflict_pgmap) { 222 - dev_WARN(dev, "Conflicting mapping in same section\n"); 230 + WARN(1, "Conflicting mapping in same section\n"); 223 231 put_dev_pagemap(conflict_pgmap); 224 232 error = -ENOMEM; 225 233 goto err_array; ··· 227 235 228 236 conflict_pgmap = get_dev_pagemap(PHYS_PFN(res->end), NULL); 229 237 if (conflict_pgmap) { 230 - dev_WARN(dev, "Conflicting mapping in same section\n"); 238 + WARN(1, "Conflicting mapping in same section\n"); 231 239 put_dev_pagemap(conflict_pgmap); 232 240 error = -ENOMEM; 233 241 goto err_array; ··· 243 251 goto err_array; 244 252 } 245 253 246 - pgmap->dev = dev; 247 - 248 254 error = xa_err(xa_store_range(&pgmap_array, PHYS_PFN(res->start), 249 255 PHYS_PFN(res->end), pgmap, GFP_KERNEL)); 250 256 if (error) 251 257 goto err_array; 252 258 253 - nid = dev_to_node(dev); 254 259 if (nid < 0) 255 260 nid = numa_mem_id(); 256 261 ··· 303 314 PHYS_PFN(res->start), 304 315 PHYS_PFN(resource_size(res)), pgmap); 305 316 percpu_ref_get_many(pgmap->ref, pfn_end(pgmap) - pfn_first(pgmap)); 306 - 307 - error = devm_add_action_or_reset(dev, devm_memremap_pages_release, 308 - pgmap); 309 - if (error) 310 - return ERR_PTR(error); 311 - 312 317 return __va(res->start); 313 318 314 319 err_add_memory: ··· 314 331 err_array: 315 332 dev_pagemap_kill(pgmap); 316 333 dev_pagemap_cleanup(pgmap); 334 + devmap_managed_enable_put(); 317 335 return ERR_PTR(error); 336 + } 337 + EXPORT_SYMBOL_GPL(memremap_pages); 338 + 339 + /** 340 + * devm_memremap_pages - remap and provide memmap backing for the given resource 341 + * @dev: hosting device for @res 342 + * @pgmap: pointer to a struct dev_pagemap 343 + * 344 + * Notes: 345 + * 1/ At a minimum the res and type members of @pgmap must be initialized 346 + * by the caller before passing it to this function 347 + * 348 + * 2/ The altmap field may optionally be initialized, in which case 349 + * PGMAP_ALTMAP_VALID must be set in pgmap->flags. 350 + * 351 + * 3/ The ref field may optionally be provided, in which pgmap->ref must be 352 + * 'live' on entry and will be killed and reaped at 353 + * devm_memremap_pages_release() time, or if this routine fails. 354 + * 355 + * 4/ res is expected to be a host memory range that could feasibly be 356 + * treated as a "System RAM" range, i.e. not a device mmio range, but 357 + * this is not enforced. 358 + */ 359 + void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) 360 + { 361 + int error; 362 + void *ret; 363 + 364 + ret = memremap_pages(pgmap, dev_to_node(dev)); 365 + if (IS_ERR(ret)) 366 + return ret; 367 + 368 + error = devm_add_action_or_reset(dev, devm_memremap_pages_release, 369 + pgmap); 370 + if (error) 371 + return ERR_PTR(error); 372 + return ret; 318 373 } 319 374 EXPORT_SYMBOL_GPL(devm_memremap_pages); 320 375

+126 -150

mm/migrate.c

··· 38 38 #include <linux/hugetlb.h> 39 39 #include <linux/hugetlb_cgroup.h> 40 40 #include <linux/gfp.h> 41 + #include <linux/pagewalk.h> 41 42 #include <linux/pfn_t.h> 42 43 #include <linux/memremap.h> 43 44 #include <linux/userfaultfd_k.h> ··· 2120 2119 2121 2120 #endif /* CONFIG_NUMA */ 2122 2121 2123 - #if defined(CONFIG_MIGRATE_VMA_HELPER) 2124 - struct migrate_vma { 2125 - struct vm_area_struct *vma; 2126 - unsigned long *dst; 2127 - unsigned long *src; 2128 - unsigned long cpages; 2129 - unsigned long npages; 2130 - unsigned long start; 2131 - unsigned long end; 2132 - }; 2133 - 2122 + #ifdef CONFIG_DEVICE_PRIVATE 2134 2123 static int migrate_vma_collect_hole(unsigned long start, 2135 2124 unsigned long end, 2136 2125 struct mm_walk *walk) ··· 2240 2249 goto next; 2241 2250 2242 2251 page = device_private_entry_to_page(entry); 2243 - mpfn = migrate_pfn(page_to_pfn(page))| 2244 - MIGRATE_PFN_DEVICE | MIGRATE_PFN_MIGRATE; 2252 + mpfn = migrate_pfn(page_to_pfn(page)) | 2253 + MIGRATE_PFN_MIGRATE; 2245 2254 if (is_write_device_private_entry(entry)) 2246 2255 mpfn |= MIGRATE_PFN_WRITE; 2247 2256 } else { ··· 2320 2329 return 0; 2321 2330 } 2322 2331 2332 + static const struct mm_walk_ops migrate_vma_walk_ops = { 2333 + .pmd_entry = migrate_vma_collect_pmd, 2334 + .pte_hole = migrate_vma_collect_hole, 2335 + }; 2336 + 2323 2337 /* 2324 2338 * migrate_vma_collect() - collect pages over a range of virtual addresses 2325 2339 * @migrate: migrate struct containing all migration information ··· 2336 2340 static void migrate_vma_collect(struct migrate_vma *migrate) 2337 2341 { 2338 2342 struct mmu_notifier_range range; 2339 - struct mm_walk mm_walk = { 2340 - .pmd_entry = migrate_vma_collect_pmd, 2341 - .pte_hole = migrate_vma_collect_hole, 2342 - .vma = migrate->vma, 2343 - .mm = migrate->vma->vm_mm, 2344 - .private = migrate, 2345 - }; 2346 2343 2347 - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, NULL, mm_walk.mm, 2348 - migrate->start, 2349 - migrate->end); 2344 + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, NULL, 2345 + migrate->vma->vm_mm, migrate->start, migrate->end); 2350 2346 mmu_notifier_invalidate_range_start(&range); 2351 - walk_page_range(migrate->start, migrate->end, &mm_walk); 2352 - mmu_notifier_invalidate_range_end(&range); 2353 2347 2348 + walk_page_range(migrate->vma->vm_mm, migrate->start, migrate->end, 2349 + &migrate_vma_walk_ops, migrate); 2350 + 2351 + mmu_notifier_invalidate_range_end(&range); 2354 2352 migrate->end = migrate->start + (migrate->npages << PAGE_SHIFT); 2355 2353 } 2356 2354 ··· 2567 2577 } 2568 2578 } 2569 2579 2580 + /** 2581 + * migrate_vma_setup() - prepare to migrate a range of memory 2582 + * @args: contains the vma, start, and and pfns arrays for the migration 2583 + * 2584 + * Returns: negative errno on failures, 0 when 0 or more pages were migrated 2585 + * without an error. 2586 + * 2587 + * Prepare to migrate a range of memory virtual address range by collecting all 2588 + * the pages backing each virtual address in the range, saving them inside the 2589 + * src array. Then lock those pages and unmap them. Once the pages are locked 2590 + * and unmapped, check whether each page is pinned or not. Pages that aren't 2591 + * pinned have the MIGRATE_PFN_MIGRATE flag set (by this function) in the 2592 + * corresponding src array entry. Then restores any pages that are pinned, by 2593 + * remapping and unlocking those pages. 2594 + * 2595 + * The caller should then allocate destination memory and copy source memory to 2596 + * it for all those entries (ie with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE 2597 + * flag set). Once these are allocated and copied, the caller must update each 2598 + * corresponding entry in the dst array with the pfn value of the destination 2599 + * page and with the MIGRATE_PFN_VALID and MIGRATE_PFN_LOCKED flags set 2600 + * (destination pages must have their struct pages locked, via lock_page()). 2601 + * 2602 + * Note that the caller does not have to migrate all the pages that are marked 2603 + * with MIGRATE_PFN_MIGRATE flag in src array unless this is a migration from 2604 + * device memory to system memory. If the caller cannot migrate a device page 2605 + * back to system memory, then it must return VM_FAULT_SIGBUS, which has severe 2606 + * consequences for the userspace process, so it must be avoided if at all 2607 + * possible. 2608 + * 2609 + * For empty entries inside CPU page table (pte_none() or pmd_none() is true) we 2610 + * do set MIGRATE_PFN_MIGRATE flag inside the corresponding source array thus 2611 + * allowing the caller to allocate device memory for those unback virtual 2612 + * address. For this the caller simply has to allocate device memory and 2613 + * properly set the destination entry like for regular migration. Note that 2614 + * this can still fails and thus inside the device driver must check if the 2615 + * migration was successful for those entries after calling migrate_vma_pages() 2616 + * just like for regular migration. 2617 + * 2618 + * After that, the callers must call migrate_vma_pages() to go over each entry 2619 + * in the src array that has the MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag 2620 + * set. If the corresponding entry in dst array has MIGRATE_PFN_VALID flag set, 2621 + * then migrate_vma_pages() to migrate struct page information from the source 2622 + * struct page to the destination struct page. If it fails to migrate the 2623 + * struct page information, then it clears the MIGRATE_PFN_MIGRATE flag in the 2624 + * src array. 2625 + * 2626 + * At this point all successfully migrated pages have an entry in the src 2627 + * array with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag set and the dst 2628 + * array entry with MIGRATE_PFN_VALID flag set. 2629 + * 2630 + * Once migrate_vma_pages() returns the caller may inspect which pages were 2631 + * successfully migrated, and which were not. Successfully migrated pages will 2632 + * have the MIGRATE_PFN_MIGRATE flag set for their src array entry. 2633 + * 2634 + * It is safe to update device page table after migrate_vma_pages() because 2635 + * both destination and source page are still locked, and the mmap_sem is held 2636 + * in read mode (hence no one can unmap the range being migrated). 2637 + * 2638 + * Once the caller is done cleaning up things and updating its page table (if it 2639 + * chose to do so, this is not an obligation) it finally calls 2640 + * migrate_vma_finalize() to update the CPU page table to point to new pages 2641 + * for successfully migrated pages or otherwise restore the CPU page table to 2642 + * point to the original source pages. 2643 + */ 2644 + int migrate_vma_setup(struct migrate_vma *args) 2645 + { 2646 + long nr_pages = (args->end - args->start) >> PAGE_SHIFT; 2647 + 2648 + args->start &= PAGE_MASK; 2649 + args->end &= PAGE_MASK; 2650 + if (!args->vma || is_vm_hugetlb_page(args->vma) || 2651 + (args->vma->vm_flags & VM_SPECIAL) || vma_is_dax(args->vma)) 2652 + return -EINVAL; 2653 + if (nr_pages <= 0) 2654 + return -EINVAL; 2655 + if (args->start < args->vma->vm_start || 2656 + args->start >= args->vma->vm_end) 2657 + return -EINVAL; 2658 + if (args->end <= args->vma->vm_start || args->end > args->vma->vm_end) 2659 + return -EINVAL; 2660 + if (!args->src || !args->dst) 2661 + return -EINVAL; 2662 + 2663 + memset(args->src, 0, sizeof(*args->src) * nr_pages); 2664 + args->cpages = 0; 2665 + args->npages = 0; 2666 + 2667 + migrate_vma_collect(args); 2668 + 2669 + if (args->cpages) 2670 + migrate_vma_prepare(args); 2671 + if (args->cpages) 2672 + migrate_vma_unmap(args); 2673 + 2674 + /* 2675 + * At this point pages are locked and unmapped, and thus they have 2676 + * stable content and can safely be copied to destination memory that 2677 + * is allocated by the drivers. 2678 + */ 2679 + return 0; 2680 + 2681 + } 2682 + EXPORT_SYMBOL(migrate_vma_setup); 2683 + 2570 2684 static void migrate_vma_insert_page(struct migrate_vma *migrate, 2571 2685 unsigned long addr, 2572 2686 struct page *page, ··· 2802 2708 *src &= ~MIGRATE_PFN_MIGRATE; 2803 2709 } 2804 2710 2805 - /* 2711 + /** 2806 2712 * migrate_vma_pages() - migrate meta-data from src page to dst page 2807 2713 * @migrate: migrate struct containing all migration information 2808 2714 * ··· 2810 2716 * struct page. This effectively finishes the migration from source page to the 2811 2717 * destination page. 2812 2718 */ 2813 - static void migrate_vma_pages(struct migrate_vma *migrate) 2719 + void migrate_vma_pages(struct migrate_vma *migrate) 2814 2720 { 2815 2721 const unsigned long npages = migrate->npages; 2816 2722 const unsigned long start = migrate->start; ··· 2884 2790 if (notified) 2885 2791 mmu_notifier_invalidate_range_only_end(&range); 2886 2792 } 2793 + EXPORT_SYMBOL(migrate_vma_pages); 2887 2794 2888 - /* 2795 + /** 2889 2796 * migrate_vma_finalize() - restore CPU page table entry 2890 2797 * @migrate: migrate struct containing all migration information 2891 2798 * ··· 2897 2802 * This also unlocks the pages and puts them back on the lru, or drops the extra 2898 2803 * refcount, for device pages. 2899 2804 */ 2900 - static void migrate_vma_finalize(struct migrate_vma *migrate) 2805 + void migrate_vma_finalize(struct migrate_vma *migrate) 2901 2806 { 2902 2807 const unsigned long npages = migrate->npages; 2903 2808 unsigned long i; ··· 2940 2845 } 2941 2846 } 2942 2847 } 2943 - 2944 - /* 2945 - * migrate_vma() - migrate a range of memory inside vma 2946 - * 2947 - * @ops: migration callback for allocating destination memory and copying 2948 - * @vma: virtual memory area containing the range to be migrated 2949 - * @start: start address of the range to migrate (inclusive) 2950 - * @end: end address of the range to migrate (exclusive) 2951 - * @src: array of hmm_pfn_t containing source pfns 2952 - * @dst: array of hmm_pfn_t containing destination pfns 2953 - * @private: pointer passed back to each of the callback 2954 - * Returns: 0 on success, error code otherwise 2955 - * 2956 - * This function tries to migrate a range of memory virtual address range, using 2957 - * callbacks to allocate and copy memory from source to destination. First it 2958 - * collects all the pages backing each virtual address in the range, saving this 2959 - * inside the src array. Then it locks those pages and unmaps them. Once the pages 2960 - * are locked and unmapped, it checks whether each page is pinned or not. Pages 2961 - * that aren't pinned have the MIGRATE_PFN_MIGRATE flag set (by this function) 2962 - * in the corresponding src array entry. It then restores any pages that are 2963 - * pinned, by remapping and unlocking those pages. 2964 - * 2965 - * At this point it calls the alloc_and_copy() callback. For documentation on 2966 - * what is expected from that callback, see struct migrate_vma_ops comments in 2967 - * include/linux/migrate.h 2968 - * 2969 - * After the alloc_and_copy() callback, this function goes over each entry in 2970 - * the src array that has the MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag 2971 - * set. If the corresponding entry in dst array has MIGRATE_PFN_VALID flag set, 2972 - * then the function tries to migrate struct page information from the source 2973 - * struct page to the destination struct page. If it fails to migrate the struct 2974 - * page information, then it clears the MIGRATE_PFN_MIGRATE flag in the src 2975 - * array. 2976 - * 2977 - * At this point all successfully migrated pages have an entry in the src 2978 - * array with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag set and the dst 2979 - * array entry with MIGRATE_PFN_VALID flag set. 2980 - * 2981 - * It then calls the finalize_and_map() callback. See comments for "struct 2982 - * migrate_vma_ops", in include/linux/migrate.h for details about 2983 - * finalize_and_map() behavior. 2984 - * 2985 - * After the finalize_and_map() callback, for successfully migrated pages, this 2986 - * function updates the CPU page table to point to new pages, otherwise it 2987 - * restores the CPU page table to point to the original source pages. 2988 - * 2989 - * Function returns 0 after the above steps, even if no pages were migrated 2990 - * (The function only returns an error if any of the arguments are invalid.) 2991 - * 2992 - * Both src and dst array must be big enough for (end - start) >> PAGE_SHIFT 2993 - * unsigned long entries. 2994 - */ 2995 - int migrate_vma(const struct migrate_vma_ops *ops, 2996 - struct vm_area_struct *vma, 2997 - unsigned long start, 2998 - unsigned long end, 2999 - unsigned long *src, 3000 - unsigned long *dst, 3001 - void *private) 3002 - { 3003 - struct migrate_vma migrate; 3004 - 3005 - /* Sanity check the arguments */ 3006 - start &= PAGE_MASK; 3007 - end &= PAGE_MASK; 3008 - if (!vma || is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL) || 3009 - vma_is_dax(vma)) 3010 - return -EINVAL; 3011 - if (start < vma->vm_start || start >= vma->vm_end) 3012 - return -EINVAL; 3013 - if (end <= vma->vm_start || end > vma->vm_end) 3014 - return -EINVAL; 3015 - if (!ops || !src || !dst || start >= end) 3016 - return -EINVAL; 3017 - 3018 - memset(src, 0, sizeof(*src) * ((end - start) >> PAGE_SHIFT)); 3019 - migrate.src = src; 3020 - migrate.dst = dst; 3021 - migrate.start = start; 3022 - migrate.npages = 0; 3023 - migrate.cpages = 0; 3024 - migrate.end = end; 3025 - migrate.vma = vma; 3026 - 3027 - /* Collect, and try to unmap source pages */ 3028 - migrate_vma_collect(&migrate); 3029 - if (!migrate.cpages) 3030 - return 0; 3031 - 3032 - /* Lock and isolate page */ 3033 - migrate_vma_prepare(&migrate); 3034 - if (!migrate.cpages) 3035 - return 0; 3036 - 3037 - /* Unmap pages */ 3038 - migrate_vma_unmap(&migrate); 3039 - if (!migrate.cpages) 3040 - return 0; 3041 - 3042 - /* 3043 - * At this point pages are locked and unmapped, and thus they have 3044 - * stable content and can safely be copied to destination memory that 3045 - * is allocated by the callback. 3046 - * 3047 - * Note that migration can fail in migrate_vma_struct_page() for each 3048 - * individual page. 3049 - */ 3050 - ops->alloc_and_copy(vma, src, dst, start, end, private); 3051 - 3052 - /* This does the real migration of struct page */ 3053 - migrate_vma_pages(&migrate); 3054 - 3055 - ops->finalize_and_map(vma, src, dst, start, end, private); 3056 - 3057 - /* Unlock and remap pages */ 3058 - migrate_vma_finalize(&migrate); 3059 - 3060 - return 0; 3061 - } 3062 - EXPORT_SYMBOL(migrate_vma); 3063 - #endif /* defined(MIGRATE_VMA_HELPER) */ 2848 + EXPORT_SYMBOL(migrate_vma_finalize); 2849 + #endif /* CONFIG_DEVICE_PRIVATE */

+8 -9

mm/mincore.c

··· 10 10 */ 11 11 #include <linux/pagemap.h> 12 12 #include <linux/gfp.h> 13 - #include <linux/mm.h> 13 + #include <linux/pagewalk.h> 14 14 #include <linux/mman.h> 15 15 #include <linux/syscalls.h> 16 16 #include <linux/swap.h> ··· 193 193 inode_permission(file_inode(vma->vm_file), MAY_WRITE) == 0; 194 194 } 195 195 196 + static const struct mm_walk_ops mincore_walk_ops = { 197 + .pmd_entry = mincore_pte_range, 198 + .pte_hole = mincore_unmapped_range, 199 + .hugetlb_entry = mincore_hugetlb, 200 + }; 201 + 196 202 /* 197 203 * Do a chunk of "sys_mincore()". We've already checked 198 204 * all the arguments, we hold the mmap semaphore: we should ··· 209 203 struct vm_area_struct *vma; 210 204 unsigned long end; 211 205 int err; 212 - struct mm_walk mincore_walk = { 213 - .pmd_entry = mincore_pte_range, 214 - .pte_hole = mincore_unmapped_range, 215 - .hugetlb_entry = mincore_hugetlb, 216 - .private = vec, 217 - }; 218 206 219 207 vma = find_vma(current->mm, addr); 220 208 if (!vma || addr < vma->vm_start) ··· 219 219 memset(vec, 1, pages); 220 220 return pages; 221 221 } 222 - mincore_walk.mm = vma->vm_mm; 223 - err = walk_page_range(addr, end, &mincore_walk); 222 + err = walk_page_range(vma->vm_mm, addr, end, &mincore_walk_ops, vec); 224 223 if (err < 0) 225 224 return err; 226 225 return (end - addr) >> PAGE_SHIFT;

+205 -64

mm/mmu_notifier.c

··· 21 21 /* global SRCU for all MMs */ 22 22 DEFINE_STATIC_SRCU(srcu); 23 23 24 - /* 25 - * This function allows mmu_notifier::release callback to delay a call to 26 - * a function that will free appropriate resources. The function must be 27 - * quick and must not block. 28 - */ 29 - void mmu_notifier_call_srcu(struct rcu_head *rcu, 30 - void (*func)(struct rcu_head *rcu)) 31 - { 32 - call_srcu(&srcu, rcu, func); 33 - } 34 - EXPORT_SYMBOL_GPL(mmu_notifier_call_srcu); 24 + #ifdef CONFIG_LOCKDEP 25 + struct lockdep_map __mmu_notifier_invalidate_range_start_map = { 26 + .name = "mmu_notifier_invalidate_range_start" 27 + }; 28 + #endif 35 29 36 30 /* 37 31 * This function can't run concurrently against mmu_notifier_register ··· 168 174 id = srcu_read_lock(&srcu); 169 175 hlist_for_each_entry_rcu(mn, &range->mm->mmu_notifier_mm->list, hlist) { 170 176 if (mn->ops->invalidate_range_start) { 171 - int _ret = mn->ops->invalidate_range_start(mn, range); 177 + int _ret; 178 + 179 + if (!mmu_notifier_range_blockable(range)) 180 + non_block_start(); 181 + _ret = mn->ops->invalidate_range_start(mn, range); 182 + if (!mmu_notifier_range_blockable(range)) 183 + non_block_end(); 172 184 if (_ret) { 173 185 pr_info("%pS callback failed with %d in %sblockable context.\n", 174 186 mn->ops->invalidate_range_start, _ret, 175 187 !mmu_notifier_range_blockable(range) ? "non-" : ""); 188 + WARN_ON(mmu_notifier_range_blockable(range) || 189 + ret != -EAGAIN); 176 190 ret = _ret; 177 191 } 178 192 } ··· 189 187 190 188 return ret; 191 189 } 192 - EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range_start); 193 190 194 191 void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range, 195 192 bool only_end) ··· 196 195 struct mmu_notifier *mn; 197 196 int id; 198 197 198 + lock_map_acquire(&__mmu_notifier_invalidate_range_start_map); 199 199 id = srcu_read_lock(&srcu); 200 200 hlist_for_each_entry_rcu(mn, &range->mm->mmu_notifier_mm->list, hlist) { 201 201 /* ··· 216 214 mn->ops->invalidate_range(mn, range->mm, 217 215 range->start, 218 216 range->end); 219 - if (mn->ops->invalidate_range_end) 217 + if (mn->ops->invalidate_range_end) { 218 + if (!mmu_notifier_range_blockable(range)) 219 + non_block_start(); 220 220 mn->ops->invalidate_range_end(mn, range); 221 + if (!mmu_notifier_range_blockable(range)) 222 + non_block_end(); 223 + } 221 224 } 222 225 srcu_read_unlock(&srcu, id); 226 + lock_map_release(&__mmu_notifier_invalidate_range_start_map); 223 227 } 224 - EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range_end); 225 228 226 229 void __mmu_notifier_invalidate_range(struct mm_struct *mm, 227 230 unsigned long start, unsigned long end) ··· 241 234 } 242 235 srcu_read_unlock(&srcu, id); 243 236 } 244 - EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range); 245 237 246 - static int do_mmu_notifier_register(struct mmu_notifier *mn, 247 - struct mm_struct *mm, 248 - int take_mmap_sem) 238 + /* 239 + * Same as mmu_notifier_register but here the caller must hold the 240 + * mmap_sem in write mode. 241 + */ 242 + int __mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm) 249 243 { 250 - struct mmu_notifier_mm *mmu_notifier_mm; 244 + struct mmu_notifier_mm *mmu_notifier_mm = NULL; 251 245 int ret; 252 246 247 + lockdep_assert_held_write(&mm->mmap_sem); 253 248 BUG_ON(atomic_read(&mm->mm_users) <= 0); 254 249 255 - ret = -ENOMEM; 256 - mmu_notifier_mm = kmalloc(sizeof(struct mmu_notifier_mm), GFP_KERNEL); 257 - if (unlikely(!mmu_notifier_mm)) 258 - goto out; 250 + if (IS_ENABLED(CONFIG_LOCKDEP)) { 251 + fs_reclaim_acquire(GFP_KERNEL); 252 + lock_map_acquire(&__mmu_notifier_invalidate_range_start_map); 253 + lock_map_release(&__mmu_notifier_invalidate_range_start_map); 254 + fs_reclaim_release(GFP_KERNEL); 255 + } 259 256 260 - if (take_mmap_sem) 261 - down_write(&mm->mmap_sem); 257 + mn->mm = mm; 258 + mn->users = 1; 259 + 260 + if (!mm->mmu_notifier_mm) { 261 + /* 262 + * kmalloc cannot be called under mm_take_all_locks(), but we 263 + * know that mm->mmu_notifier_mm can't change while we hold 264 + * the write side of the mmap_sem. 265 + */ 266 + mmu_notifier_mm = 267 + kmalloc(sizeof(struct mmu_notifier_mm), GFP_KERNEL); 268 + if (!mmu_notifier_mm) 269 + return -ENOMEM; 270 + 271 + INIT_HLIST_HEAD(&mmu_notifier_mm->list); 272 + spin_lock_init(&mmu_notifier_mm->lock); 273 + } 274 + 262 275 ret = mm_take_all_locks(mm); 263 276 if (unlikely(ret)) 264 277 goto out_clean; 265 278 266 - if (!mm_has_notifiers(mm)) { 267 - INIT_HLIST_HEAD(&mmu_notifier_mm->list); 268 - spin_lock_init(&mmu_notifier_mm->lock); 269 - 270 - mm->mmu_notifier_mm = mmu_notifier_mm; 271 - mmu_notifier_mm = NULL; 272 - } 279 + /* Pairs with the mmdrop in mmu_notifier_unregister_* */ 273 280 mmgrab(mm); 274 281 275 282 /* ··· 294 273 * We can't race against any other mmu notifier method either 295 274 * thanks to mm_take_all_locks(). 296 275 */ 276 + if (mmu_notifier_mm) 277 + mm->mmu_notifier_mm = mmu_notifier_mm; 278 + 297 279 spin_lock(&mm->mmu_notifier_mm->lock); 298 280 hlist_add_head_rcu(&mn->hlist, &mm->mmu_notifier_mm->list); 299 281 spin_unlock(&mm->mmu_notifier_mm->lock); 300 282 301 283 mm_drop_all_locks(mm); 302 - out_clean: 303 - if (take_mmap_sem) 304 - up_write(&mm->mmap_sem); 305 - kfree(mmu_notifier_mm); 306 - out: 307 284 BUG_ON(atomic_read(&mm->mm_users) <= 0); 285 + return 0; 286 + 287 + out_clean: 288 + kfree(mmu_notifier_mm); 308 289 return ret; 309 290 } 291 + EXPORT_SYMBOL_GPL(__mmu_notifier_register); 310 292 311 - /* 293 + /** 294 + * mmu_notifier_register - Register a notifier on a mm 295 + * @mn: The notifier to attach 296 + * @mm: The mm to attach the notifier to 297 + * 312 298 * Must not hold mmap_sem nor any other VM related lock when calling 313 299 * this registration function. Must also ensure mm_users can't go down 314 300 * to zero while this runs to avoid races with mmu_notifier_release, 315 301 * so mm has to be current->mm or the mm should be pinned safely such 316 302 * as with get_task_mm(). If the mm is not current->mm, the mm_users 317 303 * pin should be released by calling mmput after mmu_notifier_register 318 - * returns. mmu_notifier_unregister must be always called to 319 - * unregister the notifier. mm_count is automatically pinned to allow 320 - * mmu_notifier_unregister to safely run at any time later, before or 321 - * after exit_mmap. ->release will always be called before exit_mmap 322 - * frees the pages. 304 + * returns. 305 + * 306 + * mmu_notifier_unregister() or mmu_notifier_put() must be always called to 307 + * unregister the notifier. 308 + * 309 + * While the caller has a mmu_notifier get the mn->mm pointer will remain 310 + * valid, and can be converted to an active mm pointer via mmget_not_zero(). 323 311 */ 324 312 int mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm) 325 313 { 326 - return do_mmu_notifier_register(mn, mm, 1); 314 + int ret; 315 + 316 + down_write(&mm->mmap_sem); 317 + ret = __mmu_notifier_register(mn, mm); 318 + up_write(&mm->mmap_sem); 319 + return ret; 327 320 } 328 321 EXPORT_SYMBOL_GPL(mmu_notifier_register); 329 322 330 - /* 331 - * Same as mmu_notifier_register but here the caller must hold the 332 - * mmap_sem in write mode. 333 - */ 334 - int __mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm) 323 + static struct mmu_notifier * 324 + find_get_mmu_notifier(struct mm_struct *mm, const struct mmu_notifier_ops *ops) 335 325 { 336 - return do_mmu_notifier_register(mn, mm, 0); 326 + struct mmu_notifier *mn; 327 + 328 + spin_lock(&mm->mmu_notifier_mm->lock); 329 + hlist_for_each_entry_rcu (mn, &mm->mmu_notifier_mm->list, hlist) { 330 + if (mn->ops != ops) 331 + continue; 332 + 333 + if (likely(mn->users != UINT_MAX)) 334 + mn->users++; 335 + else 336 + mn = ERR_PTR(-EOVERFLOW); 337 + spin_unlock(&mm->mmu_notifier_mm->lock); 338 + return mn; 339 + } 340 + spin_unlock(&mm->mmu_notifier_mm->lock); 341 + return NULL; 337 342 } 338 - EXPORT_SYMBOL_GPL(__mmu_notifier_register); 343 + 344 + /** 345 + * mmu_notifier_get_locked - Return the single struct mmu_notifier for 346 + * the mm & ops 347 + * @ops: The operations struct being subscribe with 348 + * @mm : The mm to attach notifiers too 349 + * 350 + * This function either allocates a new mmu_notifier via 351 + * ops->alloc_notifier(), or returns an already existing notifier on the 352 + * list. The value of the ops pointer is used to determine when two notifiers 353 + * are the same. 354 + * 355 + * Each call to mmu_notifier_get() must be paired with a call to 356 + * mmu_notifier_put(). The caller must hold the write side of mm->mmap_sem. 357 + * 358 + * While the caller has a mmu_notifier get the mm pointer will remain valid, 359 + * and can be converted to an active mm pointer via mmget_not_zero(). 360 + */ 361 + struct mmu_notifier *mmu_notifier_get_locked(const struct mmu_notifier_ops *ops, 362 + struct mm_struct *mm) 363 + { 364 + struct mmu_notifier *mn; 365 + int ret; 366 + 367 + lockdep_assert_held_write(&mm->mmap_sem); 368 + 369 + if (mm->mmu_notifier_mm) { 370 + mn = find_get_mmu_notifier(mm, ops); 371 + if (mn) 372 + return mn; 373 + } 374 + 375 + mn = ops->alloc_notifier(mm); 376 + if (IS_ERR(mn)) 377 + return mn; 378 + mn->ops = ops; 379 + ret = __mmu_notifier_register(mn, mm); 380 + if (ret) 381 + goto out_free; 382 + return mn; 383 + out_free: 384 + mn->ops->free_notifier(mn); 385 + return ERR_PTR(ret); 386 + } 387 + EXPORT_SYMBOL_GPL(mmu_notifier_get_locked); 339 388 340 389 /* this is called after the last mmu_notifier_unregister() returned */ 341 390 void __mmu_notifier_mm_destroy(struct mm_struct *mm) ··· 466 375 } 467 376 EXPORT_SYMBOL_GPL(mmu_notifier_unregister); 468 377 469 - /* 470 - * Same as mmu_notifier_unregister but no callback and no srcu synchronization. 471 - */ 472 - void mmu_notifier_unregister_no_release(struct mmu_notifier *mn, 473 - struct mm_struct *mm) 378 + static void mmu_notifier_free_rcu(struct rcu_head *rcu) 474 379 { 380 + struct mmu_notifier *mn = container_of(rcu, struct mmu_notifier, rcu); 381 + struct mm_struct *mm = mn->mm; 382 + 383 + mn->ops->free_notifier(mn); 384 + /* Pairs with the get in __mmu_notifier_register() */ 385 + mmdrop(mm); 386 + } 387 + 388 + /** 389 + * mmu_notifier_put - Release the reference on the notifier 390 + * @mn: The notifier to act on 391 + * 392 + * This function must be paired with each mmu_notifier_get(), it releases the 393 + * reference obtained by the get. If this is the last reference then process 394 + * to free the notifier will be run asynchronously. 395 + * 396 + * Unlike mmu_notifier_unregister() the get/put flow only calls ops->release 397 + * when the mm_struct is destroyed. Instead free_notifier is always called to 398 + * release any resources held by the user. 399 + * 400 + * As ops->release is not guaranteed to be called, the user must ensure that 401 + * all sptes are dropped, and no new sptes can be established before 402 + * mmu_notifier_put() is called. 403 + * 404 + * This function can be called from the ops->release callback, however the 405 + * caller must still ensure it is called pairwise with mmu_notifier_get(). 406 + * 407 + * Modules calling this function must call mmu_notifier_synchronize() in 408 + * their __exit functions to ensure the async work is completed. 409 + */ 410 + void mmu_notifier_put(struct mmu_notifier *mn) 411 + { 412 + struct mm_struct *mm = mn->mm; 413 + 475 414 spin_lock(&mm->mmu_notifier_mm->lock); 476 - /* 477 - * Can not use list_del_rcu() since __mmu_notifier_release 478 - * can delete it before we hold the lock. 479 - */ 415 + if (WARN_ON(!mn->users) || --mn->users) 416 + goto out_unlock; 480 417 hlist_del_init_rcu(&mn->hlist); 481 418 spin_unlock(&mm->mmu_notifier_mm->lock); 482 419 483 - BUG_ON(atomic_read(&mm->mm_count) <= 0); 484 - mmdrop(mm); 420 + call_srcu(&srcu, &mn->rcu, mmu_notifier_free_rcu); 421 + return; 422 + 423 + out_unlock: 424 + spin_unlock(&mm->mmu_notifier_mm->lock); 485 425 } 486 - EXPORT_SYMBOL_GPL(mmu_notifier_unregister_no_release); 426 + EXPORT_SYMBOL_GPL(mmu_notifier_put); 427 + 428 + /** 429 + * mmu_notifier_synchronize - Ensure all mmu_notifiers are freed 430 + * 431 + * This function ensures that all outstanding async SRU work from 432 + * mmu_notifier_put() is completed. After it returns any mmu_notifier_ops 433 + * associated with an unused mmu_notifier will no longer be called. 434 + * 435 + * Before using the caller must ensure that all of its mmu_notifiers have been 436 + * fully released via mmu_notifier_put(). 437 + * 438 + * Modules using the mmu_notifier_put() API should call this in their __exit 439 + * function to avoid module unloading races. 440 + */ 441 + void mmu_notifier_synchronize(void) 442 + { 443 + synchronize_srcu(&srcu); 444 + } 445 + EXPORT_SYMBOL_GPL(mmu_notifier_synchronize); 487 446 488 447 bool 489 448 mmu_notifier_range_update_to_read_only(const struct mmu_notifier_range *range)

+10 -16

mm/mprotect.c

··· 9 9 * (C) Copyright 2002 Red Hat Inc, All Rights Reserved 10 10 */ 11 11 12 - #include <linux/mm.h> 12 + #include <linux/pagewalk.h> 13 13 #include <linux/hugetlb.h> 14 14 #include <linux/shm.h> 15 15 #include <linux/mman.h> ··· 329 329 return 0; 330 330 } 331 331 332 - static int prot_none_walk(struct vm_area_struct *vma, unsigned long start, 333 - unsigned long end, unsigned long newflags) 334 - { 335 - pgprot_t new_pgprot = vm_get_page_prot(newflags); 336 - struct mm_walk prot_none_walk = { 337 - .pte_entry = prot_none_pte_entry, 338 - .hugetlb_entry = prot_none_hugetlb_entry, 339 - .test_walk = prot_none_test, 340 - .mm = current->mm, 341 - .private = &new_pgprot, 342 - }; 343 - 344 - return walk_page_range(start, end, &prot_none_walk); 345 - } 332 + static const struct mm_walk_ops prot_none_walk_ops = { 333 + .pte_entry = prot_none_pte_entry, 334 + .hugetlb_entry = prot_none_hugetlb_entry, 335 + .test_walk = prot_none_test, 336 + }; 346 337 347 338 int 348 339 mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, ··· 360 369 if (arch_has_pfn_modify_check() && 361 370 (vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) && 362 371 (newflags & (VM_READ|VM_WRITE|VM_EXEC)) == 0) { 363 - error = prot_none_walk(vma, start, end, newflags); 372 + pgprot_t new_pgprot = vm_get_page_prot(newflags); 373 + 374 + error = walk_page_range(current->mm, start, end, 375 + &prot_none_walk_ops, &new_pgprot); 364 376 if (error) 365 377 return error; 366 378 }

+1 -1

mm/page_alloc.c

··· 5971 5971 } 5972 5972 } 5973 5973 5974 - pr_info("%s initialised, %lu pages in %ums\n", dev_name(pgmap->dev), 5974 + pr_info("%s initialised %lu pages in %ums\n", __func__, 5975 5975 size, jiffies_to_msecs(jiffies - start)); 5976 5976 } 5977 5977

+73 -53

mm/pagewalk.c

··· 1 1 // SPDX-License-Identifier: GPL-2.0 2 - #include <linux/mm.h> 2 + #include <linux/pagewalk.h> 3 3 #include <linux/highmem.h> 4 4 #include <linux/sched.h> 5 5 #include <linux/hugetlb.h> ··· 9 9 { 10 10 pte_t *pte; 11 11 int err = 0; 12 + const struct mm_walk_ops *ops = walk->ops; 12 13 13 14 pte = pte_offset_map(pmd, addr); 14 15 for (;;) { 15 - err = walk->pte_entry(pte, addr, addr + PAGE_SIZE, walk); 16 + err = ops->pte_entry(pte, addr, addr + PAGE_SIZE, walk); 16 17 if (err) 17 18 break; 18 19 addr += PAGE_SIZE; ··· 31 30 { 32 31 pmd_t *pmd; 33 32 unsigned long next; 33 + const struct mm_walk_ops *ops = walk->ops; 34 34 int err = 0; 35 35 36 36 pmd = pmd_offset(pud, addr); ··· 39 37 again: 40 38 next = pmd_addr_end(addr, end); 41 39 if (pmd_none(*pmd) || !walk->vma) { 42 - if (walk->pte_hole) 43 - err = walk->pte_hole(addr, next, walk); 40 + if (ops->pte_hole) 41 + err = ops->pte_hole(addr, next, walk); 44 42 if (err) 45 43 break; 46 44 continue; ··· 49 47 * This implies that each ->pmd_entry() handler 50 48 * needs to know about pmd_trans_huge() pmds 51 49 */ 52 - if (walk->pmd_entry) 53 - err = walk->pmd_entry(pmd, addr, next, walk); 50 + if (ops->pmd_entry) 51 + err = ops->pmd_entry(pmd, addr, next, walk); 54 52 if (err) 55 53 break; 56 54 ··· 58 56 * Check this here so we only break down trans_huge 59 57 * pages when we _need_ to 60 58 */ 61 - if (!walk->pte_entry) 59 + if (!ops->pte_entry) 62 60 continue; 63 61 64 62 split_huge_pmd(walk->vma, pmd, addr); ··· 77 75 { 78 76 pud_t *pud; 79 77 unsigned long next; 78 + const struct mm_walk_ops *ops = walk->ops; 80 79 int err = 0; 81 80 82 81 pud = pud_offset(p4d, addr); ··· 85 82 again: 86 83 next = pud_addr_end(addr, end); 87 84 if (pud_none(*pud) || !walk->vma) { 88 - if (walk->pte_hole) 89 - err = walk->pte_hole(addr, next, walk); 85 + if (ops->pte_hole) 86 + err = ops->pte_hole(addr, next, walk); 90 87 if (err) 91 88 break; 92 89 continue; 93 90 } 94 91 95 - if (walk->pud_entry) { 92 + if (ops->pud_entry) { 96 93 spinlock_t *ptl = pud_trans_huge_lock(pud, walk->vma); 97 94 98 95 if (ptl) { 99 - err = walk->pud_entry(pud, addr, next, walk); 96 + err = ops->pud_entry(pud, addr, next, walk); 100 97 spin_unlock(ptl); 101 98 if (err) 102 99 break; ··· 108 105 if (pud_none(*pud)) 109 106 goto again; 110 107 111 - if (walk->pmd_entry || walk->pte_entry) 108 + if (ops->pmd_entry || ops->pte_entry) 112 109 err = walk_pmd_range(pud, addr, next, walk); 113 110 if (err) 114 111 break; ··· 122 119 { 123 120 p4d_t *p4d; 124 121 unsigned long next; 122 + const struct mm_walk_ops *ops = walk->ops; 125 123 int err = 0; 126 124 127 125 p4d = p4d_offset(pgd, addr); 128 126 do { 129 127 next = p4d_addr_end(addr, end); 130 128 if (p4d_none_or_clear_bad(p4d)) { 131 - if (walk->pte_hole) 132 - err = walk->pte_hole(addr, next, walk); 129 + if (ops->pte_hole) 130 + err = ops->pte_hole(addr, next, walk); 133 131 if (err) 134 132 break; 135 133 continue; 136 134 } 137 - if (walk->pmd_entry || walk->pte_entry) 135 + if (ops->pmd_entry || ops->pte_entry) 138 136 err = walk_pud_range(p4d, addr, next, walk); 139 137 if (err) 140 138 break; ··· 149 145 { 150 146 pgd_t *pgd; 151 147 unsigned long next; 148 + const struct mm_walk_ops *ops = walk->ops; 152 149 int err = 0; 153 150 154 151 pgd = pgd_offset(walk->mm, addr); 155 152 do { 156 153 next = pgd_addr_end(addr, end); 157 154 if (pgd_none_or_clear_bad(pgd)) { 158 - if (walk->pte_hole) 159 - err = walk->pte_hole(addr, next, walk); 155 + if (ops->pte_hole) 156 + err = ops->pte_hole(addr, next, walk); 160 157 if (err) 161 158 break; 162 159 continue; 163 160 } 164 - if (walk->pmd_entry || walk->pte_entry) 161 + if (ops->pmd_entry || ops->pte_entry) 165 162 err = walk_p4d_range(pgd, addr, next, walk); 166 163 if (err) 167 164 break; ··· 188 183 unsigned long hmask = huge_page_mask(h); 189 184 unsigned long sz = huge_page_size(h); 190 185 pte_t *pte; 186 + const struct mm_walk_ops *ops = walk->ops; 191 187 int err = 0; 192 188 193 189 do { ··· 196 190 pte = huge_pte_offset(walk->mm, addr & hmask, sz); 197 191 198 192 if (pte) 199 - err = walk->hugetlb_entry(pte, hmask, addr, next, walk); 200 - else if (walk->pte_hole) 201 - err = walk->pte_hole(addr, next, walk); 193 + err = ops->hugetlb_entry(pte, hmask, addr, next, walk); 194 + else if (ops->pte_hole) 195 + err = ops->pte_hole(addr, next, walk); 202 196 203 197 if (err) 204 198 break; ··· 226 220 struct mm_walk *walk) 227 221 { 228 222 struct vm_area_struct *vma = walk->vma; 223 + const struct mm_walk_ops *ops = walk->ops; 229 224 230 - if (walk->test_walk) 231 - return walk->test_walk(start, end, walk); 225 + if (ops->test_walk) 226 + return ops->test_walk(start, end, walk); 232 227 233 228 /* 234 229 * vma(VM_PFNMAP) doesn't have any valid struct pages behind VM_PFNMAP ··· 241 234 */ 242 235 if (vma->vm_flags & VM_PFNMAP) { 243 236 int err = 1; 244 - if (walk->pte_hole) 245 - err = walk->pte_hole(start, end, walk); 237 + if (ops->pte_hole) 238 + err = ops->pte_hole(start, end, walk); 246 239 return err ? err : 1; 247 240 } 248 241 return 0; ··· 255 248 struct vm_area_struct *vma = walk->vma; 256 249 257 250 if (vma && is_vm_hugetlb_page(vma)) { 258 - if (walk->hugetlb_entry) 251 + if (walk->ops->hugetlb_entry) 259 252 err = walk_hugetlb_range(start, end, walk); 260 253 } else 261 254 err = walk_pgd_range(start, end, walk); ··· 265 258 266 259 /** 267 260 * walk_page_range - walk page table with caller specific callbacks 268 - * @start: start address of the virtual address range 269 - * @end: end address of the virtual address range 270 - * @walk: mm_walk structure defining the callbacks and the target address space 261 + * @mm: mm_struct representing the target process of page table walk 262 + * @start: start address of the virtual address range 263 + * @end: end address of the virtual address range 264 + * @ops: operation to call during the walk 265 + * @private: private data for callbacks' usage 271 266 * 272 - * Recursively walk the page table tree of the process represented by @walk->mm 267 + * Recursively walk the page table tree of the process represented by @mm 273 268 * within the virtual address range [@start, @end). During walking, we can do 274 269 * some caller-specific works for each entry, by setting up pmd_entry(), 275 270 * pte_entry(), and/or hugetlb_entry(). If you don't set up for some of these ··· 287 278 * 288 279 * Before starting to walk page table, some callers want to check whether 289 280 * they really want to walk over the current vma, typically by checking 290 - * its vm_flags. walk_page_test() and @walk->test_walk() are used for this 281 + * its vm_flags. walk_page_test() and @ops->test_walk() are used for this 291 282 * purpose. 292 283 * 293 284 * struct mm_walk keeps current values of some common data like vma and pmd, 294 285 * which are useful for the access from callbacks. If you want to pass some 295 - * caller-specific data to callbacks, @walk->private should be helpful. 286 + * caller-specific data to callbacks, @private should be helpful. 296 287 * 297 288 * Locking: 298 - * Callers of walk_page_range() and walk_page_vma() should hold 299 - * @walk->mm->mmap_sem, because these function traverse vma list and/or 300 - * access to vma's data. 289 + * Callers of walk_page_range() and walk_page_vma() should hold @mm->mmap_sem, 290 + * because these function traverse vma list and/or access to vma's data. 301 291 */ 302 - int walk_page_range(unsigned long start, unsigned long end, 303 - struct mm_walk *walk) 292 + int walk_page_range(struct mm_struct *mm, unsigned long start, 293 + unsigned long end, const struct mm_walk_ops *ops, 294 + void *private) 304 295 { 305 296 int err = 0; 306 297 unsigned long next; 307 298 struct vm_area_struct *vma; 299 + struct mm_walk walk = { 300 + .ops = ops, 301 + .mm = mm, 302 + .private = private, 303 + }; 308 304 309 305 if (start >= end) 310 306 return -EINVAL; 311 307 312 - if (!walk->mm) 308 + if (!walk.mm) 313 309 return -EINVAL; 314 310 315 - VM_BUG_ON_MM(!rwsem_is_locked(&walk->mm->mmap_sem), walk->mm); 311 + lockdep_assert_held(&walk.mm->mmap_sem); 316 312 317 - vma = find_vma(walk->mm, start); 313 + vma = find_vma(walk.mm, start); 318 314 do { 319 315 if (!vma) { /* after the last vma */ 320 - walk->vma = NULL; 316 + walk.vma = NULL; 321 317 next = end; 322 318 } else if (start < vma->vm_start) { /* outside vma */ 323 - walk->vma = NULL; 319 + walk.vma = NULL; 324 320 next = min(end, vma->vm_start); 325 321 } else { /* inside vma */ 326 - walk->vma = vma; 322 + walk.vma = vma; 327 323 next = min(end, vma->vm_end); 328 324 vma = vma->vm_next; 329 325 330 - err = walk_page_test(start, next, walk); 326 + err = walk_page_test(start, next, &walk); 331 327 if (err > 0) { 332 328 /* 333 329 * positive return values are purely for ··· 345 331 if (err < 0) 346 332 break; 347 333 } 348 - if (walk->vma || walk->pte_hole) 349 - err = __walk_page_range(start, next, walk); 334 + if (walk.vma || walk.ops->pte_hole) 335 + err = __walk_page_range(start, next, &walk); 350 336 if (err) 351 337 break; 352 338 } while (start = next, start < end); 353 339 return err; 354 340 } 355 341 356 - int walk_page_vma(struct vm_area_struct *vma, struct mm_walk *walk) 342 + int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops, 343 + void *private) 357 344 { 345 + struct mm_walk walk = { 346 + .ops = ops, 347 + .mm = vma->vm_mm, 348 + .vma = vma, 349 + .private = private, 350 + }; 358 351 int err; 359 352 360 - if (!walk->mm) 353 + if (!walk.mm) 361 354 return -EINVAL; 362 355 363 - VM_BUG_ON(!rwsem_is_locked(&walk->mm->mmap_sem)); 364 - VM_BUG_ON(!vma); 365 - walk->vma = vma; 366 - err = walk_page_test(vma->vm_start, vma->vm_end, walk); 356 + lockdep_assert_held(&walk.mm->mmap_sem); 357 + 358 + err = walk_page_test(vma->vm_start, vma->vm_end, &walk); 367 359 if (err > 0) 368 360 return 0; 369 361 if (err < 0) 370 362 return err; 371 - return __walk_page_range(vma->vm_start, vma->vm_end, walk); 363 + return __walk_page_range(vma->vm_start, vma->vm_end, &walk); 372 364 }

-1

tools/testing/nvdimm/test/iomap.c

··· 132 132 if (!nfit_res) 133 133 return devm_memremap_pages(dev, pgmap); 134 134 135 - pgmap->dev = dev; 136 135 if (!pgmap->ref) { 137 136 if (pgmap->ops && (pgmap->ops->kill || pgmap->ops->cleanup)) 138 137 return ERR_PTR(-EINVAL);