Merge tag 'for-linus-hmm' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma

Pull hmm updates from Jason Gunthorpe:
"Ralph has been working on nouveau's use of hmm_range_fault() and
migrate_vma() which resulted in this small series. It adds reporting
of the page table order from hmm_range_fault() and some optimization
of migrate_vma():

- Report the size of the page table mapping out of hmm_range_fault().

This makes it easier to establish a large/huge/etc mapping in the
device's page table.

- Allow devices to ignore the invalidations during migration in cases
where the migration is not going to change pages.

For instance migrating pages to a device does not require the
device to invalidate pages already in the device.

- Update nouveau and hmm_tests to use the above"

* tag 'for-linus-hmm' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma:
mm/hmm/test: use the new migration invalidation
nouveau/svm: use the new migration invalidation
mm/notifier: add migration invalidation type
mm/migrate: add a flags parameter to migrate_vma
nouveau: fix storing invalid ptes
nouveau/hmm: support mapping large sysmem pages
nouveau: fix mapping 2MB sysmem pages
nouveau/hmm: fault one page at a time
mm/hmm: add tests for hmm_pfn_to_map_order()
mm/hmm: provide the page mapping order in hmm_range_fault()

+411 -201
+3 -1
arch/powerpc/kvm/book3s_hv_uvmem.c
··· 400 400 mig.end = end; 401 401 mig.src = &src_pfn; 402 402 mig.dst = &dst_pfn; 403 + mig.flags = MIGRATE_VMA_SELECT_SYSTEM; 403 404 404 405 /* 405 406 * We come here with mmap_lock write lock held just for ··· 578 577 mig.end = end; 579 578 mig.src = &src_pfn; 580 579 mig.dst = &dst_pfn; 581 - mig.src_owner = &kvmppc_uvmem_pgmap; 580 + mig.pgmap_owner = &kvmppc_uvmem_pgmap; 581 + mig.flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE; 582 582 583 583 mutex_lock(&kvm->arch.uvmem_lock); 584 584 /* The requested page is already paged-out, nothing to do */
+15 -4
drivers/gpu/drm/nouveau/nouveau_dmem.c
··· 140 140 { 141 141 struct device *dev = drm->dev->dev; 142 142 struct page *dpage, *spage; 143 + struct nouveau_svmm *svmm; 143 144 144 145 spage = migrate_pfn_to_page(args->src[0]); 145 146 if (!spage || !(args->src[0] & MIGRATE_PFN_MIGRATE)) ··· 155 154 if (dma_mapping_error(dev, *dma_addr)) 156 155 goto error_free_page; 157 156 157 + svmm = spage->zone_device_data; 158 + mutex_lock(&svmm->mutex); 159 + nouveau_svmm_invalidate(svmm, args->start, args->end); 158 160 if (drm->dmem->migrate.copy_func(drm, 1, NOUVEAU_APER_HOST, *dma_addr, 159 161 NOUVEAU_APER_VRAM, nouveau_dmem_page_addr(spage))) 160 162 goto error_dma_unmap; 163 + mutex_unlock(&svmm->mutex); 161 164 162 165 args->dst[0] = migrate_pfn(page_to_pfn(dpage)) | MIGRATE_PFN_LOCKED; 163 166 return 0; 164 167 165 168 error_dma_unmap: 169 + mutex_unlock(&svmm->mutex); 166 170 dma_unmap_page(dev, *dma_addr, PAGE_SIZE, DMA_BIDIRECTIONAL); 167 171 error_free_page: 168 172 __free_page(dpage); ··· 188 182 .end = vmf->address + PAGE_SIZE, 189 183 .src = &src, 190 184 .dst = &dst, 191 - .src_owner = drm->dev, 185 + .pgmap_owner = drm->dev, 186 + .flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE, 192 187 }; 193 188 194 189 /* ··· 537 530 } 538 531 539 532 static unsigned long nouveau_dmem_migrate_copy_one(struct nouveau_drm *drm, 540 - unsigned long src, dma_addr_t *dma_addr, u64 *pfn) 533 + struct nouveau_svmm *svmm, unsigned long src, 534 + dma_addr_t *dma_addr, u64 *pfn) 541 535 { 542 536 struct device *dev = drm->dev->dev; 543 537 struct page *dpage, *spage; ··· 568 560 goto out_free_page; 569 561 } 570 562 563 + dpage->zone_device_data = svmm; 571 564 *pfn = NVIF_VMM_PFNMAP_V0_V | NVIF_VMM_PFNMAP_V0_VRAM | 572 565 ((paddr >> PAGE_SHIFT) << NVIF_VMM_PFNMAP_V0_ADDR_SHIFT); 573 566 if (src & MIGRATE_PFN_WRITE) ··· 592 583 unsigned long addr = args->start, nr_dma = 0, i; 593 584 594 585 for (i = 0; addr < args->end; i++) { 595 - args->dst[i] = nouveau_dmem_migrate_copy_one(drm, args->src[i], 596 - dma_addrs + nr_dma, pfns + i); 586 + args->dst[i] = nouveau_dmem_migrate_copy_one(drm, svmm, 587 + args->src[i], dma_addrs + nr_dma, pfns + i); 597 588 if (!dma_mapping_error(drm->dev->dev, dma_addrs[nr_dma])) 598 589 nr_dma++; 599 590 addr += PAGE_SIZE; ··· 624 615 struct migrate_vma args = { 625 616 .vma = vma, 626 617 .start = start, 618 + .pgmap_owner = drm->dev, 619 + .flags = MIGRATE_VMA_SELECT_SYSTEM, 627 620 }; 628 621 unsigned long i; 629 622 u64 *pfns;
+108 -151
drivers/gpu/drm/nouveau/nouveau_svm.c
··· 93 93 return NULL; 94 94 } 95 95 96 - struct nouveau_svmm { 97 - struct mmu_notifier notifier; 98 - struct nouveau_vmm *vmm; 99 - struct { 100 - unsigned long start; 101 - unsigned long limit; 102 - } unmanaged; 103 - 104 - struct mutex mutex; 105 - }; 106 - 107 96 #define SVMM_DBG(s,f,a...) \ 108 97 NV_DEBUG((s)->vmm->cli->drm, "svm-%p: "f"\n", (s), ##a) 109 98 #define SVMM_ERR(s,f,a...) \ ··· 235 246 } 236 247 237 248 /* Invalidate SVMM address-range on GPU. */ 238 - static void 249 + void 239 250 nouveau_svmm_invalidate(struct nouveau_svmm *svmm, u64 start, u64 limit) 240 251 { 241 252 if (limit > start) { ··· 266 277 267 278 mutex_lock(&svmm->mutex); 268 279 if (unlikely(!svmm->vmm)) 280 + goto out; 281 + 282 + /* 283 + * Ignore invalidation callbacks for device private pages since 284 + * the invalidation is handled as part of the migration process. 285 + */ 286 + if (update->event == MMU_NOTIFY_MIGRATE && 287 + update->migrate_pgmap_owner == svmm->vmm->cli->drm->dev) 269 288 goto out; 270 289 271 290 if (limit > svmm->unmanaged.start && start < svmm->unmanaged.limit) { ··· 511 514 }; 512 515 513 516 static void nouveau_hmm_convert_pfn(struct nouveau_drm *drm, 514 - struct hmm_range *range, u64 *ioctl_addr) 517 + struct hmm_range *range, 518 + struct nouveau_pfnmap_args *args) 515 519 { 516 - unsigned long i, npages; 520 + struct page *page; 517 521 518 522 /* 519 - * The ioctl_addr prepared here is passed through nvif_object_ioctl() 523 + * The address prepared here is passed through nvif_object_ioctl() 520 524 * to an eventual DMA map in something like gp100_vmm_pgt_pfn() 521 525 * 522 526 * This is all just encoding the internal hmm representation into a 523 527 * different nouveau internal representation. 524 528 */ 525 - npages = (range->end - range->start) >> PAGE_SHIFT; 526 - for (i = 0; i < npages; ++i) { 527 - struct page *page; 528 - 529 - if (!(range->hmm_pfns[i] & HMM_PFN_VALID)) { 530 - ioctl_addr[i] = 0; 531 - continue; 532 - } 533 - 534 - page = hmm_pfn_to_page(range->hmm_pfns[i]); 535 - if (is_device_private_page(page)) 536 - ioctl_addr[i] = nouveau_dmem_page_addr(page) | 537 - NVIF_VMM_PFNMAP_V0_V | 538 - NVIF_VMM_PFNMAP_V0_VRAM; 539 - else 540 - ioctl_addr[i] = page_to_phys(page) | 541 - NVIF_VMM_PFNMAP_V0_V | 542 - NVIF_VMM_PFNMAP_V0_HOST; 543 - if (range->hmm_pfns[i] & HMM_PFN_WRITE) 544 - ioctl_addr[i] |= NVIF_VMM_PFNMAP_V0_W; 529 + if (!(range->hmm_pfns[0] & HMM_PFN_VALID)) { 530 + args->p.phys[0] = 0; 531 + return; 545 532 } 533 + 534 + page = hmm_pfn_to_page(range->hmm_pfns[0]); 535 + /* 536 + * Only map compound pages to the GPU if the CPU is also mapping the 537 + * page as a compound page. Otherwise, the PTE protections might not be 538 + * consistent (e.g., CPU only maps part of a compound page). 539 + * Note that the underlying page might still be larger than the 540 + * CPU mapping (e.g., a PUD sized compound page partially mapped with 541 + * a PMD sized page table entry). 542 + */ 543 + if (hmm_pfn_to_map_order(range->hmm_pfns[0])) { 544 + unsigned long addr = args->p.addr; 545 + 546 + args->p.page = hmm_pfn_to_map_order(range->hmm_pfns[0]) + 547 + PAGE_SHIFT; 548 + args->p.size = 1UL << args->p.page; 549 + args->p.addr &= ~(args->p.size - 1); 550 + page -= (addr - args->p.addr) >> PAGE_SHIFT; 551 + } 552 + if (is_device_private_page(page)) 553 + args->p.phys[0] = nouveau_dmem_page_addr(page) | 554 + NVIF_VMM_PFNMAP_V0_V | 555 + NVIF_VMM_PFNMAP_V0_VRAM; 556 + else 557 + args->p.phys[0] = page_to_phys(page) | 558 + NVIF_VMM_PFNMAP_V0_V | 559 + NVIF_VMM_PFNMAP_V0_HOST; 560 + if (range->hmm_pfns[0] & HMM_PFN_WRITE) 561 + args->p.phys[0] |= NVIF_VMM_PFNMAP_V0_W; 546 562 } 547 563 548 564 static int nouveau_range_fault(struct nouveau_svmm *svmm, 549 - struct nouveau_drm *drm, void *data, u32 size, 550 - unsigned long hmm_pfns[], u64 *ioctl_addr, 565 + struct nouveau_drm *drm, 566 + struct nouveau_pfnmap_args *args, u32 size, 567 + unsigned long hmm_flags, 551 568 struct svm_notifier *notifier) 552 569 { 553 570 unsigned long timeout = 554 571 jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT); 555 572 /* Have HMM fault pages within the fault window to the GPU. */ 573 + unsigned long hmm_pfns[1]; 556 574 struct hmm_range range = { 557 575 .notifier = &notifier->notifier, 558 576 .start = notifier->notifier.interval_tree.start, 559 577 .end = notifier->notifier.interval_tree.last + 1, 560 - .pfn_flags_mask = HMM_PFN_REQ_FAULT | HMM_PFN_REQ_WRITE, 578 + .default_flags = hmm_flags, 561 579 .hmm_pfns = hmm_pfns, 562 580 .dev_private_owner = drm->dev, 563 581 }; ··· 588 576 ret = hmm_range_fault(&range); 589 577 mmap_read_unlock(mm); 590 578 if (ret) { 591 - /* 592 - * FIXME: the input PFN_REQ flags are destroyed on 593 - * -EBUSY, we need to regenerate them, also for the 594 - * other continue below 595 - */ 596 579 if (ret == -EBUSY) 597 580 continue; 598 581 return ret; ··· 602 595 break; 603 596 } 604 597 605 - nouveau_hmm_convert_pfn(drm, &range, ioctl_addr); 598 + nouveau_hmm_convert_pfn(drm, &range, args); 606 599 607 600 svmm->vmm->vmm.object.client->super = true; 608 - ret = nvif_object_ioctl(&svmm->vmm->vmm.object, data, size, NULL); 601 + ret = nvif_object_ioctl(&svmm->vmm->vmm.object, args, size, NULL); 609 602 svmm->vmm->vmm.object.client->super = false; 610 603 mutex_unlock(&svmm->mutex); 611 604 ··· 622 615 struct nvif_object *device = &svm->drm->client.device.object; 623 616 struct nouveau_svmm *svmm; 624 617 struct { 625 - struct { 626 - struct nvif_ioctl_v0 i; 627 - struct nvif_ioctl_mthd_v0 m; 628 - struct nvif_vmm_pfnmap_v0 p; 629 - } i; 630 - u64 phys[16]; 618 + struct nouveau_pfnmap_args i; 619 + u64 phys[1]; 631 620 } args; 632 - unsigned long hmm_pfns[ARRAY_SIZE(args.phys)]; 633 - struct vm_area_struct *vma; 621 + unsigned long hmm_flags; 634 622 u64 inst, start, limit; 635 - int fi, fn, pi, fill; 623 + int fi, fn; 636 624 int replay = 0, ret; 637 625 638 626 /* Parse available fault buffer entries into a cache, and update ··· 694 692 * window into a single update. 695 693 */ 696 694 start = buffer->fault[fi]->addr; 697 - limit = start + (ARRAY_SIZE(args.phys) << PAGE_SHIFT); 695 + limit = start + PAGE_SIZE; 698 696 if (start < svmm->unmanaged.limit) 699 697 limit = min_t(u64, limit, svmm->unmanaged.start); 700 - SVMM_DBG(svmm, "wndw %016llx-%016llx", start, limit); 698 + 699 + /* 700 + * Prepare the GPU-side update of all pages within the 701 + * fault window, determining required pages and access 702 + * permissions based on pending faults. 703 + */ 704 + args.i.p.addr = start; 705 + args.i.p.page = PAGE_SHIFT; 706 + args.i.p.size = PAGE_SIZE; 707 + /* 708 + * Determine required permissions based on GPU fault 709 + * access flags. 710 + * XXX: atomic? 711 + */ 712 + switch (buffer->fault[fi]->access) { 713 + case 0: /* READ. */ 714 + hmm_flags = HMM_PFN_REQ_FAULT; 715 + break; 716 + case 3: /* PREFETCH. */ 717 + hmm_flags = 0; 718 + break; 719 + default: 720 + hmm_flags = HMM_PFN_REQ_FAULT | HMM_PFN_REQ_WRITE; 721 + break; 722 + } 701 723 702 724 mm = svmm->notifier.mm; 703 725 if (!mmget_not_zero(mm)) { ··· 729 703 continue; 730 704 } 731 705 732 - /* Intersect fault window with the CPU VMA, cancelling 733 - * the fault if the address is invalid. 734 - */ 735 - mmap_read_lock(mm); 736 - vma = find_vma_intersection(mm, start, limit); 737 - if (!vma) { 738 - SVMM_ERR(svmm, "wndw %016llx-%016llx", start, limit); 739 - mmap_read_unlock(mm); 740 - mmput(mm); 741 - nouveau_svm_fault_cancel_fault(svm, buffer->fault[fi]); 742 - continue; 706 + notifier.svmm = svmm; 707 + ret = mmu_interval_notifier_insert(&notifier.notifier, mm, 708 + args.i.p.addr, args.i.p.size, 709 + &nouveau_svm_mni_ops); 710 + if (!ret) { 711 + ret = nouveau_range_fault(svmm, svm->drm, &args.i, 712 + sizeof(args), hmm_flags, &notifier); 713 + mmu_interval_notifier_remove(&notifier.notifier); 743 714 } 744 - start = max_t(u64, start, vma->vm_start); 745 - limit = min_t(u64, limit, vma->vm_end); 746 - mmap_read_unlock(mm); 747 - SVMM_DBG(svmm, "wndw %016llx-%016llx", start, limit); 715 + mmput(mm); 748 716 749 - if (buffer->fault[fi]->addr != start) { 750 - SVMM_ERR(svmm, "addr %016llx", buffer->fault[fi]->addr); 751 - mmput(mm); 752 - nouveau_svm_fault_cancel_fault(svm, buffer->fault[fi]); 753 - continue; 754 - } 755 - 756 - /* Prepare the GPU-side update of all pages within the 757 - * fault window, determining required pages and access 758 - * permissions based on pending faults. 759 - */ 760 - args.i.p.page = PAGE_SHIFT; 761 - args.i.p.addr = start; 762 - for (fn = fi, pi = 0;;) { 763 - /* Determine required permissions based on GPU fault 764 - * access flags. 765 - *XXX: atomic? 766 - */ 767 - switch (buffer->fault[fn]->access) { 768 - case 0: /* READ. */ 769 - hmm_pfns[pi++] = HMM_PFN_REQ_FAULT; 770 - break; 771 - case 3: /* PREFETCH. */ 772 - hmm_pfns[pi++] = 0; 773 - break; 774 - default: 775 - hmm_pfns[pi++] = HMM_PFN_REQ_FAULT | 776 - HMM_PFN_REQ_WRITE; 777 - break; 778 - } 779 - args.i.p.size = pi << PAGE_SHIFT; 780 - 717 + limit = args.i.p.addr + args.i.p.size; 718 + for (fn = fi; ++fn < buffer->fault_nr; ) { 781 719 /* It's okay to skip over duplicate addresses from the 782 720 * same SVMM as faults are ordered by access type such 783 721 * that only the first one needs to be handled. 784 722 * 785 723 * ie. WRITE faults appear first, thus any handling of 786 724 * pending READ faults will already be satisfied. 725 + * But if a large page is mapped, make sure subsequent 726 + * fault addresses have sufficient access permission. 787 727 */ 788 - while (++fn < buffer->fault_nr && 789 - buffer->fault[fn]->svmm == svmm && 790 - buffer->fault[fn ]->addr == 791 - buffer->fault[fn - 1]->addr); 792 - 793 - /* If the next fault is outside the window, or all GPU 794 - * faults have been dealt with, we're done here. 795 - */ 796 - if (fn >= buffer->fault_nr || 797 - buffer->fault[fn]->svmm != svmm || 798 - buffer->fault[fn]->addr >= limit) 728 + if (buffer->fault[fn]->svmm != svmm || 729 + buffer->fault[fn]->addr >= limit || 730 + (buffer->fault[fi]->access == 0 /* READ. */ && 731 + !(args.phys[0] & NVIF_VMM_PFNMAP_V0_V)) || 732 + (buffer->fault[fi]->access != 0 /* READ. */ && 733 + buffer->fault[fi]->access != 3 /* PREFETCH. */ && 734 + !(args.phys[0] & NVIF_VMM_PFNMAP_V0_W))) 799 735 break; 800 - 801 - /* Fill in the gap between this fault and the next. */ 802 - fill = (buffer->fault[fn ]->addr - 803 - buffer->fault[fn - 1]->addr) >> PAGE_SHIFT; 804 - while (--fill) 805 - hmm_pfns[pi++] = 0; 806 736 } 807 737 808 - SVMM_DBG(svmm, "wndw %016llx-%016llx covering %d fault(s)", 809 - args.i.p.addr, 810 - args.i.p.addr + args.i.p.size, fn - fi); 738 + /* If handling failed completely, cancel all faults. */ 739 + if (ret) { 740 + while (fi < fn) { 741 + struct nouveau_svm_fault *fault = 742 + buffer->fault[fi++]; 811 743 812 - notifier.svmm = svmm; 813 - ret = mmu_interval_notifier_insert(&notifier.notifier, 814 - svmm->notifier.mm, 815 - args.i.p.addr, args.i.p.size, 816 - &nouveau_svm_mni_ops); 817 - if (!ret) { 818 - ret = nouveau_range_fault( 819 - svmm, svm->drm, &args, 820 - sizeof(args.i) + pi * sizeof(args.phys[0]), 821 - hmm_pfns, args.phys, &notifier); 822 - mmu_interval_notifier_remove(&notifier.notifier); 823 - } 824 - mmput(mm); 825 - 826 - /* Cancel any faults in the window whose pages didn't manage 827 - * to keep their valid bit, or stay writeable when required. 828 - * 829 - * If handling failed completely, cancel all faults. 830 - */ 831 - while (fi < fn) { 832 - struct nouveau_svm_fault *fault = buffer->fault[fi++]; 833 - pi = (fault->addr - args.i.p.addr) >> PAGE_SHIFT; 834 - if (ret || 835 - !(args.phys[pi] & NVIF_VMM_PFNMAP_V0_V) || 836 - (!(args.phys[pi] & NVIF_VMM_PFNMAP_V0_W) && 837 - fault->access != 0 && fault->access != 3)) { 838 744 nouveau_svm_fault_cancel_fault(svm, fault); 839 - continue; 840 745 } 746 + } else 841 747 replay++; 842 - } 843 748 } 844 749 845 750 /* Issue fault replay to the GPU. */
+12 -1
drivers/gpu/drm/nouveau/nouveau_svm.h
··· 1 1 #ifndef __NOUVEAU_SVM_H__ 2 2 #define __NOUVEAU_SVM_H__ 3 3 #include <nvif/os.h> 4 + #include <linux/mmu_notifier.h> 4 5 struct drm_device; 5 6 struct drm_file; 6 7 struct nouveau_drm; 7 8 8 - struct nouveau_svmm; 9 + struct nouveau_svmm { 10 + struct mmu_notifier notifier; 11 + struct nouveau_vmm *vmm; 12 + struct { 13 + unsigned long start; 14 + unsigned long limit; 15 + } unmanaged; 16 + 17 + struct mutex mutex; 18 + }; 9 19 10 20 #if IS_ENABLED(CONFIG_DRM_NOUVEAU_SVM) 11 21 void nouveau_svm_init(struct nouveau_drm *); ··· 29 19 void nouveau_svmm_part(struct nouveau_svmm *, u64 inst); 30 20 int nouveau_svmm_bind(struct drm_device *, void *, struct drm_file *); 31 21 22 + void nouveau_svmm_invalidate(struct nouveau_svmm *svmm, u64 start, u64 limit); 32 23 u64 *nouveau_pfns_alloc(unsigned long npages); 33 24 void nouveau_pfns_free(u64 *pfns); 34 25 void nouveau_pfns_map(struct nouveau_svmm *svmm, struct mm_struct *mm,
+2 -3
drivers/gpu/drm/nouveau/nvkm/subdev/mmu/vmm.c
··· 1204 1204 /*TODO: 1205 1205 * - Avoid PT readback (for dma_unmap etc), this might end up being dealt 1206 1206 * with inside HMM, which would be a lot nicer for us to deal with. 1207 - * - Multiple page sizes (particularly for huge page support). 1208 1207 * - Support for systems without a 4KiB page size. 1209 1208 */ 1210 1209 int ··· 1219 1220 /* Only support mapping where the page size of the incoming page 1220 1221 * array matches a page size available for direct mapping. 1221 1222 */ 1222 - while (page->shift && page->shift != shift && 1223 - page->desc->func->pfn == NULL) 1223 + while (page->shift && (page->shift != shift || 1224 + page->desc->func->pfn == NULL)) 1224 1225 page++; 1225 1226 1226 1227 if (!page->shift || !IS_ALIGNED(addr, 1ULL << shift) ||
+89 -2
drivers/gpu/drm/nouveau/nvkm/subdev/mmu/vmmgp100.c
··· 79 79 dma_addr_t addr; 80 80 81 81 nvkm_kmap(pt->memory); 82 - while (ptes--) { 82 + for (; ptes; ptes--, map->pfn++) { 83 83 u64 data = 0; 84 + 85 + if (!(*map->pfn & NVKM_VMM_PFN_V)) 86 + continue; 87 + 84 88 if (!(*map->pfn & NVKM_VMM_PFN_W)) 85 89 data |= BIT_ULL(6); /* RO. */ 86 90 ··· 104 100 } 105 101 106 102 VMM_WO064(pt, vmm, ptei++ * 8, data); 107 - map->pfn++; 108 103 } 109 104 nvkm_done(pt->memory); 110 105 } ··· 261 258 VMM_FO128(pt, vmm, pdei * 0x10, 0ULL, 0ULL, pdes); 262 259 } 263 260 261 + static void 262 + gp100_vmm_pd0_pfn_unmap(struct nvkm_vmm *vmm, 263 + struct nvkm_mmu_pt *pt, u32 ptei, u32 ptes) 264 + { 265 + struct device *dev = vmm->mmu->subdev.device->dev; 266 + dma_addr_t addr; 267 + 268 + nvkm_kmap(pt->memory); 269 + while (ptes--) { 270 + u32 datalo = nvkm_ro32(pt->memory, pt->base + ptei * 16 + 0); 271 + u32 datahi = nvkm_ro32(pt->memory, pt->base + ptei * 16 + 4); 272 + u64 data = (u64)datahi << 32 | datalo; 273 + 274 + if ((data & (3ULL << 1)) != 0) { 275 + addr = (data >> 8) << 12; 276 + dma_unmap_page(dev, addr, 1UL << 21, DMA_BIDIRECTIONAL); 277 + } 278 + ptei++; 279 + } 280 + nvkm_done(pt->memory); 281 + } 282 + 283 + static bool 284 + gp100_vmm_pd0_pfn_clear(struct nvkm_vmm *vmm, 285 + struct nvkm_mmu_pt *pt, u32 ptei, u32 ptes) 286 + { 287 + bool dma = false; 288 + 289 + nvkm_kmap(pt->memory); 290 + while (ptes--) { 291 + u32 datalo = nvkm_ro32(pt->memory, pt->base + ptei * 16 + 0); 292 + u32 datahi = nvkm_ro32(pt->memory, pt->base + ptei * 16 + 4); 293 + u64 data = (u64)datahi << 32 | datalo; 294 + 295 + if ((data & BIT_ULL(0)) && (data & (3ULL << 1)) != 0) { 296 + VMM_WO064(pt, vmm, ptei * 16, data & ~BIT_ULL(0)); 297 + dma = true; 298 + } 299 + ptei++; 300 + } 301 + nvkm_done(pt->memory); 302 + return dma; 303 + } 304 + 305 + static void 306 + gp100_vmm_pd0_pfn(struct nvkm_vmm *vmm, struct nvkm_mmu_pt *pt, 307 + u32 ptei, u32 ptes, struct nvkm_vmm_map *map) 308 + { 309 + struct device *dev = vmm->mmu->subdev.device->dev; 310 + dma_addr_t addr; 311 + 312 + nvkm_kmap(pt->memory); 313 + for (; ptes; ptes--, map->pfn++) { 314 + u64 data = 0; 315 + 316 + if (!(*map->pfn & NVKM_VMM_PFN_V)) 317 + continue; 318 + 319 + if (!(*map->pfn & NVKM_VMM_PFN_W)) 320 + data |= BIT_ULL(6); /* RO. */ 321 + 322 + if (!(*map->pfn & NVKM_VMM_PFN_VRAM)) { 323 + addr = *map->pfn >> NVKM_VMM_PFN_ADDR_SHIFT; 324 + addr = dma_map_page(dev, pfn_to_page(addr), 0, 325 + 1UL << 21, DMA_BIDIRECTIONAL); 326 + if (!WARN_ON(dma_mapping_error(dev, addr))) { 327 + data |= addr >> 4; 328 + data |= 2ULL << 1; /* SYSTEM_COHERENT_MEMORY. */ 329 + data |= BIT_ULL(3); /* VOL. */ 330 + data |= BIT_ULL(0); /* VALID. */ 331 + } 332 + } else { 333 + data |= (*map->pfn & NVKM_VMM_PFN_ADDR) >> 4; 334 + data |= BIT_ULL(0); /* VALID. */ 335 + } 336 + 337 + VMM_WO064(pt, vmm, ptei++ * 16, data); 338 + } 339 + nvkm_done(pt->memory); 340 + } 341 + 264 342 static const struct nvkm_vmm_desc_func 265 343 gp100_vmm_desc_pd0 = { 266 344 .unmap = gp100_vmm_pd0_unmap, 267 345 .sparse = gp100_vmm_pd0_sparse, 268 346 .pde = gp100_vmm_pd0_pde, 269 347 .mem = gp100_vmm_pd0_mem, 348 + .pfn = gp100_vmm_pd0_pfn, 349 + .pfn_clear = gp100_vmm_pd0_pfn_clear, 350 + .pfn_unmap = gp100_vmm_pd0_pfn_unmap, 270 351 }; 271 352 272 353 static void
+22 -2
include/linux/hmm.h
··· 37 37 * will fail. Must be combined with HMM_PFN_REQ_FAULT. 38 38 */ 39 39 enum hmm_pfn_flags { 40 - /* Output flags */ 40 + /* Output fields and flags */ 41 41 HMM_PFN_VALID = 1UL << (BITS_PER_LONG - 1), 42 42 HMM_PFN_WRITE = 1UL << (BITS_PER_LONG - 2), 43 43 HMM_PFN_ERROR = 1UL << (BITS_PER_LONG - 3), 44 + HMM_PFN_ORDER_SHIFT = (BITS_PER_LONG - 8), 44 45 45 46 /* Input flags */ 46 47 HMM_PFN_REQ_FAULT = HMM_PFN_VALID, 47 48 HMM_PFN_REQ_WRITE = HMM_PFN_WRITE, 48 49 49 - HMM_PFN_FLAGS = HMM_PFN_VALID | HMM_PFN_WRITE | HMM_PFN_ERROR, 50 + HMM_PFN_FLAGS = 0xFFUL << HMM_PFN_ORDER_SHIFT, 50 51 }; 51 52 52 53 /* ··· 60 59 static inline struct page *hmm_pfn_to_page(unsigned long hmm_pfn) 61 60 { 62 61 return pfn_to_page(hmm_pfn & ~HMM_PFN_FLAGS); 62 + } 63 + 64 + /* 65 + * hmm_pfn_to_map_order() - return the CPU mapping size order 66 + * 67 + * This is optionally useful to optimize processing of the pfn result 68 + * array. It indicates that the page starts at the order aligned VA and is 69 + * 1<<order bytes long. Every pfn within an high order page will have the 70 + * same pfn flags, both access protections and the map_order. The caller must 71 + * be careful with edge cases as the start and end VA of the given page may 72 + * extend past the range used with hmm_range_fault(). 73 + * 74 + * This must be called under the caller 'user_lock' after a successful 75 + * mmu_interval_read_begin(). The caller must have tested for HMM_PFN_VALID 76 + * already. 77 + */ 78 + static inline unsigned int hmm_pfn_to_map_order(unsigned long hmm_pfn) 79 + { 80 + return (hmm_pfn >> HMM_PFN_ORDER_SHIFT) & 0x1F; 63 81 } 64 82 65 83 /*
+12 -4
include/linux/migrate.h
··· 180 180 return (pfn << MIGRATE_PFN_SHIFT) | MIGRATE_PFN_VALID; 181 181 } 182 182 183 + enum migrate_vma_direction { 184 + MIGRATE_VMA_SELECT_SYSTEM = 1 << 0, 185 + MIGRATE_VMA_SELECT_DEVICE_PRIVATE = 1 << 1, 186 + }; 187 + 183 188 struct migrate_vma { 184 189 struct vm_area_struct *vma; 185 190 /* ··· 204 199 205 200 /* 206 201 * Set to the owner value also stored in page->pgmap->owner for 207 - * migrating out of device private memory. If set only device 208 - * private pages with this owner are migrated. If not set 209 - * device private pages are not migrated at all. 202 + * migrating out of device private memory. The flags also need to 203 + * be set to MIGRATE_VMA_SELECT_DEVICE_PRIVATE. 204 + * The caller should always set this field when using mmu notifier 205 + * callbacks to avoid device MMU invalidations for device private 206 + * pages that are not being migrated. 210 207 */ 211 - void *src_owner; 208 + void *pgmap_owner; 209 + unsigned long flags; 212 210 }; 213 211 214 212 int migrate_vma_setup(struct migrate_vma *args);
+6
include/linux/mmu_notifier.h
··· 38 38 * 39 39 * @MMU_NOTIFY_RELEASE: used during mmu_interval_notifier invalidate to signal 40 40 * that the mm refcount is zero and the range is no longer accessible. 41 + * 42 + * @MMU_NOTIFY_MIGRATE: used during migrate_vma_collect() invalidate to signal 43 + * a device driver to possibly ignore the invalidation if the 44 + * migrate_pgmap_owner field matches the driver's device private pgmap owner. 41 45 */ 42 46 enum mmu_notifier_event { 43 47 MMU_NOTIFY_UNMAP = 0, ··· 50 46 MMU_NOTIFY_PROTECTION_PAGE, 51 47 MMU_NOTIFY_SOFT_DIRTY, 52 48 MMU_NOTIFY_RELEASE, 49 + MMU_NOTIFY_MIGRATE, 53 50 }; 54 51 55 52 #define MMU_NOTIFIER_RANGE_BLOCKABLE (1 << 0) ··· 269 264 unsigned long end; 270 265 unsigned flags; 271 266 enum mmu_notifier_event event; 267 + void *migrate_pgmap_owner; 272 268 }; 273 269 274 270 static inline int mm_has_notifiers(struct mm_struct *mm)
+24 -23
lib/test_hmm.c
··· 214 214 { 215 215 struct dmirror *dmirror = container_of(mni, struct dmirror, notifier); 216 216 217 + /* 218 + * Ignore invalidation callbacks for device private pages since 219 + * the invalidation is handled as part of the migration process. 220 + */ 221 + if (range->event == MMU_NOTIFY_MIGRATE && 222 + range->migrate_pgmap_owner == dmirror->mdevice) 223 + return true; 224 + 217 225 if (mmu_notifier_range_blockable(range)) 218 226 mutex_lock(&dmirror->mutex); 219 227 else if (!mutex_trylock(&dmirror->mutex)) ··· 593 585 */ 594 586 spage = migrate_pfn_to_page(*src); 595 587 596 - /* 597 - * Don't migrate device private pages from our own driver or 598 - * others. For our own we would do a device private memory copy 599 - * not a migration and for others, we would need to fault the 600 - * other device's page into system memory first. 601 - */ 602 - if (spage && is_zone_device_page(spage)) 603 - continue; 604 - 605 588 dpage = dmirror_devmem_alloc_page(mdevice); 606 589 if (!dpage) 607 590 continue; ··· 701 702 args.dst = dst_pfns; 702 703 args.start = addr; 703 704 args.end = next; 704 - args.src_owner = NULL; 705 + args.pgmap_owner = dmirror->mdevice; 706 + args.flags = MIGRATE_VMA_SELECT_SYSTEM; 705 707 ret = migrate_vma_setup(&args); 706 708 if (ret) 707 709 goto out; ··· 766 766 *perm |= HMM_DMIRROR_PROT_WRITE; 767 767 else 768 768 *perm |= HMM_DMIRROR_PROT_READ; 769 + if (hmm_pfn_to_map_order(entry) + PAGE_SHIFT == PMD_SHIFT) 770 + *perm |= HMM_DMIRROR_PROT_PMD; 771 + else if (hmm_pfn_to_map_order(entry) + PAGE_SHIFT == PUD_SHIFT) 772 + *perm |= HMM_DMIRROR_PROT_PUD; 769 773 } 770 774 771 775 static bool dmirror_snapshot_invalidate(struct mmu_interval_notifier *mni, ··· 991 987 } 992 988 993 989 static vm_fault_t dmirror_devmem_fault_alloc_and_copy(struct migrate_vma *args, 994 - struct dmirror_device *mdevice) 990 + struct dmirror *dmirror) 995 991 { 996 992 const unsigned long *src = args->src; 997 993 unsigned long *dst = args->dst; ··· 1013 1009 continue; 1014 1010 1015 1011 lock_page(dpage); 1012 + xa_erase(&dmirror->pt, addr >> PAGE_SHIFT); 1016 1013 copy_highpage(dpage, spage); 1017 1014 *dst = migrate_pfn(page_to_pfn(dpage)) | MIGRATE_PFN_LOCKED; 1018 1015 if (*src & MIGRATE_PFN_WRITE) 1019 1016 *dst |= MIGRATE_PFN_WRITE; 1020 1017 } 1021 1018 return 0; 1022 - } 1023 - 1024 - static void dmirror_devmem_fault_finalize_and_map(struct migrate_vma *args, 1025 - struct dmirror *dmirror) 1026 - { 1027 - /* Invalidate the device's page table mapping. */ 1028 - mutex_lock(&dmirror->mutex); 1029 - dmirror_do_update(dmirror, args->start, args->end); 1030 - mutex_unlock(&dmirror->mutex); 1031 1019 } 1032 1020 1033 1021 static vm_fault_t dmirror_devmem_fault(struct vm_fault *vmf) ··· 1045 1049 args.end = args.start + PAGE_SIZE; 1046 1050 args.src = &src_pfns; 1047 1051 args.dst = &dst_pfns; 1048 - args.src_owner = dmirror->mdevice; 1052 + args.pgmap_owner = dmirror->mdevice; 1053 + args.flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE; 1049 1054 1050 1055 if (migrate_vma_setup(&args)) 1051 1056 return VM_FAULT_SIGBUS; 1052 1057 1053 - ret = dmirror_devmem_fault_alloc_and_copy(&args, dmirror->mdevice); 1058 + ret = dmirror_devmem_fault_alloc_and_copy(&args, dmirror); 1054 1059 if (ret) 1055 1060 return ret; 1056 1061 migrate_vma_pages(&args); 1057 - dmirror_devmem_fault_finalize_and_map(&args, dmirror); 1062 + /* 1063 + * No device finalize step is needed since 1064 + * dmirror_devmem_fault_alloc_and_copy() will have already 1065 + * invalidated the device page table. 1066 + */ 1058 1067 migrate_vma_finalize(&args); 1059 1068 return 0; 1060 1069 }
+4
lib/test_hmm_uapi.h
··· 40 40 * HMM_DMIRROR_PROT_NONE: unpopulated PTE or PTE with no access 41 41 * HMM_DMIRROR_PROT_READ: read-only PTE 42 42 * HMM_DMIRROR_PROT_WRITE: read/write PTE 43 + * HMM_DMIRROR_PROT_PMD: PMD sized page is fully mapped by same permissions 44 + * HMM_DMIRROR_PROT_PUD: PUD sized page is fully mapped by same permissions 43 45 * HMM_DMIRROR_PROT_ZERO: special read-only zero page 44 46 * HMM_DMIRROR_PROT_DEV_PRIVATE_LOCAL: Migrated device private page on the 45 47 * device the ioctl() is made ··· 53 51 HMM_DMIRROR_PROT_NONE = 0x00, 54 52 HMM_DMIRROR_PROT_READ = 0x01, 55 53 HMM_DMIRROR_PROT_WRITE = 0x02, 54 + HMM_DMIRROR_PROT_PMD = 0x04, 55 + HMM_DMIRROR_PROT_PUD = 0x08, 56 56 HMM_DMIRROR_PROT_ZERO = 0x10, 57 57 HMM_DMIRROR_PROT_DEV_PRIVATE_LOCAL = 0x20, 58 58 HMM_DMIRROR_PROT_DEV_PRIVATE_REMOTE = 0x30,
+13 -3
mm/hmm.c
··· 165 165 return hmm_pfns_fill(addr, end, range, 0); 166 166 } 167 167 168 + static inline unsigned long hmm_pfn_flags_order(unsigned long order) 169 + { 170 + return order << HMM_PFN_ORDER_SHIFT; 171 + } 172 + 168 173 static inline unsigned long pmd_to_hmm_pfn_flags(struct hmm_range *range, 169 174 pmd_t pmd) 170 175 { 171 176 if (pmd_protnone(pmd)) 172 177 return 0; 173 - return pmd_write(pmd) ? (HMM_PFN_VALID | HMM_PFN_WRITE) : HMM_PFN_VALID; 178 + return (pmd_write(pmd) ? (HMM_PFN_VALID | HMM_PFN_WRITE) : 179 + HMM_PFN_VALID) | 180 + hmm_pfn_flags_order(PMD_SHIFT - PAGE_SHIFT); 174 181 } 175 182 176 183 #ifdef CONFIG_TRANSPARENT_HUGEPAGE ··· 396 389 { 397 390 if (!pud_present(pud)) 398 391 return 0; 399 - return pud_write(pud) ? (HMM_PFN_VALID | HMM_PFN_WRITE) : HMM_PFN_VALID; 392 + return (pud_write(pud) ? (HMM_PFN_VALID | HMM_PFN_WRITE) : 393 + HMM_PFN_VALID) | 394 + hmm_pfn_flags_order(PUD_SHIFT - PAGE_SHIFT); 400 395 } 401 396 402 397 static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end, ··· 483 474 484 475 i = (start - range->start) >> PAGE_SHIFT; 485 476 pfn_req_flags = range->hmm_pfns[i]; 486 - cpu_flags = pte_to_hmm_pfn_flags(range, entry); 477 + cpu_flags = pte_to_hmm_pfn_flags(range, entry) | 478 + hmm_pfn_flags_order(huge_page_order(hstate_vma(vma))); 487 479 required_fault = 488 480 hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, cpu_flags); 489 481 if (required_fault) {
+11 -3
mm/migrate.c
··· 2276 2276 goto next; 2277 2277 2278 2278 page = device_private_entry_to_page(entry); 2279 - if (page->pgmap->owner != migrate->src_owner) 2279 + if (!(migrate->flags & 2280 + MIGRATE_VMA_SELECT_DEVICE_PRIVATE) || 2281 + page->pgmap->owner != migrate->pgmap_owner) 2280 2282 goto next; 2281 2283 2282 2284 mpfn = migrate_pfn(page_to_pfn(page)) | ··· 2286 2284 if (is_write_device_private_entry(entry)) 2287 2285 mpfn |= MIGRATE_PFN_WRITE; 2288 2286 } else { 2289 - if (migrate->src_owner) 2287 + if (!(migrate->flags & MIGRATE_VMA_SELECT_SYSTEM)) 2290 2288 goto next; 2291 2289 pfn = pte_pfn(pte); 2292 2290 if (is_zero_pfn(pfn)) { ··· 2381 2379 { 2382 2380 struct mmu_notifier_range range; 2383 2381 2384 - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, NULL, 2382 + /* 2383 + * Note that the pgmap_owner is passed to the mmu notifier callback so 2384 + * that the registered device driver can skip invalidating device 2385 + * private page mappings that won't be migrated. 2386 + */ 2387 + mmu_notifier_range_init(&range, MMU_NOTIFY_MIGRATE, 0, migrate->vma, 2385 2388 migrate->vma->vm_mm, migrate->start, migrate->end); 2389 + range.migrate_pgmap_owner = migrate->pgmap_owner; 2386 2390 mmu_notifier_invalidate_range_start(&range); 2387 2391 2388 2392 walk_page_range(migrate->vma->vm_mm, migrate->start, migrate->end,
+90 -4
tools/testing/selftests/vm/hmm-tests.c
··· 881 881 } 882 882 883 883 /* 884 - * Migrate anonymous memory to device private memory and fault it back to system 885 - * memory. 884 + * Migrate anonymous memory to device private memory and fault some of it back 885 + * to system memory, then try migrating the resulting mix of system and device 886 + * private memory to the device. 886 887 */ 887 888 TEST_F(hmm, migrate_fault) 888 889 { ··· 925 924 for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) 926 925 ASSERT_EQ(ptr[i], i); 927 926 928 - /* Fault pages back to system memory and check them. */ 929 - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) 927 + /* Fault half the pages back to system memory and check them. */ 928 + for (i = 0, ptr = buffer->ptr; i < size / (2 * sizeof(*ptr)); ++i) 929 + ASSERT_EQ(ptr[i], i); 930 + 931 + /* Migrate memory to the device again. */ 932 + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_MIGRATE, buffer, npages); 933 + ASSERT_EQ(ret, 0); 934 + ASSERT_EQ(buffer->cpages, npages); 935 + 936 + /* Check what the device read. */ 937 + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) 930 938 ASSERT_EQ(ptr[i], i); 931 939 932 940 hmm_buffer_free(buffer); ··· 1298 1288 HMM_DMIRROR_PROT_WRITE); 1299 1289 ASSERT_EQ(m[6], HMM_DMIRROR_PROT_NONE); 1300 1290 1291 + hmm_buffer_free(buffer); 1292 + } 1293 + 1294 + /* 1295 + * Test the hmm_range_fault() HMM_PFN_PMD flag for large pages that 1296 + * should be mapped by a large page table entry. 1297 + */ 1298 + TEST_F(hmm, compound) 1299 + { 1300 + struct hmm_buffer *buffer; 1301 + unsigned long npages; 1302 + unsigned long size; 1303 + int *ptr; 1304 + unsigned char *m; 1305 + int ret; 1306 + long pagesizes[4]; 1307 + int n, idx; 1308 + unsigned long i; 1309 + 1310 + /* Skip test if we can't allocate a hugetlbfs page. */ 1311 + 1312 + n = gethugepagesizes(pagesizes, 4); 1313 + if (n <= 0) 1314 + return; 1315 + for (idx = 0; --n > 0; ) { 1316 + if (pagesizes[n] < pagesizes[idx]) 1317 + idx = n; 1318 + } 1319 + size = ALIGN(TWOMEG, pagesizes[idx]); 1320 + npages = size >> self->page_shift; 1321 + 1322 + buffer = malloc(sizeof(*buffer)); 1323 + ASSERT_NE(buffer, NULL); 1324 + 1325 + buffer->ptr = get_hugepage_region(size, GHR_STRICT); 1326 + if (buffer->ptr == NULL) { 1327 + free(buffer); 1328 + return; 1329 + } 1330 + 1331 + buffer->size = size; 1332 + buffer->mirror = malloc(npages); 1333 + ASSERT_NE(buffer->mirror, NULL); 1334 + 1335 + /* Initialize the pages the device will snapshot in buffer->ptr. */ 1336 + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) 1337 + ptr[i] = i; 1338 + 1339 + /* Simulate a device snapshotting CPU pagetables. */ 1340 + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_SNAPSHOT, buffer, npages); 1341 + ASSERT_EQ(ret, 0); 1342 + ASSERT_EQ(buffer->cpages, npages); 1343 + 1344 + /* Check what the device saw. */ 1345 + m = buffer->mirror; 1346 + for (i = 0; i < npages; ++i) 1347 + ASSERT_EQ(m[i], HMM_DMIRROR_PROT_WRITE | 1348 + HMM_DMIRROR_PROT_PMD); 1349 + 1350 + /* Make the region read-only. */ 1351 + ret = mprotect(buffer->ptr, size, PROT_READ); 1352 + ASSERT_EQ(ret, 0); 1353 + 1354 + /* Simulate a device snapshotting CPU pagetables. */ 1355 + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_SNAPSHOT, buffer, npages); 1356 + ASSERT_EQ(ret, 0); 1357 + ASSERT_EQ(buffer->cpages, npages); 1358 + 1359 + /* Check what the device saw. */ 1360 + m = buffer->mirror; 1361 + for (i = 0; i < npages; ++i) 1362 + ASSERT_EQ(m[i], HMM_DMIRROR_PROT_READ | 1363 + HMM_DMIRROR_PROT_PMD); 1364 + 1365 + free_hugepage_region(buffer->ptr); 1366 + buffer->ptr = NULL; 1301 1367 hmm_buffer_free(buffer); 1302 1368 } 1303 1369