Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

mm/rmap: convert make_device_exclusive_range() to make_device_exclusive()

The single "real" user in the tree of make_device_exclusive_range() always
requests making only a single address exclusive. The current
implementation is hard to fix for properly supporting anonymous THP /
large folios and for avoiding messing with rmap walks in weird ways.

So let's always process a single address/page and return folio + page to
minimize page -> folio lookups. This is a preparation for further
changes.

Reject any non-anonymous or hugetlb folios early, directly after GUP.

While at it, extend the documentation of make_device_exclusive() to
clarify some things.

Link: https://lkml.kernel.org/r/20250210193801.781278-4-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Acked-by: Simona Vetter <simona.vetter@ffwll.ch>
Reviewed-by: Alistair Popple <apopple@nvidia.com>
Tested-by: Alistair Popple <apopple@nvidia.com>
Cc: Alex Shi <alexs@kernel.org>
Cc: Danilo Krummrich <dakr@kernel.org>
Cc: Dave Airlie <airlied@gmail.com>
Cc: Jann Horn <jannh@google.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Jerome Glisse <jglisse@redhat.com>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Karol Herbst <kherbst@redhat.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Lyude <lyude@redhat.com>
Cc: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: SeongJae Park <sj@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Yanteng Si <si.yanteng@linux.dev>
Cc: Barry Song <v-songbaohua@oppo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

authored by

David Hildenbrand and committed by
Andrew Morton
599b684a bc3fe680

+83 -77
+1 -1
Documentation/mm/hmm.rst
··· 400 400 Some devices have features such as atomic PTE bits that can be used to implement 401 401 atomic access to system memory. To support atomic operations to a shared virtual 402 402 memory page such a device needs access to that page which is exclusive of any 403 - userspace access from the CPU. The ``make_device_exclusive_range()`` function 403 + userspace access from the CPU. The ``make_device_exclusive()`` function 404 404 can be used to make a memory range inaccessible from userspace. 405 405 406 406 This replaces all mappings for pages in the given range with special swap
+1 -1
Documentation/translations/zh_CN/mm/hmm.rst
··· 326 326 327 327 一些设备具有诸如原子PTE位的功能,可以用来实现对系统内存的原子访问。为了支持对一 328 328 个共享的虚拟内存页的原子操作,这样的设备需要对该页的访问是排他的,而不是来自CPU 329 - 的任何用户空间访问。 ``make_device_exclusive_range()`` 函数可以用来使一 329 + 的任何用户空间访问。 ``make_device_exclusive()`` 函数可以用来使一 330 330 个内存范围不能从用户空间访问。 331 331 332 332 这将用特殊的交换条目替换给定范围内的所有页的映射。任何试图访问交换条目的行为都会
+2 -3
drivers/gpu/drm/nouveau/nouveau_svm.c
··· 610 610 611 611 notifier_seq = mmu_interval_read_begin(&notifier->notifier); 612 612 mmap_read_lock(mm); 613 - ret = make_device_exclusive_range(mm, start, start + PAGE_SIZE, 614 - &page, drm->dev); 613 + page = make_device_exclusive(mm, start, drm->dev, &folio); 615 614 mmap_read_unlock(mm); 616 - if (ret <= 0 || !page) { 615 + if (IS_ERR(page)) { 617 616 ret = -EINVAL; 618 617 goto out; 619 618 }
+1 -1
include/linux/mmu_notifier.h
··· 46 46 * @MMU_NOTIFY_EXCLUSIVE: to signal a device driver that the device will no 47 47 * longer have exclusive access to the page. When sent during creation of an 48 48 * exclusive range the owner will be initialised to the value provided by the 49 - * caller of make_device_exclusive_range(), otherwise the owner will be NULL. 49 + * caller of make_device_exclusive(), otherwise the owner will be NULL. 50 50 */ 51 51 enum mmu_notifier_event { 52 52 MMU_NOTIFY_UNMAP = 0,
+2 -3
include/linux/rmap.h
··· 663 663 void try_to_migrate(struct folio *folio, enum ttu_flags flags); 664 664 void try_to_unmap(struct folio *, enum ttu_flags flags); 665 665 666 - int make_device_exclusive_range(struct mm_struct *mm, unsigned long start, 667 - unsigned long end, struct page **pages, 668 - void *arg); 666 + struct page *make_device_exclusive(struct mm_struct *mm, unsigned long addr, 667 + void *owner, struct folio **foliop); 669 668 670 669 /* Avoid racy checks */ 671 670 #define PVMW_SYNC (1 << 0)
+15 -26
lib/test_hmm.c
··· 780 780 unsigned long start, end, addr; 781 781 unsigned long size = cmd->npages << PAGE_SHIFT; 782 782 struct mm_struct *mm = dmirror->notifier.mm; 783 - struct page *pages[64]; 784 783 struct dmirror_bounce bounce; 785 - unsigned long next; 786 - int ret; 784 + int ret = 0; 787 785 788 786 start = cmd->addr; 789 787 end = start + size; ··· 793 795 return -EINVAL; 794 796 795 797 mmap_read_lock(mm); 796 - for (addr = start; addr < end; addr = next) { 797 - unsigned long mapped = 0; 798 - int i; 798 + for (addr = start; !ret && addr < end; addr += PAGE_SIZE) { 799 + struct folio *folio; 800 + struct page *page; 799 801 800 - next = min(end, addr + (ARRAY_SIZE(pages) << PAGE_SHIFT)); 801 - 802 - ret = make_device_exclusive_range(mm, addr, next, pages, NULL); 803 - /* 804 - * Do dmirror_atomic_map() iff all pages are marked for 805 - * exclusive access to avoid accessing uninitialized 806 - * fields of pages. 807 - */ 808 - if (ret == (next - addr) >> PAGE_SHIFT) 809 - mapped = dmirror_atomic_map(addr, next, pages, dmirror); 810 - for (i = 0; i < ret; i++) { 811 - if (pages[i]) { 812 - unlock_page(pages[i]); 813 - put_page(pages[i]); 814 - } 802 + page = make_device_exclusive(mm, addr, NULL, &folio); 803 + if (IS_ERR(page)) { 804 + ret = PTR_ERR(page); 805 + break; 815 806 } 816 807 817 - if (addr + (mapped << PAGE_SHIFT) < next) { 818 - mmap_read_unlock(mm); 819 - mmput(mm); 820 - return -EBUSY; 821 - } 808 + ret = dmirror_atomic_map(addr, addr + PAGE_SIZE, &page, dmirror); 809 + ret = ret == 1 ? 0 : -EBUSY; 810 + folio_unlock(folio); 811 + folio_put(folio); 822 812 } 823 813 mmap_read_unlock(mm); 824 814 mmput(mm); 815 + 816 + if (ret) 817 + return ret; 825 818 826 819 /* Return the migrated data for verification. */ 827 820 ret = dmirror_bounce_init(&bounce, start, size);
+61 -42
mm/rmap.c
··· 2495 2495 .arg = &args, 2496 2496 }; 2497 2497 2498 - /* 2499 - * Restrict to anonymous folios for now to avoid potential writeback 2500 - * issues. 2501 - */ 2502 - if (!folio_test_anon(folio) || folio_test_hugetlb(folio)) 2503 - return false; 2504 - 2505 2498 rmap_walk(folio, &rwc); 2506 2499 2507 2500 return args.valid && !folio_mapcount(folio); 2508 2501 } 2509 2502 2510 2503 /** 2511 - * make_device_exclusive_range() - Mark a range for exclusive use by a device 2504 + * make_device_exclusive() - Mark a page for exclusive use by a device 2512 2505 * @mm: mm_struct of associated target process 2513 - * @start: start of the region to mark for exclusive device access 2514 - * @end: end address of region 2515 - * @pages: returns the pages which were successfully marked for exclusive access 2506 + * @addr: the virtual address to mark for exclusive device access 2516 2507 * @owner: passed to MMU_NOTIFY_EXCLUSIVE range notifier to allow filtering 2508 + * @foliop: folio pointer will be stored here on success. 2517 2509 * 2518 - * Returns: number of pages found in the range by GUP. A page is marked for 2519 - * exclusive access only if the page pointer is non-NULL. 2510 + * This function looks up the page mapped at the given address, grabs a 2511 + * folio reference, locks the folio and replaces the PTE with special 2512 + * device-exclusive PFN swap entry, preventing access through the process 2513 + * page tables. The function will return with the folio locked and referenced. 2520 2514 * 2521 - * This function finds ptes mapping page(s) to the given address range, locks 2522 - * them and replaces mappings with special swap entries preventing userspace CPU 2523 - * access. On fault these entries are replaced with the original mapping after 2524 - * calling MMU notifiers. 2515 + * On fault, the device-exclusive entries are replaced with the original PTE 2516 + * under folio lock, after calling MMU notifiers. 2517 + * 2518 + * Only anonymous non-hugetlb folios are supported and the VMA must have 2519 + * write permissions such that we can fault in the anonymous page writable 2520 + * in order to mark it exclusive. The caller must hold the mmap_lock in read 2521 + * mode. 2525 2522 * 2526 2523 * A driver using this to program access from a device must use a mmu notifier 2527 2524 * critical section to hold a device specific lock during programming. Once 2528 - * programming is complete it should drop the page lock and reference after 2525 + * programming is complete it should drop the folio lock and reference after 2529 2526 * which point CPU access to the page will revoke the exclusive access. 2527 + * 2528 + * Notes: 2529 + * #. This function always operates on individual PTEs mapping individual 2530 + * pages. PMD-sized THPs are first remapped to be mapped by PTEs before 2531 + * the conversion happens on a single PTE corresponding to @addr. 2532 + * #. While concurrent access through the process page tables is prevented, 2533 + * concurrent access through other page references (e.g., earlier GUP 2534 + * invocation) is not handled and not supported. 2535 + * #. device-exclusive entries are considered "clean" and "old" by core-mm. 2536 + * Device drivers must update the folio state when informed by MMU 2537 + * notifiers. 2538 + * 2539 + * Returns: pointer to mapped page on success, otherwise a negative error. 2530 2540 */ 2531 - int make_device_exclusive_range(struct mm_struct *mm, unsigned long start, 2532 - unsigned long end, struct page **pages, 2533 - void *owner) 2541 + struct page *make_device_exclusive(struct mm_struct *mm, unsigned long addr, 2542 + void *owner, struct folio **foliop) 2534 2543 { 2535 - long npages = (end - start) >> PAGE_SHIFT; 2536 - long i; 2544 + struct folio *folio; 2545 + struct page *page; 2546 + long npages; 2537 2547 2538 - npages = get_user_pages_remote(mm, start, npages, 2548 + mmap_assert_locked(mm); 2549 + 2550 + /* 2551 + * Fault in the page writable and try to lock it; note that if the 2552 + * address would already be marked for exclusive use by a device, 2553 + * the GUP call would undo that first by triggering a fault. 2554 + */ 2555 + npages = get_user_pages_remote(mm, addr, 1, 2539 2556 FOLL_GET | FOLL_WRITE | FOLL_SPLIT_PMD, 2540 - pages, NULL); 2541 - if (npages < 0) 2542 - return npages; 2557 + &page, NULL); 2558 + if (npages != 1) 2559 + return ERR_PTR(npages); 2560 + folio = page_folio(page); 2543 2561 2544 - for (i = 0; i < npages; i++, start += PAGE_SIZE) { 2545 - struct folio *folio = page_folio(pages[i]); 2546 - if (PageTail(pages[i]) || !folio_trylock(folio)) { 2547 - folio_put(folio); 2548 - pages[i] = NULL; 2549 - continue; 2550 - } 2551 - 2552 - if (!folio_make_device_exclusive(folio, mm, start, owner)) { 2553 - folio_unlock(folio); 2554 - folio_put(folio); 2555 - pages[i] = NULL; 2556 - } 2562 + if (!folio_test_anon(folio) || folio_test_hugetlb(folio)) { 2563 + folio_put(folio); 2564 + return ERR_PTR(-EOPNOTSUPP); 2557 2565 } 2558 2566 2559 - return npages; 2567 + if (!folio_trylock(folio)) { 2568 + folio_put(folio); 2569 + return ERR_PTR(-EBUSY); 2570 + } 2571 + 2572 + if (!folio_make_device_exclusive(folio, mm, addr, owner)) { 2573 + folio_unlock(folio); 2574 + folio_put(folio); 2575 + return ERR_PTR(-EBUSY); 2576 + } 2577 + *foliop = folio; 2578 + return page; 2560 2579 } 2561 - EXPORT_SYMBOL_GPL(make_device_exclusive_range); 2580 + EXPORT_SYMBOL_GPL(make_device_exclusive); 2562 2581 #endif 2563 2582 2564 2583 void __put_anon_vma(struct anon_vma *anon_vma)