Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge tag 'vfio-v6.18-rc1-pt2' of https://github.com/awilliam/linux-vfio

Pull more VFIO updates from Alex Williamson:

- Optimizations for DMA map and unmap opertions through the type1 vfio
IOMMU backend.

This uses various means of batching and hints from the mm structures
to improve efficiency and therefore performance, resulting in a
significant speedup for huge page use cases (Li Zhe)

- Expose supported device migration features through debugfs (Cédric Le
Goater)

* tag 'vfio-v6.18-rc1-pt2' of https://github.com/awilliam/linux-vfio:
vfio: Dump migration features under debugfs
vfio/type1: optimize vfio_unpin_pages_remote()
vfio/type1: introduce a new member has_rsvd for struct vfio_dma
vfio/type1: batch vfio_find_vpfn() in function vfio_unpin_pages_remote()
vfio/type1: optimize vfio_pin_pages_remote()
mm: introduce num_pages_contiguous()

+158 -22
+6
Documentation/ABI/testing/debugfs-vfio
··· 23 23 Description: Read the live migration status of the vfio device. 24 24 The contents of the state file reflects the migration state 25 25 relative to those defined in the vfio_device_mig_state enum 26 + 27 + What: /sys/kernel/debug/vfio/<device>/migration/features 28 + Date: Oct 2025 29 + KernelVersion: 6.18 30 + Contact: Cédric Le Goater <clg@redhat.com> 31 + Description: Read the migration features of the vfio device.
+19
drivers/vfio/debugfs.c
··· 58 58 return 0; 59 59 } 60 60 61 + static int vfio_device_features_read(struct seq_file *seq, void *data) 62 + { 63 + struct device *vf_dev = seq->private; 64 + struct vfio_device *vdev = container_of(vf_dev, struct vfio_device, device); 65 + 66 + if (vdev->migration_flags & VFIO_MIGRATION_STOP_COPY) 67 + seq_puts(seq, "stop-copy\n"); 68 + if (vdev->migration_flags & VFIO_MIGRATION_P2P) 69 + seq_puts(seq, "p2p\n"); 70 + if (vdev->migration_flags & VFIO_MIGRATION_PRE_COPY) 71 + seq_puts(seq, "pre-copy\n"); 72 + if (vdev->log_ops) 73 + seq_puts(seq, "dirty-tracking\n"); 74 + 75 + return 0; 76 + } 77 + 61 78 void vfio_device_debugfs_init(struct vfio_device *vdev) 62 79 { 63 80 struct device *dev = &vdev->device; ··· 89 72 vdev->debug_root); 90 73 debugfs_create_devm_seqfile(dev, "state", vfio_dev_migration, 91 74 vfio_device_state_read); 75 + debugfs_create_devm_seqfile(dev, "features", vfio_dev_migration, 76 + vfio_device_features_read); 92 77 } 93 78 } 94 79
+91 -21
drivers/vfio/vfio_iommu_type1.c
··· 37 37 #include <linux/vfio.h> 38 38 #include <linux/workqueue.h> 39 39 #include <linux/notifier.h> 40 + #include <linux/mm_inline.h> 40 41 #include "vfio.h" 41 42 42 43 #define DRIVER_VERSION "0.2" ··· 93 92 bool iommu_mapped; 94 93 bool lock_cap; /* capable(CAP_IPC_LOCK) */ 95 94 bool vaddr_invalid; 95 + bool has_rsvd; /* has 1 or more rsvd pfns */ 96 96 struct task_struct *task; 97 97 struct rb_root pfn_list; /* Ex-user pinned pfn list */ 98 98 unsigned long *bitmap; ··· 320 318 /* 321 319 * Helper Functions for host iova-pfn list 322 320 */ 323 - static struct vfio_pfn *vfio_find_vpfn(struct vfio_dma *dma, dma_addr_t iova) 321 + 322 + /* 323 + * Find the highest vfio_pfn that overlapping the range 324 + * [iova_start, iova_end) in rb tree. 325 + */ 326 + static struct vfio_pfn *vfio_find_vpfn_range(struct vfio_dma *dma, 327 + dma_addr_t iova_start, dma_addr_t iova_end) 324 328 { 325 329 struct vfio_pfn *vpfn; 326 330 struct rb_node *node = dma->pfn_list.rb_node; ··· 334 326 while (node) { 335 327 vpfn = rb_entry(node, struct vfio_pfn, node); 336 328 337 - if (iova < vpfn->iova) 329 + if (iova_end <= vpfn->iova) 338 330 node = node->rb_left; 339 - else if (iova > vpfn->iova) 331 + else if (iova_start > vpfn->iova) 340 332 node = node->rb_right; 341 333 else 342 334 return vpfn; 343 335 } 344 336 return NULL; 337 + } 338 + 339 + static inline struct vfio_pfn *vfio_find_vpfn(struct vfio_dma *dma, dma_addr_t iova) 340 + { 341 + return vfio_find_vpfn_range(dma, iova, iova + 1); 345 342 } 346 343 347 344 static void vfio_link_pfn(struct vfio_dma *dma, ··· 627 614 return ret; 628 615 } 629 616 617 + 618 + static long vpfn_pages(struct vfio_dma *dma, 619 + dma_addr_t iova_start, long nr_pages) 620 + { 621 + dma_addr_t iova_end = iova_start + (nr_pages << PAGE_SHIFT); 622 + struct vfio_pfn *top = vfio_find_vpfn_range(dma, iova_start, iova_end); 623 + long ret = 1; 624 + struct vfio_pfn *vpfn; 625 + struct rb_node *prev; 626 + struct rb_node *next; 627 + 628 + if (likely(!top)) 629 + return 0; 630 + 631 + prev = next = &top->node; 632 + 633 + while ((prev = rb_prev(prev))) { 634 + vpfn = rb_entry(prev, struct vfio_pfn, node); 635 + if (vpfn->iova < iova_start) 636 + break; 637 + ret++; 638 + } 639 + 640 + while ((next = rb_next(next))) { 641 + vpfn = rb_entry(next, struct vfio_pfn, node); 642 + if (vpfn->iova >= iova_end) 643 + break; 644 + ret++; 645 + } 646 + 647 + return ret; 648 + } 649 + 630 650 /* 631 651 * Attempt to pin pages. We really don't want to track all the pfns and 632 652 * the iommu can only map chunks of consecutive pfns anyway, so get the ··· 733 687 * and rsvd here, and therefore continues to use the batch. 734 688 */ 735 689 while (true) { 690 + long nr_pages, acct_pages = 0; 691 + 736 692 if (pfn != *pfn_base + pinned || 737 693 rsvd != is_invalid_reserved_pfn(pfn)) 738 694 goto out; 695 + 696 + /* 697 + * Using GUP with the FOLL_LONGTERM in 698 + * vaddr_get_pfns() will not return invalid 699 + * or reserved pages. 700 + */ 701 + nr_pages = num_pages_contiguous( 702 + &batch->pages[batch->offset], 703 + batch->size); 704 + if (!rsvd) { 705 + acct_pages = nr_pages; 706 + acct_pages -= vpfn_pages(dma, iova, nr_pages); 707 + } 739 708 740 709 /* 741 710 * Reserved pages aren't counted against the user, 742 711 * externally pinned pages are already counted against 743 712 * the user. 744 713 */ 745 - if (!rsvd && !vfio_find_vpfn(dma, iova)) { 714 + if (acct_pages) { 746 715 if (!dma->lock_cap && 747 - mm->locked_vm + lock_acct + 1 > limit) { 716 + mm->locked_vm + lock_acct + acct_pages > limit) { 748 717 pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", 749 718 __func__, limit << PAGE_SHIFT); 750 719 ret = -ENOMEM; 751 720 goto unpin_out; 752 721 } 753 - lock_acct++; 722 + lock_acct += acct_pages; 754 723 } 755 724 756 - pinned++; 757 - npage--; 758 - vaddr += PAGE_SIZE; 759 - iova += PAGE_SIZE; 760 - batch->offset++; 761 - batch->size--; 725 + pinned += nr_pages; 726 + npage -= nr_pages; 727 + vaddr += PAGE_SIZE * nr_pages; 728 + iova += PAGE_SIZE * nr_pages; 729 + batch->offset += nr_pages; 730 + batch->size -= nr_pages; 762 731 763 732 if (!batch->size) 764 733 break; ··· 783 722 } 784 723 785 724 out: 725 + dma->has_rsvd |= rsvd; 786 726 ret = vfio_lock_acct(dma, lock_acct, false); 787 727 788 728 unpin_out: ··· 800 738 return pinned; 801 739 } 802 740 741 + static inline void put_valid_unreserved_pfns(unsigned long start_pfn, 742 + unsigned long npage, int prot) 743 + { 744 + unpin_user_page_range_dirty_lock(pfn_to_page(start_pfn), npage, 745 + prot & IOMMU_WRITE); 746 + } 747 + 803 748 static long vfio_unpin_pages_remote(struct vfio_dma *dma, dma_addr_t iova, 804 749 unsigned long pfn, unsigned long npage, 805 750 bool do_accounting) 806 751 { 807 - long unlocked = 0, locked = 0; 808 - long i; 752 + long unlocked = 0, locked = vpfn_pages(dma, iova, npage); 809 753 810 - for (i = 0; i < npage; i++, iova += PAGE_SIZE) { 811 - if (put_pfn(pfn++, dma->prot)) { 812 - unlocked++; 813 - if (vfio_find_vpfn(dma, iova)) 814 - locked++; 815 - } 754 + if (dma->has_rsvd) { 755 + unsigned long i; 756 + 757 + for (i = 0; i < npage; i++) 758 + if (put_pfn(pfn++, dma->prot)) 759 + unlocked++; 760 + } else { 761 + put_valid_unreserved_pfns(pfn, npage, dma->prot); 762 + unlocked = npage; 816 763 } 817 - 818 764 if (do_accounting) 819 765 vfio_lock_acct(dma, locked - unlocked, true); 820 766
+6 -1
include/linux/mm.h
··· 1833 1833 { 1834 1834 return (mdf.f >> SECTIONS_PGSHIFT) & SECTIONS_MASK; 1835 1835 } 1836 - #endif 1836 + #else /* !SECTION_IN_PAGE_FLAGS */ 1837 + static inline unsigned long memdesc_section(memdesc_flags_t mdf) 1838 + { 1839 + return 0; 1840 + } 1841 + #endif /* SECTION_IN_PAGE_FLAGS */ 1837 1842 1838 1843 /** 1839 1844 * folio_pfn - Return the Page Frame Number of a folio.
+36
include/linux/mm_inline.h
··· 617 617 return true; 618 618 } 619 619 620 + /** 621 + * num_pages_contiguous() - determine the number of contiguous pages 622 + * that represent contiguous PFNs 623 + * @pages: an array of page pointers 624 + * @nr_pages: length of the array, at least 1 625 + * 626 + * Determine the number of contiguous pages that represent contiguous PFNs 627 + * in @pages, starting from the first page. 628 + * 629 + * In some kernel configs contiguous PFNs will not have contiguous struct 630 + * pages. In these configurations num_pages_contiguous() will return a num 631 + * smaller than ideal number. The caller should continue to check for pfn 632 + * contiguity after each call to num_pages_contiguous(). 633 + * 634 + * Returns the number of contiguous pages. 635 + */ 636 + static inline size_t num_pages_contiguous(struct page **pages, size_t nr_pages) 637 + { 638 + struct page *cur_page = pages[0]; 639 + unsigned long section = memdesc_section(cur_page->flags); 640 + size_t i; 641 + 642 + for (i = 1; i < nr_pages; i++) { 643 + if (++cur_page != pages[i]) 644 + break; 645 + /* 646 + * In unproblematic kernel configs, page_to_section() == 0 and 647 + * the whole check will get optimized out. 648 + */ 649 + if (memdesc_section(cur_page->flags) != section) 650 + break; 651 + } 652 + 653 + return i; 654 + } 655 + 620 656 #endif