Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge tag 'vfio-v6.15-rc1' of https://github.com/awilliam/linux-vfio

Pull VFIO updates from Alex Williamson:

- Relax IGD support code to match display class device rather than
specifically requiring a VGA device (Tomita Moeko)

- Accelerate DMA mapping of device MMIO by iterating at PMD and PUD
levels to take advantage of huge pfnmap support added in v6.12
(Alex Williamson)

- Extend virtio vfio-pci variant driver to include migration support
for block devices where enabled by the PF (Yishai Hadas)

- Virtualize INTx PIN register for devices where the platform does not
route legacy PCI interrupts for the device and the interrupt is
reported as IRQ_NOTCONNECTED (Alex Williamson)

* tag 'vfio-v6.15-rc1' of https://github.com/awilliam/linux-vfio:
vfio/pci: Handle INTx IRQ_NOTCONNECTED
vfio/virtio: Enable support for virtio-block live migration
vfio/type1: Use mapping page mask for pfnmaps
mm: Provide address mask in struct follow_pfnmap_args
vfio/type1: Use consistent types for page counts
vfio/type1: Use vfio_batch for vaddr_get_pfns()
vfio/type1: Convert all vaddr_get_pfns() callers to use vfio_batch
vfio/type1: Catch zero from pin_user_pages_remote()
vfio/pci: match IGD devices in display controller class

+106 -66
+1 -3
drivers/vfio/pci/vfio_pci.c
··· 111 111 if (ret) 112 112 return ret; 113 113 114 - if (vfio_pci_is_vga(pdev) && 115 - pdev->vendor == PCI_VENDOR_ID_INTEL && 116 - IS_ENABLED(CONFIG_VFIO_PCI_IGD)) { 114 + if (vfio_pci_is_intel_display(pdev)) { 117 115 ret = vfio_pci_igd_init(vdev); 118 116 if (ret && ret != -ENODEV) { 119 117 pci_warn(pdev, "Failed to setup Intel IGD regions\n");
+2 -1
drivers/vfio/pci/vfio_pci_config.c
··· 1814 1814 cpu_to_le16(PCI_COMMAND_MEMORY); 1815 1815 } 1816 1816 1817 - if (!IS_ENABLED(CONFIG_VFIO_PCI_INTX) || vdev->nointx) 1817 + if (!IS_ENABLED(CONFIG_VFIO_PCI_INTX) || vdev->nointx || 1818 + vdev->pdev->irq == IRQ_NOTCONNECTED) 1818 1819 vconfig[PCI_INTERRUPT_PIN] = 0; 1819 1820 1820 1821 ret = vfio_cap_init(vdev);
+1 -9
drivers/vfio/pci/vfio_pci_core.c
··· 727 727 static int vfio_pci_get_irq_count(struct vfio_pci_core_device *vdev, int irq_type) 728 728 { 729 729 if (irq_type == VFIO_PCI_INTX_IRQ_INDEX) { 730 - u8 pin; 731 - 732 - if (!IS_ENABLED(CONFIG_VFIO_PCI_INTX) || 733 - vdev->nointx || vdev->pdev->is_virtfn) 734 - return 0; 735 - 736 - pci_read_config_byte(vdev->pdev, PCI_INTERRUPT_PIN, &pin); 737 - 738 - return pin ? 1 : 0; 730 + return vdev->vconfig[PCI_INTERRUPT_PIN] ? 1 : 0; 739 731 } else if (irq_type == VFIO_PCI_MSI_IRQ_INDEX) { 740 732 u8 pos; 741 733 u16 flags;
+6
drivers/vfio/pci/vfio_pci_igd.c
··· 435 435 return 0; 436 436 } 437 437 438 + bool vfio_pci_is_intel_display(struct pci_dev *pdev) 439 + { 440 + return (pdev->vendor == PCI_VENDOR_ID_INTEL) && 441 + ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY); 442 + } 443 + 438 444 int vfio_pci_igd_init(struct vfio_pci_core_device *vdev) 439 445 { 440 446 int ret;
+1 -1
drivers/vfio/pci/vfio_pci_intrs.c
··· 259 259 if (!is_irq_none(vdev)) 260 260 return -EINVAL; 261 261 262 - if (!pdev->irq) 262 + if (!pdev->irq || pdev->irq == IRQ_NOTCONNECTED) 263 263 return -ENODEV; 264 264 265 265 name = kasprintf(GFP_KERNEL_ACCOUNT, "vfio-intx(%s)", pci_name(pdev));
+6
drivers/vfio/pci/vfio_pci_priv.h
··· 67 67 u16 cmd); 68 68 69 69 #ifdef CONFIG_VFIO_PCI_IGD 70 + bool vfio_pci_is_intel_display(struct pci_dev *pdev); 70 71 int vfio_pci_igd_init(struct vfio_pci_core_device *vdev); 71 72 #else 73 + static inline bool vfio_pci_is_intel_display(struct pci_dev *pdev) 74 + { 75 + return false; 76 + } 77 + 72 78 static inline int vfio_pci_igd_init(struct vfio_pci_core_device *vdev) 73 79 { 74 80 return -ENODEV;
+3 -3
drivers/vfio/pci/virtio/Kconfig
··· 1 1 # SPDX-License-Identifier: GPL-2.0-only 2 2 config VIRTIO_VFIO_PCI 3 - tristate "VFIO support for VIRTIO NET PCI VF devices" 3 + tristate "VFIO support for VIRTIO PCI VF devices" 4 4 depends on VIRTIO_PCI 5 5 select VFIO_PCI_CORE 6 6 help 7 - This provides migration support for VIRTIO NET PCI VF devices 8 - using the VFIO framework. Migration support requires the 7 + This provides migration support for VIRTIO NET and BLOCK PCI VF 8 + devices using the VFIO framework. Migration support requires the 9 9 SR-IOV PF device to support specific VIRTIO extensions, 10 10 otherwise this driver provides no additional functionality 11 11 beyond vfio-pci.
+3 -1
drivers/vfio/pci/virtio/legacy_io.c
··· 382 382 383 383 bool virtiovf_support_legacy_io(struct pci_dev *pdev) 384 384 { 385 - return virtio_pci_admin_has_legacy_io(pdev) && !virtiovf_bar0_exists(pdev); 385 + /* For now, the legacy IO functionality is supported only for virtio-net */ 386 + return pdev->device == 0x1041 && virtio_pci_admin_has_legacy_io(pdev) && 387 + !virtiovf_bar0_exists(pdev); 386 388 } 387 389 388 390 int virtiovf_init_legacy_io(struct virtiovf_pci_core_device *virtvdev)
+3 -2
drivers/vfio/pci/virtio/main.c
··· 187 187 } 188 188 189 189 static const struct pci_device_id virtiovf_pci_table[] = { 190 - /* Only virtio-net is supported/tested so far */ 190 + /* Only virtio-net and virtio-block are supported/tested so far */ 191 191 { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_REDHAT_QUMRANET, 0x1041) }, 192 + { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_REDHAT_QUMRANET, 0x1042) }, 192 193 {} 193 194 }; 194 195 ··· 222 221 MODULE_LICENSE("GPL"); 223 222 MODULE_AUTHOR("Yishai Hadas <yishaih@nvidia.com>"); 224 223 MODULE_DESCRIPTION( 225 - "VIRTIO VFIO PCI - User Level meta-driver for VIRTIO NET devices"); 224 + "VIRTIO VFIO PCI - User Level meta-driver for VIRTIO NET and BLOCK devices");
+77 -46
drivers/vfio/vfio_iommu_type1.c
··· 103 103 struct vfio_batch { 104 104 struct page **pages; /* for pin_user_pages_remote */ 105 105 struct page *fallback_page; /* if pages alloc fails */ 106 - int capacity; /* length of pages array */ 107 - int size; /* of batch currently */ 108 - int offset; /* of next entry in pages */ 106 + unsigned int capacity; /* length of pages array */ 107 + unsigned int size; /* of batch currently */ 108 + unsigned int offset; /* of next entry in pages */ 109 109 }; 110 110 111 111 struct vfio_iommu_group { ··· 471 471 472 472 #define VFIO_BATCH_MAX_CAPACITY (PAGE_SIZE / sizeof(struct page *)) 473 473 474 - static void vfio_batch_init(struct vfio_batch *batch) 474 + static void __vfio_batch_init(struct vfio_batch *batch, bool single) 475 475 { 476 476 batch->size = 0; 477 477 batch->offset = 0; 478 478 479 - if (unlikely(disable_hugepages)) 479 + if (single || unlikely(disable_hugepages)) 480 480 goto fallback; 481 481 482 482 batch->pages = (struct page **) __get_free_page(GFP_KERNEL); ··· 489 489 fallback: 490 490 batch->pages = &batch->fallback_page; 491 491 batch->capacity = 1; 492 + } 493 + 494 + static void vfio_batch_init(struct vfio_batch *batch) 495 + { 496 + __vfio_batch_init(batch, false); 497 + } 498 + 499 + static void vfio_batch_init_single(struct vfio_batch *batch) 500 + { 501 + __vfio_batch_init(batch, true); 492 502 } 493 503 494 504 static void vfio_batch_unpin(struct vfio_batch *batch, struct vfio_dma *dma) ··· 520 510 521 511 static int follow_fault_pfn(struct vm_area_struct *vma, struct mm_struct *mm, 522 512 unsigned long vaddr, unsigned long *pfn, 523 - bool write_fault) 513 + unsigned long *addr_mask, bool write_fault) 524 514 { 525 515 struct follow_pfnmap_args args = { .vma = vma, .address = vaddr }; 526 516 int ret; ··· 544 534 return ret; 545 535 } 546 536 547 - if (write_fault && !args.writable) 537 + if (write_fault && !args.writable) { 548 538 ret = -EFAULT; 549 - else 539 + } else { 550 540 *pfn = args.pfn; 541 + *addr_mask = args.addr_mask; 542 + } 551 543 552 544 follow_pfnmap_end(&args); 553 545 return ret; ··· 557 545 558 546 /* 559 547 * Returns the positive number of pfns successfully obtained or a negative 560 - * error code. 548 + * error code. The initial pfn is stored in the pfn arg. For page-backed 549 + * pfns, the provided batch is also updated to indicate the filled pages and 550 + * initial offset. For VM_PFNMAP pfns, only the returned number of pfns and 551 + * returned initial pfn are provided; subsequent pfns are contiguous. 561 552 */ 562 - static int vaddr_get_pfns(struct mm_struct *mm, unsigned long vaddr, 563 - long npages, int prot, unsigned long *pfn, 564 - struct page **pages) 553 + static long vaddr_get_pfns(struct mm_struct *mm, unsigned long vaddr, 554 + unsigned long npages, int prot, unsigned long *pfn, 555 + struct vfio_batch *batch) 565 556 { 557 + unsigned long pin_pages = min_t(unsigned long, npages, batch->capacity); 566 558 struct vm_area_struct *vma; 567 559 unsigned int flags = 0; 568 - int ret; 560 + long ret; 569 561 570 562 if (prot & IOMMU_WRITE) 571 563 flags |= FOLL_WRITE; 572 564 573 565 mmap_read_lock(mm); 574 - ret = pin_user_pages_remote(mm, vaddr, npages, flags | FOLL_LONGTERM, 575 - pages, NULL); 566 + ret = pin_user_pages_remote(mm, vaddr, pin_pages, flags | FOLL_LONGTERM, 567 + batch->pages, NULL); 576 568 if (ret > 0) { 577 - *pfn = page_to_pfn(pages[0]); 569 + *pfn = page_to_pfn(batch->pages[0]); 570 + batch->size = ret; 571 + batch->offset = 0; 578 572 goto done; 573 + } else if (!ret) { 574 + ret = -EFAULT; 579 575 } 580 576 581 577 vaddr = untagged_addr_remote(mm, vaddr); ··· 592 572 vma = vma_lookup(mm, vaddr); 593 573 594 574 if (vma && vma->vm_flags & VM_PFNMAP) { 595 - ret = follow_fault_pfn(vma, mm, vaddr, pfn, prot & IOMMU_WRITE); 575 + unsigned long addr_mask; 576 + 577 + ret = follow_fault_pfn(vma, mm, vaddr, pfn, &addr_mask, 578 + prot & IOMMU_WRITE); 596 579 if (ret == -EAGAIN) 597 580 goto retry; 598 581 599 582 if (!ret) { 600 - if (is_invalid_reserved_pfn(*pfn)) 601 - ret = 1; 602 - else 583 + if (is_invalid_reserved_pfn(*pfn)) { 584 + unsigned long epfn; 585 + 586 + epfn = (*pfn | (~addr_mask >> PAGE_SHIFT)) + 1; 587 + ret = min_t(long, npages, epfn - *pfn); 588 + } else { 603 589 ret = -EFAULT; 590 + } 604 591 } 605 592 } 606 593 done: ··· 621 594 * first page and all consecutive pages with the same locking. 622 595 */ 623 596 static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr, 624 - long npage, unsigned long *pfn_base, 597 + unsigned long npage, unsigned long *pfn_base, 625 598 unsigned long limit, struct vfio_batch *batch) 626 599 { 627 600 unsigned long pfn; ··· 643 616 *pfn_base = 0; 644 617 } 645 618 619 + if (unlikely(disable_hugepages)) 620 + npage = 1; 621 + 646 622 while (npage) { 647 623 if (!batch->size) { 648 624 /* Empty batch, so refill it. */ 649 - long req_pages = min_t(long, npage, batch->capacity); 650 - 651 - ret = vaddr_get_pfns(mm, vaddr, req_pages, dma->prot, 652 - &pfn, batch->pages); 625 + ret = vaddr_get_pfns(mm, vaddr, npage, dma->prot, 626 + &pfn, batch); 653 627 if (ret < 0) 654 628 goto unpin_out; 655 - 656 - batch->size = ret; 657 - batch->offset = 0; 658 629 659 630 if (!*pfn_base) { 660 631 *pfn_base = pfn; 661 632 rsvd = is_invalid_reserved_pfn(*pfn_base); 662 633 } 634 + 635 + /* Handle pfnmap */ 636 + if (!batch->size) { 637 + if (pfn != *pfn_base + pinned || !rsvd) 638 + goto out; 639 + 640 + pinned += ret; 641 + npage -= ret; 642 + vaddr += (PAGE_SIZE * ret); 643 + iova += (PAGE_SIZE * ret); 644 + continue; 645 + } 663 646 } 664 647 665 648 /* 666 - * pfn is preset for the first iteration of this inner loop and 667 - * updated at the end to handle a VM_PFNMAP pfn. In that case, 668 - * batch->pages isn't valid (there's no struct page), so allow 669 - * batch->pages to be touched only when there's more than one 670 - * pfn to check, which guarantees the pfns are from a 671 - * !VM_PFNMAP vma. 649 + * pfn is preset for the first iteration of this inner loop 650 + * due to the fact that vaddr_get_pfns() needs to provide the 651 + * initial pfn for pfnmaps. Therefore to reduce redundancy, 652 + * the next pfn is fetched at the end of the loop. 653 + * A PageReserved() page could still qualify as page backed 654 + * and rsvd here, and therefore continues to use the batch. 672 655 */ 673 656 while (true) { 674 657 if (pfn != *pfn_base + pinned || ··· 713 676 714 677 pfn = page_to_pfn(batch->pages[batch->offset]); 715 678 } 716 - 717 - if (unlikely(disable_hugepages)) 718 - break; 719 679 } 720 680 721 681 out: 722 682 ret = vfio_lock_acct(dma, lock_acct, false); 723 683 724 684 unpin_out: 725 - if (batch->size == 1 && !batch->offset) { 726 - /* May be a VM_PFNMAP pfn, which the batch can't remember. */ 727 - put_pfn(pfn, dma->prot); 728 - batch->size = 0; 729 - } 730 - 731 685 if (ret < 0) { 732 686 if (pinned && !rsvd) { 733 687 for (pfn = *pfn_base ; pinned ; pfn++, pinned--) ··· 733 705 } 734 706 735 707 static long vfio_unpin_pages_remote(struct vfio_dma *dma, dma_addr_t iova, 736 - unsigned long pfn, long npage, 708 + unsigned long pfn, unsigned long npage, 737 709 bool do_accounting) 738 710 { 739 711 long unlocked = 0, locked = 0; ··· 756 728 static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr, 757 729 unsigned long *pfn_base, bool do_accounting) 758 730 { 759 - struct page *pages[1]; 731 + struct vfio_batch batch; 760 732 struct mm_struct *mm; 761 733 int ret; 762 734 ··· 764 736 if (!mmget_not_zero(mm)) 765 737 return -ENODEV; 766 738 767 - ret = vaddr_get_pfns(mm, vaddr, 1, dma->prot, pfn_base, pages); 739 + vfio_batch_init_single(&batch); 740 + 741 + ret = vaddr_get_pfns(mm, vaddr, 1, dma->prot, pfn_base, &batch); 768 742 if (ret != 1) 769 743 goto out; 770 744 ··· 785 755 } 786 756 787 757 out: 758 + vfio_batch_fini(&batch); 788 759 mmput(mm); 789 760 return ret; 790 761 }
+2
include/linux/mm.h
··· 2495 2495 * Outputs: 2496 2496 * 2497 2497 * @pfn: the PFN of the address 2498 + * @addr_mask: address mask covering pfn 2498 2499 * @pgprot: the pgprot_t of the mapping 2499 2500 * @writable: whether the mapping is writable 2500 2501 * @special: whether the mapping is a special mapping (real PFN maps) 2501 2502 */ 2502 2503 unsigned long pfn; 2504 + unsigned long addr_mask; 2503 2505 pgprot_t pgprot; 2504 2506 bool writable; 2505 2507 bool special;
+1
mm/memory.c
··· 6670 6670 args->lock = lock; 6671 6671 args->ptep = ptep; 6672 6672 args->pfn = pfn_base + ((args->address & ~addr_mask) >> PAGE_SHIFT); 6673 + args->addr_mask = addr_mask; 6673 6674 args->pgprot = pgprot; 6674 6675 args->writable = writable; 6675 6676 args->special = special;