Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge tag 'vfio-v6.14-rc1' of https://github.com/awilliam/linux-vfio

Pull vfio updates from Alex Williamson:

- Extend vfio-pci 8-byte read/write support to include archs defining
CONFIG_GENERIC_IOMAP, such as x86, and remove now extraneous #ifdefs
around 64-bit accessors (Ramesh Thomas)

- Update vfio-pci shadow ROM handling and allow cached ROM from setup
data to be exposed as a functional ROM BAR region when available
(Yunxiang Li)

- Update nvgrace-gpu vfio-pci variant driver for new Grace Blackwell
hardware, conditionalizing the uncached BAR workaround for previous
generation hardware based on the presence of a flag in a new DVSEC
capability, and include a delay during probe for link training to
complete, a new requirement for GB devices (Ankit Agrawal)

* tag 'vfio-v6.14-rc1' of https://github.com/awilliam/linux-vfio:
vfio/nvgrace-gpu: Add GB200 SKU to the devid table
vfio/nvgrace-gpu: Check the HBM training and C2C link status
vfio/nvgrace-gpu: Expose the blackwell device PF BAR1 to the VM
vfio/nvgrace-gpu: Read dvsec register to determine need for uncached resmem
vfio/platform: check the bounds of read/write syscalls
vfio/pci: Expose setup ROM at ROM bar when needed
vfio/pci: Remove shadow ROM specific code paths
vfio/pci: Remove #ifdef iowrite64 and #ifdef ioread64
vfio/pci: Enable iowrite64 and ioread64 for vfio pci

+196 -69
+147 -22
drivers/vfio/pci/nvgrace-gpu/main.c
··· 5 5 6 6 #include <linux/sizes.h> 7 7 #include <linux/vfio_pci_core.h> 8 + #include <linux/delay.h> 9 + #include <linux/jiffies.h> 8 10 9 11 /* 10 12 * The device memory usable to the workloads running in the VM is cached ··· 19 17 #define RESMEM_REGION_INDEX VFIO_PCI_BAR2_REGION_INDEX 20 18 #define USEMEM_REGION_INDEX VFIO_PCI_BAR4_REGION_INDEX 21 19 22 - /* Memory size expected as non cached and reserved by the VM driver */ 23 - #define RESMEM_SIZE SZ_1G 24 - 25 20 /* A hardwired and constant ABI value between the GPU FW and VFIO driver. */ 26 21 #define MEMBLK_SIZE SZ_512M 22 + 23 + #define DVSEC_BITMAP_OFFSET 0xA 24 + #define MIG_SUPPORTED_WITH_CACHED_RESMEM BIT(0) 25 + 26 + #define GPU_CAP_DVSEC_REGISTER 3 27 + 28 + #define C2C_LINK_BAR0_OFFSET 0x1498 29 + #define HBM_TRAINING_BAR0_OFFSET 0x200BC 30 + #define STATUS_READY 0xFF 31 + 32 + #define POLL_QUANTUM_MS 1000 33 + #define POLL_TIMEOUT_MS (30 * 1000) 27 34 28 35 /* 29 36 * The state of the two device memory region - resmem and usemem - is ··· 57 46 struct mem_region resmem; 58 47 /* Lock to control device memory kernel mapping */ 59 48 struct mutex remap_lock; 49 + bool has_mig_hw_bug; 60 50 }; 61 51 62 52 static void nvgrace_gpu_init_fake_bar_emu_regs(struct vfio_device *core_vdev) ··· 78 66 if (index == USEMEM_REGION_INDEX) 79 67 return &nvdev->usemem; 80 68 81 - if (index == RESMEM_REGION_INDEX) 69 + if (nvdev->resmem.memlength && index == RESMEM_REGION_INDEX) 82 70 return &nvdev->resmem; 83 71 84 72 return NULL; ··· 763 751 u64 memphys, u64 memlength) 764 752 { 765 753 int ret = 0; 754 + u64 resmem_size = 0; 766 755 767 756 /* 768 - * The VM GPU device driver needs a non-cacheable region to support 769 - * the MIG feature. Since the device memory is mapped as NORMAL cached, 770 - * carve out a region from the end with a different NORMAL_NC 771 - * property (called as reserved memory and represented as resmem). This 772 - * region then is exposed as a 64b BAR (region 2 and 3) to the VM, while 773 - * exposing the rest (termed as usable memory and represented using usemem) 774 - * as cacheable 64b BAR (region 4 and 5). 757 + * On Grace Hopper systems, the VM GPU device driver needs a non-cacheable 758 + * region to support the MIG feature owing to a hardware bug. Since the 759 + * device memory is mapped as NORMAL cached, carve out a region from the end 760 + * with a different NORMAL_NC property (called as reserved memory and 761 + * represented as resmem). This region then is exposed as a 64b BAR 762 + * (region 2 and 3) to the VM, while exposing the rest (termed as usable 763 + * memory and represented using usemem) as cacheable 64b BAR (region 4 and 5). 775 764 * 776 765 * devmem (memlength) 777 766 * |-------------------------------------------------| 778 767 * | | 779 768 * usemem.memphys resmem.memphys 769 + * 770 + * This hardware bug is fixed on the Grace Blackwell platforms and the 771 + * presence of the bug can be determined through nvdev->has_mig_hw_bug. 772 + * Thus on systems with the hardware fix, there is no need to partition 773 + * the GPU device memory and the entire memory is usable and mapped as 774 + * NORMAL cached (i.e. resmem size is 0). 780 775 */ 776 + if (nvdev->has_mig_hw_bug) 777 + resmem_size = SZ_1G; 778 + 781 779 nvdev->usemem.memphys = memphys; 782 780 783 781 /* 784 782 * The device memory exposed to the VM is added to the kernel by the 785 - * VM driver module in chunks of memory block size. Only the usable 786 - * memory (usemem) is added to the kernel for usage by the VM 787 - * workloads. Make the usable memory size memblock aligned. 783 + * VM driver module in chunks of memory block size. Note that only the 784 + * usable memory (usemem) is added to the kernel for usage by the VM 785 + * workloads. 788 786 */ 789 - if (check_sub_overflow(memlength, RESMEM_SIZE, 787 + if (check_sub_overflow(memlength, resmem_size, 790 788 &nvdev->usemem.memlength)) { 791 789 ret = -EOVERFLOW; 792 790 goto done; 793 791 } 794 792 795 793 /* 796 - * The USEMEM part of the device memory has to be MEMBLK_SIZE 797 - * aligned. This is a hardwired ABI value between the GPU FW and 798 - * VFIO driver. The VM device driver is also aware of it and make 799 - * use of the value for its calculation to determine USEMEM size. 794 + * The usemem region is exposed as a 64B Bar composed of region 4 and 5. 795 + * Calculate and save the BAR size for the region. 796 + */ 797 + nvdev->usemem.bar_size = roundup_pow_of_two(nvdev->usemem.memlength); 798 + 799 + /* 800 + * If the hardware has the fix for MIG, there is no requirement 801 + * for splitting the device memory to create RESMEM. The entire 802 + * device memory is usable and will be USEMEM. Return here for 803 + * such case. 804 + */ 805 + if (!nvdev->has_mig_hw_bug) 806 + goto done; 807 + 808 + /* 809 + * When the device memory is split to workaround the MIG bug on 810 + * Grace Hopper, the USEMEM part of the device memory has to be 811 + * MEMBLK_SIZE aligned. This is a hardwired ABI value between the 812 + * GPU FW and VFIO driver. The VM device driver is also aware of it 813 + * and make use of the value for its calculation to determine USEMEM 814 + * size. Note that the device memory may not be 512M aligned. 800 815 */ 801 816 nvdev->usemem.memlength = round_down(nvdev->usemem.memlength, 802 817 MEMBLK_SIZE); ··· 842 803 } 843 804 844 805 /* 845 - * The memory regions are exposed as BARs. Calculate and save 846 - * the BAR size for them. 806 + * The resmem region is exposed as a 64b BAR composed of region 2 and 3 807 + * for Grace Hopper. Calculate and save the BAR size for the region. 847 808 */ 848 - nvdev->usemem.bar_size = roundup_pow_of_two(nvdev->usemem.memlength); 849 809 nvdev->resmem.bar_size = roundup_pow_of_two(nvdev->resmem.memlength); 850 810 done: 811 + return ret; 812 + } 813 + 814 + static bool nvgrace_gpu_has_mig_hw_bug(struct pci_dev *pdev) 815 + { 816 + int pcie_dvsec; 817 + u16 dvsec_ctrl16; 818 + 819 + pcie_dvsec = pci_find_dvsec_capability(pdev, PCI_VENDOR_ID_NVIDIA, 820 + GPU_CAP_DVSEC_REGISTER); 821 + 822 + if (pcie_dvsec) { 823 + pci_read_config_word(pdev, 824 + pcie_dvsec + DVSEC_BITMAP_OFFSET, 825 + &dvsec_ctrl16); 826 + 827 + if (dvsec_ctrl16 & MIG_SUPPORTED_WITH_CACHED_RESMEM) 828 + return false; 829 + } 830 + 831 + return true; 832 + } 833 + 834 + /* 835 + * To reduce the system bootup time, the HBM training has 836 + * been moved out of the UEFI on the Grace-Blackwell systems. 837 + * 838 + * The onus of checking whether the HBM training has completed 839 + * thus falls on the module. The HBM training status can be 840 + * determined from a BAR0 register. 841 + * 842 + * Similarly, another BAR0 register exposes the status of the 843 + * CPU-GPU chip-to-chip (C2C) cache coherent interconnect. 844 + * 845 + * Poll these register and check for 30s. If the HBM training is 846 + * not complete or if the C2C link is not ready, fail the probe. 847 + * 848 + * While the wait is not required on Grace Hopper systems, it 849 + * is beneficial to make the check to ensure the device is in an 850 + * expected state. 851 + * 852 + * Ensure that the BAR0 region is enabled before accessing the 853 + * registers. 854 + */ 855 + static int nvgrace_gpu_wait_device_ready(struct pci_dev *pdev) 856 + { 857 + unsigned long timeout = jiffies + msecs_to_jiffies(POLL_TIMEOUT_MS); 858 + void __iomem *io; 859 + int ret = -ETIME; 860 + 861 + ret = pci_enable_device(pdev); 862 + if (ret) 863 + return ret; 864 + 865 + ret = pci_request_selected_regions(pdev, 1 << 0, KBUILD_MODNAME); 866 + if (ret) 867 + goto request_region_exit; 868 + 869 + io = pci_iomap(pdev, 0, 0); 870 + if (!io) { 871 + ret = -ENOMEM; 872 + goto iomap_exit; 873 + } 874 + 875 + do { 876 + if ((ioread32(io + C2C_LINK_BAR0_OFFSET) == STATUS_READY) && 877 + (ioread32(io + HBM_TRAINING_BAR0_OFFSET) == STATUS_READY)) { 878 + ret = 0; 879 + goto reg_check_exit; 880 + } 881 + msleep(POLL_QUANTUM_MS); 882 + } while (!time_after(jiffies, timeout)); 883 + 884 + reg_check_exit: 885 + pci_iounmap(pdev, io); 886 + iomap_exit: 887 + pci_release_selected_regions(pdev, 1 << 0); 888 + request_region_exit: 889 + pci_disable_device(pdev); 851 890 return ret; 852 891 } 853 892 ··· 936 819 struct nvgrace_gpu_pci_core_device *nvdev; 937 820 u64 memphys, memlength; 938 821 int ret; 822 + 823 + ret = nvgrace_gpu_wait_device_ready(pdev); 824 + if (ret) 825 + return ret; 939 826 940 827 ret = nvgrace_gpu_fetch_memory_property(pdev, &memphys, &memlength); 941 828 if (!ret) ··· 953 832 dev_set_drvdata(&pdev->dev, &nvdev->core_device); 954 833 955 834 if (ops == &nvgrace_gpu_pci_ops) { 835 + nvdev->has_mig_hw_bug = nvgrace_gpu_has_mig_hw_bug(pdev); 836 + 956 837 /* 957 838 * Device memory properties are identified in the host ACPI 958 839 * table. Set the nvgrace_gpu_pci_core_device structure. ··· 991 868 { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x2345) }, 992 869 /* GH200 SKU */ 993 870 { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x2348) }, 871 + /* GB200 SKU */ 872 + { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x2941) }, 994 873 {} 995 874 }; 996 875
+4 -4
drivers/vfio/pci/vfio_pci_config.c
··· 511 511 mask = ~(pci_resource_len(pdev, PCI_ROM_RESOURCE) - 1); 512 512 mask |= PCI_ROM_ADDRESS_ENABLE; 513 513 *vbar &= cpu_to_le32((u32)mask); 514 - } else if (pdev->resource[PCI_ROM_RESOURCE].flags & 515 - IORESOURCE_ROM_SHADOW) { 516 - mask = ~(0x20000 - 1); 514 + } else if (pdev->rom && pdev->romlen) { 515 + mask = ~(roundup_pow_of_two(pdev->romlen) - 1); 517 516 mask |= PCI_ROM_ADDRESS_ENABLE; 518 517 *vbar &= cpu_to_le32((u32)mask); 519 - } else 518 + } else { 520 519 *vbar = 0; 520 + } 521 521 522 522 vdev->bardirty = false; 523 523 }
+18 -22
drivers/vfio/pci/vfio_pci_core.c
··· 1054 1054 1055 1055 info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); 1056 1056 info.flags = 0; 1057 + info.size = 0; 1057 1058 1058 - /* Report the BAR size, not the ROM size */ 1059 - info.size = pci_resource_len(pdev, info.index); 1060 - if (!info.size) { 1061 - /* Shadow ROMs appear as PCI option ROMs */ 1062 - if (pdev->resource[PCI_ROM_RESOURCE].flags & 1063 - IORESOURCE_ROM_SHADOW) 1064 - info.size = 0x20000; 1065 - else 1066 - break; 1067 - } 1068 - 1069 - /* 1070 - * Is it really there? Enable memory decode for implicit access 1071 - * in pci_map_rom(). 1072 - */ 1073 - cmd = vfio_pci_memory_lock_and_enable(vdev); 1074 - io = pci_map_rom(pdev, &size); 1075 - if (io) { 1059 + if (pci_resource_start(pdev, PCI_ROM_RESOURCE)) { 1060 + /* 1061 + * Check ROM content is valid. Need to enable memory 1062 + * decode for ROM access in pci_map_rom(). 1063 + */ 1064 + cmd = vfio_pci_memory_lock_and_enable(vdev); 1065 + io = pci_map_rom(pdev, &size); 1066 + if (io) { 1067 + info.flags = VFIO_REGION_INFO_FLAG_READ; 1068 + /* Report the BAR size, not the ROM size. */ 1069 + info.size = pci_resource_len(pdev, PCI_ROM_RESOURCE); 1070 + pci_unmap_rom(pdev, io); 1071 + } 1072 + vfio_pci_memory_unlock_and_restore(vdev, cmd); 1073 + } else if (pdev->rom && pdev->romlen) { 1076 1074 info.flags = VFIO_REGION_INFO_FLAG_READ; 1077 - pci_unmap_rom(pdev, io); 1078 - } else { 1079 - info.size = 0; 1075 + /* Report BAR size as power of two. */ 1076 + info.size = roundup_pow_of_two(pdev->romlen); 1080 1077 } 1081 - vfio_pci_memory_unlock_and_restore(vdev, cmd); 1082 1078 1083 1079 break; 1084 1080 }
+17 -21
drivers/vfio/pci/vfio_pci_rdwr.c
··· 16 16 #include <linux/io.h> 17 17 #include <linux/vfio.h> 18 18 #include <linux/vgaarb.h> 19 + #include <linux/io-64-nonatomic-lo-hi.h> 19 20 20 21 #include "vfio_pci_priv.h" 21 22 ··· 62 61 VFIO_IOWRITE(8) 63 62 VFIO_IOWRITE(16) 64 63 VFIO_IOWRITE(32) 65 - #ifdef iowrite64 66 64 VFIO_IOWRITE(64) 67 - #endif 68 65 69 66 #define VFIO_IOREAD(size) \ 70 67 int vfio_pci_core_ioread##size(struct vfio_pci_core_device *vdev, \ ··· 88 89 VFIO_IOREAD(8) 89 90 VFIO_IOREAD(16) 90 91 VFIO_IOREAD(32) 91 - #ifdef ioread64 92 92 VFIO_IOREAD(64) 93 - #endif 94 93 95 94 #define VFIO_IORDWR(size) \ 96 95 static int vfio_pci_iordwr##size(struct vfio_pci_core_device *vdev,\ ··· 124 127 VFIO_IORDWR(8) 125 128 VFIO_IORDWR(16) 126 129 VFIO_IORDWR(32) 127 - #if defined(ioread64) && defined(iowrite64) 128 130 VFIO_IORDWR(64) 129 - #endif 130 131 131 132 /* 132 133 * Read or write from an __iomem region (MMIO or I/O port) with an excluded ··· 150 155 else 151 156 fillable = 0; 152 157 153 - #if defined(ioread64) && defined(iowrite64) 154 158 if (fillable >= 8 && !(off % 8)) { 155 159 ret = vfio_pci_iordwr64(vdev, iswrite, test_mem, 156 160 io, buf, off, &filled); ··· 157 163 return ret; 158 164 159 165 } else 160 - #endif 161 166 if (fillable >= 4 && !(off % 4)) { 162 167 ret = vfio_pci_iordwr32(vdev, iswrite, test_mem, 163 168 io, buf, off, &filled); ··· 237 244 238 245 if (pci_resource_start(pdev, bar)) 239 246 end = pci_resource_len(pdev, bar); 240 - else if (bar == PCI_ROM_RESOURCE && 241 - pdev->resource[bar].flags & IORESOURCE_ROM_SHADOW) 242 - end = 0x20000; 247 + else if (bar == PCI_ROM_RESOURCE && pdev->rom && pdev->romlen) 248 + end = roundup_pow_of_two(pdev->romlen); 243 249 else 244 250 return -EINVAL; 245 251 ··· 253 261 * excluded range at the end of the actual ROM. This makes 254 262 * filling large ROM BARs much faster. 255 263 */ 256 - io = pci_map_rom(pdev, &x_start); 257 - if (!io) { 258 - done = -ENOMEM; 259 - goto out; 264 + if (pci_resource_start(pdev, bar)) { 265 + io = pci_map_rom(pdev, &x_start); 266 + } else { 267 + io = ioremap(pdev->rom, pdev->romlen); 268 + x_start = pdev->romlen; 260 269 } 270 + if (!io) 271 + return -ENOMEM; 261 272 x_end = end; 262 273 } else { 263 274 int ret = vfio_pci_core_setup_barmap(vdev, bar); ··· 283 288 if (done >= 0) 284 289 *ppos += done; 285 290 286 - if (bar == PCI_ROM_RESOURCE) 287 - pci_unmap_rom(pdev, io); 291 + if (bar == PCI_ROM_RESOURCE) { 292 + if (pci_resource_start(pdev, bar)) 293 + pci_unmap_rom(pdev, io); 294 + else 295 + iounmap(io); 296 + } 297 + 288 298 out: 289 299 return done; 290 300 } ··· 381 381 vfio_pci_core_iowrite32(ioeventfd->vdev, test_mem, 382 382 ioeventfd->data, ioeventfd->addr); 383 383 break; 384 - #ifdef iowrite64 385 384 case 8: 386 385 vfio_pci_core_iowrite64(ioeventfd->vdev, test_mem, 387 386 ioeventfd->data, ioeventfd->addr); 388 387 break; 389 - #endif 390 388 } 391 389 } 392 390 ··· 438 440 pos >= vdev->msix_offset + vdev->msix_size)) 439 441 return -EINVAL; 440 442 441 - #ifndef iowrite64 442 443 if (count == 8) 443 444 return -EINVAL; 444 - #endif 445 445 446 446 ret = vfio_pci_core_setup_barmap(vdev, bar); 447 447 if (ret)
+10
drivers/vfio/platform/vfio_platform_common.c
··· 388 388 { 389 389 unsigned int done = 0; 390 390 391 + if (off >= reg->size) 392 + return -EINVAL; 393 + 394 + count = min_t(size_t, count, reg->size - off); 395 + 391 396 if (!reg->ioaddr) { 392 397 reg->ioaddr = 393 398 ioremap(reg->addr, reg->size); ··· 471 466 loff_t off) 472 467 { 473 468 unsigned int done = 0; 469 + 470 + if (off >= reg->size) 471 + return -EINVAL; 472 + 473 + count = min_t(size_t, count, reg->size - off); 474 474 475 475 if (!reg->ioaddr) { 476 476 reg->ioaddr =