Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge tag 'vfio-v4.6-rc1' of git://github.com/awilliam/linux-vfio

Pull VFIO updates from Alex Williamson:
"Various enablers for assignment of Intel graphics devices and future
support of vGPU devices (Alex Williamson). This includes

- Handling the vfio type1 interface as an API rather than a specific
implementation, allowing multiple type1 providers.

- Capability chains, similar to PCI device capabilities, that allow
extending ioctls. Extensions here include device specific regions
and sparse mmap descriptions. The former is used to expose non-PCI
regions for IGD, including the OpRegion (particularly the Video
BIOS Table), and read only PCI config access to the host and LPC
bridge as drivers often depend on identifying those devices.

Sparse mmaps here are used to describe the MSIx vector table, which
vfio has always protected from mmap, but never had an API to
explicitly define that protection. In future vGPU support this is
expected to allow the description of PCI BARs that may mix direct
access and emulated access within a single region.

- The ability to expose the shadow ROM as an option ROM as IGD use
cases may rely on the ROM even though the physical device does not
make use of a PCI option ROM BAR"

* tag 'vfio-v4.6-rc1' of git://github.com/awilliam/linux-vfio:
vfio/pci: return -EFAULT if copy_to_user fails
vfio/pci: Expose shadow ROM as PCI option ROM
vfio/pci: Intel IGD host and LCP bridge config space access
vfio/pci: Intel IGD OpRegion support
vfio/pci: Enable virtual register in PCI config space
vfio/pci: Add infrastructure for additional device specific regions
vfio: Define device specific region type capability
vfio/pci: Include sparse mmap capability for MSI-X table regions
vfio: Define sparse mmap capability for regions
vfio: Add capability chain helpers
vfio: Define capability chains
vfio: If an IOMMU backend fails, keep looking
vfio/pci: Fix unsigned comparison overflow

+706 -37
+4
drivers/vfio/pci/Kconfig
··· 26 26 config VFIO_PCI_INTX 27 27 depends on VFIO_PCI 28 28 def_bool y if !S390 29 + 30 + config VFIO_PCI_IGD 31 + depends on VFIO_PCI 32 + def_bool y if X86
+1
drivers/vfio/pci/Makefile
··· 1 1 2 2 vfio-pci-y := vfio_pci.o vfio_pci_intrs.o vfio_pci_rdwr.o vfio_pci_config.o 3 + vfio-pci-$(CONFIG_VFIO_PCI_IGD) += vfio_pci_igd.o 3 4 4 5 obj-$(CONFIG_VFIO_PCI) += vfio-pci.o
+168 -7
drivers/vfio/pci/vfio_pci.c
··· 111 111 } 112 112 113 113 static void vfio_pci_try_bus_reset(struct vfio_pci_device *vdev); 114 + static void vfio_pci_disable(struct vfio_pci_device *vdev); 114 115 115 116 static int vfio_pci_enable(struct vfio_pci_device *vdev) 116 117 { ··· 170 169 if (!vfio_vga_disabled() && vfio_pci_is_vga(pdev)) 171 170 vdev->has_vga = true; 172 171 172 + 173 + if (vfio_pci_is_vga(pdev) && 174 + pdev->vendor == PCI_VENDOR_ID_INTEL && 175 + IS_ENABLED(CONFIG_VFIO_PCI_IGD)) { 176 + ret = vfio_pci_igd_init(vdev); 177 + if (ret) { 178 + dev_warn(&vdev->pdev->dev, 179 + "Failed to setup Intel IGD regions\n"); 180 + vfio_pci_disable(vdev); 181 + return ret; 182 + } 183 + } 184 + 173 185 return 0; 174 186 } 175 187 176 188 static void vfio_pci_disable(struct vfio_pci_device *vdev) 177 189 { 178 190 struct pci_dev *pdev = vdev->pdev; 179 - int bar; 191 + int i, bar; 180 192 181 193 /* Stop the device from further DMA */ 182 194 pci_clear_master(pdev); ··· 199 185 vdev->irq_type, 0, 0, NULL); 200 186 201 187 vdev->virq_disabled = false; 188 + 189 + for (i = 0; i < vdev->num_regions; i++) 190 + vdev->region[i].ops->release(vdev, &vdev->region[i]); 191 + 192 + vdev->num_regions = 0; 193 + kfree(vdev->region); 194 + vdev->region = NULL; /* don't krealloc a freed pointer */ 202 195 203 196 vfio_config_free(vdev); 204 197 ··· 442 421 return walk.ret; 443 422 } 444 423 424 + static int msix_sparse_mmap_cap(struct vfio_pci_device *vdev, 425 + struct vfio_info_cap *caps) 426 + { 427 + struct vfio_info_cap_header *header; 428 + struct vfio_region_info_cap_sparse_mmap *sparse; 429 + size_t end, size; 430 + int nr_areas = 2, i = 0; 431 + 432 + end = pci_resource_len(vdev->pdev, vdev->msix_bar); 433 + 434 + /* If MSI-X table is aligned to the start or end, only one area */ 435 + if (((vdev->msix_offset & PAGE_MASK) == 0) || 436 + (PAGE_ALIGN(vdev->msix_offset + vdev->msix_size) >= end)) 437 + nr_areas = 1; 438 + 439 + size = sizeof(*sparse) + (nr_areas * sizeof(*sparse->areas)); 440 + 441 + header = vfio_info_cap_add(caps, size, 442 + VFIO_REGION_INFO_CAP_SPARSE_MMAP, 1); 443 + if (IS_ERR(header)) 444 + return PTR_ERR(header); 445 + 446 + sparse = container_of(header, 447 + struct vfio_region_info_cap_sparse_mmap, header); 448 + sparse->nr_areas = nr_areas; 449 + 450 + if (vdev->msix_offset & PAGE_MASK) { 451 + sparse->areas[i].offset = 0; 452 + sparse->areas[i].size = vdev->msix_offset & PAGE_MASK; 453 + i++; 454 + } 455 + 456 + if (PAGE_ALIGN(vdev->msix_offset + vdev->msix_size) < end) { 457 + sparse->areas[i].offset = PAGE_ALIGN(vdev->msix_offset + 458 + vdev->msix_size); 459 + sparse->areas[i].size = end - sparse->areas[i].offset; 460 + i++; 461 + } 462 + 463 + return 0; 464 + } 465 + 466 + static int region_type_cap(struct vfio_pci_device *vdev, 467 + struct vfio_info_cap *caps, 468 + unsigned int type, unsigned int subtype) 469 + { 470 + struct vfio_info_cap_header *header; 471 + struct vfio_region_info_cap_type *cap; 472 + 473 + header = vfio_info_cap_add(caps, sizeof(*cap), 474 + VFIO_REGION_INFO_CAP_TYPE, 1); 475 + if (IS_ERR(header)) 476 + return PTR_ERR(header); 477 + 478 + cap = container_of(header, struct vfio_region_info_cap_type, header); 479 + cap->type = type; 480 + cap->subtype = subtype; 481 + 482 + return 0; 483 + } 484 + 485 + int vfio_pci_register_dev_region(struct vfio_pci_device *vdev, 486 + unsigned int type, unsigned int subtype, 487 + const struct vfio_pci_regops *ops, 488 + size_t size, u32 flags, void *data) 489 + { 490 + struct vfio_pci_region *region; 491 + 492 + region = krealloc(vdev->region, 493 + (vdev->num_regions + 1) * sizeof(*region), 494 + GFP_KERNEL); 495 + if (!region) 496 + return -ENOMEM; 497 + 498 + vdev->region = region; 499 + vdev->region[vdev->num_regions].type = type; 500 + vdev->region[vdev->num_regions].subtype = subtype; 501 + vdev->region[vdev->num_regions].ops = ops; 502 + vdev->region[vdev->num_regions].size = size; 503 + vdev->region[vdev->num_regions].flags = flags; 504 + vdev->region[vdev->num_regions].data = data; 505 + 506 + vdev->num_regions++; 507 + 508 + return 0; 509 + } 510 + 445 511 static long vfio_pci_ioctl(void *device_data, 446 512 unsigned int cmd, unsigned long arg) 447 513 { ··· 551 443 if (vdev->reset_works) 552 444 info.flags |= VFIO_DEVICE_FLAGS_RESET; 553 445 554 - info.num_regions = VFIO_PCI_NUM_REGIONS; 446 + info.num_regions = VFIO_PCI_NUM_REGIONS + vdev->num_regions; 555 447 info.num_irqs = VFIO_PCI_NUM_IRQS; 556 448 557 449 return copy_to_user((void __user *)arg, &info, minsz) ? ··· 560 452 } else if (cmd == VFIO_DEVICE_GET_REGION_INFO) { 561 453 struct pci_dev *pdev = vdev->pdev; 562 454 struct vfio_region_info info; 455 + struct vfio_info_cap caps = { .buf = NULL, .size = 0 }; 456 + int i, ret; 563 457 564 458 minsz = offsetofend(struct vfio_region_info, offset); 565 459 ··· 590 480 VFIO_REGION_INFO_FLAG_WRITE; 591 481 if (IS_ENABLED(CONFIG_VFIO_PCI_MMAP) && 592 482 pci_resource_flags(pdev, info.index) & 593 - IORESOURCE_MEM && info.size >= PAGE_SIZE) 483 + IORESOURCE_MEM && info.size >= PAGE_SIZE) { 594 484 info.flags |= VFIO_REGION_INFO_FLAG_MMAP; 485 + if (info.index == vdev->msix_bar) { 486 + ret = msix_sparse_mmap_cap(vdev, &caps); 487 + if (ret) 488 + return ret; 489 + } 490 + } 491 + 595 492 break; 596 493 case VFIO_PCI_ROM_REGION_INDEX: 597 494 { ··· 610 493 611 494 /* Report the BAR size, not the ROM size */ 612 495 info.size = pci_resource_len(pdev, info.index); 613 - if (!info.size) 614 - break; 496 + if (!info.size) { 497 + /* Shadow ROMs appear as PCI option ROMs */ 498 + if (pdev->resource[PCI_ROM_RESOURCE].flags & 499 + IORESOURCE_ROM_SHADOW) 500 + info.size = 0x20000; 501 + else 502 + break; 503 + } 615 504 616 505 /* Is it really there? */ 617 506 io = pci_map_rom(pdev, &size); ··· 641 518 642 519 break; 643 520 default: 644 - return -EINVAL; 521 + if (info.index >= 522 + VFIO_PCI_NUM_REGIONS + vdev->num_regions) 523 + return -EINVAL; 524 + 525 + i = info.index - VFIO_PCI_NUM_REGIONS; 526 + 527 + info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); 528 + info.size = vdev->region[i].size; 529 + info.flags = vdev->region[i].flags; 530 + 531 + ret = region_type_cap(vdev, &caps, 532 + vdev->region[i].type, 533 + vdev->region[i].subtype); 534 + if (ret) 535 + return ret; 536 + } 537 + 538 + if (caps.size) { 539 + info.flags |= VFIO_REGION_INFO_FLAG_CAPS; 540 + if (info.argsz < sizeof(info) + caps.size) { 541 + info.argsz = sizeof(info) + caps.size; 542 + info.cap_offset = 0; 543 + } else { 544 + vfio_info_cap_shift(&caps, sizeof(info)); 545 + if (copy_to_user((void __user *)arg + 546 + sizeof(info), caps.buf, 547 + caps.size)) { 548 + kfree(caps.buf); 549 + return -EFAULT; 550 + } 551 + info.cap_offset = sizeof(info); 552 + } 553 + 554 + kfree(caps.buf); 645 555 } 646 556 647 557 return copy_to_user((void __user *)arg, &info, minsz) ? ··· 954 798 unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); 955 799 struct vfio_pci_device *vdev = device_data; 956 800 957 - if (index >= VFIO_PCI_NUM_REGIONS) 801 + if (index >= VFIO_PCI_NUM_REGIONS + vdev->num_regions) 958 802 return -EINVAL; 959 803 960 804 switch (index) { ··· 971 815 972 816 case VFIO_PCI_VGA_REGION_INDEX: 973 817 return vfio_pci_vga_rw(vdev, buf, count, ppos, iswrite); 818 + default: 819 + index -= VFIO_PCI_NUM_REGIONS; 820 + return vdev->region[index].ops->rw(vdev, buf, 821 + count, ppos, iswrite); 974 822 } 975 823 976 824 return -EINVAL; ··· 1157 997 return; 1158 998 1159 999 vfio_iommu_group_put(pdev->dev.iommu_group, &pdev->dev); 1000 + kfree(vdev->region); 1160 1001 kfree(vdev); 1161 1002 1162 1003 if (vfio_pci_is_vga(pdev)) {
+38 -7
drivers/vfio/pci/vfio_pci_config.c
··· 33 33 34 34 #define PCI_CFG_SPACE_SIZE 256 35 35 36 - /* Useful "pseudo" capabilities */ 36 + /* Fake capability ID for standard config space */ 37 37 #define PCI_CAP_ID_BASIC 0 38 - #define PCI_CAP_ID_INVALID 0xFF 39 38 40 39 #define is_bar(offset) \ 41 40 ((offset >= PCI_BASE_ADDRESS_0 && offset < PCI_BASE_ADDRESS_5 + 4) || \ ··· 300 301 return count; 301 302 } 302 303 304 + /* Virt access uses only virtualization */ 305 + static int vfio_virt_config_write(struct vfio_pci_device *vdev, int pos, 306 + int count, struct perm_bits *perm, 307 + int offset, __le32 val) 308 + { 309 + memcpy(vdev->vconfig + pos, &val, count); 310 + return count; 311 + } 312 + 313 + static int vfio_virt_config_read(struct vfio_pci_device *vdev, int pos, 314 + int count, struct perm_bits *perm, 315 + int offset, __le32 *val) 316 + { 317 + memcpy(val, vdev->vconfig + pos, count); 318 + return count; 319 + } 320 + 303 321 /* Default capability regions to read-only, no-virtualization */ 304 322 static struct perm_bits cap_perms[PCI_CAP_ID_MAX + 1] = { 305 323 [0 ... PCI_CAP_ID_MAX] = { .readfn = vfio_direct_config_read } ··· 333 317 static struct perm_bits unassigned_perms = { 334 318 .readfn = vfio_raw_config_read, 335 319 .writefn = vfio_raw_config_write 320 + }; 321 + 322 + static struct perm_bits virt_perms = { 323 + .readfn = vfio_virt_config_read, 324 + .writefn = vfio_virt_config_write 336 325 }; 337 326 338 327 static void free_perm_bits(struct perm_bits *perm) ··· 475 454 bar = (__le32 *)&vdev->vconfig[PCI_ROM_ADDRESS]; 476 455 477 456 /* 478 - * NB. we expose the actual BAR size here, regardless of whether 479 - * we can read it. When we report the REGION_INFO for the ROM 480 - * we report what PCI tells us is the actual ROM size. 457 + * NB. REGION_INFO will have reported zero size if we weren't able 458 + * to read the ROM, but we still return the actual BAR size here if 459 + * it exists (or the shadow ROM space). 481 460 */ 482 461 if (pci_resource_start(pdev, PCI_ROM_RESOURCE)) { 483 462 mask = ~(pci_resource_len(pdev, PCI_ROM_RESOURCE) - 1); 463 + mask |= PCI_ROM_ADDRESS_ENABLE; 464 + *bar &= cpu_to_le32((u32)mask); 465 + } else if (pdev->resource[PCI_ROM_RESOURCE].flags & 466 + IORESOURCE_ROM_SHADOW) { 467 + mask = ~(0x20000 - 1); 484 468 mask |= PCI_ROM_ADDRESS_ENABLE; 485 469 *bar &= cpu_to_le32((u32)mask); 486 470 } else ··· 1358 1332 pos + i, map[pos + i], cap); 1359 1333 } 1360 1334 1335 + BUILD_BUG_ON(PCI_CAP_ID_MAX >= PCI_CAP_ID_INVALID_VIRT); 1336 + 1361 1337 memset(map + pos, cap, len); 1362 1338 ret = vfio_fill_vconfig_bytes(vdev, pos, len); 1363 1339 if (ret) ··· 1447 1419 /* 1448 1420 * Even though ecap is 2 bytes, we're currently a long way 1449 1421 * from exceeding 1 byte capabilities. If we ever make it 1450 - * up to 0xFF we'll need to up this to a two-byte, byte map. 1422 + * up to 0xFE we'll need to up this to a two-byte, byte map. 1451 1423 */ 1452 - BUILD_BUG_ON(PCI_EXT_CAP_ID_MAX >= PCI_CAP_ID_INVALID); 1424 + BUILD_BUG_ON(PCI_EXT_CAP_ID_MAX >= PCI_CAP_ID_INVALID_VIRT); 1453 1425 1454 1426 memset(map + epos, ecap, len); 1455 1427 ret = vfio_fill_vconfig_bytes(vdev, epos, len); ··· 1624 1596 1625 1597 if (cap_id == PCI_CAP_ID_INVALID) { 1626 1598 perm = &unassigned_perms; 1599 + cap_start = *ppos; 1600 + } else if (cap_id == PCI_CAP_ID_INVALID_VIRT) { 1601 + perm = &virt_perms; 1627 1602 cap_start = *ppos; 1628 1603 } else { 1629 1604 if (*ppos >= PCI_CFG_SPACE_SIZE) {
+280
drivers/vfio/pci/vfio_pci_igd.c
··· 1 + /* 2 + * VFIO PCI Intel Graphics support 3 + * 4 + * Copyright (C) 2016 Red Hat, Inc. All rights reserved. 5 + * Author: Alex Williamson <alex.williamson@redhat.com> 6 + * 7 + * This program is free software; you can redistribute it and/or modify 8 + * it under the terms of the GNU General Public License version 2 as 9 + * published by the Free Software Foundation. 10 + * 11 + * Register a device specific region through which to provide read-only 12 + * access to the Intel IGD opregion. The register defining the opregion 13 + * address is also virtualized to prevent user modification. 14 + */ 15 + 16 + #include <linux/io.h> 17 + #include <linux/pci.h> 18 + #include <linux/uaccess.h> 19 + #include <linux/vfio.h> 20 + 21 + #include "vfio_pci_private.h" 22 + 23 + #define OPREGION_SIGNATURE "IntelGraphicsMem" 24 + #define OPREGION_SIZE (8 * 1024) 25 + #define OPREGION_PCI_ADDR 0xfc 26 + 27 + static size_t vfio_pci_igd_rw(struct vfio_pci_device *vdev, char __user *buf, 28 + size_t count, loff_t *ppos, bool iswrite) 29 + { 30 + unsigned int i = VFIO_PCI_OFFSET_TO_INDEX(*ppos) - VFIO_PCI_NUM_REGIONS; 31 + void *base = vdev->region[i].data; 32 + loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK; 33 + 34 + if (pos >= vdev->region[i].size || iswrite) 35 + return -EINVAL; 36 + 37 + count = min(count, (size_t)(vdev->region[i].size - pos)); 38 + 39 + if (copy_to_user(buf, base + pos, count)) 40 + return -EFAULT; 41 + 42 + *ppos += count; 43 + 44 + return count; 45 + } 46 + 47 + static void vfio_pci_igd_release(struct vfio_pci_device *vdev, 48 + struct vfio_pci_region *region) 49 + { 50 + memunmap(region->data); 51 + } 52 + 53 + static const struct vfio_pci_regops vfio_pci_igd_regops = { 54 + .rw = vfio_pci_igd_rw, 55 + .release = vfio_pci_igd_release, 56 + }; 57 + 58 + static int vfio_pci_igd_opregion_init(struct vfio_pci_device *vdev) 59 + { 60 + __le32 *dwordp = (__le32 *)(vdev->vconfig + OPREGION_PCI_ADDR); 61 + u32 addr, size; 62 + void *base; 63 + int ret; 64 + 65 + ret = pci_read_config_dword(vdev->pdev, OPREGION_PCI_ADDR, &addr); 66 + if (ret) 67 + return ret; 68 + 69 + if (!addr || !(~addr)) 70 + return -ENODEV; 71 + 72 + base = memremap(addr, OPREGION_SIZE, MEMREMAP_WB); 73 + if (!base) 74 + return -ENOMEM; 75 + 76 + if (memcmp(base, OPREGION_SIGNATURE, 16)) { 77 + memunmap(base); 78 + return -EINVAL; 79 + } 80 + 81 + size = le32_to_cpu(*(__le32 *)(base + 16)); 82 + if (!size) { 83 + memunmap(base); 84 + return -EINVAL; 85 + } 86 + 87 + size *= 1024; /* In KB */ 88 + 89 + if (size != OPREGION_SIZE) { 90 + memunmap(base); 91 + base = memremap(addr, size, MEMREMAP_WB); 92 + if (!base) 93 + return -ENOMEM; 94 + } 95 + 96 + ret = vfio_pci_register_dev_region(vdev, 97 + PCI_VENDOR_ID_INTEL | VFIO_REGION_TYPE_PCI_VENDOR_TYPE, 98 + VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION, 99 + &vfio_pci_igd_regops, size, VFIO_REGION_INFO_FLAG_READ, base); 100 + if (ret) { 101 + memunmap(base); 102 + return ret; 103 + } 104 + 105 + /* Fill vconfig with the hw value and virtualize register */ 106 + *dwordp = cpu_to_le32(addr); 107 + memset(vdev->pci_config_map + OPREGION_PCI_ADDR, 108 + PCI_CAP_ID_INVALID_VIRT, 4); 109 + 110 + return ret; 111 + } 112 + 113 + static size_t vfio_pci_igd_cfg_rw(struct vfio_pci_device *vdev, 114 + char __user *buf, size_t count, loff_t *ppos, 115 + bool iswrite) 116 + { 117 + unsigned int i = VFIO_PCI_OFFSET_TO_INDEX(*ppos) - VFIO_PCI_NUM_REGIONS; 118 + struct pci_dev *pdev = vdev->region[i].data; 119 + loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK; 120 + size_t size; 121 + int ret; 122 + 123 + if (pos >= vdev->region[i].size || iswrite) 124 + return -EINVAL; 125 + 126 + size = count = min(count, (size_t)(vdev->region[i].size - pos)); 127 + 128 + if ((pos & 1) && size) { 129 + u8 val; 130 + 131 + ret = pci_user_read_config_byte(pdev, pos, &val); 132 + if (ret) 133 + return pcibios_err_to_errno(ret); 134 + 135 + if (copy_to_user(buf + count - size, &val, 1)) 136 + return -EFAULT; 137 + 138 + pos++; 139 + size--; 140 + } 141 + 142 + if ((pos & 3) && size > 2) { 143 + u16 val; 144 + 145 + ret = pci_user_read_config_word(pdev, pos, &val); 146 + if (ret) 147 + return pcibios_err_to_errno(ret); 148 + 149 + val = cpu_to_le16(val); 150 + if (copy_to_user(buf + count - size, &val, 2)) 151 + return -EFAULT; 152 + 153 + pos += 2; 154 + size -= 2; 155 + } 156 + 157 + while (size > 3) { 158 + u32 val; 159 + 160 + ret = pci_user_read_config_dword(pdev, pos, &val); 161 + if (ret) 162 + return pcibios_err_to_errno(ret); 163 + 164 + val = cpu_to_le32(val); 165 + if (copy_to_user(buf + count - size, &val, 4)) 166 + return -EFAULT; 167 + 168 + pos += 4; 169 + size -= 4; 170 + } 171 + 172 + while (size >= 2) { 173 + u16 val; 174 + 175 + ret = pci_user_read_config_word(pdev, pos, &val); 176 + if (ret) 177 + return pcibios_err_to_errno(ret); 178 + 179 + val = cpu_to_le16(val); 180 + if (copy_to_user(buf + count - size, &val, 2)) 181 + return -EFAULT; 182 + 183 + pos += 2; 184 + size -= 2; 185 + } 186 + 187 + while (size) { 188 + u8 val; 189 + 190 + ret = pci_user_read_config_byte(pdev, pos, &val); 191 + if (ret) 192 + return pcibios_err_to_errno(ret); 193 + 194 + if (copy_to_user(buf + count - size, &val, 1)) 195 + return -EFAULT; 196 + 197 + pos++; 198 + size--; 199 + } 200 + 201 + *ppos += count; 202 + 203 + return count; 204 + } 205 + 206 + static void vfio_pci_igd_cfg_release(struct vfio_pci_device *vdev, 207 + struct vfio_pci_region *region) 208 + { 209 + struct pci_dev *pdev = region->data; 210 + 211 + pci_dev_put(pdev); 212 + } 213 + 214 + static const struct vfio_pci_regops vfio_pci_igd_cfg_regops = { 215 + .rw = vfio_pci_igd_cfg_rw, 216 + .release = vfio_pci_igd_cfg_release, 217 + }; 218 + 219 + static int vfio_pci_igd_cfg_init(struct vfio_pci_device *vdev) 220 + { 221 + struct pci_dev *host_bridge, *lpc_bridge; 222 + int ret; 223 + 224 + host_bridge = pci_get_domain_bus_and_slot(0, 0, PCI_DEVFN(0, 0)); 225 + if (!host_bridge) 226 + return -ENODEV; 227 + 228 + if (host_bridge->vendor != PCI_VENDOR_ID_INTEL || 229 + host_bridge->class != (PCI_CLASS_BRIDGE_HOST << 8)) { 230 + pci_dev_put(host_bridge); 231 + return -EINVAL; 232 + } 233 + 234 + ret = vfio_pci_register_dev_region(vdev, 235 + PCI_VENDOR_ID_INTEL | VFIO_REGION_TYPE_PCI_VENDOR_TYPE, 236 + VFIO_REGION_SUBTYPE_INTEL_IGD_HOST_CFG, 237 + &vfio_pci_igd_cfg_regops, host_bridge->cfg_size, 238 + VFIO_REGION_INFO_FLAG_READ, host_bridge); 239 + if (ret) { 240 + pci_dev_put(host_bridge); 241 + return ret; 242 + } 243 + 244 + lpc_bridge = pci_get_domain_bus_and_slot(0, 0, PCI_DEVFN(0x1f, 0)); 245 + if (!lpc_bridge) 246 + return -ENODEV; 247 + 248 + if (lpc_bridge->vendor != PCI_VENDOR_ID_INTEL || 249 + lpc_bridge->class != (PCI_CLASS_BRIDGE_ISA << 8)) { 250 + pci_dev_put(lpc_bridge); 251 + return -EINVAL; 252 + } 253 + 254 + ret = vfio_pci_register_dev_region(vdev, 255 + PCI_VENDOR_ID_INTEL | VFIO_REGION_TYPE_PCI_VENDOR_TYPE, 256 + VFIO_REGION_SUBTYPE_INTEL_IGD_LPC_CFG, 257 + &vfio_pci_igd_cfg_regops, lpc_bridge->cfg_size, 258 + VFIO_REGION_INFO_FLAG_READ, lpc_bridge); 259 + if (ret) { 260 + pci_dev_put(lpc_bridge); 261 + return ret; 262 + } 263 + 264 + return 0; 265 + } 266 + 267 + int vfio_pci_igd_init(struct vfio_pci_device *vdev) 268 + { 269 + int ret; 270 + 271 + ret = vfio_pci_igd_opregion_init(vdev); 272 + if (ret) 273 + return ret; 274 + 275 + ret = vfio_pci_igd_cfg_init(vdev); 276 + if (ret) 277 + return ret; 278 + 279 + return 0; 280 + }
+9 -8
drivers/vfio/pci/vfio_pci_intrs.c
··· 309 309 int vector, int fd, bool msix) 310 310 { 311 311 struct pci_dev *pdev = vdev->pdev; 312 - int irq = msix ? vdev->msix[vector].vector : pdev->irq + vector; 313 - char *name = msix ? "vfio-msix" : "vfio-msi"; 314 312 struct eventfd_ctx *trigger; 315 - int ret; 313 + int irq, ret; 316 314 317 - if (vector >= vdev->num_ctx) 315 + if (vector < 0 || vector >= vdev->num_ctx) 318 316 return -EINVAL; 317 + 318 + irq = msix ? vdev->msix[vector].vector : pdev->irq + vector; 319 319 320 320 if (vdev->ctx[vector].trigger) { 321 321 free_irq(irq, vdev->ctx[vector].trigger); ··· 328 328 if (fd < 0) 329 329 return 0; 330 330 331 - vdev->ctx[vector].name = kasprintf(GFP_KERNEL, "%s[%d](%s)", 332 - name, vector, pci_name(pdev)); 331 + vdev->ctx[vector].name = kasprintf(GFP_KERNEL, "vfio-msi%s[%d](%s)", 332 + msix ? "x" : "", vector, 333 + pci_name(pdev)); 333 334 if (!vdev->ctx[vector].name) 334 335 return -ENOMEM; 335 336 ··· 380 379 { 381 380 int i, j, ret = 0; 382 381 383 - if (start + count > vdev->num_ctx) 382 + if (start >= vdev->num_ctx || start + count > vdev->num_ctx) 384 383 return -EINVAL; 385 384 386 385 for (i = 0, j = start; i < count && !ret; i++, j++) { ··· 389 388 } 390 389 391 390 if (ret) { 392 - for (--j; j >= start; j--) 391 + for (--j; j >= (int)start; j--) 393 392 vfio_msi_set_vector_signal(vdev, j, -1, msix); 394 393 } 395 394
+39
drivers/vfio/pci/vfio_pci_private.h
··· 14 14 #include <linux/mutex.h> 15 15 #include <linux/pci.h> 16 16 #include <linux/irqbypass.h> 17 + #include <linux/types.h> 17 18 18 19 #ifndef VFIO_PCI_PRIVATE_H 19 20 #define VFIO_PCI_PRIVATE_H ··· 25 24 #define VFIO_PCI_INDEX_TO_OFFSET(index) ((u64)(index) << VFIO_PCI_OFFSET_SHIFT) 26 25 #define VFIO_PCI_OFFSET_MASK (((u64)(1) << VFIO_PCI_OFFSET_SHIFT) - 1) 27 26 27 + /* Special capability IDs predefined access */ 28 + #define PCI_CAP_ID_INVALID 0xFF /* default raw access */ 29 + #define PCI_CAP_ID_INVALID_VIRT 0xFE /* default virt access */ 30 + 28 31 struct vfio_pci_irq_ctx { 29 32 struct eventfd_ctx *trigger; 30 33 struct virqfd *unmask; ··· 36 31 char *name; 37 32 bool masked; 38 33 struct irq_bypass_producer producer; 34 + }; 35 + 36 + struct vfio_pci_device; 37 + struct vfio_pci_region; 38 + 39 + struct vfio_pci_regops { 40 + size_t (*rw)(struct vfio_pci_device *vdev, char __user *buf, 41 + size_t count, loff_t *ppos, bool iswrite); 42 + void (*release)(struct vfio_pci_device *vdev, 43 + struct vfio_pci_region *region); 44 + }; 45 + 46 + struct vfio_pci_region { 47 + u32 type; 48 + u32 subtype; 49 + const struct vfio_pci_regops *ops; 50 + void *data; 51 + size_t size; 52 + u32 flags; 39 53 }; 40 54 41 55 struct vfio_pci_device { ··· 69 45 struct vfio_pci_irq_ctx *ctx; 70 46 int num_ctx; 71 47 int irq_type; 48 + int num_regions; 49 + struct vfio_pci_region *region; 72 50 u8 msi_qmax; 73 51 u8 msix_bar; 74 52 u16 msix_size; ··· 117 91 118 92 extern int vfio_config_init(struct vfio_pci_device *vdev); 119 93 extern void vfio_config_free(struct vfio_pci_device *vdev); 94 + 95 + extern int vfio_pci_register_dev_region(struct vfio_pci_device *vdev, 96 + unsigned int type, unsigned int subtype, 97 + const struct vfio_pci_regops *ops, 98 + size_t size, u32 flags, void *data); 99 + #ifdef CONFIG_VFIO_PCI_IGD 100 + extern int vfio_pci_igd_init(struct vfio_pci_device *vdev); 101 + #else 102 + static inline int vfio_pci_igd_init(struct vfio_pci_device *vdev) 103 + { 104 + return -ENODEV; 105 + } 106 + #endif 120 107 #endif /* VFIO_PCI_PRIVATE_H */
+6 -3
drivers/vfio/pci/vfio_pci_rdwr.c
··· 124 124 void __iomem *io; 125 125 ssize_t done; 126 126 127 - if (!pci_resource_start(pdev, bar)) 127 + if (pci_resource_start(pdev, bar)) 128 + end = pci_resource_len(pdev, bar); 129 + else if (bar == PCI_ROM_RESOURCE && 130 + pdev->resource[bar].flags & IORESOURCE_ROM_SHADOW) 131 + end = 0x20000; 132 + else 128 133 return -EINVAL; 129 - 130 - end = pci_resource_len(pdev, bar); 131 134 132 135 if (pos >= end) 133 136 return -EINVAL;
+60 -10
drivers/vfio/vfio.c
··· 1080 1080 continue; 1081 1081 } 1082 1082 1083 - /* module reference holds the driver we're working on */ 1084 - mutex_unlock(&vfio.iommu_drivers_lock); 1085 - 1086 1083 data = driver->ops->open(arg); 1087 1084 if (IS_ERR(data)) { 1088 1085 ret = PTR_ERR(data); 1089 1086 module_put(driver->ops->owner); 1090 - goto skip_drivers_unlock; 1087 + continue; 1091 1088 } 1092 1089 1093 1090 ret = __vfio_container_attach_groups(container, driver, data); 1094 - if (!ret) { 1095 - container->iommu_driver = driver; 1096 - container->iommu_data = data; 1097 - } else { 1091 + if (ret) { 1098 1092 driver->ops->release(data); 1099 1093 module_put(driver->ops->owner); 1094 + continue; 1100 1095 } 1101 1096 1102 - goto skip_drivers_unlock; 1097 + container->iommu_driver = driver; 1098 + container->iommu_data = data; 1099 + break; 1103 1100 } 1104 1101 1105 1102 mutex_unlock(&vfio.iommu_drivers_lock); 1106 - skip_drivers_unlock: 1107 1103 up_write(&container->group_lock); 1108 1104 1109 1105 return ret; ··· 1727 1731 return vfio_ioctl_check_extension(group->container, arg); 1728 1732 } 1729 1733 EXPORT_SYMBOL_GPL(vfio_external_check_extension); 1734 + 1735 + /** 1736 + * Sub-module support 1737 + */ 1738 + /* 1739 + * Helper for managing a buffer of info chain capabilities, allocate or 1740 + * reallocate a buffer with additional @size, filling in @id and @version 1741 + * of the capability. A pointer to the new capability is returned. 1742 + * 1743 + * NB. The chain is based at the head of the buffer, so new entries are 1744 + * added to the tail, vfio_info_cap_shift() should be called to fixup the 1745 + * next offsets prior to copying to the user buffer. 1746 + */ 1747 + struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps, 1748 + size_t size, u16 id, u16 version) 1749 + { 1750 + void *buf; 1751 + struct vfio_info_cap_header *header, *tmp; 1752 + 1753 + buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL); 1754 + if (!buf) { 1755 + kfree(caps->buf); 1756 + caps->size = 0; 1757 + return ERR_PTR(-ENOMEM); 1758 + } 1759 + 1760 + caps->buf = buf; 1761 + header = buf + caps->size; 1762 + 1763 + /* Eventually copied to user buffer, zero */ 1764 + memset(header, 0, size); 1765 + 1766 + header->id = id; 1767 + header->version = version; 1768 + 1769 + /* Add to the end of the capability chain */ 1770 + for (tmp = caps->buf; tmp->next; tmp = (void *)tmp + tmp->next) 1771 + ; /* nothing */ 1772 + 1773 + tmp->next = caps->size; 1774 + caps->size += size; 1775 + 1776 + return header; 1777 + } 1778 + EXPORT_SYMBOL_GPL(vfio_info_cap_add); 1779 + 1780 + void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset) 1781 + { 1782 + struct vfio_info_cap_header *tmp; 1783 + 1784 + for (tmp = caps->buf; tmp->next; tmp = (void *)tmp + tmp->next - offset) 1785 + tmp->next += offset; 1786 + } 1787 + EXPORT_SYMBOL_GPL(vfio_info_cap_shift); 1730 1788 1731 1789 /** 1732 1790 * Module/class support
+11
include/linux/vfio.h
··· 92 92 extern long vfio_external_check_extension(struct vfio_group *group, 93 93 unsigned long arg); 94 94 95 + /* 96 + * Sub-module helpers 97 + */ 98 + struct vfio_info_cap { 99 + struct vfio_info_cap_header *buf; 100 + size_t size; 101 + }; 102 + extern struct vfio_info_cap_header *vfio_info_cap_add( 103 + struct vfio_info_cap *caps, size_t size, u16 id, u16 version); 104 + extern void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset); 105 + 95 106 struct pci_dev; 96 107 #ifdef CONFIG_EEH 97 108 extern void vfio_spapr_pci_eeh_open(struct pci_dev *pdev);
+90 -2
include/uapi/linux/vfio.h
··· 59 59 #define VFIO_TYPE (';') 60 60 #define VFIO_BASE 100 61 61 62 + /* 63 + * For extension of INFO ioctls, VFIO makes use of a capability chain 64 + * designed after PCI/e capabilities. A flag bit indicates whether 65 + * this capability chain is supported and a field defined in the fixed 66 + * structure defines the offset of the first capability in the chain. 67 + * This field is only valid when the corresponding bit in the flags 68 + * bitmap is set. This offset field is relative to the start of the 69 + * INFO buffer, as is the next field within each capability header. 70 + * The id within the header is a shared address space per INFO ioctl, 71 + * while the version field is specific to the capability id. The 72 + * contents following the header are specific to the capability id. 73 + */ 74 + struct vfio_info_cap_header { 75 + __u16 id; /* Identifies capability */ 76 + __u16 version; /* Version specific to the capability ID */ 77 + __u32 next; /* Offset of next capability */ 78 + }; 79 + 80 + /* 81 + * Callers of INFO ioctls passing insufficiently sized buffers will see 82 + * the capability chain flag bit set, a zero value for the first capability 83 + * offset (if available within the provided argsz), and argsz will be 84 + * updated to report the necessary buffer size. For compatibility, the 85 + * INFO ioctl will not report error in this case, but the capability chain 86 + * will not be available. 87 + */ 88 + 62 89 /* -------- IOCTLs for VFIO file descriptor (/dev/vfio/vfio) -------- */ 63 90 64 91 /** ··· 221 194 #define VFIO_REGION_INFO_FLAG_READ (1 << 0) /* Region supports read */ 222 195 #define VFIO_REGION_INFO_FLAG_WRITE (1 << 1) /* Region supports write */ 223 196 #define VFIO_REGION_INFO_FLAG_MMAP (1 << 2) /* Region supports mmap */ 197 + #define VFIO_REGION_INFO_FLAG_CAPS (1 << 3) /* Info supports caps */ 224 198 __u32 index; /* Region index */ 225 - __u32 resv; /* Reserved for alignment */ 199 + __u32 cap_offset; /* Offset within info struct of first cap */ 226 200 __u64 size; /* Region size (bytes) */ 227 201 __u64 offset; /* Region offset from start of device fd */ 228 202 }; 229 203 #define VFIO_DEVICE_GET_REGION_INFO _IO(VFIO_TYPE, VFIO_BASE + 8) 204 + 205 + /* 206 + * The sparse mmap capability allows finer granularity of specifying areas 207 + * within a region with mmap support. When specified, the user should only 208 + * mmap the offset ranges specified by the areas array. mmaps outside of the 209 + * areas specified may fail (such as the range covering a PCI MSI-X table) or 210 + * may result in improper device behavior. 211 + * 212 + * The structures below define version 1 of this capability. 213 + */ 214 + #define VFIO_REGION_INFO_CAP_SPARSE_MMAP 1 215 + 216 + struct vfio_region_sparse_mmap_area { 217 + __u64 offset; /* Offset of mmap'able area within region */ 218 + __u64 size; /* Size of mmap'able area */ 219 + }; 220 + 221 + struct vfio_region_info_cap_sparse_mmap { 222 + struct vfio_info_cap_header header; 223 + __u32 nr_areas; 224 + __u32 reserved; 225 + struct vfio_region_sparse_mmap_area areas[]; 226 + }; 227 + 228 + /* 229 + * The device specific type capability allows regions unique to a specific 230 + * device or class of devices to be exposed. This helps solve the problem for 231 + * vfio bus drivers of defining which region indexes correspond to which region 232 + * on the device, without needing to resort to static indexes, as done by 233 + * vfio-pci. For instance, if we were to go back in time, we might remove 234 + * VFIO_PCI_VGA_REGION_INDEX and let vfio-pci simply define that all indexes 235 + * greater than or equal to VFIO_PCI_NUM_REGIONS are device specific and we'd 236 + * make a "VGA" device specific type to describe the VGA access space. This 237 + * means that non-VGA devices wouldn't need to waste this index, and thus the 238 + * address space associated with it due to implementation of device file 239 + * descriptor offsets in vfio-pci. 240 + * 241 + * The current implementation is now part of the user ABI, so we can't use this 242 + * for VGA, but there are other upcoming use cases, such as opregions for Intel 243 + * IGD devices and framebuffers for vGPU devices. We missed VGA, but we'll 244 + * use this for future additions. 245 + * 246 + * The structure below defines version 1 of this capability. 247 + */ 248 + #define VFIO_REGION_INFO_CAP_TYPE 2 249 + 250 + struct vfio_region_info_cap_type { 251 + struct vfio_info_cap_header header; 252 + __u32 type; /* global per bus driver */ 253 + __u32 subtype; /* type specific */ 254 + }; 255 + 256 + #define VFIO_REGION_TYPE_PCI_VENDOR_TYPE (1 << 31) 257 + #define VFIO_REGION_TYPE_PCI_VENDOR_MASK (0xffff) 258 + 259 + /* 8086 Vendor sub-types */ 260 + #define VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION (1) 261 + #define VFIO_REGION_SUBTYPE_INTEL_IGD_HOST_CFG (2) 262 + #define VFIO_REGION_SUBTYPE_INTEL_IGD_LPC_CFG (3) 230 263 231 264 /** 232 265 * VFIO_DEVICE_GET_IRQ_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 9, ··· 423 336 * between described ranges are unimplemented. 424 337 */ 425 338 VFIO_PCI_VGA_REGION_INDEX, 426 - VFIO_PCI_NUM_REGIONS 339 + VFIO_PCI_NUM_REGIONS = 9 /* Fixed user ABI, region indexes >=9 use */ 340 + /* device specific cap to define content. */ 427 341 }; 428 342 429 343 enum {