Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

um: Add VFIO-based virtual PCI driver

Implement a new virtual PCI driver based on the VFIO framework.
This driver allows users to pass through PCI devices to UML via
VFIO. Currently, only MSI-X capable devices are supported, and
it is assumed that drivers will use MSI-X.

Signed-off-by: Tiwei Bie <tiwei.btw@antgroup.com>
Link: https://patch.msgid.link/20250413154421.517878-1-tiwei.btw@antgroup.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>

authored by

Tiwei Bie and committed by
Johannes Berg
a0e2cb6a 6767e878

+1023
+8
arch/um/drivers/Kconfig
··· 367 367 There's no official device ID assigned (yet), set the one you 368 368 wish to use for experimentation here. The default of -1 is 369 369 not valid and will cause the driver to fail at probe. 370 + 371 + config UML_PCI_OVER_VFIO 372 + bool "Enable VFIO-based PCI passthrough" 373 + select UML_PCI 374 + help 375 + This driver provides support for VFIO-based PCI passthrough. 376 + Currently, only MSI-X capable devices are supported, and it 377 + is assumed that drivers will use MSI-X.
+2
arch/um/drivers/Makefile
··· 19 19 harddog-objs := harddog_kern.o 20 20 harddog-builtin-$(CONFIG_UML_WATCHDOG) := harddog_user.o harddog_user_exp.o 21 21 rtc-objs := rtc_kern.o rtc_user.o 22 + vfio_uml-objs := vfio_kern.o vfio_user.o 22 23 23 24 LDFLAGS_vde.o = $(shell $(CC) $(CFLAGS) -print-file-name=libvdeplug.a) 24 25 ··· 63 62 obj-$(CONFIG_UML_RTC) += rtc.o 64 63 obj-$(CONFIG_UML_PCI) += virt-pci.o 65 64 obj-$(CONFIG_UML_PCI_OVER_VIRTIO) += virtio_pcidev.o 65 + obj-$(CONFIG_UML_PCI_OVER_VFIO) += vfio_uml.o 66 66 67 67 # pcap_user.o must be added explicitly. 68 68 USER_OBJS := fd.o null.o pty.o tty.o xterm.o slip_common.o vde_user.o vector_user.o
+642
arch/um/drivers/vfio_kern.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* 3 + * Copyright (C) 2025 Ant Group 4 + * Author: Tiwei Bie <tiwei.btw@antgroup.com> 5 + */ 6 + 7 + #define pr_fmt(fmt) "vfio-uml: " fmt 8 + 9 + #include <linux/module.h> 10 + #include <linux/logic_iomem.h> 11 + #include <linux/mutex.h> 12 + #include <linux/list.h> 13 + #include <linux/string.h> 14 + #include <linux/unaligned.h> 15 + #include <irq_kern.h> 16 + #include <init.h> 17 + #include <os.h> 18 + 19 + #include "virt-pci.h" 20 + #include "vfio_user.h" 21 + 22 + #define to_vdev(_pdev) container_of(_pdev, struct uml_vfio_device, pdev) 23 + 24 + struct uml_vfio_intr_ctx { 25 + struct uml_vfio_device *dev; 26 + int irq; 27 + }; 28 + 29 + struct uml_vfio_device { 30 + const char *name; 31 + int group; 32 + 33 + struct um_pci_device pdev; 34 + struct uml_vfio_user_device udev; 35 + struct uml_vfio_intr_ctx *intr_ctx; 36 + 37 + int msix_cap; 38 + int msix_bar; 39 + int msix_offset; 40 + int msix_size; 41 + u32 *msix_data; 42 + 43 + struct list_head list; 44 + }; 45 + 46 + struct uml_vfio_group { 47 + int id; 48 + int fd; 49 + int users; 50 + struct list_head list; 51 + }; 52 + 53 + static struct { 54 + int fd; 55 + int users; 56 + } uml_vfio_container = { .fd = -1 }; 57 + static DEFINE_MUTEX(uml_vfio_container_mtx); 58 + 59 + static LIST_HEAD(uml_vfio_groups); 60 + static DEFINE_MUTEX(uml_vfio_groups_mtx); 61 + 62 + static LIST_HEAD(uml_vfio_devices); 63 + 64 + static int uml_vfio_set_container(int group_fd) 65 + { 66 + int err; 67 + 68 + guard(mutex)(&uml_vfio_container_mtx); 69 + 70 + err = uml_vfio_user_set_container(uml_vfio_container.fd, group_fd); 71 + if (err) 72 + return err; 73 + 74 + uml_vfio_container.users++; 75 + if (uml_vfio_container.users > 1) 76 + return 0; 77 + 78 + err = uml_vfio_user_setup_iommu(uml_vfio_container.fd); 79 + if (err) { 80 + uml_vfio_user_unset_container(uml_vfio_container.fd, group_fd); 81 + uml_vfio_container.users--; 82 + } 83 + return err; 84 + } 85 + 86 + static void uml_vfio_unset_container(int group_fd) 87 + { 88 + guard(mutex)(&uml_vfio_container_mtx); 89 + 90 + uml_vfio_user_unset_container(uml_vfio_container.fd, group_fd); 91 + uml_vfio_container.users--; 92 + } 93 + 94 + static int uml_vfio_open_group(int group_id) 95 + { 96 + struct uml_vfio_group *group; 97 + int err; 98 + 99 + guard(mutex)(&uml_vfio_groups_mtx); 100 + 101 + list_for_each_entry(group, &uml_vfio_groups, list) { 102 + if (group->id == group_id) { 103 + group->users++; 104 + return group->fd; 105 + } 106 + } 107 + 108 + group = kzalloc(sizeof(*group), GFP_KERNEL); 109 + if (!group) 110 + return -ENOMEM; 111 + 112 + group->fd = uml_vfio_user_open_group(group_id); 113 + if (group->fd < 0) { 114 + err = group->fd; 115 + goto free_group; 116 + } 117 + 118 + err = uml_vfio_set_container(group->fd); 119 + if (err) 120 + goto close_group; 121 + 122 + group->id = group_id; 123 + group->users = 1; 124 + 125 + list_add(&group->list, &uml_vfio_groups); 126 + 127 + return group->fd; 128 + 129 + close_group: 130 + os_close_file(group->fd); 131 + free_group: 132 + kfree(group); 133 + return err; 134 + } 135 + 136 + static int uml_vfio_release_group(int group_fd) 137 + { 138 + struct uml_vfio_group *group; 139 + 140 + guard(mutex)(&uml_vfio_groups_mtx); 141 + 142 + list_for_each_entry(group, &uml_vfio_groups, list) { 143 + if (group->fd == group_fd) { 144 + group->users--; 145 + if (group->users == 0) { 146 + uml_vfio_unset_container(group_fd); 147 + os_close_file(group_fd); 148 + list_del(&group->list); 149 + kfree(group); 150 + } 151 + return 0; 152 + } 153 + } 154 + 155 + return -ENOENT; 156 + } 157 + 158 + static irqreturn_t uml_vfio_interrupt(int unused, void *opaque) 159 + { 160 + struct uml_vfio_intr_ctx *ctx = opaque; 161 + struct uml_vfio_device *dev = ctx->dev; 162 + int index = ctx - dev->intr_ctx; 163 + int irqfd = dev->udev.irqfd[index]; 164 + int irq = dev->msix_data[index]; 165 + uint64_t v; 166 + int r; 167 + 168 + do { 169 + r = os_read_file(irqfd, &v, sizeof(v)); 170 + if (r == sizeof(v)) 171 + generic_handle_irq(irq); 172 + } while (r == sizeof(v) || r == -EINTR); 173 + WARN(r != -EAGAIN, "read returned %d\n", r); 174 + 175 + return IRQ_HANDLED; 176 + } 177 + 178 + static int uml_vfio_activate_irq(struct uml_vfio_device *dev, int index) 179 + { 180 + struct uml_vfio_intr_ctx *ctx = &dev->intr_ctx[index]; 181 + int err, irqfd; 182 + 183 + if (ctx->irq >= 0) 184 + return 0; 185 + 186 + irqfd = uml_vfio_user_activate_irq(&dev->udev, index); 187 + if (irqfd < 0) 188 + return irqfd; 189 + 190 + ctx->irq = um_request_irq(UM_IRQ_ALLOC, irqfd, IRQ_READ, 191 + uml_vfio_interrupt, 0, 192 + "vfio-uml", ctx); 193 + if (ctx->irq < 0) { 194 + err = ctx->irq; 195 + goto deactivate; 196 + } 197 + 198 + err = add_sigio_fd(irqfd); 199 + if (err) 200 + goto free_irq; 201 + 202 + return 0; 203 + 204 + free_irq: 205 + um_free_irq(ctx->irq, ctx); 206 + ctx->irq = -1; 207 + deactivate: 208 + uml_vfio_user_deactivate_irq(&dev->udev, index); 209 + return err; 210 + } 211 + 212 + static int uml_vfio_deactivate_irq(struct uml_vfio_device *dev, int index) 213 + { 214 + struct uml_vfio_intr_ctx *ctx = &dev->intr_ctx[index]; 215 + 216 + if (ctx->irq >= 0) { 217 + ignore_sigio_fd(dev->udev.irqfd[index]); 218 + um_free_irq(ctx->irq, ctx); 219 + uml_vfio_user_deactivate_irq(&dev->udev, index); 220 + ctx->irq = -1; 221 + } 222 + return 0; 223 + } 224 + 225 + static int uml_vfio_update_msix_cap(struct uml_vfio_device *dev, 226 + unsigned int offset, int size, 227 + unsigned long val) 228 + { 229 + /* 230 + * Here, we handle only the operations we care about, 231 + * ignoring the rest. 232 + */ 233 + if (size == 2 && offset == dev->msix_cap + PCI_MSIX_FLAGS) { 234 + switch (val & ~PCI_MSIX_FLAGS_QSIZE) { 235 + case PCI_MSIX_FLAGS_ENABLE: 236 + case 0: 237 + return uml_vfio_user_update_irqs(&dev->udev); 238 + } 239 + } 240 + return 0; 241 + } 242 + 243 + static int uml_vfio_update_msix_table(struct uml_vfio_device *dev, 244 + unsigned int offset, int size, 245 + unsigned long val) 246 + { 247 + int index; 248 + 249 + /* 250 + * Here, we handle only the operations we care about, 251 + * ignoring the rest. 252 + */ 253 + offset -= dev->msix_offset + PCI_MSIX_ENTRY_DATA; 254 + 255 + if (size != 4 || offset % PCI_MSIX_ENTRY_SIZE != 0) 256 + return 0; 257 + 258 + index = offset / PCI_MSIX_ENTRY_SIZE; 259 + if (index >= dev->udev.irq_count) 260 + return -EINVAL; 261 + 262 + dev->msix_data[index] = val; 263 + 264 + return val ? uml_vfio_activate_irq(dev, index) : 265 + uml_vfio_deactivate_irq(dev, index); 266 + } 267 + 268 + static unsigned long __uml_vfio_cfgspace_read(struct uml_vfio_device *dev, 269 + unsigned int offset, int size) 270 + { 271 + u8 data[8]; 272 + 273 + memset(data, 0xff, sizeof(data)); 274 + 275 + if (uml_vfio_user_cfgspace_read(&dev->udev, offset, data, size)) 276 + return ULONG_MAX; 277 + 278 + switch (size) { 279 + case 1: 280 + return data[0]; 281 + case 2: 282 + return le16_to_cpup((void *)data); 283 + case 4: 284 + return le32_to_cpup((void *)data); 285 + #ifdef CONFIG_64BIT 286 + case 8: 287 + return le64_to_cpup((void *)data); 288 + #endif 289 + default: 290 + return ULONG_MAX; 291 + } 292 + } 293 + 294 + static unsigned long uml_vfio_cfgspace_read(struct um_pci_device *pdev, 295 + unsigned int offset, int size) 296 + { 297 + struct uml_vfio_device *dev = to_vdev(pdev); 298 + 299 + return __uml_vfio_cfgspace_read(dev, offset, size); 300 + } 301 + 302 + static void __uml_vfio_cfgspace_write(struct uml_vfio_device *dev, 303 + unsigned int offset, int size, 304 + unsigned long val) 305 + { 306 + u8 data[8]; 307 + 308 + switch (size) { 309 + case 1: 310 + data[0] = (u8)val; 311 + break; 312 + case 2: 313 + put_unaligned_le16(val, (void *)data); 314 + break; 315 + case 4: 316 + put_unaligned_le32(val, (void *)data); 317 + break; 318 + #ifdef CONFIG_64BIT 319 + case 8: 320 + put_unaligned_le64(val, (void *)data); 321 + break; 322 + #endif 323 + } 324 + 325 + WARN_ON(uml_vfio_user_cfgspace_write(&dev->udev, offset, data, size)); 326 + } 327 + 328 + static void uml_vfio_cfgspace_write(struct um_pci_device *pdev, 329 + unsigned int offset, int size, 330 + unsigned long val) 331 + { 332 + struct uml_vfio_device *dev = to_vdev(pdev); 333 + 334 + if (offset < dev->msix_cap + PCI_CAP_MSIX_SIZEOF && 335 + offset + size > dev->msix_cap) 336 + WARN_ON(uml_vfio_update_msix_cap(dev, offset, size, val)); 337 + 338 + __uml_vfio_cfgspace_write(dev, offset, size, val); 339 + } 340 + 341 + static void uml_vfio_bar_copy_from(struct um_pci_device *pdev, int bar, 342 + void *buffer, unsigned int offset, int size) 343 + { 344 + struct uml_vfio_device *dev = to_vdev(pdev); 345 + 346 + memset(buffer, 0xff, size); 347 + uml_vfio_user_bar_read(&dev->udev, bar, offset, buffer, size); 348 + } 349 + 350 + static unsigned long uml_vfio_bar_read(struct um_pci_device *pdev, int bar, 351 + unsigned int offset, int size) 352 + { 353 + u8 data[8]; 354 + 355 + uml_vfio_bar_copy_from(pdev, bar, data, offset, size); 356 + 357 + switch (size) { 358 + case 1: 359 + return data[0]; 360 + case 2: 361 + return le16_to_cpup((void *)data); 362 + case 4: 363 + return le32_to_cpup((void *)data); 364 + #ifdef CONFIG_64BIT 365 + case 8: 366 + return le64_to_cpup((void *)data); 367 + #endif 368 + default: 369 + return ULONG_MAX; 370 + } 371 + } 372 + 373 + static void uml_vfio_bar_copy_to(struct um_pci_device *pdev, int bar, 374 + unsigned int offset, const void *buffer, 375 + int size) 376 + { 377 + struct uml_vfio_device *dev = to_vdev(pdev); 378 + 379 + uml_vfio_user_bar_write(&dev->udev, bar, offset, buffer, size); 380 + } 381 + 382 + static void uml_vfio_bar_write(struct um_pci_device *pdev, int bar, 383 + unsigned int offset, int size, 384 + unsigned long val) 385 + { 386 + struct uml_vfio_device *dev = to_vdev(pdev); 387 + u8 data[8]; 388 + 389 + if (bar == dev->msix_bar && offset + size > dev->msix_offset && 390 + offset < dev->msix_offset + dev->msix_size) 391 + WARN_ON(uml_vfio_update_msix_table(dev, offset, size, val)); 392 + 393 + switch (size) { 394 + case 1: 395 + data[0] = (u8)val; 396 + break; 397 + case 2: 398 + put_unaligned_le16(val, (void *)data); 399 + break; 400 + case 4: 401 + put_unaligned_le32(val, (void *)data); 402 + break; 403 + #ifdef CONFIG_64BIT 404 + case 8: 405 + put_unaligned_le64(val, (void *)data); 406 + break; 407 + #endif 408 + } 409 + 410 + uml_vfio_bar_copy_to(pdev, bar, offset, data, size); 411 + } 412 + 413 + static void uml_vfio_bar_set(struct um_pci_device *pdev, int bar, 414 + unsigned int offset, u8 value, int size) 415 + { 416 + struct uml_vfio_device *dev = to_vdev(pdev); 417 + int i; 418 + 419 + for (i = 0; i < size; i++) 420 + uml_vfio_user_bar_write(&dev->udev, bar, offset + i, &value, 1); 421 + } 422 + 423 + static const struct um_pci_ops uml_vfio_um_pci_ops = { 424 + .cfgspace_read = uml_vfio_cfgspace_read, 425 + .cfgspace_write = uml_vfio_cfgspace_write, 426 + .bar_read = uml_vfio_bar_read, 427 + .bar_write = uml_vfio_bar_write, 428 + .bar_copy_from = uml_vfio_bar_copy_from, 429 + .bar_copy_to = uml_vfio_bar_copy_to, 430 + .bar_set = uml_vfio_bar_set, 431 + }; 432 + 433 + static u8 uml_vfio_find_capability(struct uml_vfio_device *dev, u8 cap) 434 + { 435 + u8 id, pos; 436 + u16 ent; 437 + int ttl = 48; /* PCI_FIND_CAP_TTL */ 438 + 439 + pos = __uml_vfio_cfgspace_read(dev, PCI_CAPABILITY_LIST, sizeof(pos)); 440 + 441 + while (pos && ttl--) { 442 + ent = __uml_vfio_cfgspace_read(dev, pos, sizeof(ent)); 443 + 444 + id = ent & 0xff; 445 + if (id == 0xff) 446 + break; 447 + if (id == cap) 448 + return pos; 449 + 450 + pos = ent >> 8; 451 + } 452 + 453 + return 0; 454 + } 455 + 456 + static int uml_vfio_read_msix_table(struct uml_vfio_device *dev) 457 + { 458 + unsigned int off; 459 + u16 flags; 460 + u32 tbl; 461 + 462 + off = uml_vfio_find_capability(dev, PCI_CAP_ID_MSIX); 463 + if (!off) 464 + return -ENOTSUPP; 465 + 466 + dev->msix_cap = off; 467 + 468 + tbl = __uml_vfio_cfgspace_read(dev, off + PCI_MSIX_TABLE, sizeof(tbl)); 469 + flags = __uml_vfio_cfgspace_read(dev, off + PCI_MSIX_FLAGS, sizeof(flags)); 470 + 471 + dev->msix_bar = tbl & PCI_MSIX_TABLE_BIR; 472 + dev->msix_offset = tbl & PCI_MSIX_TABLE_OFFSET; 473 + dev->msix_size = ((flags & PCI_MSIX_FLAGS_QSIZE) + 1) * PCI_MSIX_ENTRY_SIZE; 474 + 475 + dev->msix_data = kzalloc(dev->msix_size, GFP_KERNEL); 476 + if (!dev->msix_data) 477 + return -ENOMEM; 478 + 479 + return 0; 480 + } 481 + 482 + static void uml_vfio_open_device(struct uml_vfio_device *dev) 483 + { 484 + struct uml_vfio_intr_ctx *ctx; 485 + int err, group_id, i; 486 + 487 + group_id = uml_vfio_user_get_group_id(dev->name); 488 + if (group_id < 0) { 489 + pr_err("Failed to get group id (%s), error %d\n", 490 + dev->name, group_id); 491 + goto free_dev; 492 + } 493 + 494 + dev->group = uml_vfio_open_group(group_id); 495 + if (dev->group < 0) { 496 + pr_err("Failed to open group %d (%s), error %d\n", 497 + group_id, dev->name, dev->group); 498 + goto free_dev; 499 + } 500 + 501 + err = uml_vfio_user_setup_device(&dev->udev, dev->group, dev->name); 502 + if (err) { 503 + pr_err("Failed to setup device (%s), error %d\n", 504 + dev->name, err); 505 + goto release_group; 506 + } 507 + 508 + err = uml_vfio_read_msix_table(dev); 509 + if (err) { 510 + pr_err("Failed to read MSI-X table (%s), error %d\n", 511 + dev->name, err); 512 + goto teardown_udev; 513 + } 514 + 515 + dev->intr_ctx = kmalloc_array(dev->udev.irq_count, 516 + sizeof(struct uml_vfio_intr_ctx), 517 + GFP_KERNEL); 518 + if (!dev->intr_ctx) { 519 + pr_err("Failed to allocate interrupt context (%s)\n", 520 + dev->name); 521 + goto free_msix; 522 + } 523 + 524 + for (i = 0; i < dev->udev.irq_count; i++) { 525 + ctx = &dev->intr_ctx[i]; 526 + ctx->dev = dev; 527 + ctx->irq = -1; 528 + } 529 + 530 + dev->pdev.ops = &uml_vfio_um_pci_ops; 531 + 532 + err = um_pci_device_register(&dev->pdev); 533 + if (err) { 534 + pr_err("Failed to register UM PCI device (%s), error %d\n", 535 + dev->name, err); 536 + goto free_intr_ctx; 537 + } 538 + 539 + return; 540 + 541 + free_intr_ctx: 542 + kfree(dev->intr_ctx); 543 + free_msix: 544 + kfree(dev->msix_data); 545 + teardown_udev: 546 + uml_vfio_user_teardown_device(&dev->udev); 547 + release_group: 548 + uml_vfio_release_group(dev->group); 549 + free_dev: 550 + list_del(&dev->list); 551 + kfree(dev->name); 552 + kfree(dev); 553 + } 554 + 555 + static void uml_vfio_release_device(struct uml_vfio_device *dev) 556 + { 557 + int i; 558 + 559 + for (i = 0; i < dev->udev.irq_count; i++) 560 + uml_vfio_deactivate_irq(dev, i); 561 + uml_vfio_user_update_irqs(&dev->udev); 562 + 563 + um_pci_device_unregister(&dev->pdev); 564 + kfree(dev->intr_ctx); 565 + kfree(dev->msix_data); 566 + uml_vfio_user_teardown_device(&dev->udev); 567 + uml_vfio_release_group(dev->group); 568 + list_del(&dev->list); 569 + kfree(dev->name); 570 + kfree(dev); 571 + } 572 + 573 + static int uml_vfio_cmdline_set(const char *device, const struct kernel_param *kp) 574 + { 575 + struct uml_vfio_device *dev; 576 + int fd; 577 + 578 + if (uml_vfio_container.fd < 0) { 579 + fd = uml_vfio_user_open_container(); 580 + if (fd < 0) 581 + return fd; 582 + uml_vfio_container.fd = fd; 583 + } 584 + 585 + dev = kzalloc(sizeof(*dev), GFP_KERNEL); 586 + if (!dev) 587 + return -ENOMEM; 588 + 589 + dev->name = kstrdup(device, GFP_KERNEL); 590 + if (!dev->name) { 591 + kfree(dev); 592 + return -ENOMEM; 593 + } 594 + 595 + list_add_tail(&dev->list, &uml_vfio_devices); 596 + return 0; 597 + } 598 + 599 + static int uml_vfio_cmdline_get(char *buffer, const struct kernel_param *kp) 600 + { 601 + return 0; 602 + } 603 + 604 + static const struct kernel_param_ops uml_vfio_cmdline_param_ops = { 605 + .set = uml_vfio_cmdline_set, 606 + .get = uml_vfio_cmdline_get, 607 + }; 608 + 609 + device_param_cb(device, &uml_vfio_cmdline_param_ops, NULL, 0400); 610 + __uml_help(uml_vfio_cmdline_param_ops, 611 + "vfio_uml.device=<domain:bus:slot.function>\n" 612 + " Pass through a PCI device to UML via VFIO. Currently, only MSI-X\n" 613 + " capable devices are supported, and it is assumed that drivers will\n" 614 + " use MSI-X. This parameter can be specified multiple times to pass\n" 615 + " through multiple PCI devices to UML.\n\n" 616 + ); 617 + 618 + static int __init uml_vfio_init(void) 619 + { 620 + struct uml_vfio_device *dev, *n; 621 + 622 + sigio_broken(); 623 + 624 + /* If the opening fails, the device will be released. */ 625 + list_for_each_entry_safe(dev, n, &uml_vfio_devices, list) 626 + uml_vfio_open_device(dev); 627 + 628 + return 0; 629 + } 630 + late_initcall(uml_vfio_init); 631 + 632 + static void __exit uml_vfio_exit(void) 633 + { 634 + struct uml_vfio_device *dev, *n; 635 + 636 + list_for_each_entry_safe(dev, n, &uml_vfio_devices, list) 637 + uml_vfio_release_device(dev); 638 + 639 + if (uml_vfio_container.fd >= 0) 640 + os_close_file(uml_vfio_container.fd); 641 + } 642 + module_exit(uml_vfio_exit);
+327
arch/um/drivers/vfio_user.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* 3 + * Copyright (C) 2025 Ant Group 4 + * Author: Tiwei Bie <tiwei.btw@antgroup.com> 5 + */ 6 + #include <errno.h> 7 + #include <fcntl.h> 8 + #include <unistd.h> 9 + #include <stdio.h> 10 + #include <stdint.h> 11 + #include <stdlib.h> 12 + #include <string.h> 13 + #include <sys/ioctl.h> 14 + #include <sys/eventfd.h> 15 + #include <linux/limits.h> 16 + #include <linux/vfio.h> 17 + #include <linux/pci_regs.h> 18 + #include <as-layout.h> 19 + #include <um_malloc.h> 20 + 21 + #include "vfio_user.h" 22 + 23 + int uml_vfio_user_open_container(void) 24 + { 25 + int r, fd; 26 + 27 + fd = open("/dev/vfio/vfio", O_RDWR); 28 + if (fd < 0) 29 + return -errno; 30 + 31 + r = ioctl(fd, VFIO_GET_API_VERSION); 32 + if (r != VFIO_API_VERSION) { 33 + r = r < 0 ? -errno : -EINVAL; 34 + goto error; 35 + } 36 + 37 + r = ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU); 38 + if (r <= 0) { 39 + r = r < 0 ? -errno : -EINVAL; 40 + goto error; 41 + } 42 + 43 + return fd; 44 + 45 + error: 46 + close(fd); 47 + return r; 48 + } 49 + 50 + int uml_vfio_user_setup_iommu(int container) 51 + { 52 + /* 53 + * This is a bit tricky. See the big comment in 54 + * vhost_user_set_mem_table() in virtio_uml.c. 55 + */ 56 + unsigned long reserved = uml_reserved - uml_physmem; 57 + struct vfio_iommu_type1_dma_map dma_map = { 58 + .argsz = sizeof(dma_map), 59 + .flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE, 60 + .vaddr = uml_reserved, 61 + .iova = reserved, 62 + .size = physmem_size - reserved, 63 + }; 64 + 65 + if (ioctl(container, VFIO_SET_IOMMU, VFIO_TYPE1_IOMMU) < 0) 66 + return -errno; 67 + 68 + if (ioctl(container, VFIO_IOMMU_MAP_DMA, &dma_map) < 0) 69 + return -errno; 70 + 71 + return 0; 72 + } 73 + 74 + int uml_vfio_user_get_group_id(const char *device) 75 + { 76 + char *path, *buf, *end; 77 + const char *name; 78 + int r; 79 + 80 + path = uml_kmalloc(PATH_MAX, UM_GFP_KERNEL); 81 + if (!path) 82 + return -ENOMEM; 83 + 84 + sprintf(path, "/sys/bus/pci/devices/%s/iommu_group", device); 85 + 86 + buf = uml_kmalloc(PATH_MAX + 1, UM_GFP_KERNEL); 87 + if (!buf) { 88 + r = -ENOMEM; 89 + goto free_path; 90 + } 91 + 92 + r = readlink(path, buf, PATH_MAX); 93 + if (r < 0) { 94 + r = -errno; 95 + goto free_buf; 96 + } 97 + buf[r] = '\0'; 98 + 99 + name = basename(buf); 100 + 101 + r = strtoul(name, &end, 10); 102 + if (*end != '\0' || end == name) { 103 + r = -EINVAL; 104 + goto free_buf; 105 + } 106 + 107 + free_buf: 108 + kfree(buf); 109 + free_path: 110 + kfree(path); 111 + return r; 112 + } 113 + 114 + int uml_vfio_user_open_group(int group_id) 115 + { 116 + char *path; 117 + int fd; 118 + 119 + path = uml_kmalloc(PATH_MAX, UM_GFP_KERNEL); 120 + if (!path) 121 + return -ENOMEM; 122 + 123 + sprintf(path, "/dev/vfio/%d", group_id); 124 + 125 + fd = open(path, O_RDWR); 126 + if (fd < 0) { 127 + fd = -errno; 128 + goto out; 129 + } 130 + 131 + out: 132 + kfree(path); 133 + return fd; 134 + } 135 + 136 + int uml_vfio_user_set_container(int container, int group) 137 + { 138 + if (ioctl(group, VFIO_GROUP_SET_CONTAINER, &container) < 0) 139 + return -errno; 140 + return 0; 141 + } 142 + 143 + int uml_vfio_user_unset_container(int container, int group) 144 + { 145 + if (ioctl(group, VFIO_GROUP_UNSET_CONTAINER, &container) < 0) 146 + return -errno; 147 + return 0; 148 + } 149 + 150 + static int vfio_set_irqs(int device, int start, int count, int *irqfd) 151 + { 152 + struct vfio_irq_set *irq_set; 153 + int argsz = sizeof(*irq_set) + sizeof(*irqfd) * count; 154 + int err = 0; 155 + 156 + irq_set = uml_kmalloc(argsz, UM_GFP_KERNEL); 157 + if (!irq_set) 158 + return -ENOMEM; 159 + 160 + irq_set->argsz = argsz; 161 + irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER; 162 + irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX; 163 + irq_set->start = start; 164 + irq_set->count = count; 165 + memcpy(irq_set->data, irqfd, sizeof(*irqfd) * count); 166 + 167 + if (ioctl(device, VFIO_DEVICE_SET_IRQS, irq_set) < 0) { 168 + err = -errno; 169 + goto out; 170 + } 171 + 172 + out: 173 + kfree(irq_set); 174 + return err; 175 + } 176 + 177 + int uml_vfio_user_setup_device(struct uml_vfio_user_device *dev, 178 + int group, const char *device) 179 + { 180 + struct vfio_device_info device_info = { .argsz = sizeof(device_info) }; 181 + struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info) }; 182 + int err, i; 183 + 184 + dev->device = ioctl(group, VFIO_GROUP_GET_DEVICE_FD, device); 185 + if (dev->device < 0) 186 + return -errno; 187 + 188 + if (ioctl(dev->device, VFIO_DEVICE_GET_INFO, &device_info) < 0) { 189 + err = -errno; 190 + goto close_device; 191 + } 192 + 193 + dev->num_regions = device_info.num_regions; 194 + if (dev->num_regions > VFIO_PCI_CONFIG_REGION_INDEX + 1) 195 + dev->num_regions = VFIO_PCI_CONFIG_REGION_INDEX + 1; 196 + 197 + dev->region = uml_kmalloc(sizeof(*dev->region) * dev->num_regions, 198 + UM_GFP_KERNEL); 199 + if (!dev->region) { 200 + err = -ENOMEM; 201 + goto close_device; 202 + } 203 + 204 + for (i = 0; i < dev->num_regions; i++) { 205 + struct vfio_region_info region = { 206 + .argsz = sizeof(region), 207 + .index = i, 208 + }; 209 + if (ioctl(dev->device, VFIO_DEVICE_GET_REGION_INFO, &region) < 0) { 210 + err = -errno; 211 + goto free_region; 212 + } 213 + dev->region[i].size = region.size; 214 + dev->region[i].offset = region.offset; 215 + } 216 + 217 + /* Only MSI-X is supported currently. */ 218 + irq_info.index = VFIO_PCI_MSIX_IRQ_INDEX; 219 + if (ioctl(dev->device, VFIO_DEVICE_GET_IRQ_INFO, &irq_info) < 0) { 220 + err = -errno; 221 + goto free_region; 222 + } 223 + 224 + dev->irq_count = irq_info.count; 225 + 226 + dev->irqfd = uml_kmalloc(sizeof(int) * dev->irq_count, UM_GFP_KERNEL); 227 + if (!dev->irqfd) { 228 + err = -ENOMEM; 229 + goto free_region; 230 + } 231 + 232 + memset(dev->irqfd, -1, sizeof(int) * dev->irq_count); 233 + 234 + err = vfio_set_irqs(dev->device, 0, dev->irq_count, dev->irqfd); 235 + if (err) 236 + goto free_irqfd; 237 + 238 + return 0; 239 + 240 + free_irqfd: 241 + kfree(dev->irqfd); 242 + free_region: 243 + kfree(dev->region); 244 + close_device: 245 + close(dev->device); 246 + return err; 247 + } 248 + 249 + void uml_vfio_user_teardown_device(struct uml_vfio_user_device *dev) 250 + { 251 + kfree(dev->irqfd); 252 + kfree(dev->region); 253 + close(dev->device); 254 + } 255 + 256 + int uml_vfio_user_activate_irq(struct uml_vfio_user_device *dev, int index) 257 + { 258 + int irqfd; 259 + 260 + irqfd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC); 261 + if (irqfd < 0) 262 + return -errno; 263 + 264 + dev->irqfd[index] = irqfd; 265 + return irqfd; 266 + } 267 + 268 + void uml_vfio_user_deactivate_irq(struct uml_vfio_user_device *dev, int index) 269 + { 270 + close(dev->irqfd[index]); 271 + dev->irqfd[index] = -1; 272 + } 273 + 274 + int uml_vfio_user_update_irqs(struct uml_vfio_user_device *dev) 275 + { 276 + return vfio_set_irqs(dev->device, 0, dev->irq_count, dev->irqfd); 277 + } 278 + 279 + static int vfio_region_read(struct uml_vfio_user_device *dev, unsigned int index, 280 + uint64_t offset, void *buf, uint64_t size) 281 + { 282 + if (index >= dev->num_regions || offset + size > dev->region[index].size) 283 + return -EINVAL; 284 + 285 + if (pread(dev->device, buf, size, dev->region[index].offset + offset) < 0) 286 + return -errno; 287 + 288 + return 0; 289 + } 290 + 291 + static int vfio_region_write(struct uml_vfio_user_device *dev, unsigned int index, 292 + uint64_t offset, const void *buf, uint64_t size) 293 + { 294 + if (index >= dev->num_regions || offset + size > dev->region[index].size) 295 + return -EINVAL; 296 + 297 + if (pwrite(dev->device, buf, size, dev->region[index].offset + offset) < 0) 298 + return -errno; 299 + 300 + return 0; 301 + } 302 + 303 + int uml_vfio_user_cfgspace_read(struct uml_vfio_user_device *dev, 304 + unsigned int offset, void *buf, int size) 305 + { 306 + return vfio_region_read(dev, VFIO_PCI_CONFIG_REGION_INDEX, 307 + offset, buf, size); 308 + } 309 + 310 + int uml_vfio_user_cfgspace_write(struct uml_vfio_user_device *dev, 311 + unsigned int offset, const void *buf, int size) 312 + { 313 + return vfio_region_write(dev, VFIO_PCI_CONFIG_REGION_INDEX, 314 + offset, buf, size); 315 + } 316 + 317 + int uml_vfio_user_bar_read(struct uml_vfio_user_device *dev, int bar, 318 + unsigned int offset, void *buf, int size) 319 + { 320 + return vfio_region_read(dev, bar, offset, buf, size); 321 + } 322 + 323 + int uml_vfio_user_bar_write(struct uml_vfio_user_device *dev, int bar, 324 + unsigned int offset, const void *buf, int size) 325 + { 326 + return vfio_region_write(dev, bar, offset, buf, size); 327 + }
+44
arch/um/drivers/vfio_user.h
··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + #ifndef __UM_VFIO_USER_H 3 + #define __UM_VFIO_USER_H 4 + 5 + struct uml_vfio_user_device { 6 + int device; 7 + 8 + struct { 9 + uint64_t size; 10 + uint64_t offset; 11 + } *region; 12 + int num_regions; 13 + 14 + int32_t *irqfd; 15 + int irq_count; 16 + }; 17 + 18 + int uml_vfio_user_open_container(void); 19 + int uml_vfio_user_setup_iommu(int container); 20 + 21 + int uml_vfio_user_get_group_id(const char *device); 22 + int uml_vfio_user_open_group(int group_id); 23 + int uml_vfio_user_set_container(int container, int group); 24 + int uml_vfio_user_unset_container(int container, int group); 25 + 26 + int uml_vfio_user_setup_device(struct uml_vfio_user_device *dev, 27 + int group, const char *device); 28 + void uml_vfio_user_teardown_device(struct uml_vfio_user_device *dev); 29 + 30 + int uml_vfio_user_activate_irq(struct uml_vfio_user_device *dev, int index); 31 + void uml_vfio_user_deactivate_irq(struct uml_vfio_user_device *dev, int index); 32 + int uml_vfio_user_update_irqs(struct uml_vfio_user_device *dev); 33 + 34 + int uml_vfio_user_cfgspace_read(struct uml_vfio_user_device *dev, 35 + unsigned int offset, void *buf, int size); 36 + int uml_vfio_user_cfgspace_write(struct uml_vfio_user_device *dev, 37 + unsigned int offset, const void *buf, int size); 38 + 39 + int uml_vfio_user_bar_read(struct uml_vfio_user_device *dev, int bar, 40 + unsigned int offset, void *buf, int size); 41 + int uml_vfio_user_bar_write(struct uml_vfio_user_device *dev, int bar, 42 + unsigned int offset, const void *buf, int size); 43 + 44 + #endif /* __UM_VFIO_USER_H */