Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

vfio: Include No-IOMMU mode

There is really no way to safely give a user full access to a DMA
capable device without an IOMMU to protect the host system. There is
also no way to provide DMA translation, for use cases such as device
assignment to virtual machines. However, there are still those users
that want userspace drivers even under those conditions. The UIO
driver exists for this use case, but does not provide the degree of
device access and programming that VFIO has. In an effort to avoid
code duplication, this introduces a No-IOMMU mode for VFIO.

This mode requires building VFIO with CONFIG_VFIO_NOIOMMU and enabling
the "enable_unsafe_noiommu_mode" option on the vfio driver. This
should make it very clear that this mode is not safe. Additionally,
CAP_SYS_RAWIO privileges are necessary to work with groups and
containers using this mode. Groups making use of this support are
named /dev/vfio/noiommu-$GROUP and can only make use of the special
VFIO_NOIOMMU_IOMMU for the container. Use of this mode, specifically
binding a device without a native IOMMU group to a VFIO bus driver
will taint the kernel and should therefore not be considered
supported. This patch includes no-iommu support for the vfio-pci bus
driver only.

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>

+210 -7
+15
drivers/vfio/Kconfig
··· 31 31 32 32 If you don't know what to do here, say N. 33 33 34 + menuconfig VFIO_NOIOMMU 35 + bool "VFIO No-IOMMU support" 36 + depends on VFIO 37 + help 38 + VFIO is built on the ability to isolate devices using the IOMMU. 39 + Only with an IOMMU can userspace access to DMA capable devices be 40 + considered secure. VFIO No-IOMMU mode enables IOMMU groups for 41 + devices without IOMMU backing for the purpose of re-using the VFIO 42 + infrastructure in a non-secure mode. Use of this mode will result 43 + in an unsupportable kernel and will therefore taint the kernel. 44 + Device assignment to virtual machines is also not possible with 45 + this mode since there is no IOMMU to provide DMA translation. 46 + 47 + If you don't know what to do here, say N. 48 + 34 49 source "drivers/vfio/pci/Kconfig" 35 50 source "drivers/vfio/platform/Kconfig" 36 51 source "virt/lib/Kconfig"
+4 -4
drivers/vfio/pci/vfio_pci.c
··· 940 940 if (pdev->hdr_type != PCI_HEADER_TYPE_NORMAL) 941 941 return -EINVAL; 942 942 943 - group = iommu_group_get(&pdev->dev); 943 + group = vfio_iommu_group_get(&pdev->dev); 944 944 if (!group) 945 945 return -EINVAL; 946 946 947 947 vdev = kzalloc(sizeof(*vdev), GFP_KERNEL); 948 948 if (!vdev) { 949 - iommu_group_put(group); 949 + vfio_iommu_group_put(group, &pdev->dev); 950 950 return -ENOMEM; 951 951 } 952 952 ··· 957 957 958 958 ret = vfio_add_group_dev(&pdev->dev, &vfio_pci_ops, vdev); 959 959 if (ret) { 960 - iommu_group_put(group); 960 + vfio_iommu_group_put(group, &pdev->dev); 961 961 kfree(vdev); 962 962 return ret; 963 963 } ··· 993 993 if (!vdev) 994 994 return; 995 995 996 - iommu_group_put(pdev->dev.iommu_group); 996 + vfio_iommu_group_put(pdev->dev.iommu_group, &pdev->dev); 997 997 kfree(vdev); 998 998 999 999 if (vfio_pci_is_vga(pdev)) {
+181 -3
drivers/vfio/vfio.c
··· 62 62 struct rw_semaphore group_lock; 63 63 struct vfio_iommu_driver *iommu_driver; 64 64 void *iommu_data; 65 + bool noiommu; 65 66 }; 66 67 67 68 struct vfio_unbound_dev { ··· 85 84 struct list_head unbound_list; 86 85 struct mutex unbound_lock; 87 86 atomic_t opened; 87 + bool noiommu; 88 88 }; 89 89 90 90 struct vfio_device { ··· 96 94 struct list_head group_next; 97 95 void *device_data; 98 96 }; 97 + 98 + #ifdef CONFIG_VFIO_NOIOMMU 99 + static bool noiommu __read_mostly; 100 + module_param_named(enable_unsafe_noiommu_mode, 101 + noiommu, bool, S_IRUGO | S_IWUSR); 102 + MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode. This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel. If you do not know what this is for, step away. (default: false)"); 103 + #endif 104 + 105 + /* 106 + * vfio_iommu_group_{get,put} are only intended for VFIO bus driver probe 107 + * and remove functions, any use cases other than acquiring the first 108 + * reference for the purpose of calling vfio_add_group_dev() or removing 109 + * that symmetric reference after vfio_del_group_dev() should use the raw 110 + * iommu_group_{get,put} functions. In particular, vfio_iommu_group_put() 111 + * removes the device from the dummy group and cannot be nested. 112 + */ 113 + struct iommu_group *vfio_iommu_group_get(struct device *dev) 114 + { 115 + struct iommu_group *group; 116 + int __maybe_unused ret; 117 + 118 + group = iommu_group_get(dev); 119 + 120 + #ifdef CONFIG_VFIO_NOIOMMU 121 + /* 122 + * With noiommu enabled, an IOMMU group will be created for a device 123 + * that doesn't already have one and doesn't have an iommu_ops on their 124 + * bus. We use iommu_present() again in the main code to detect these 125 + * fake groups. 126 + */ 127 + if (group || !noiommu || iommu_present(dev->bus)) 128 + return group; 129 + 130 + group = iommu_group_alloc(); 131 + if (IS_ERR(group)) 132 + return NULL; 133 + 134 + iommu_group_set_name(group, "vfio-noiommu"); 135 + ret = iommu_group_add_device(group, dev); 136 + iommu_group_put(group); 137 + if (ret) 138 + return NULL; 139 + 140 + /* 141 + * Where to taint? At this point we've added an IOMMU group for a 142 + * device that is not backed by iommu_ops, therefore any iommu_ 143 + * callback using iommu_ops can legitimately Oops. So, while we may 144 + * be about to give a DMA capable device to a user without IOMMU 145 + * protection, which is clearly taint-worthy, let's go ahead and do 146 + * it here. 147 + */ 148 + add_taint(TAINT_USER, LOCKDEP_STILL_OK); 149 + dev_warn(dev, "Adding kernel taint for vfio-noiommu group on device\n"); 150 + #endif 151 + 152 + return group; 153 + } 154 + EXPORT_SYMBOL_GPL(vfio_iommu_group_get); 155 + 156 + void vfio_iommu_group_put(struct iommu_group *group, struct device *dev) 157 + { 158 + #ifdef CONFIG_VFIO_NOIOMMU 159 + if (!iommu_present(dev->bus)) 160 + iommu_group_remove_device(dev); 161 + #endif 162 + 163 + iommu_group_put(group); 164 + } 165 + EXPORT_SYMBOL_GPL(vfio_iommu_group_put); 166 + 167 + #ifdef CONFIG_VFIO_NOIOMMU 168 + static void *vfio_noiommu_open(unsigned long arg) 169 + { 170 + if (arg != VFIO_NOIOMMU_IOMMU) 171 + return ERR_PTR(-EINVAL); 172 + if (!capable(CAP_SYS_RAWIO)) 173 + return ERR_PTR(-EPERM); 174 + 175 + return NULL; 176 + } 177 + 178 + static void vfio_noiommu_release(void *iommu_data) 179 + { 180 + } 181 + 182 + static long vfio_noiommu_ioctl(void *iommu_data, 183 + unsigned int cmd, unsigned long arg) 184 + { 185 + if (cmd == VFIO_CHECK_EXTENSION) 186 + return noiommu && (arg == VFIO_NOIOMMU_IOMMU) ? 1 : 0; 187 + 188 + return -ENOTTY; 189 + } 190 + 191 + static int vfio_iommu_present(struct device *dev, void *unused) 192 + { 193 + return iommu_present(dev->bus) ? 1 : 0; 194 + } 195 + 196 + static int vfio_noiommu_attach_group(void *iommu_data, 197 + struct iommu_group *iommu_group) 198 + { 199 + return iommu_group_for_each_dev(iommu_group, NULL, 200 + vfio_iommu_present) ? -EINVAL : 0; 201 + } 202 + 203 + static void vfio_noiommu_detach_group(void *iommu_data, 204 + struct iommu_group *iommu_group) 205 + { 206 + } 207 + 208 + static const struct vfio_iommu_driver_ops vfio_noiommu_ops = { 209 + .name = "vfio-noiommu", 210 + .owner = THIS_MODULE, 211 + .open = vfio_noiommu_open, 212 + .release = vfio_noiommu_release, 213 + .ioctl = vfio_noiommu_ioctl, 214 + .attach_group = vfio_noiommu_attach_group, 215 + .detach_group = vfio_noiommu_detach_group, 216 + }; 217 + #endif 218 + 99 219 100 220 /** 101 221 * IOMMU driver registration ··· 323 199 /** 324 200 * Group objects - create, release, get, put, search 325 201 */ 326 - static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group) 202 + static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group, 203 + bool iommu_present) 327 204 { 328 205 struct vfio_group *group, *tmp; 329 206 struct device *dev; ··· 342 217 atomic_set(&group->container_users, 0); 343 218 atomic_set(&group->opened, 0); 344 219 group->iommu_group = iommu_group; 220 + group->noiommu = !iommu_present; 345 221 346 222 group->nb.notifier_call = vfio_iommu_group_notifier; 347 223 ··· 378 252 379 253 dev = device_create(vfio.class, NULL, 380 254 MKDEV(MAJOR(vfio.group_devt), minor), 381 - group, "%d", iommu_group_id(iommu_group)); 255 + group, "%s%d", group->noiommu ? "noiommu-" : "", 256 + iommu_group_id(iommu_group)); 382 257 if (IS_ERR(dev)) { 383 258 vfio_free_group_minor(minor); 384 259 vfio_group_unlock_and_free(group); ··· 767 640 768 641 group = vfio_group_get_from_iommu(iommu_group); 769 642 if (!group) { 770 - group = vfio_create_group(iommu_group); 643 + group = vfio_create_group(iommu_group, iommu_present(dev->bus)); 771 644 if (IS_ERR(group)) { 772 645 iommu_group_put(iommu_group); 773 646 return PTR_ERR(group); ··· 981 854 mutex_lock(&vfio.iommu_drivers_lock); 982 855 list_for_each_entry(driver, &vfio.iommu_drivers_list, 983 856 vfio_next) { 857 + 858 + #ifdef CONFIG_VFIO_NOIOMMU 859 + if (!list_empty(&container->group_list) && 860 + (container->noiommu != 861 + (driver->ops == &vfio_noiommu_ops))) 862 + continue; 863 + #endif 864 + 984 865 if (!try_module_get(driver->ops->owner)) 985 866 continue; 986 867 ··· 1059 924 mutex_lock(&vfio.iommu_drivers_lock); 1060 925 list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) { 1061 926 void *data; 927 + 928 + #ifdef CONFIG_VFIO_NOIOMMU 929 + /* 930 + * Only noiommu containers can use vfio-noiommu and noiommu 931 + * containers can only use vfio-noiommu. 932 + */ 933 + if (container->noiommu != (driver->ops == &vfio_noiommu_ops)) 934 + continue; 935 + #endif 1062 936 1063 937 if (!try_module_get(driver->ops->owner)) 1064 938 continue; ··· 1331 1187 if (atomic_read(&group->container_users)) 1332 1188 return -EINVAL; 1333 1189 1190 + if (group->noiommu && !capable(CAP_SYS_RAWIO)) 1191 + return -EPERM; 1192 + 1334 1193 f = fdget(container_fd); 1335 1194 if (!f.file) 1336 1195 return -EBADF; ··· 1349 1202 1350 1203 down_write(&container->group_lock); 1351 1204 1205 + /* Real groups and fake groups cannot mix */ 1206 + if (!list_empty(&container->group_list) && 1207 + container->noiommu != group->noiommu) { 1208 + ret = -EPERM; 1209 + goto unlock_out; 1210 + } 1211 + 1352 1212 driver = container->iommu_driver; 1353 1213 if (driver) { 1354 1214 ret = driver->ops->attach_group(container->iommu_data, ··· 1365 1211 } 1366 1212 1367 1213 group->container = container; 1214 + container->noiommu = group->noiommu; 1368 1215 list_add(&group->container_next, &container->group_list); 1369 1216 1370 1217 /* Get a reference on the container and mark a user within the group */ ··· 1395 1240 if (0 == atomic_read(&group->container_users) || 1396 1241 !group->container->iommu_driver || !vfio_group_viable(group)) 1397 1242 return -EINVAL; 1243 + 1244 + if (group->noiommu && !capable(CAP_SYS_RAWIO)) 1245 + return -EPERM; 1398 1246 1399 1247 device = vfio_device_get_from_name(group, buf); 1400 1248 if (!device) ··· 1440 1282 atomic_inc(&group->container_users); 1441 1283 1442 1284 fd_install(ret, filep); 1285 + 1286 + if (group->noiommu) 1287 + dev_warn(device->dev, "vfio-noiommu device opened by user " 1288 + "(%s:%d)\n", current->comm, task_pid_nr(current)); 1443 1289 1444 1290 return ret; 1445 1291 } ··· 1532 1370 group = vfio_group_get_from_minor(iminor(inode)); 1533 1371 if (!group) 1534 1372 return -ENODEV; 1373 + 1374 + if (group->noiommu && !capable(CAP_SYS_RAWIO)) { 1375 + vfio_group_put(group); 1376 + return -EPERM; 1377 + } 1535 1378 1536 1379 /* Do we need multiple instances of the group open? Seems not. */ 1537 1380 opened = atomic_cmpxchg(&group->opened, 0, 1); ··· 1700 1533 if (!atomic_inc_not_zero(&group->container_users)) 1701 1534 return ERR_PTR(-EINVAL); 1702 1535 1536 + if (group->noiommu) { 1537 + atomic_dec(&group->container_users); 1538 + return ERR_PTR(-EPERM); 1539 + } 1540 + 1703 1541 if (!group->container->iommu_driver || 1704 1542 !vfio_group_viable(group)) { 1705 1543 atomic_dec(&group->container_users); ··· 1797 1625 request_module_nowait("vfio_iommu_type1"); 1798 1626 request_module_nowait("vfio_iommu_spapr_tce"); 1799 1627 1628 + #ifdef CONFIG_VFIO_NOIOMMU 1629 + vfio_register_iommu_driver(&vfio_noiommu_ops); 1630 + #endif 1800 1631 return 0; 1801 1632 1802 1633 err_cdev_add: ··· 1816 1641 { 1817 1642 WARN_ON(!list_empty(&vfio.group_list)); 1818 1643 1644 + #ifdef CONFIG_VFIO_NOIOMMU 1645 + vfio_unregister_iommu_driver(&vfio_noiommu_ops); 1646 + #endif 1819 1647 idr_destroy(&vfio.group_idr); 1820 1648 cdev_del(&vfio.group_cdev); 1821 1649 unregister_chrdev_region(vfio.group_devt, MINORMASK);
+3
include/linux/vfio.h
··· 44 44 void (*request)(void *device_data, unsigned int count); 45 45 }; 46 46 47 + extern struct iommu_group *vfio_iommu_group_get(struct device *dev); 48 + extern void vfio_iommu_group_put(struct iommu_group *group, struct device *dev); 49 + 47 50 extern int vfio_add_group_dev(struct device *dev, 48 51 const struct vfio_device_ops *ops, 49 52 void *device_data);
+7
include/uapi/linux/vfio.h
··· 39 39 #define VFIO_SPAPR_TCE_v2_IOMMU 7 40 40 41 41 /* 42 + * The No-IOMMU IOMMU offers no translation or isolation for devices and 43 + * supports no ioctls outside of VFIO_CHECK_EXTENSION. Use of VFIO's No-IOMMU 44 + * code will taint the host kernel and should be used with extreme caution. 45 + */ 46 + #define VFIO_NOIOMMU_IOMMU 8 47 + 48 + /* 42 49 * The IOCTL interface is designed for extensibility by embedding the 43 50 * structure length (argsz) and flags into structures passed between 44 51 * kernel and userspace. We therefore use the _IO() macro for these