Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

powerpc/iommu: Add iommu_ops to report capabilities and allow blocking domains

Up until now PPC64 managed to avoid using iommu_ops. The VFIO driver
uses a SPAPR TCE sub-driver and all iommu_ops uses were kept in the
Type1 VFIO driver. Recent development added 2 uses of iommu_ops to the
generic VFIO which broke POWER:
- a coherency capability check;
- blocking IOMMU domain - iommu_group_dma_owner_claimed()/...

This adds a simple iommu_ops which reports support for cache coherency
and provides a basic support for blocking domains. No other domain types
are implemented so the default domain is NULL.

Since now iommu_ops controls the group ownership, this takes it out of
VFIO.

This adds an IOMMU device into a pci_controller (=PHB) and registers it
in the IOMMU subsystem, iommu_ops is registered at this point. This
setup is done in postcore_initcall_sync.

This replaces iommu_group_add_device() with iommu_probe_device() as the
former misses necessary steps in connecting PCI devices to IOMMU
devices. This adds a comment about why explicit iommu_probe_device() is
still needed.

The previous discussion is here:
https://lore.kernel.org/r/20220707135552.3688927-1-aik@ozlabs.ru/
https://lore.kernel.org/r/20220701061751.1955857-1-aik@ozlabs.ru/

Fixes: e8ae0e140c05 ("vfio: Require that devices support DMA cache coherence")
Fixes: 70693f470848 ("vfio: Set DMA ownership for VFIO devices")
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Signed-off-by: Timothy Pearson <tpearson@raptorengineering.com>
Acked-by: Alex Williamson <alex.williamson@redhat.com>
[mpe: Fix CONFIG_IOMMU_API=n build]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://msgid.link/2000135730.16998523.1678123860135.JavaMail.zimbra@raptorengineeringinc.com

authored by

Alexey Kardashevskiy and committed by
Michael Ellerman
a9409044 76f35109

+218 -10
+7
arch/powerpc/include/asm/pci-bridge.h
··· 8 8 #include <linux/list.h> 9 9 #include <linux/ioport.h> 10 10 #include <linux/numa.h> 11 + #include <linux/iommu.h> 11 12 12 13 struct device_node; 13 14 ··· 45 44 #endif 46 45 47 46 void (*shutdown)(struct pci_controller *hose); 47 + 48 + struct iommu_group *(*device_group)(struct pci_controller *hose, 49 + struct pci_dev *pdev); 48 50 }; 49 51 50 52 /* ··· 135 131 struct irq_domain *dev_domain; 136 132 struct irq_domain *msi_domain; 137 133 struct fwnode_handle *fwnode; 134 + 135 + /* iommu_ops support */ 136 + struct iommu_device iommu; 138 137 }; 139 138 140 139 /* These are used for config access before all the PCI probing
+146 -2
arch/powerpc/kernel/iommu.c
··· 35 35 #include <asm/vio.h> 36 36 #include <asm/tce.h> 37 37 #include <asm/mmu_context.h> 38 + #include <asm/ppc-pci.h> 38 39 39 40 #define DBG(...) 40 41 ··· 1157 1156 1158 1157 pr_debug("%s: Adding %s to iommu group %d\n", 1159 1158 __func__, dev_name(dev), iommu_group_id(table_group->group)); 1160 - 1161 - return iommu_group_add_device(table_group->group, dev); 1159 + /* 1160 + * This is still not adding devices via the IOMMU bus notifier because 1161 + * of pcibios_init() from arch/powerpc/kernel/pci_64.c which calls 1162 + * pcibios_scan_phb() first (and this guy adds devices and triggers 1163 + * the notifier) and only then it calls pci_bus_add_devices() which 1164 + * configures DMA for buses which also creates PEs and IOMMU groups. 1165 + */ 1166 + return iommu_probe_device(dev); 1162 1167 } 1163 1168 EXPORT_SYMBOL_GPL(iommu_add_device); 1164 1169 ··· 1244 1237 rc = iommu_take_ownership(tbl); 1245 1238 if (!rc) 1246 1239 continue; 1240 + 1247 1241 for (j = 0; j < i; ++j) 1248 1242 iommu_release_ownership(table_group->tables[j]); 1249 1243 return rc; ··· 1276 1268 .take_ownership = spapr_tce_take_ownership, 1277 1269 .release_ownership = spapr_tce_release_ownership, 1278 1270 }; 1271 + 1272 + /* 1273 + * A simple iommu_ops to allow less cruft in generic VFIO code. 1274 + */ 1275 + static int spapr_tce_blocking_iommu_attach_dev(struct iommu_domain *dom, 1276 + struct device *dev) 1277 + { 1278 + struct iommu_group *grp = iommu_group_get(dev); 1279 + struct iommu_table_group *table_group; 1280 + int ret = -EINVAL; 1281 + 1282 + if (!grp) 1283 + return -ENODEV; 1284 + 1285 + table_group = iommu_group_get_iommudata(grp); 1286 + ret = table_group->ops->take_ownership(table_group); 1287 + iommu_group_put(grp); 1288 + 1289 + return ret; 1290 + } 1291 + 1292 + static void spapr_tce_blocking_iommu_set_platform_dma(struct device *dev) 1293 + { 1294 + struct iommu_group *grp = iommu_group_get(dev); 1295 + struct iommu_table_group *table_group; 1296 + 1297 + table_group = iommu_group_get_iommudata(grp); 1298 + table_group->ops->release_ownership(table_group); 1299 + } 1300 + 1301 + static const struct iommu_domain_ops spapr_tce_blocking_domain_ops = { 1302 + .attach_dev = spapr_tce_blocking_iommu_attach_dev, 1303 + }; 1304 + 1305 + static bool spapr_tce_iommu_capable(struct device *dev, enum iommu_cap cap) 1306 + { 1307 + switch (cap) { 1308 + case IOMMU_CAP_CACHE_COHERENCY: 1309 + return true; 1310 + default: 1311 + break; 1312 + } 1313 + 1314 + return false; 1315 + } 1316 + 1317 + static struct iommu_domain *spapr_tce_iommu_domain_alloc(unsigned int type) 1318 + { 1319 + struct iommu_domain *dom; 1320 + 1321 + if (type != IOMMU_DOMAIN_BLOCKED) 1322 + return NULL; 1323 + 1324 + dom = kzalloc(sizeof(*dom), GFP_KERNEL); 1325 + if (!dom) 1326 + return NULL; 1327 + 1328 + dom->ops = &spapr_tce_blocking_domain_ops; 1329 + 1330 + return dom; 1331 + } 1332 + 1333 + static struct iommu_device *spapr_tce_iommu_probe_device(struct device *dev) 1334 + { 1335 + struct pci_dev *pdev; 1336 + struct pci_controller *hose; 1337 + 1338 + if (!dev_is_pci(dev)) 1339 + return ERR_PTR(-EPERM); 1340 + 1341 + pdev = to_pci_dev(dev); 1342 + hose = pdev->bus->sysdata; 1343 + 1344 + return &hose->iommu; 1345 + } 1346 + 1347 + static void spapr_tce_iommu_release_device(struct device *dev) 1348 + { 1349 + } 1350 + 1351 + static struct iommu_group *spapr_tce_iommu_device_group(struct device *dev) 1352 + { 1353 + struct pci_controller *hose; 1354 + struct pci_dev *pdev; 1355 + 1356 + pdev = to_pci_dev(dev); 1357 + hose = pdev->bus->sysdata; 1358 + 1359 + if (!hose->controller_ops.device_group) 1360 + return ERR_PTR(-ENOENT); 1361 + 1362 + return hose->controller_ops.device_group(hose, pdev); 1363 + } 1364 + 1365 + static const struct iommu_ops spapr_tce_iommu_ops = { 1366 + .capable = spapr_tce_iommu_capable, 1367 + .domain_alloc = spapr_tce_iommu_domain_alloc, 1368 + .probe_device = spapr_tce_iommu_probe_device, 1369 + .release_device = spapr_tce_iommu_release_device, 1370 + .device_group = spapr_tce_iommu_device_group, 1371 + .set_platform_dma_ops = spapr_tce_blocking_iommu_set_platform_dma, 1372 + }; 1373 + 1374 + static struct attribute *spapr_tce_iommu_attrs[] = { 1375 + NULL, 1376 + }; 1377 + 1378 + static struct attribute_group spapr_tce_iommu_group = { 1379 + .name = "spapr-tce-iommu", 1380 + .attrs = spapr_tce_iommu_attrs, 1381 + }; 1382 + 1383 + static const struct attribute_group *spapr_tce_iommu_groups[] = { 1384 + &spapr_tce_iommu_group, 1385 + NULL, 1386 + }; 1387 + 1388 + /* 1389 + * This registers IOMMU devices of PHBs. This needs to happen 1390 + * after core_initcall(iommu_init) + postcore_initcall(pci_driver_init) and 1391 + * before subsys_initcall(iommu_subsys_init). 1392 + */ 1393 + static int __init spapr_tce_setup_phb_iommus_initcall(void) 1394 + { 1395 + struct pci_controller *hose; 1396 + 1397 + list_for_each_entry(hose, &hose_list, list_node) { 1398 + iommu_device_sysfs_add(&hose->iommu, hose->parent, 1399 + spapr_tce_iommu_groups, "iommu-phb%04x", 1400 + hose->global_number); 1401 + iommu_device_register(&hose->iommu, &spapr_tce_iommu_ops, 1402 + hose->parent); 1403 + } 1404 + return 0; 1405 + } 1406 + postcore_initcall_sync(spapr_tce_setup_phb_iommus_initcall); 1279 1407 1280 1408 #endif /* CONFIG_IOMMU_API */
+34
arch/powerpc/platforms/powernv/pci-ioda.c
··· 1899 1899 /* Store @tbl as pnv_pci_ioda2_unset_window() resets it */ 1900 1900 struct iommu_table *tbl = pe->table_group.tables[0]; 1901 1901 1902 + /* 1903 + * iommu_ops transfers the ownership per a device and we mode 1904 + * the group ownership with the first device in the group. 1905 + */ 1906 + if (!tbl) 1907 + return 0; 1908 + 1902 1909 pnv_pci_ioda2_set_bypass(pe, false); 1903 1910 pnv_pci_ioda2_unset_window(&pe->table_group, 0); 1904 1911 if (pe->pbus) ··· 1922 1915 struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe, 1923 1916 table_group); 1924 1917 1918 + /* See the comment about iommu_ops above */ 1919 + if (pe->table_group.tables[0]) 1920 + return; 1925 1921 pnv_pci_ioda2_setup_default_config(pe); 1926 1922 if (pe->pbus) 1927 1923 pnv_ioda_setup_bus_dma(pe, pe->pbus); ··· 2931 2921 } 2932 2922 } 2933 2923 2924 + #ifdef CONFIG_IOMMU_API 2925 + static struct iommu_group *pnv_pci_device_group(struct pci_controller *hose, 2926 + struct pci_dev *pdev) 2927 + { 2928 + struct pnv_phb *phb = hose->private_data; 2929 + struct pnv_ioda_pe *pe; 2930 + 2931 + if (WARN_ON(!phb)) 2932 + return ERR_PTR(-ENODEV); 2933 + 2934 + pe = pnv_pci_bdfn_to_pe(phb, pdev->devfn | (pdev->bus->number << 8)); 2935 + if (!pe) 2936 + return ERR_PTR(-ENODEV); 2937 + 2938 + if (!pe->table_group.group) 2939 + return ERR_PTR(-ENODEV); 2940 + 2941 + return iommu_group_ref_get(pe->table_group.group); 2942 + } 2943 + #endif 2944 + 2934 2945 static const struct pci_controller_ops pnv_pci_ioda_controller_ops = { 2935 2946 .dma_dev_setup = pnv_pci_ioda_dma_dev_setup, 2936 2947 .dma_bus_setup = pnv_pci_ioda_dma_bus_setup, ··· 2962 2931 .setup_bridge = pnv_pci_fixup_bridge_resources, 2963 2932 .reset_secondary_bus = pnv_pci_reset_secondary_bus, 2964 2933 .shutdown = pnv_pci_ioda_shutdown, 2934 + #ifdef CONFIG_IOMMU_API 2935 + .device_group = pnv_pci_device_group, 2936 + #endif 2965 2937 }; 2966 2938 2967 2939 static const struct pci_controller_ops pnv_npu_ocapi_ioda_controller_ops = {
+24
arch/powerpc/platforms/pseries/iommu.c
··· 1729 1729 return 0; 1730 1730 } 1731 1731 machine_subsys_initcall_sync(pseries, tce_iommu_bus_notifier_init); 1732 + 1733 + #ifdef CONFIG_SPAPR_TCE_IOMMU 1734 + struct iommu_group *pSeries_pci_device_group(struct pci_controller *hose, 1735 + struct pci_dev *pdev) 1736 + { 1737 + struct device_node *pdn, *dn = pdev->dev.of_node; 1738 + struct iommu_group *grp; 1739 + struct pci_dn *pci; 1740 + 1741 + pdn = pci_dma_find(dn, NULL); 1742 + if (!pdn || !PCI_DN(pdn)) 1743 + return ERR_PTR(-ENODEV); 1744 + 1745 + pci = PCI_DN(pdn); 1746 + if (!pci->table_group) 1747 + return ERR_PTR(-ENODEV); 1748 + 1749 + grp = pci->table_group->group; 1750 + if (!grp) 1751 + return ERR_PTR(-ENODEV); 1752 + 1753 + return iommu_group_ref_get(grp); 1754 + } 1755 + #endif
+4
arch/powerpc/platforms/pseries/pseries.h
··· 123 123 #endif 124 124 125 125 void pseries_rng_init(void); 126 + #ifdef CONFIG_SPAPR_TCE_IOMMU 127 + struct iommu_group *pSeries_pci_device_group(struct pci_controller *hose, 128 + struct pci_dev *pdev); 129 + #endif 126 130 127 131 #endif /* _PSERIES_PSERIES_H */
+3
arch/powerpc/platforms/pseries/setup.c
··· 1118 1118 1119 1119 struct pci_controller_ops pseries_pci_controller_ops = { 1120 1120 .probe_mode = pSeries_pci_probe_mode, 1121 + #ifdef CONFIG_SPAPR_TCE_IOMMU 1122 + .device_group = pSeries_pci_device_group, 1123 + #endif 1121 1124 }; 1122 1125 1123 1126 define_machine(pseries) {
-8
drivers/vfio/vfio_iommu_spapr_tce.c
··· 1200 1200 for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) 1201 1201 if (container->tables[i]) 1202 1202 table_group->ops->unset_window(table_group, i); 1203 - 1204 - table_group->ops->release_ownership(table_group); 1205 1203 } 1206 1204 1207 1205 static long tce_iommu_take_ownership(struct tce_container *container, 1208 1206 struct iommu_table_group *table_group) 1209 1207 { 1210 1208 long i, ret = 0; 1211 - 1212 - ret = table_group->ops->take_ownership(table_group); 1213 - if (ret) 1214 - return ret; 1215 1209 1216 1210 /* Set all windows to the new group */ 1217 1211 for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { ··· 1224 1230 release_exit: 1225 1231 for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) 1226 1232 table_group->ops->unset_window(table_group, i); 1227 - 1228 - table_group->ops->release_ownership(table_group); 1229 1233 1230 1234 return ret; 1231 1235 }