Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

powerpc/iommu/powernv: Release replaced TCE

At the moment writing new TCE value to the IOMMU table fails with EBUSY
if there is a valid entry already. However PAPR specification allows
the guest to write new TCE value without clearing it first.

Another problem this patch is addressing is the use of pool locks for
external IOMMU users such as VFIO. The pool locks are to protect
DMA page allocator rather than entries and since the host kernel does
not control what pages are in use, there is no point in pool locks and
exchange()+put_page(oldtce) is sufficient to avoid possible races.

This adds an exchange() callback to iommu_table_ops which does the same
thing as set() plus it returns replaced TCE and DMA direction so
the caller can release the pages afterwards. The exchange() receives
a physical address unlike set() which receives linear mapping address;
and returns a physical address as the clear() does.

This implements exchange() for P5IOC2/IODA/IODA2. This adds a requirement
for a platform to have exchange() implemented in order to support VFIO.

This replaces iommu_tce_build() and iommu_clear_tce() with
a single iommu_tce_xchg().

This makes sure that TCE permission bits are not set in TCE passed to
IOMMU API as those are to be calculated by platform code from
DMA direction.

This moves SetPageDirty() to the IOMMU code to make it work for both
VFIO ioctl interface in in-kernel TCE acceleration (when it becomes
available later).

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
[aw: for the vfio related changes]
Acked-by: Alex Williamson <alex.williamson@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

authored by

Alexey Kardashevskiy and committed by
Michael Ellerman
05c6cfb9 c5bb44ed

+132 -69
+18 -4
arch/powerpc/include/asm/iommu.h
··· 45 45 extern int iommu_force_on; 46 46 47 47 struct iommu_table_ops { 48 + /* 49 + * When called with direction==DMA_NONE, it is equal to clear(). 50 + * uaddr is a linear map address. 51 + */ 48 52 int (*set)(struct iommu_table *tbl, 49 53 long index, long npages, 50 54 unsigned long uaddr, 51 55 enum dma_data_direction direction, 52 56 struct dma_attrs *attrs); 57 + #ifdef CONFIG_IOMMU_API 58 + /* 59 + * Exchanges existing TCE with new TCE plus direction bits; 60 + * returns old TCE and DMA direction mask. 61 + * @tce is a physical address. 62 + */ 63 + int (*exchange)(struct iommu_table *tbl, 64 + long index, 65 + unsigned long *hpa, 66 + enum dma_data_direction *direction); 67 + #endif 53 68 void (*clear)(struct iommu_table *tbl, 54 69 long index, long npages); 70 + /* get() returns a physical address */ 55 71 unsigned long (*get)(struct iommu_table *tbl, long index); 56 72 void (*flush)(struct iommu_table *tbl); 57 73 }; ··· 169 153 extern int iommu_add_device(struct device *dev); 170 154 extern void iommu_del_device(struct device *dev); 171 155 extern int __init tce_iommu_bus_notifier_init(void); 156 + extern long iommu_tce_xchg(struct iommu_table *tbl, unsigned long entry, 157 + unsigned long *hpa, enum dma_data_direction *direction); 172 158 #else 173 159 static inline void iommu_register_group(struct iommu_table_group *table_group, 174 160 int pci_domain_number, ··· 243 225 unsigned long npages); 244 226 extern int iommu_tce_put_param_check(struct iommu_table *tbl, 245 227 unsigned long ioba, unsigned long tce); 246 - extern int iommu_tce_build(struct iommu_table *tbl, unsigned long entry, 247 - unsigned long hwaddr, enum dma_data_direction direction); 248 - extern unsigned long iommu_clear_tce(struct iommu_table *tbl, 249 - unsigned long entry); 250 228 251 229 extern void iommu_flush_tce(struct iommu_table *tbl); 252 230 extern int iommu_take_ownership(struct iommu_table *tbl);
+19 -40
arch/powerpc/kernel/iommu.c
··· 965 965 int iommu_tce_put_param_check(struct iommu_table *tbl, 966 966 unsigned long ioba, unsigned long tce) 967 967 { 968 - if (!(tce & (TCE_PCI_WRITE | TCE_PCI_READ))) 969 - return -EINVAL; 970 - 971 - if (tce & ~(IOMMU_PAGE_MASK(tbl) | TCE_PCI_WRITE | TCE_PCI_READ)) 968 + if (tce & ~IOMMU_PAGE_MASK(tbl)) 972 969 return -EINVAL; 973 970 974 971 if (ioba & ~IOMMU_PAGE_MASK(tbl)) ··· 982 985 } 983 986 EXPORT_SYMBOL_GPL(iommu_tce_put_param_check); 984 987 985 - unsigned long iommu_clear_tce(struct iommu_table *tbl, unsigned long entry) 988 + long iommu_tce_xchg(struct iommu_table *tbl, unsigned long entry, 989 + unsigned long *hpa, enum dma_data_direction *direction) 986 990 { 987 - unsigned long oldtce; 988 - struct iommu_pool *pool = get_pool(tbl, entry); 991 + long ret; 989 992 990 - spin_lock(&(pool->lock)); 993 + ret = tbl->it_ops->exchange(tbl, entry, hpa, direction); 991 994 992 - oldtce = tbl->it_ops->get(tbl, entry); 993 - if (oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)) 994 - tbl->it_ops->clear(tbl, entry, 1); 995 - else 996 - oldtce = 0; 997 - 998 - spin_unlock(&(pool->lock)); 999 - 1000 - return oldtce; 1001 - } 1002 - EXPORT_SYMBOL_GPL(iommu_clear_tce); 1003 - 1004 - /* 1005 - * hwaddr is a kernel virtual address here (0xc... bazillion), 1006 - * tce_build converts it to a physical address. 1007 - */ 1008 - int iommu_tce_build(struct iommu_table *tbl, unsigned long entry, 1009 - unsigned long hwaddr, enum dma_data_direction direction) 1010 - { 1011 - int ret = -EBUSY; 1012 - unsigned long oldtce; 1013 - struct iommu_pool *pool = get_pool(tbl, entry); 1014 - 1015 - spin_lock(&(pool->lock)); 1016 - 1017 - oldtce = tbl->it_ops->get(tbl, entry); 1018 - /* Add new entry if it is not busy */ 1019 - if (!(oldtce & (TCE_PCI_WRITE | TCE_PCI_READ))) 1020 - ret = tbl->it_ops->set(tbl, entry, 1, hwaddr, direction, NULL); 1021 - 1022 - spin_unlock(&(pool->lock)); 995 + if (!ret && ((*direction == DMA_FROM_DEVICE) || 996 + (*direction == DMA_BIDIRECTIONAL))) 997 + SetPageDirty(pfn_to_page(*hpa >> PAGE_SHIFT)); 1023 998 1024 999 /* if (unlikely(ret)) 1025 1000 pr_err("iommu_tce: %s failed on hwaddr=%lx ioba=%lx kva=%lx ret=%d\n", ··· 1000 1031 1001 1032 return ret; 1002 1033 } 1003 - EXPORT_SYMBOL_GPL(iommu_tce_build); 1034 + EXPORT_SYMBOL_GPL(iommu_tce_xchg); 1004 1035 1005 1036 int iommu_take_ownership(struct iommu_table *tbl) 1006 1037 { 1007 1038 unsigned long flags, i, sz = (tbl->it_size + 7) >> 3; 1008 1039 int ret = 0; 1040 + 1041 + /* 1042 + * VFIO does not control TCE entries allocation and the guest 1043 + * can write new TCEs on top of existing ones so iommu_tce_build() 1044 + * must be able to release old pages. This functionality 1045 + * requires exchange() callback defined so if it is not 1046 + * implemented, we disallow taking ownership over the table. 1047 + */ 1048 + if (!tbl->it_ops->exchange) 1049 + return -EINVAL; 1009 1050 1010 1051 spin_lock_irqsave(&tbl->large_pool.lock, flags); 1011 1052 for (i = 0; i < tbl->nr_pools; i++)
+34
arch/powerpc/platforms/powernv/pci-ioda.c
··· 1738 1738 return ret; 1739 1739 } 1740 1740 1741 + #ifdef CONFIG_IOMMU_API 1742 + static int pnv_ioda1_tce_xchg(struct iommu_table *tbl, long index, 1743 + unsigned long *hpa, enum dma_data_direction *direction) 1744 + { 1745 + long ret = pnv_tce_xchg(tbl, index, hpa, direction); 1746 + 1747 + if (!ret && (tbl->it_type & 1748 + (TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE))) 1749 + pnv_pci_ioda1_tce_invalidate(tbl, index, 1, false); 1750 + 1751 + return ret; 1752 + } 1753 + #endif 1754 + 1741 1755 static void pnv_ioda1_tce_free(struct iommu_table *tbl, long index, 1742 1756 long npages) 1743 1757 { ··· 1763 1749 1764 1750 static struct iommu_table_ops pnv_ioda1_iommu_ops = { 1765 1751 .set = pnv_ioda1_tce_build, 1752 + #ifdef CONFIG_IOMMU_API 1753 + .exchange = pnv_ioda1_tce_xchg, 1754 + #endif 1766 1755 .clear = pnv_ioda1_tce_free, 1767 1756 .get = pnv_tce_get, 1768 1757 }; ··· 1841 1824 return ret; 1842 1825 } 1843 1826 1827 + #ifdef CONFIG_IOMMU_API 1828 + static int pnv_ioda2_tce_xchg(struct iommu_table *tbl, long index, 1829 + unsigned long *hpa, enum dma_data_direction *direction) 1830 + { 1831 + long ret = pnv_tce_xchg(tbl, index, hpa, direction); 1832 + 1833 + if (!ret && (tbl->it_type & 1834 + (TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE))) 1835 + pnv_pci_ioda2_tce_invalidate(tbl, index, 1, false); 1836 + 1837 + return ret; 1838 + } 1839 + #endif 1840 + 1844 1841 static void pnv_ioda2_tce_free(struct iommu_table *tbl, long index, 1845 1842 long npages) 1846 1843 { ··· 1866 1835 1867 1836 static struct iommu_table_ops pnv_ioda2_iommu_ops = { 1868 1837 .set = pnv_ioda2_tce_build, 1838 + #ifdef CONFIG_IOMMU_API 1839 + .exchange = pnv_ioda2_tce_xchg, 1840 + #endif 1869 1841 .clear = pnv_ioda2_tce_free, 1870 1842 .get = pnv_tce_get, 1871 1843 };
+3
arch/powerpc/platforms/powernv/pci-p5ioc2.c
··· 85 85 86 86 static struct iommu_table_ops pnv_p5ioc2_iommu_ops = { 87 87 .set = pnv_tce_build, 88 + #ifdef CONFIG_IOMMU_API 89 + .exchange = pnv_tce_xchg, 90 + #endif 88 91 .clear = pnv_tce_free, 89 92 .get = pnv_tce_get, 90 93 };
+18
arch/powerpc/platforms/powernv/pci.c
··· 598 598 return 0; 599 599 } 600 600 601 + #ifdef CONFIG_IOMMU_API 602 + int pnv_tce_xchg(struct iommu_table *tbl, long index, 603 + unsigned long *hpa, enum dma_data_direction *direction) 604 + { 605 + u64 proto_tce = iommu_direction_to_tce_perm(*direction); 606 + unsigned long newtce = *hpa | proto_tce, oldtce; 607 + unsigned long idx = index - tbl->it_offset; 608 + 609 + BUG_ON(*hpa & ~IOMMU_PAGE_MASK(tbl)); 610 + 611 + oldtce = xchg(pnv_tce(tbl, idx), cpu_to_be64(newtce)); 612 + *hpa = be64_to_cpu(oldtce) & ~(TCE_PCI_READ | TCE_PCI_WRITE); 613 + *direction = iommu_tce_direction(oldtce); 614 + 615 + return 0; 616 + } 617 + #endif 618 + 601 619 void pnv_tce_free(struct iommu_table *tbl, long index, long npages) 602 620 { 603 621 long i;
+2
arch/powerpc/platforms/powernv/pci.h
··· 207 207 unsigned long uaddr, enum dma_data_direction direction, 208 208 struct dma_attrs *attrs); 209 209 extern void pnv_tce_free(struct iommu_table *tbl, long index, long npages); 210 + extern int pnv_tce_xchg(struct iommu_table *tbl, long index, 211 + unsigned long *hpa, enum dma_data_direction *direction); 210 212 extern unsigned long pnv_tce_get(struct iommu_table *tbl, long index); 211 213 212 214 void pnv_pci_dump_phb_diag_data(struct pci_controller *hose,
+38 -25
drivers/vfio/vfio_iommu_spapr_tce.c
··· 236 236 } 237 237 238 238 static void tce_iommu_unuse_page(struct tce_container *container, 239 - unsigned long oldtce) 239 + unsigned long hpa) 240 240 { 241 241 struct page *page; 242 242 243 - if (!(oldtce & (TCE_PCI_READ | TCE_PCI_WRITE))) 244 - return; 245 - 246 - page = pfn_to_page(oldtce >> PAGE_SHIFT); 247 - 248 - if (oldtce & TCE_PCI_WRITE) 249 - SetPageDirty(page); 250 - 243 + page = pfn_to_page(hpa >> PAGE_SHIFT); 251 244 put_page(page); 252 245 } 253 246 ··· 248 255 struct iommu_table *tbl, 249 256 unsigned long entry, unsigned long pages) 250 257 { 251 - unsigned long oldtce; 258 + unsigned long oldhpa; 259 + long ret; 260 + enum dma_data_direction direction; 252 261 253 262 for ( ; pages; --pages, ++entry) { 254 - oldtce = iommu_clear_tce(tbl, entry); 255 - if (!oldtce) 263 + direction = DMA_NONE; 264 + oldhpa = 0; 265 + ret = iommu_tce_xchg(tbl, entry, &oldhpa, &direction); 266 + if (ret) 256 267 continue; 257 268 258 - tce_iommu_unuse_page(container, oldtce); 269 + if (direction == DMA_NONE) 270 + continue; 271 + 272 + tce_iommu_unuse_page(container, oldhpa); 259 273 } 260 274 261 275 return 0; ··· 284 284 285 285 static long tce_iommu_build(struct tce_container *container, 286 286 struct iommu_table *tbl, 287 - unsigned long entry, unsigned long tce, unsigned long pages) 287 + unsigned long entry, unsigned long tce, unsigned long pages, 288 + enum dma_data_direction direction) 288 289 { 289 290 long i, ret = 0; 290 291 struct page *page; 291 292 unsigned long hpa; 292 - enum dma_data_direction direction = iommu_tce_direction(tce); 293 + enum dma_data_direction dirtmp; 293 294 294 295 for (i = 0; i < pages; ++i) { 295 296 unsigned long offset = tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK; ··· 306 305 } 307 306 308 307 hpa |= offset; 309 - ret = iommu_tce_build(tbl, entry + i, (unsigned long) __va(hpa), 310 - direction); 308 + dirtmp = direction; 309 + ret = iommu_tce_xchg(tbl, entry + i, &hpa, &dirtmp); 311 310 if (ret) { 312 311 tce_iommu_unuse_page(container, hpa); 313 312 pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n", ··· 315 314 tce, ret); 316 315 break; 317 316 } 317 + 318 + if (dirtmp != DMA_NONE) 319 + tce_iommu_unuse_page(container, hpa); 320 + 318 321 tce += IOMMU_PAGE_SIZE(tbl); 319 322 } 320 323 ··· 383 378 case VFIO_IOMMU_MAP_DMA: { 384 379 struct vfio_iommu_type1_dma_map param; 385 380 struct iommu_table *tbl = NULL; 386 - unsigned long tce; 387 381 long num; 382 + enum dma_data_direction direction; 388 383 389 384 if (!container->enabled) 390 385 return -EPERM; ··· 410 405 return -EINVAL; 411 406 412 407 /* iova is checked by the IOMMU API */ 413 - tce = param.vaddr; 414 - if (param.flags & VFIO_DMA_MAP_FLAG_READ) 415 - tce |= TCE_PCI_READ; 416 - if (param.flags & VFIO_DMA_MAP_FLAG_WRITE) 417 - tce |= TCE_PCI_WRITE; 408 + if (param.flags & VFIO_DMA_MAP_FLAG_READ) { 409 + if (param.flags & VFIO_DMA_MAP_FLAG_WRITE) 410 + direction = DMA_BIDIRECTIONAL; 411 + else 412 + direction = DMA_TO_DEVICE; 413 + } else { 414 + if (param.flags & VFIO_DMA_MAP_FLAG_WRITE) 415 + direction = DMA_FROM_DEVICE; 416 + else 417 + return -EINVAL; 418 + } 418 419 419 - ret = iommu_tce_put_param_check(tbl, param.iova, tce); 420 + ret = iommu_tce_put_param_check(tbl, param.iova, param.vaddr); 420 421 if (ret) 421 422 return ret; 422 423 423 424 ret = tce_iommu_build(container, tbl, 424 425 param.iova >> tbl->it_page_shift, 425 - tce, param.size >> tbl->it_page_shift); 426 + param.vaddr, 427 + param.size >> tbl->it_page_shift, 428 + direction); 426 429 427 430 iommu_flush_tce(tbl); 428 431