Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

KVM: PPC: Optimize clearing TCEs for sparse tables

The powernv platform maintains 2 TCE tables for VFIO - a hardware TCE
table and a table with userspace addresses. These tables are radix trees,
we allocate indirect levels when they are written to. Since
the memory allocation is problematic in real mode, we have 2 accessors
to the entries:
- for virtual mode: it allocates the memory and it is always expected
to return non-NULL;
- fr real mode: it does not allocate and can return NULL.

Also, DMA windows can span to up to 55 bits of the address space and since
we never have this much RAM, such windows are sparse. However currently
the SPAPR TCE IOMMU driver walks through all TCEs to unpin DMA memory.

Since we maintain a userspace addresses table for VFIO which is a mirror
of the hardware table, we can use it to know which parts of the DMA
window have not been mapped and skip these so does this patch.

The bare metal systems do not have this problem as they use a bypass mode
of a PHB which maps RAM directly.

This helps a lot with sparse DMA windows, reducing the shutdown time from
about 3 minutes per 1 billion TCEs to a few seconds for 32GB sparse guest.
Just skipping the last level seems to be good enough.

As non-allocating accessor is used now in virtual mode as well, rename it
from IOMMU_TABLE_USERSPACE_ENTRY_RM (real mode) to _RO (read only).

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>

authored by

Alexey Kardashevskiy and committed by
Paul Mackerras
6e301a8e 8d9fcacf

+27 -9
+1 -1
arch/powerpc/include/asm/iommu.h
··· 126 126 int it_nid; 127 127 }; 128 128 129 - #define IOMMU_TABLE_USERSPACE_ENTRY_RM(tbl, entry) \ 129 + #define IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, entry) \ 130 130 ((tbl)->it_ops->useraddrptr((tbl), (entry), false)) 131 131 #define IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry) \ 132 132 ((tbl)->it_ops->useraddrptr((tbl), (entry), true))
+2 -3
arch/powerpc/kvm/book3s_64_vio.c
··· 410 410 { 411 411 struct mm_iommu_table_group_mem_t *mem = NULL; 412 412 const unsigned long pgsize = 1ULL << tbl->it_page_shift; 413 - __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry); 413 + __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, entry); 414 414 415 415 if (!pua) 416 - /* it_userspace allocation might be delayed */ 417 - return H_TOO_HARD; 416 + return H_SUCCESS; 418 417 419 418 mem = mm_iommu_lookup(kvm->mm, be64_to_cpu(*pua), pgsize); 420 419 if (!mem)
+3 -3
arch/powerpc/kvm/book3s_64_vio_hv.c
··· 214 214 215 215 if (!ret && ((*direction == DMA_FROM_DEVICE) || 216 216 (*direction == DMA_BIDIRECTIONAL))) { 217 - __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RM(tbl, entry); 217 + __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, entry); 218 218 /* 219 219 * kvmppc_rm_tce_iommu_do_map() updates the UA cache after 220 220 * calling this so we still get here a valid UA. ··· 240 240 { 241 241 struct mm_iommu_table_group_mem_t *mem = NULL; 242 242 const unsigned long pgsize = 1ULL << tbl->it_page_shift; 243 - __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RM(tbl, entry); 243 + __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, entry); 244 244 245 245 if (!pua) 246 246 /* it_userspace allocation might be delayed */ ··· 304 304 { 305 305 long ret; 306 306 unsigned long hpa = 0; 307 - __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RM(tbl, entry); 307 + __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, entry); 308 308 struct mm_iommu_table_group_mem_t *mem; 309 309 310 310 if (!pua)
+21 -2
drivers/vfio/vfio_iommu_spapr_tce.c
··· 444 444 struct mm_iommu_table_group_mem_t *mem = NULL; 445 445 int ret; 446 446 unsigned long hpa = 0; 447 - __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry); 447 + __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, entry); 448 448 449 449 if (!pua) 450 450 return; ··· 467 467 unsigned long oldhpa; 468 468 long ret; 469 469 enum dma_data_direction direction; 470 + unsigned long lastentry = entry + pages; 470 471 471 - for ( ; pages; --pages, ++entry) { 472 + for ( ; entry < lastentry; ++entry) { 473 + if (tbl->it_indirect_levels && tbl->it_userspace) { 474 + /* 475 + * For multilevel tables, we can take a shortcut here 476 + * and skip some TCEs as we know that the userspace 477 + * addresses cache is a mirror of the real TCE table 478 + * and if it is missing some indirect levels, then 479 + * the hardware table does not have them allocated 480 + * either and therefore does not require updating. 481 + */ 482 + __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, 483 + entry); 484 + if (!pua) { 485 + /* align to level_size which is power of two */ 486 + entry |= tbl->it_level_size - 1; 487 + continue; 488 + } 489 + } 490 + 472 491 cond_resched(); 473 492 474 493 direction = DMA_NONE;