Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

iommu/sva: invalidate stale IOTLB entries for kernel address space

Introduce a new IOMMU interface to flush IOTLB paging cache entries for
the CPU kernel address space. This interface is invoked from the x86
architecture code that manages combined user and kernel page tables,
specifically before any kernel page table page is freed and reused.

This addresses the main issue with vfree() which is a common occurrence
and can be triggered by unprivileged users. While this resolves the
primary problem, it doesn't address some extremely rare case related to
memory unplug of memory that was present as reserved memory at boot, which
cannot be triggered by unprivileged users. The discussion can be found at
the link below.

Enable SVA on x86 architecture since the IOMMU can now receive
notification to flush the paging cache before freeing the CPU kernel page
table pages.

Link: https://lkml.kernel.org/r/20251022082635.2462433-9-baolu.lu@linux.intel.com
Link: https://lore.kernel.org/linux-iommu/04983c62-3b1d-40d4-93ae-34ca04b827e5@intel.com/
Co-developed-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Suggested-by: Jann Horn <jannh@google.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Vasant Hegde <vasant.hegde@amd.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Betkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jean-Philippe Brucker <jean-philippe@linaro.org>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Mike Rapoport (Microsoft) <rppt@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Robin Murohy <robin.murphy@arm.com>
Cc: Thomas Gleinxer <tglx@linutronix.de>
Cc: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Cc: Vinicius Costa Gomes <vinicius.gomes@intel.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Will Deacon <will@kernel.org>
Cc: Yi Lai <yi1.lai@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

authored by

Lu Baolu and committed by
Andrew Morton
e37d5a2d 5ba2f0a1

+35 -4
+1
arch/x86/Kconfig
··· 279 279 select HAVE_PCI 280 280 select HAVE_PERF_REGS 281 281 select HAVE_PERF_USER_STACK_DUMP 282 + select ASYNC_KERNEL_PGTABLE_FREE if IOMMU_SVA 282 283 select MMU_GATHER_RCU_TABLE_FREE 283 284 select MMU_GATHER_MERGE_VMAS 284 285 select HAVE_POSIX_CPU_TIMERS_TASK_WORK
+28 -4
drivers/iommu/iommu-sva.c
··· 10 10 #include "iommu-priv.h" 11 11 12 12 static DEFINE_MUTEX(iommu_sva_lock); 13 + static bool iommu_sva_present; 14 + static LIST_HEAD(iommu_sva_mms); 13 15 static struct iommu_domain *iommu_sva_domain_alloc(struct device *dev, 14 16 struct mm_struct *mm); 15 17 ··· 44 42 return ERR_PTR(-ENOSPC); 45 43 } 46 44 iommu_mm->pasid = pasid; 45 + iommu_mm->mm = mm; 47 46 INIT_LIST_HEAD(&iommu_mm->sva_domains); 48 47 /* 49 48 * Make sure the write to mm->iommu_mm is not reordered in front of ··· 79 76 80 77 if (!group) 81 78 return ERR_PTR(-ENODEV); 82 - 83 - if (IS_ENABLED(CONFIG_X86)) 84 - return ERR_PTR(-EOPNOTSUPP); 85 79 86 80 mutex_lock(&iommu_sva_lock); 87 81 ··· 135 135 if (ret) 136 136 goto out_free_domain; 137 137 domain->users = 1; 138 - list_add(&domain->next, &mm->iommu_mm->sva_domains); 139 138 139 + if (list_empty(&iommu_mm->sva_domains)) { 140 + if (list_empty(&iommu_sva_mms)) 141 + iommu_sva_present = true; 142 + list_add(&iommu_mm->mm_list_elm, &iommu_sva_mms); 143 + } 144 + list_add(&domain->next, &iommu_mm->sva_domains); 140 145 out: 141 146 refcount_set(&handle->users, 1); 142 147 mutex_unlock(&iommu_sva_lock); ··· 183 178 list_del(&domain->next); 184 179 iommu_domain_free(domain); 185 180 } 181 + 182 + if (list_empty(&iommu_mm->sva_domains)) { 183 + list_del(&iommu_mm->mm_list_elm); 184 + if (list_empty(&iommu_sva_mms)) 185 + iommu_sva_present = false; 186 + } 187 + 186 188 mutex_unlock(&iommu_sva_lock); 187 189 kfree(handle); 188 190 } ··· 326 314 domain->iopf_handler = iommu_sva_iopf_handler; 327 315 328 316 return domain; 317 + } 318 + 319 + void iommu_sva_invalidate_kva_range(unsigned long start, unsigned long end) 320 + { 321 + struct iommu_mm_data *iommu_mm; 322 + 323 + guard(mutex)(&iommu_sva_lock); 324 + if (!iommu_sva_present) 325 + return; 326 + 327 + list_for_each_entry(iommu_mm, &iommu_sva_mms, mm_list_elm) 328 + mmu_notifier_arch_invalidate_secondary_tlbs(iommu_mm->mm, start, end); 329 329 }
+4
include/linux/iommu.h
··· 1134 1134 1135 1135 struct iommu_mm_data { 1136 1136 u32 pasid; 1137 + struct mm_struct *mm; 1137 1138 struct list_head sva_domains; 1139 + struct list_head mm_list_elm; 1138 1140 }; 1139 1141 1140 1142 int iommu_fwspec_init(struct device *dev, struct fwnode_handle *iommu_fwnode); ··· 1617 1615 struct mm_struct *mm); 1618 1616 void iommu_sva_unbind_device(struct iommu_sva *handle); 1619 1617 u32 iommu_sva_get_pasid(struct iommu_sva *handle); 1618 + void iommu_sva_invalidate_kva_range(unsigned long start, unsigned long end); 1620 1619 #else 1621 1620 static inline struct iommu_sva * 1622 1621 iommu_sva_bind_device(struct device *dev, struct mm_struct *mm) ··· 1642 1639 } 1643 1640 1644 1641 static inline void mm_pasid_drop(struct mm_struct *mm) {} 1642 + static inline void iommu_sva_invalidate_kva_range(unsigned long start, unsigned long end) {} 1645 1643 #endif /* CONFIG_IOMMU_SVA */ 1646 1644 1647 1645 #ifdef CONFIG_IOMMU_IOPF
+2
mm/pgtable-generic.c
··· 13 13 #include <linux/swap.h> 14 14 #include <linux/swapops.h> 15 15 #include <linux/mm_inline.h> 16 + #include <linux/iommu.h> 16 17 #include <asm/pgalloc.h> 17 18 #include <asm/tlb.h> 18 19 ··· 431 430 list_splice_tail_init(&kernel_pgtable_work.list, &page_list); 432 431 spin_unlock(&kernel_pgtable_work.lock); 433 432 433 + iommu_sva_invalidate_kva_range(PAGE_OFFSET, TLB_FLUSH_ALL); 434 434 list_for_each_entry_safe(pt, next, &page_list, pt_list) 435 435 __pagetable_free(pt); 436 436 }