Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

mmu_gather: Remove per arch tlb_{start,end}_vma()

Scattered across the archs are 3 basic forms of tlb_{start,end}_vma().
Provide two new MMU_GATHER_knobs to enumerate them and remove the per
arch tlb_{start,end}_vma() implementations.

- MMU_GATHER_NO_FLUSH_CACHE indicates the arch has flush_cache_range()
but does *NOT* want to call it for each VMA.

- MMU_GATHER_MERGE_VMAS indicates the arch wants to merge the
invalidate across multiple VMAs if possible.

With these it is possible to capture the three forms:

1) empty stubs;
select MMU_GATHER_NO_FLUSH_CACHE and MMU_GATHER_MERGE_VMAS

2) start: flush_cache_range(), end: empty;
select MMU_GATHER_MERGE_VMAS

3) start: flush_cache_range(), end: flush_tlb_range();
default

Obviously, if the architecture does not have flush_cache_range() then
it also doesn't need to select MMU_GATHER_NO_FLUSH_CACHE.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Will Deacon <will@kernel.org>
Cc: David Miller <davem@davemloft.net>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

authored by

Peter Zijlstra and committed by
Linus Torvalds
1e9fdf21 23a67619

+32 -35
+7
arch/Kconfig
··· 438 438 439 439 config MMU_GATHER_NO_RANGE 440 440 bool 441 + select MMU_GATHER_MERGE_VMAS 442 + 443 + config MMU_GATHER_NO_FLUSH_CACHE 444 + bool 445 + 446 + config MMU_GATHER_MERGE_VMAS 447 + bool 441 448 442 449 config MMU_GATHER_NO_GATHER 443 450 bool
-13
arch/csky/include/asm/tlb.h
··· 4 4 #define __ASM_CSKY_TLB_H 5 5 6 6 #include <asm/cacheflush.h> 7 - 8 - #define tlb_start_vma(tlb, vma) \ 9 - do { \ 10 - if (!(tlb)->fullmm) \ 11 - flush_cache_range(vma, (vma)->vm_start, (vma)->vm_end); \ 12 - } while (0) 13 - 14 - #define tlb_end_vma(tlb, vma) \ 15 - do { \ 16 - if (!(tlb)->fullmm) \ 17 - flush_tlb_range(vma, (vma)->vm_start, (vma)->vm_end); \ 18 - } while (0) 19 - 20 7 #define tlb_flush(tlb) flush_tlb_mm((tlb)->mm) 21 8 22 9 #include <asm-generic/tlb.h>
+1
arch/loongarch/Kconfig
··· 108 108 select TRACE_IRQFLAGS_SUPPORT 109 109 select USE_PERCPU_NUMA_NODE_ID 110 110 select ZONE_DMA32 111 + select MMU_GATHER_MERGE_VMAS if MMU 111 112 112 113 config 32BIT 113 114 bool
-10
arch/loongarch/include/asm/tlb.h
··· 137 137 ); 138 138 } 139 139 140 - /* 141 - * LoongArch doesn't need any special per-pte or per-vma handling, except 142 - * we need to flush cache for area to be unmapped. 143 - */ 144 - #define tlb_start_vma(tlb, vma) \ 145 - do { \ 146 - if (!(tlb)->fullmm) \ 147 - flush_cache_range(vma, vma->vm_start, vma->vm_end); \ 148 - } while (0) 149 - #define tlb_end_vma(tlb, vma) do { } while (0) 150 140 #define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0) 151 141 152 142 static void tlb_flush(struct mmu_gather *tlb);
+1
arch/powerpc/Kconfig
··· 256 256 select IRQ_FORCED_THREADING 257 257 select MMU_GATHER_PAGE_SIZE 258 258 select MMU_GATHER_RCU_TABLE_FREE 259 + select MMU_GATHER_MERGE_VMAS 259 260 select MODULES_USE_ELF_RELA 260 261 select NEED_DMA_MAP_STATE if PPC64 || NOT_COHERENT_CACHE 261 262 select NEED_PER_CPU_EMBED_FIRST_CHUNK if PPC64
-2
arch/powerpc/include/asm/tlb.h
··· 19 19 20 20 #include <linux/pagemap.h> 21 21 22 - #define tlb_start_vma(tlb, vma) do { } while (0) 23 - #define tlb_end_vma(tlb, vma) do { } while (0) 24 22 #define __tlb_remove_tlb_entry __tlb_remove_tlb_entry 25 23 26 24 #define tlb_flush tlb_flush
+1
arch/s390/Kconfig
··· 204 204 select IOMMU_SUPPORT if PCI 205 205 select MMU_GATHER_NO_GATHER 206 206 select MMU_GATHER_RCU_TABLE_FREE 207 + select MMU_GATHER_MERGE_VMAS 207 208 select MODULES_USE_ELF_RELA 208 209 select NEED_DMA_MAP_STATE if PCI 209 210 select NEED_SG_DMA_LENGTH if PCI
-3
arch/s390/include/asm/tlb.h
··· 27 27 static inline bool __tlb_remove_page_size(struct mmu_gather *tlb, 28 28 struct page *page, int page_size); 29 29 30 - #define tlb_start_vma(tlb, vma) do { } while (0) 31 - #define tlb_end_vma(tlb, vma) do { } while (0) 32 - 33 30 #define tlb_flush tlb_flush 34 31 #define pte_free_tlb pte_free_tlb 35 32 #define pmd_free_tlb pmd_free_tlb
+2
arch/sparc/Kconfig
··· 67 67 select HAVE_KRETPROBES 68 68 select HAVE_KPROBES 69 69 select MMU_GATHER_RCU_TABLE_FREE if SMP 70 + select MMU_GATHER_MERGE_VMAS 71 + select MMU_GATHER_NO_FLUSH_CACHE 70 72 select HAVE_ARCH_TRANSPARENT_HUGEPAGE 71 73 select HAVE_DYNAMIC_FTRACE 72 74 select HAVE_FTRACE_MCOUNT_RECORD
-2
arch/sparc/include/asm/tlb_64.h
··· 22 22 void __flush_tlb_pending(unsigned long, unsigned long, unsigned long *); 23 23 void flush_tlb_pending(void); 24 24 25 - #define tlb_start_vma(tlb, vma) do { } while (0) 26 - #define tlb_end_vma(tlb, vma) do { } while (0) 27 25 #define tlb_flush(tlb) flush_tlb_pending() 28 26 29 27 /*
+1
arch/x86/Kconfig
··· 245 245 select HAVE_PERF_REGS 246 246 select HAVE_PERF_USER_STACK_DUMP 247 247 select MMU_GATHER_RCU_TABLE_FREE if PARAVIRT 248 + select MMU_GATHER_MERGE_VMAS 248 249 select HAVE_POSIX_CPU_TIMERS_TASK_WORK 249 250 select HAVE_REGS_AND_STACK_ACCESS_API 250 251 select HAVE_RELIABLE_STACKTRACE if UNWINDER_ORC || STACK_VALIDATION
-3
arch/x86/include/asm/tlb.h
··· 2 2 #ifndef _ASM_X86_TLB_H 3 3 #define _ASM_X86_TLB_H 4 4 5 - #define tlb_start_vma(tlb, vma) do { } while (0) 6 - #define tlb_end_vma(tlb, vma) do { } while (0) 7 - 8 5 #define tlb_flush tlb_flush 9 6 static inline void tlb_flush(struct mmu_gather *tlb); 10 7
+19 -2
include/asm-generic/tlb.h
··· 158 158 * Useful if your architecture doesn't use IPIs for remote TLB invalidates 159 159 * and therefore doesn't naturally serialize with software page-table walkers. 160 160 * 161 + * MMU_GATHER_NO_FLUSH_CACHE 162 + * 163 + * Indicates the architecture has flush_cache_range() but it needs *NOT* be called 164 + * before unmapping a VMA. 165 + * 166 + * NOTE: strictly speaking we shouldn't have this knob and instead rely on 167 + * flush_cache_range() being a NOP, except Sparc64 seems to be 168 + * different here. 169 + * 170 + * MMU_GATHER_MERGE_VMAS 171 + * 172 + * Indicates the architecture wants to merge ranges over VMAs; typical when 173 + * multiple range invalidates are more expensive than a full invalidate. 174 + * 161 175 * MMU_GATHER_NO_RANGE 162 176 * 163 - * Use this if your architecture lacks an efficient flush_tlb_range(). 177 + * Use this if your architecture lacks an efficient flush_tlb_range(). This 178 + * option implies MMU_GATHER_MERGE_VMAS above. 164 179 * 165 180 * MMU_GATHER_NO_GATHER 166 181 * ··· 508 493 return; 509 494 510 495 tlb_update_vma_flags(tlb, vma); 496 + #ifndef CONFIG_MMU_GATHER_NO_FLUSH_CACHE 511 497 flush_cache_range(vma, vma->vm_start, vma->vm_end); 498 + #endif 512 499 } 513 500 #endif 514 501 515 502 #ifndef tlb_end_vma 516 503 static inline void tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vma) 517 504 { 518 - if (tlb->fullmm) 505 + if (tlb->fullmm || IS_ENABLED(CONFIG_MMU_GATHER_MERGE_VMAS)) 519 506 return; 520 507 521 508 /*