mm: send one IPI per CPU to TLB flush all entries after unmapping pages

+1

arch/x86/Kconfig

··· 41 41 select ARCH_USE_CMPXCHG_LOCKREF if X86_64 42 42 select ARCH_USE_QUEUED_RWLOCKS 43 43 select ARCH_USE_QUEUED_SPINLOCKS 44 + select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH if SMP 44 45 select ARCH_WANTS_DYNAMIC_TASK_STRUCT 45 46 select ARCH_WANT_FRAME_POINTERS 46 47 select ARCH_WANT_IPC_PARSE_VERSION if X86_32

+6

arch/x86/include/asm/tlbflush.h

··· 261 261 262 262 #endif /* SMP */ 263 263 264 + /* Not inlined due to inc_irq_stat not being defined yet */ 265 + #define flush_tlb_local() { \ 266 + inc_irq_stat(irq_tlb_count); \ 267 + local_flush_tlb(); \ 268 + } 269 + 264 270 #ifndef CONFIG_PARAVIRT 265 271 #define flush_tlb_others(mask, mm, start, end) \ 266 272 native_flush_tlb_others(mask, mm, start, end)

+3

include/linux/rmap.h

··· 89 89 TTU_IGNORE_MLOCK = (1 << 8), /* ignore mlock */ 90 90 TTU_IGNORE_ACCESS = (1 << 9), /* don't age */ 91 91 TTU_IGNORE_HWPOISON = (1 << 10),/* corrupted page is recoverable */ 92 + TTU_BATCH_FLUSH = (1 << 11), /* Batch TLB flushes where possible 93 + * and caller guarantees they will 94 + * do a final flush if necessary */ 92 95 }; 93 96 94 97 #ifdef CONFIG_MMU

+16

include/linux/sched.h

··· 1344 1344 perf_nr_task_contexts, 1345 1345 }; 1346 1346 1347 + /* Track pages that require TLB flushes */ 1348 + struct tlbflush_unmap_batch { 1349 + /* 1350 + * Each bit set is a CPU that potentially has a TLB entry for one of 1351 + * the PFNs being flushed. See set_tlb_ubc_flush_pending(). 1352 + */ 1353 + struct cpumask cpumask; 1354 + 1355 + /* True if any bit in cpumask is set */ 1356 + bool flush_required; 1357 + }; 1358 + 1347 1359 struct task_struct { 1348 1360 volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */ 1349 1361 void *stack; ··· 1711 1699 1712 1700 unsigned long numa_pages_migrated; 1713 1701 #endif /* CONFIG_NUMA_BALANCING */ 1702 + 1703 + #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH 1704 + struct tlbflush_unmap_batch tlb_ubc; 1705 + #endif 1714 1706 1715 1707 struct rcu_head rcu; 1716 1708

+10

init/Kconfig

··· 883 883 bool 884 884 885 885 # 886 + # For architectures that prefer to flush all TLBs after a number of pages 887 + # are unmapped instead of sending one IPI per page to flush. The architecture 888 + # must provide guarantees on what happens if a clean TLB cache entry is 889 + # written after the unmap. Details are in mm/rmap.c near the check for 890 + # should_defer_flush. The architecture should also consider if the full flush 891 + # and the refill costs are offset by the savings of sending fewer IPIs. 892 + config ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH 893 + bool 894 + 895 + # 886 896 # For architectures that know their GCC __int128 support is sound 887 897 # 888 898 config ARCH_SUPPORTS_INT128

+11

mm/internal.h

··· 426 426 #define ALLOC_CMA 0x80 /* allow allocations from CMA areas */ 427 427 #define ALLOC_FAIR 0x100 /* fair zone allocation */ 428 428 429 + enum ttu_flags; 430 + struct tlbflush_unmap_batch; 431 + 432 + #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH 433 + void try_to_unmap_flush(void); 434 + #else 435 + static inline void try_to_unmap_flush(void) 436 + { 437 + } 438 + 439 + #endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */ 429 440 #endif /* __MM_INTERNAL_H */

+103 -1

mm/rmap.c

··· 62 62 63 63 #include <asm/tlbflush.h> 64 64 65 + #include <trace/events/tlb.h> 66 + 65 67 #include "internal.h" 66 68 67 69 static struct kmem_cache *anon_vma_cachep; ··· 584 582 585 583 return address; 586 584 } 585 + 586 + #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH 587 + static void percpu_flush_tlb_batch_pages(void *data) 588 + { 589 + /* 590 + * All TLB entries are flushed on the assumption that it is 591 + * cheaper to flush all TLBs and let them be refilled than 592 + * flushing individual PFNs. Note that we do not track mm's 593 + * to flush as that might simply be multiple full TLB flushes 594 + * for no gain. 595 + */ 596 + count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); 597 + flush_tlb_local(); 598 + } 599 + 600 + /* 601 + * Flush TLB entries for recently unmapped pages from remote CPUs. It is 602 + * important if a PTE was dirty when it was unmapped that it's flushed 603 + * before any IO is initiated on the page to prevent lost writes. Similarly, 604 + * it must be flushed before freeing to prevent data leakage. 605 + */ 606 + void try_to_unmap_flush(void) 607 + { 608 + struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc; 609 + int cpu; 610 + 611 + if (!tlb_ubc->flush_required) 612 + return; 613 + 614 + cpu = get_cpu(); 615 + 616 + trace_tlb_flush(TLB_REMOTE_SHOOTDOWN, -1UL); 617 + 618 + if (cpumask_test_cpu(cpu, &tlb_ubc->cpumask)) 619 + percpu_flush_tlb_batch_pages(&tlb_ubc->cpumask); 620 + 621 + if (cpumask_any_but(&tlb_ubc->cpumask, cpu) < nr_cpu_ids) { 622 + smp_call_function_many(&tlb_ubc->cpumask, 623 + percpu_flush_tlb_batch_pages, (void *)tlb_ubc, true); 624 + } 625 + cpumask_clear(&tlb_ubc->cpumask); 626 + tlb_ubc->flush_required = false; 627 + put_cpu(); 628 + } 629 + 630 + static void set_tlb_ubc_flush_pending(struct mm_struct *mm, 631 + struct page *page) 632 + { 633 + struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc; 634 + 635 + cpumask_or(&tlb_ubc->cpumask, &tlb_ubc->cpumask, mm_cpumask(mm)); 636 + tlb_ubc->flush_required = true; 637 + } 638 + 639 + /* 640 + * Returns true if the TLB flush should be deferred to the end of a batch of 641 + * unmap operations to reduce IPIs. 642 + */ 643 + static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags) 644 + { 645 + bool should_defer = false; 646 + 647 + if (!(flags & TTU_BATCH_FLUSH)) 648 + return false; 649 + 650 + /* If remote CPUs need to be flushed then defer batch the flush */ 651 + if (cpumask_any_but(mm_cpumask(mm), get_cpu()) < nr_cpu_ids) 652 + should_defer = true; 653 + put_cpu(); 654 + 655 + return should_defer; 656 + } 657 + #else 658 + static void set_tlb_ubc_flush_pending(struct mm_struct *mm, 659 + struct page *page) 660 + { 661 + } 662 + 663 + static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags) 664 + { 665 + return false; 666 + } 667 + #endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */ 587 668 588 669 /* 589 670 * At what user virtual address is page expected in vma? ··· 1305 1220 1306 1221 /* Nuke the page table entry. */ 1307 1222 flush_cache_page(vma, address, page_to_pfn(page)); 1308 - pteval = ptep_clear_flush(vma, address, pte); 1223 + if (should_defer_flush(mm, flags)) { 1224 + /* 1225 + * We clear the PTE but do not flush so potentially a remote 1226 + * CPU could still be writing to the page. If the entry was 1227 + * previously clean then the architecture must guarantee that 1228 + * a clear->dirty transition on a cached TLB entry is written 1229 + * through and traps if the PTE is unmapped. 1230 + */ 1231 + pteval = ptep_get_and_clear(mm, address, pte); 1232 + 1233 + /* Potentially writable TLBs must be flushed before IO */ 1234 + if (pte_dirty(pteval)) 1235 + flush_tlb_page(vma, address); 1236 + else 1237 + set_tlb_ubc_flush_pending(mm, page); 1238 + } else { 1239 + pteval = ptep_clear_flush(vma, address, pte); 1240 + } 1309 1241 1310 1242 /* Move the dirty bit to the physical page now the pte is gone. */ 1311 1243 if (pte_dirty(pteval))

+22 -1

mm/vmscan.c

··· 1057 1057 * processes. Try to unmap it here. 1058 1058 */ 1059 1059 if (page_mapped(page) && mapping) { 1060 - switch (try_to_unmap(page, ttu_flags)) { 1060 + switch (try_to_unmap(page, 1061 + ttu_flags|TTU_BATCH_FLUSH)) { 1061 1062 case SWAP_FAIL: 1062 1063 goto activate_locked; 1063 1064 case SWAP_AGAIN: ··· 1209 1208 } 1210 1209 1211 1210 mem_cgroup_uncharge_list(&free_pages); 1211 + try_to_unmap_flush(); 1212 1212 free_hot_cold_page_list(&free_pages, true); 1213 1213 1214 1214 list_splice(&ret_pages, page_list); ··· 2153 2151 } 2154 2152 } 2155 2153 2154 + #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH 2155 + static void init_tlb_ubc(void) 2156 + { 2157 + /* 2158 + * This deliberately does not clear the cpumask as it's expensive 2159 + * and unnecessary. If there happens to be data in there then the 2160 + * first SWAP_CLUSTER_MAX pages will send an unnecessary IPI and 2161 + * then will be cleared. 2162 + */ 2163 + current->tlb_ubc.flush_required = false; 2164 + } 2165 + #else 2166 + static inline void init_tlb_ubc(void) 2167 + { 2168 + } 2169 + #endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */ 2170 + 2156 2171 /* 2157 2172 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. 2158 2173 */ ··· 2203 2184 */ 2204 2185 scan_adjusted = (global_reclaim(sc) && !current_is_kswapd() && 2205 2186 sc->priority == DEF_PRIORITY); 2187 + 2188 + init_tlb_ubc(); 2206 2189 2207 2190 blk_start_plug(&plug); 2208 2191 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||