Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Fix TLB gather virtual address range invalidation corner cases

Ben Tebulin reported:

"Since v3.7.2 on two independent machines a very specific Git
repository fails in 9/10 cases on git-fsck due to an SHA1/memory
failures. This only occurs on a very specific repository and can be
reproduced stably on two independent laptops. Git mailing list ran
out of ideas and for me this looks like some very exotic kernel issue"

and bisected the failure to the backport of commit 53a59fc67f97 ("mm:
limit mmu_gather batching to fix soft lockups on !CONFIG_PREEMPT").

That commit itself is not actually buggy, but what it does is to make it
much more likely to hit the partial TLB invalidation case, since it
introduces a new case in tlb_next_batch() that previously only ever
happened when running out of memory.

The real bug is that the TLB gather virtual memory range setup is subtly
buggered. It was introduced in commit 597e1c3580b7 ("mm/mmu_gather:
enable tlb flush range in generic mmu_gather"), and the range handling
was already fixed at least once in commit e6c495a96ce0 ("mm: fix the TLB
range flushed when __tlb_remove_page() runs out of slots"), but that fix
was not complete.

The problem with the TLB gather virtual address range is that it isn't
set up by the initial tlb_gather_mmu() initialization (which didn't get
the TLB range information), but it is set up ad-hoc later by the
functions that actually flush the TLB. And so any such case that forgot
to update the TLB range entries would potentially miss TLB invalidates.

Rather than try to figure out exactly which particular ad-hoc range
setup was missing (I personally suspect it's the hugetlb case in
zap_huge_pmd(), which didn't have the same logic as zap_pte_range()
did), this patch just gets rid of the problem at the source: make the
TLB range information available to tlb_gather_mmu(), and initialize it
when initializing all the other tlb gather fields.

This makes the patch larger, but conceptually much simpler. And the end
result is much more understandable; even if you want to play games with
partial ranges when invalidating the TLB contents in chunks, now the
range information is always there, and anybody who doesn't want to
bother with it won't introduce subtle bugs.

Ben verified that this fixes his problem.

Reported-bisected-and-tested-by: Ben Tebulin <tebulin@googlemail.com>
Build-testing-by: Stephen Rothwell <sfr@canb.auug.org.au>
Build-testing-by: Richard Weinberger <richard.weinberger@gmail.com>
Reviewed-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: stable@vger.kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

+57 -34
+5 -2
arch/arm/include/asm/tlb.h
··· 43 43 struct mm_struct *mm; 44 44 unsigned int fullmm; 45 45 struct vm_area_struct *vma; 46 + unsigned long start, end; 46 47 unsigned long range_start; 47 48 unsigned long range_end; 48 49 unsigned int nr; ··· 108 107 } 109 108 110 109 static inline void 111 - tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned int fullmm) 110 + tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long start, unsigned long end) 112 111 { 113 112 tlb->mm = mm; 114 - tlb->fullmm = fullmm; 113 + tlb->fullmm = !(start | (end+1)); 114 + tlb->start = start; 115 + tlb->end = end; 115 116 tlb->vma = NULL; 116 117 tlb->max = ARRAY_SIZE(tlb->local); 117 118 tlb->pages = tlb->local;
+5 -2
arch/arm64/include/asm/tlb.h
··· 35 35 struct mm_struct *mm; 36 36 unsigned int fullmm; 37 37 struct vm_area_struct *vma; 38 + unsigned long start, end; 38 39 unsigned long range_start; 39 40 unsigned long range_end; 40 41 unsigned int nr; ··· 98 97 } 99 98 100 99 static inline void 101 - tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned int fullmm) 100 + tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long start, unsigned long end) 102 101 { 103 102 tlb->mm = mm; 104 - tlb->fullmm = fullmm; 103 + tlb->fullmm = !(start | (end+1)); 104 + tlb->start = start; 105 + tlb->end = end; 105 106 tlb->vma = NULL; 106 107 tlb->max = ARRAY_SIZE(tlb->local); 107 108 tlb->pages = tlb->local;
+6 -3
arch/ia64/include/asm/tlb.h
··· 22 22 * unmapping a portion of the virtual address space, these hooks are called according to 23 23 * the following template: 24 24 * 25 - * tlb <- tlb_gather_mmu(mm, full_mm_flush); // start unmap for address space MM 25 + * tlb <- tlb_gather_mmu(mm, start, end); // start unmap for address space MM 26 26 * { 27 27 * for each vma that needs a shootdown do { 28 28 * tlb_start_vma(tlb, vma); ··· 58 58 unsigned int max; 59 59 unsigned char fullmm; /* non-zero means full mm flush */ 60 60 unsigned char need_flush; /* really unmapped some PTEs? */ 61 + unsigned long start, end; 61 62 unsigned long start_addr; 62 63 unsigned long end_addr; 63 64 struct page **pages; ··· 156 155 157 156 158 157 static inline void 159 - tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned int full_mm_flush) 158 + tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long start, unsigned long end) 160 159 { 161 160 tlb->mm = mm; 162 161 tlb->max = ARRAY_SIZE(tlb->local); 163 162 tlb->pages = tlb->local; 164 163 tlb->nr = 0; 165 - tlb->fullmm = full_mm_flush; 164 + tlb->fullmm = !(start | (end+1)); 165 + tlb->start = start; 166 + tlb->end = end; 166 167 tlb->start_addr = ~0UL; 167 168 } 168 169
+6 -2
arch/s390/include/asm/tlb.h
··· 32 32 struct mm_struct *mm; 33 33 struct mmu_table_batch *batch; 34 34 unsigned int fullmm; 35 + unsigned long start, unsigned long end; 35 36 }; 36 37 37 38 struct mmu_table_batch { ··· 49 48 50 49 static inline void tlb_gather_mmu(struct mmu_gather *tlb, 51 50 struct mm_struct *mm, 52 - unsigned int full_mm_flush) 51 + unsigned long start, 52 + unsigned long end) 53 53 { 54 54 tlb->mm = mm; 55 - tlb->fullmm = full_mm_flush; 55 + tlb->start = start; 56 + tlb->end = end; 57 + tlb->fullmm = !(start | (end+1)); 56 58 tlb->batch = NULL; 57 59 if (tlb->fullmm) 58 60 __tlb_flush_mm(mm);
+4 -2
arch/sh/include/asm/tlb.h
··· 36 36 } 37 37 38 38 static inline void 39 - tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned int full_mm_flush) 39 + tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long start, unsigned long end) 40 40 { 41 41 tlb->mm = mm; 42 - tlb->fullmm = full_mm_flush; 42 + tlb->start = start; 43 + tlb->end = end; 44 + tlb->fullmm = !(start | (end+1)); 43 45 44 46 init_tlb_gather(tlb); 45 47 }
+4 -2
arch/um/include/asm/tlb.h
··· 45 45 } 46 46 47 47 static inline void 48 - tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned int full_mm_flush) 48 + tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long start, unsigned long end) 49 49 { 50 50 tlb->mm = mm; 51 - tlb->fullmm = full_mm_flush; 51 + tlb->start = start; 52 + tlb->end = end; 53 + tlb->fullmm = !(start | (end+1)); 52 54 53 55 init_tlb_gather(tlb); 54 56 }
+2 -2
fs/exec.c
··· 608 608 return -ENOMEM; 609 609 610 610 lru_add_drain(); 611 - tlb_gather_mmu(&tlb, mm, 0); 611 + tlb_gather_mmu(&tlb, mm, old_start, old_end); 612 612 if (new_end > old_start) { 613 613 /* 614 614 * when the old and new regions overlap clear from new_end. ··· 625 625 free_pgd_range(&tlb, old_start, old_end, new_end, 626 626 vma->vm_next ? vma->vm_next->vm_start : USER_PGTABLES_CEILING); 627 627 } 628 - tlb_finish_mmu(&tlb, new_end, old_end); 628 + tlb_finish_mmu(&tlb, old_start, old_end); 629 629 630 630 /* 631 631 * Shrink the vma to just the new range. Always succeeds.
+1 -1
include/asm-generic/tlb.h
··· 112 112 113 113 #define HAVE_GENERIC_MMU_GATHER 114 114 115 - void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, bool fullmm); 115 + void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long start, unsigned long end); 116 116 void tlb_flush_mmu(struct mmu_gather *tlb); 117 117 void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, 118 118 unsigned long end);
+1 -1
mm/hugetlb.c
··· 2490 2490 2491 2491 mm = vma->vm_mm; 2492 2492 2493 - tlb_gather_mmu(&tlb, mm, 0); 2493 + tlb_gather_mmu(&tlb, mm, start, end); 2494 2494 __unmap_hugepage_range(&tlb, vma, start, end, ref_page); 2495 2495 tlb_finish_mmu(&tlb, start, end); 2496 2496 }
+21 -15
mm/memory.c
··· 209 209 * tear-down from @mm. The @fullmm argument is used when @mm is without 210 210 * users and we're going to destroy the full address space (exit/execve). 211 211 */ 212 - void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, bool fullmm) 212 + void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long start, unsigned long end) 213 213 { 214 214 tlb->mm = mm; 215 215 216 - tlb->fullmm = fullmm; 216 + /* Is it from 0 to ~0? */ 217 + tlb->fullmm = !(start | (end+1)); 217 218 tlb->need_flush_all = 0; 218 - tlb->start = -1UL; 219 - tlb->end = 0; 219 + tlb->start = start; 220 + tlb->end = end; 220 221 tlb->need_flush = 0; 221 222 tlb->local.next = NULL; 222 223 tlb->local.nr = 0; ··· 257 256 { 258 257 struct mmu_gather_batch *batch, *next; 259 258 260 - tlb->start = start; 261 - tlb->end = end; 262 259 tlb_flush_mmu(tlb); 263 260 264 261 /* keep the page table cache within bounds */ ··· 1098 1099 spinlock_t *ptl; 1099 1100 pte_t *start_pte; 1100 1101 pte_t *pte; 1101 - unsigned long range_start = addr; 1102 1102 1103 1103 again: 1104 1104 init_rss_vec(rss); ··· 1203 1205 * and page-free while holding it. 1204 1206 */ 1205 1207 if (force_flush) { 1208 + unsigned long old_end; 1209 + 1206 1210 force_flush = 0; 1207 1211 1208 - #ifdef HAVE_GENERIC_MMU_GATHER 1209 - tlb->start = range_start; 1212 + /* 1213 + * Flush the TLB just for the previous segment, 1214 + * then update the range to be the remaining 1215 + * TLB range. 1216 + */ 1217 + old_end = tlb->end; 1210 1218 tlb->end = addr; 1211 - #endif 1219 + 1212 1220 tlb_flush_mmu(tlb); 1213 - if (addr != end) { 1214 - range_start = addr; 1221 + 1222 + tlb->start = addr; 1223 + tlb->end = old_end; 1224 + 1225 + if (addr != end) 1215 1226 goto again; 1216 - } 1217 1227 } 1218 1228 1219 1229 return addr; ··· 1406 1400 unsigned long end = start + size; 1407 1401 1408 1402 lru_add_drain(); 1409 - tlb_gather_mmu(&tlb, mm, 0); 1403 + tlb_gather_mmu(&tlb, mm, start, end); 1410 1404 update_hiwater_rss(mm); 1411 1405 mmu_notifier_invalidate_range_start(mm, start, end); 1412 1406 for ( ; vma && vma->vm_start < end; vma = vma->vm_next) ··· 1432 1426 unsigned long end = address + size; 1433 1427 1434 1428 lru_add_drain(); 1435 - tlb_gather_mmu(&tlb, mm, 0); 1429 + tlb_gather_mmu(&tlb, mm, address, end); 1436 1430 update_hiwater_rss(mm); 1437 1431 mmu_notifier_invalidate_range_start(mm, address, end); 1438 1432 unmap_single_vma(&tlb, vma, address, end, details);
+2 -2
mm/mmap.c
··· 2336 2336 struct mmu_gather tlb; 2337 2337 2338 2338 lru_add_drain(); 2339 - tlb_gather_mmu(&tlb, mm, 0); 2339 + tlb_gather_mmu(&tlb, mm, start, end); 2340 2340 update_hiwater_rss(mm); 2341 2341 unmap_vmas(&tlb, vma, start, end); 2342 2342 free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS, ··· 2709 2709 2710 2710 lru_add_drain(); 2711 2711 flush_cache_mm(mm); 2712 - tlb_gather_mmu(&tlb, mm, 1); 2712 + tlb_gather_mmu(&tlb, mm, 0, -1); 2713 2713 /* update_hiwater_rss(mm) here? but nobody should be looking */ 2714 2714 /* Use -1 here to ensure all VMAs in the mm are unmapped */ 2715 2715 unmap_vmas(&tlb, vma, 0, -1);