Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

mm: defer flush of writable TLB entries

If a PTE is unmapped and it's dirty then it was writable recently. Due to
deferred TLB flushing, it's best to assume a writable TLB cache entry
exists. With that assumption, the TLB must be flushed before any IO can
start or the page is freed to avoid lost writes or data corruption. This
patch defers flushing of potentially writable TLBs as long as possible.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Reviewed-by: Rik van Riel <riel@redhat.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Acked-by: Ingo Molnar <mingo@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

authored by

Mel Gorman and committed by
Linus Torvalds
d950c947 72b252ae

+38 -8
+7
include/linux/sched.h
··· 1354 1354 1355 1355 /* True if any bit in cpumask is set */ 1356 1356 bool flush_required; 1357 + 1358 + /* 1359 + * If true then the PTE was dirty when unmapped. The entry must be 1360 + * flushed before IO is initiated or a stale TLB entry potentially 1361 + * allows an update without redirtying the page. 1362 + */ 1363 + bool writable; 1357 1364 }; 1358 1365 1359 1366 struct task_struct {
+4
mm/internal.h
··· 431 431 432 432 #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH 433 433 void try_to_unmap_flush(void); 434 + void try_to_unmap_flush_dirty(void); 434 435 #else 435 436 static inline void try_to_unmap_flush(void) 437 + { 438 + } 439 + static inline void try_to_unmap_flush_dirty(void) 436 440 { 437 441 } 438 442
+21 -7
mm/rmap.c
··· 626 626 } 627 627 cpumask_clear(&tlb_ubc->cpumask); 628 628 tlb_ubc->flush_required = false; 629 + tlb_ubc->writable = false; 629 630 put_cpu(); 630 631 } 631 632 633 + /* Flush iff there are potentially writable TLB entries that can race with IO */ 634 + void try_to_unmap_flush_dirty(void) 635 + { 636 + struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc; 637 + 638 + if (tlb_ubc->writable) 639 + try_to_unmap_flush(); 640 + } 641 + 632 642 static void set_tlb_ubc_flush_pending(struct mm_struct *mm, 633 - struct page *page) 643 + struct page *page, bool writable) 634 644 { 635 645 struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc; 636 646 637 647 cpumask_or(&tlb_ubc->cpumask, &tlb_ubc->cpumask, mm_cpumask(mm)); 638 648 tlb_ubc->flush_required = true; 649 + 650 + /* 651 + * If the PTE was dirty then it's best to assume it's writable. The 652 + * caller must use try_to_unmap_flush_dirty() or try_to_unmap_flush() 653 + * before the page is queued for IO. 654 + */ 655 + if (writable) 656 + tlb_ubc->writable = true; 639 657 } 640 658 641 659 /* ··· 676 658 } 677 659 #else 678 660 static void set_tlb_ubc_flush_pending(struct mm_struct *mm, 679 - struct page *page) 661 + struct page *page, bool writable) 680 662 { 681 663 } 682 664 ··· 1333 1315 */ 1334 1316 pteval = ptep_get_and_clear(mm, address, pte); 1335 1317 1336 - /* Potentially writable TLBs must be flushed before IO */ 1337 - if (pte_dirty(pteval)) 1338 - flush_tlb_page(vma, address); 1339 - else 1340 - set_tlb_ubc_flush_pending(mm, page); 1318 + set_tlb_ubc_flush_pending(mm, page, pte_dirty(pteval)); 1341 1319 } else { 1342 1320 pteval = ptep_clear_flush(vma, address, pte); 1343 1321 }
+6 -1
mm/vmscan.c
··· 1098 1098 if (!sc->may_writepage) 1099 1099 goto keep_locked; 1100 1100 1101 - /* Page is dirty, try to write it out here */ 1101 + /* 1102 + * Page is dirty. Flush the TLB if a writable entry 1103 + * potentially exists to avoid CPU writes after IO 1104 + * starts and then write it out here. 1105 + */ 1106 + try_to_unmap_flush_dirty(); 1102 1107 switch (pageout(page, mapping, sc)) { 1103 1108 case PAGE_KEEP: 1104 1109 goto keep_locked;