Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

mm: save soft-dirty bits on file pages

Andy reported that if file page get reclaimed we lose the soft-dirty bit
if it was there, so save _PAGE_BIT_SOFT_DIRTY bit when page address get
encoded into pte entry. Thus when #pf happens on such non-present pte
we can restore it back.

Reported-by: Andy Lutomirski <luto@amacapital.net>
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Acked-by: Pavel Emelyanov <xemul@parallels.com>
Cc: Matt Mackall <mpm@selenic.com>
Cc: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@gmail.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

authored by

Cyrill Gorcunov and committed by
Linus Torvalds
41bb3476 179ef71c

+108 -11
+47 -1
arch/x86/include/asm/pgtable-2level.h
··· 55 55 #define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp) 56 56 #endif 57 57 58 + #ifdef CONFIG_MEM_SOFT_DIRTY 59 + 60 + /* 61 + * Bits _PAGE_BIT_PRESENT, _PAGE_BIT_FILE, _PAGE_BIT_SOFT_DIRTY and 62 + * _PAGE_BIT_PROTNONE are taken, split up the 28 bits of offset 63 + * into this range. 64 + */ 65 + #define PTE_FILE_MAX_BITS 28 66 + #define PTE_FILE_SHIFT1 (_PAGE_BIT_PRESENT + 1) 67 + #define PTE_FILE_SHIFT2 (_PAGE_BIT_FILE + 1) 68 + #define PTE_FILE_SHIFT3 (_PAGE_BIT_PROTNONE + 1) 69 + #define PTE_FILE_SHIFT4 (_PAGE_BIT_SOFT_DIRTY + 1) 70 + #define PTE_FILE_BITS1 (PTE_FILE_SHIFT2 - PTE_FILE_SHIFT1 - 1) 71 + #define PTE_FILE_BITS2 (PTE_FILE_SHIFT3 - PTE_FILE_SHIFT2 - 1) 72 + #define PTE_FILE_BITS3 (PTE_FILE_SHIFT4 - PTE_FILE_SHIFT3 - 1) 73 + 74 + #define pte_to_pgoff(pte) \ 75 + ((((pte).pte_low >> (PTE_FILE_SHIFT1)) \ 76 + & ((1U << PTE_FILE_BITS1) - 1))) \ 77 + + ((((pte).pte_low >> (PTE_FILE_SHIFT2)) \ 78 + & ((1U << PTE_FILE_BITS2) - 1)) \ 79 + << (PTE_FILE_BITS1)) \ 80 + + ((((pte).pte_low >> (PTE_FILE_SHIFT3)) \ 81 + & ((1U << PTE_FILE_BITS3) - 1)) \ 82 + << (PTE_FILE_BITS1 + PTE_FILE_BITS2)) \ 83 + + ((((pte).pte_low >> (PTE_FILE_SHIFT4))) \ 84 + << (PTE_FILE_BITS1 + PTE_FILE_BITS2 + PTE_FILE_BITS3)) 85 + 86 + #define pgoff_to_pte(off) \ 87 + ((pte_t) { .pte_low = \ 88 + ((((off)) & ((1U << PTE_FILE_BITS1) - 1)) << PTE_FILE_SHIFT1) \ 89 + + ((((off) >> PTE_FILE_BITS1) \ 90 + & ((1U << PTE_FILE_BITS2) - 1)) \ 91 + << PTE_FILE_SHIFT2) \ 92 + + ((((off) >> (PTE_FILE_BITS1 + PTE_FILE_BITS2)) \ 93 + & ((1U << PTE_FILE_BITS3) - 1)) \ 94 + << PTE_FILE_SHIFT3) \ 95 + + ((((off) >> \ 96 + (PTE_FILE_BITS1 + PTE_FILE_BITS2 + PTE_FILE_BITS3))) \ 97 + << PTE_FILE_SHIFT4) \ 98 + + _PAGE_FILE }) 99 + 100 + #else /* CONFIG_MEM_SOFT_DIRTY */ 101 + 58 102 /* 59 103 * Bits _PAGE_BIT_PRESENT, _PAGE_BIT_FILE and _PAGE_BIT_PROTNONE are taken, 60 - * split up the 29 bits of offset into this range: 104 + * split up the 29 bits of offset into this range. 61 105 */ 62 106 #define PTE_FILE_MAX_BITS 29 63 107 #define PTE_FILE_SHIFT1 (_PAGE_BIT_PRESENT + 1) ··· 131 87 + (((off) >> (PTE_FILE_BITS1 + PTE_FILE_BITS2)) \ 132 88 << PTE_FILE_SHIFT3) \ 133 89 + _PAGE_FILE }) 90 + 91 + #endif /* CONFIG_MEM_SOFT_DIRTY */ 134 92 135 93 /* Encode and de-code a swap entry */ 136 94 #if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE
+3
arch/x86/include/asm/pgtable-3level.h
··· 179 179 /* 180 180 * Bits 0, 6 and 7 are taken in the low part of the pte, 181 181 * put the 32 bits of offset into the high part. 182 + * 183 + * For soft-dirty tracking 11 bit is taken from 184 + * the low part of pte as well. 182 185 */ 183 186 #define pte_to_pgoff(pte) ((pte).pte_high) 184 187 #define pgoff_to_pte(off) \
+15
arch/x86/include/asm/pgtable.h
··· 329 329 return pte_clear_flags(pte, _PAGE_SWP_SOFT_DIRTY); 330 330 } 331 331 332 + static inline pte_t pte_file_clear_soft_dirty(pte_t pte) 333 + { 334 + return pte_clear_flags(pte, _PAGE_SOFT_DIRTY); 335 + } 336 + 337 + static inline pte_t pte_file_mksoft_dirty(pte_t pte) 338 + { 339 + return pte_set_flags(pte, _PAGE_SOFT_DIRTY); 340 + } 341 + 342 + static inline int pte_file_soft_dirty(pte_t pte) 343 + { 344 + return pte_flags(pte) & _PAGE_SOFT_DIRTY; 345 + } 346 + 332 347 /* 333 348 * Mask out unsupported bits in a present pgprot. Non-present pgprots 334 349 * can use those bits for other purposes, so leave them be.
+3 -1
arch/x86/include/asm/pgtable_types.h
··· 61 61 * they do not conflict with each other. 62 62 */ 63 63 64 + #define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_HIDDEN 65 + 64 66 #ifdef CONFIG_MEM_SOFT_DIRTY 65 - #define _PAGE_SOFT_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_HIDDEN) 67 + #define _PAGE_SOFT_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_SOFT_DIRTY) 66 68 #else 67 69 #define _PAGE_SOFT_DIRTY (_AT(pteval_t, 0)) 68 70 #endif
+2
fs/proc/task_mmu.c
··· 736 736 ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY); 737 737 } else if (is_swap_pte(ptent)) { 738 738 ptent = pte_swp_clear_soft_dirty(ptent); 739 + } else if (pte_file(ptent)) { 740 + ptent = pte_file_clear_soft_dirty(ptent); 739 741 } 740 742 741 743 set_pte_at(vma->vm_mm, addr, pte, ptent);
+15
include/asm-generic/pgtable.h
··· 432 432 { 433 433 return pte; 434 434 } 435 + 436 + static inline pte_t pte_file_clear_soft_dirty(pte_t pte) 437 + { 438 + return pte; 439 + } 440 + 441 + static inline pte_t pte_file_mksoft_dirty(pte_t pte) 442 + { 443 + return pte; 444 + } 445 + 446 + static inline int pte_file_soft_dirty(pte_t pte) 447 + { 448 + return 0; 449 + } 435 450 #endif 436 451 437 452 #ifndef __HAVE_PFNMAP_TRACKING
+9 -4
mm/fremap.c
··· 57 57 unsigned long addr, unsigned long pgoff, pgprot_t prot) 58 58 { 59 59 int err = -ENOMEM; 60 - pte_t *pte; 60 + pte_t *pte, ptfile; 61 61 spinlock_t *ptl; 62 62 63 63 pte = get_locked_pte(mm, addr, &ptl); 64 64 if (!pte) 65 65 goto out; 66 66 67 - if (!pte_none(*pte)) 68 - zap_pte(mm, vma, addr, pte); 67 + ptfile = pgoff_to_pte(pgoff); 69 68 70 - set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff)); 69 + if (!pte_none(*pte)) { 70 + if (pte_present(*pte) && pte_soft_dirty(*pte)) 71 + pte_file_mksoft_dirty(ptfile); 72 + zap_pte(mm, vma, addr, pte); 73 + } 74 + 75 + set_pte_at(mm, addr, pte, ptfile); 71 76 /* 72 77 * We don't need to run update_mmu_cache() here because the "file pte" 73 78 * being installed by install_file_pte() is not a real pte - it's a
+8 -3
mm/memory.c
··· 1141 1141 continue; 1142 1142 if (unlikely(details) && details->nonlinear_vma 1143 1143 && linear_page_index(details->nonlinear_vma, 1144 - addr) != page->index) 1145 - set_pte_at(mm, addr, pte, 1146 - pgoff_to_pte(page->index)); 1144 + addr) != page->index) { 1145 + pte_t ptfile = pgoff_to_pte(page->index); 1146 + if (pte_soft_dirty(ptent)) 1147 + pte_file_mksoft_dirty(ptfile); 1148 + set_pte_at(mm, addr, pte, ptfile); 1149 + } 1147 1150 if (PageAnon(page)) 1148 1151 rss[MM_ANONPAGES]--; 1149 1152 else { ··· 3413 3410 entry = mk_pte(page, vma->vm_page_prot); 3414 3411 if (flags & FAULT_FLAG_WRITE) 3415 3412 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 3413 + else if (pte_file(orig_pte) && pte_file_soft_dirty(orig_pte)) 3414 + pte_mksoft_dirty(entry); 3416 3415 if (anon) { 3417 3416 inc_mm_counter_fast(mm, MM_ANONPAGES); 3418 3417 page_add_new_anon_rmap(page, vma, address);
+6 -2
mm/rmap.c
··· 1405 1405 pteval = ptep_clear_flush(vma, address, pte); 1406 1406 1407 1407 /* If nonlinear, store the file page offset in the pte. */ 1408 - if (page->index != linear_page_index(vma, address)) 1409 - set_pte_at(mm, address, pte, pgoff_to_pte(page->index)); 1408 + if (page->index != linear_page_index(vma, address)) { 1409 + pte_t ptfile = pgoff_to_pte(page->index); 1410 + if (pte_soft_dirty(pteval)) 1411 + pte_file_mksoft_dirty(ptfile); 1412 + set_pte_at(mm, address, pte, ptfile); 1413 + } 1410 1414 1411 1415 /* Move the dirty bit to the physical page now the pte is gone. */ 1412 1416 if (pte_dirty(pteval))