Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

mm: track vma changes with VM_SOFTDIRTY bit

Pavel reported that in case if vma area get unmapped and then mapped (or
expanded) in-place, the soft dirty tracker won't be able to recognize this
situation since it works on pte level and ptes are get zapped on unmap,
loosing soft dirty bit of course.

So to resolve this situation we need to track actions on vma level, there
VM_SOFTDIRTY flag comes in. When new vma area created (or old expanded)
we set this bit, and keep it here until application calls for clearing
soft dirty bit.

Thus when user space application track memory changes now it can detect if
vma area is renewed.

Reported-by: Pavel Emelyanov <xemul@parallels.com>
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Matt Mackall <mpm@selenic.com>
Cc: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@gmail.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Cc: Rob Landley <rob@landley.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

authored by

Cyrill Gorcunov and committed by
Linus Torvalds
d9104d1c 3b11f0aa

+61 -12
+7
Documentation/vm/soft-dirty.txt
··· 28 28 the kernel does is finds this fact out and puts both writable and soft-dirty 29 29 bits on the PTE. 30 30 31 + While in most cases tracking memory changes by #PF-s is more than enough 32 + there is still a scenario when we can lose soft dirty bits -- a task 33 + unmaps a previously mapped memory region and then maps a new one at exactly 34 + the same place. When unmap is called, the kernel internally clears PTE values 35 + including soft dirty bits. To notify user space application about such 36 + memory region renewal the kernel always marks new memory regions (and 37 + expanded regions) as soft dirty. 31 38 32 39 This feature is actively used by the checkpoint-restore project. You 33 40 can find more details about it on http://criu.org
+1 -1
fs/exec.c
··· 266 266 BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP); 267 267 vma->vm_end = STACK_TOP_MAX; 268 268 vma->vm_start = vma->vm_end - PAGE_SIZE; 269 - vma->vm_flags = VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP; 269 + vma->vm_flags = VM_SOFTDIRTY | VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP; 270 270 vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); 271 271 INIT_LIST_HEAD(&vma->anon_vma_chain); 272 272
+36 -10
fs/proc/task_mmu.c
··· 740 740 ptent = pte_file_clear_soft_dirty(ptent); 741 741 } 742 742 743 + if (vma->vm_flags & VM_SOFTDIRTY) 744 + vma->vm_flags &= ~VM_SOFTDIRTY; 745 + 743 746 set_pte_at(vma->vm_mm, addr, pte, ptent); 744 747 #endif 745 748 } ··· 952 949 if (is_migration_entry(entry)) 953 950 page = migration_entry_to_page(entry); 954 951 } else { 955 - *pme = make_pme(PM_NOT_PRESENT(pm->v2)); 952 + if (vma->vm_flags & VM_SOFTDIRTY) 953 + flags2 |= __PM_SOFT_DIRTY; 954 + *pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, flags2)); 956 955 return; 957 956 } 958 957 959 958 if (page && !PageAnon(page)) 960 959 flags |= PM_FILE; 961 - if (pte_soft_dirty(pte)) 960 + if ((vma->vm_flags & VM_SOFTDIRTY) || pte_soft_dirty(pte)) 962 961 flags2 |= __PM_SOFT_DIRTY; 963 962 964 963 *pme = make_pme(PM_PFRAME(frame) | PM_STATUS2(pm->v2, flags2) | flags); ··· 979 974 *pme = make_pme(PM_PFRAME(pmd_pfn(pmd) + offset) 980 975 | PM_STATUS2(pm->v2, pmd_flags2) | PM_PRESENT); 981 976 else 982 - *pme = make_pme(PM_NOT_PRESENT(pm->v2)); 977 + *pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, pmd_flags2)); 983 978 } 984 979 #else 985 980 static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, ··· 1002 997 if (vma && pmd_trans_huge_lock(pmd, vma) == 1) { 1003 998 int pmd_flags2; 1004 999 1005 - pmd_flags2 = (pmd_soft_dirty(*pmd) ? __PM_SOFT_DIRTY : 0); 1000 + if ((vma->vm_flags & VM_SOFTDIRTY) || pmd_soft_dirty(*pmd)) 1001 + pmd_flags2 = __PM_SOFT_DIRTY; 1002 + else 1003 + pmd_flags2 = 0; 1004 + 1006 1005 for (; addr != end; addr += PAGE_SIZE) { 1007 1006 unsigned long offset; 1008 1007 ··· 1024 1015 if (pmd_trans_unstable(pmd)) 1025 1016 return 0; 1026 1017 for (; addr != end; addr += PAGE_SIZE) { 1018 + int flags2; 1027 1019 1028 1020 /* check to see if we've left 'vma' behind 1029 1021 * and need a new, higher one */ 1030 1022 if (vma && (addr >= vma->vm_end)) { 1031 1023 vma = find_vma(walk->mm, addr); 1032 - pme = make_pme(PM_NOT_PRESENT(pm->v2)); 1024 + if (vma && (vma->vm_flags & VM_SOFTDIRTY)) 1025 + flags2 = __PM_SOFT_DIRTY; 1026 + else 1027 + flags2 = 0; 1028 + pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, flags2)); 1033 1029 } 1034 1030 1035 1031 /* check that 'vma' actually covers this address, ··· 1058 1044 1059 1045 #ifdef CONFIG_HUGETLB_PAGE 1060 1046 static void huge_pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, 1061 - pte_t pte, int offset) 1047 + pte_t pte, int offset, int flags2) 1062 1048 { 1063 1049 if (pte_present(pte)) 1064 - *pme = make_pme(PM_PFRAME(pte_pfn(pte) + offset) 1065 - | PM_STATUS2(pm->v2, 0) | PM_PRESENT); 1050 + *pme = make_pme(PM_PFRAME(pte_pfn(pte) + offset) | 1051 + PM_STATUS2(pm->v2, flags2) | 1052 + PM_PRESENT); 1066 1053 else 1067 - *pme = make_pme(PM_NOT_PRESENT(pm->v2)); 1054 + *pme = make_pme(PM_NOT_PRESENT(pm->v2) | 1055 + PM_STATUS2(pm->v2, flags2)); 1068 1056 } 1069 1057 1070 1058 /* This function walks within one hugetlb entry in the single call */ ··· 1075 1059 struct mm_walk *walk) 1076 1060 { 1077 1061 struct pagemapread *pm = walk->private; 1062 + struct vm_area_struct *vma; 1078 1063 int err = 0; 1064 + int flags2; 1079 1065 pagemap_entry_t pme; 1066 + 1067 + vma = find_vma(walk->mm, addr); 1068 + WARN_ON_ONCE(!vma); 1069 + 1070 + if (vma && (vma->vm_flags & VM_SOFTDIRTY)) 1071 + flags2 = __PM_SOFT_DIRTY; 1072 + else 1073 + flags2 = 0; 1080 1074 1081 1075 for (; addr != end; addr += PAGE_SIZE) { 1082 1076 int offset = (addr & ~hmask) >> PAGE_SHIFT; 1083 - huge_pte_to_pagemap_entry(&pme, pm, *pte, offset); 1077 + huge_pte_to_pagemap_entry(&pme, pm, *pte, offset, flags2); 1084 1078 err = add_to_pagemap(addr, &pme, pm); 1085 1079 if (err) 1086 1080 return err;
+6
include/linux/mm.h
··· 115 115 #define VM_ARCH_1 0x01000000 /* Architecture-specific flag */ 116 116 #define VM_DONTDUMP 0x04000000 /* Do not include in the core dump */ 117 117 118 + #ifdef CONFIG_MEM_SOFT_DIRTY 119 + # define VM_SOFTDIRTY 0x08000000 /* Not soft dirty clean area */ 120 + #else 121 + # define VM_SOFTDIRTY 0 122 + #endif 123 + 118 124 #define VM_MIXEDMAP 0x10000000 /* Can contain "struct page" and pure PFN pages */ 119 125 #define VM_HUGEPAGE 0x20000000 /* MADV_HUGEPAGE marked this vma */ 120 126 #define VM_NOHUGEPAGE 0x40000000 /* MADV_NOHUGEPAGE marked this vma */
+11 -1
mm/mmap.c
··· 1609 1609 if (file) 1610 1610 uprobe_mmap(vma); 1611 1611 1612 + /* 1613 + * New (or expanded) vma always get soft dirty status. 1614 + * Otherwise user-space soft-dirty page tracker won't 1615 + * be able to distinguish situation when vma area unmapped, 1616 + * then new mapped in-place (which must be aimed as 1617 + * a completely new data area). 1618 + */ 1619 + vma->vm_flags |= VM_SOFTDIRTY; 1620 + 1612 1621 return addr; 1613 1622 1614 1623 unmap_and_free_vma: ··· 2661 2652 mm->total_vm += len >> PAGE_SHIFT; 2662 2653 if (flags & VM_LOCKED) 2663 2654 mm->locked_vm += (len >> PAGE_SHIFT); 2655 + vma->vm_flags |= VM_SOFTDIRTY; 2664 2656 return addr; 2665 2657 } 2666 2658 ··· 2926 2916 vma->vm_start = addr; 2927 2917 vma->vm_end = addr + len; 2928 2918 2929 - vma->vm_flags = vm_flags | mm->def_flags | VM_DONTEXPAND; 2919 + vma->vm_flags = vm_flags | mm->def_flags | VM_DONTEXPAND | VM_SOFTDIRTY; 2930 2920 vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); 2931 2921 2932 2922 vma->vm_ops = &special_mapping_vmops;