Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

mm: prevent KSM from breaking VMA merging for new VMAs

If a user wishes to enable KSM mergeability for an entire process and all
fork/exec'd processes that come after it, they use the prctl()
PR_SET_MEMORY_MERGE operation.

This defaults all newly mapped VMAs to have the VM_MERGEABLE VMA flag set
(in order to indicate they are KSM mergeable), as well as setting this
flag for all existing VMAs and propagating this across fork/exec.

However it also breaks VMA merging for new VMAs, both in the process and
all forked (and fork/exec'd) child processes.

This is because when a new mapping is proposed, the flags specified will
never have VM_MERGEABLE set. However all adjacent VMAs will already have
VM_MERGEABLE set, rendering VMAs unmergeable by default.

To work around this, we try to set the VM_MERGEABLE flag prior to
attempting a merge. In the case of brk() this can always be done.

However on mmap() things are more complicated - while KSM is not supported
for MAP_SHARED file-backed mappings, it is supported for MAP_PRIVATE
file-backed mappings.

These mappings may have deprecated .mmap() callbacks specified which
could, in theory, adjust flags and thus KSM eligibility.

So we check to determine whether this is possible. If not, we set
VM_MERGEABLE prior to the merge attempt on mmap(), otherwise we retain the
previous behaviour.

This fixes VMA merging for all new anonymous mappings, which covers the
majority of real-world cases, so we should see a significant improvement
in VMA mergeability.

For MAP_PRIVATE file-backed mappings, those which implement the
.mmap_prepare() hook and shmem are both known to be safe, so we allow
these, disallowing all other cases.

Also add stubs for newly introduced function invocations to VMA userland
testing.

[lorenzo.stoakes@oracle.com: correctly invoke late KSM check after mmap hook]
Link: https://lkml.kernel.org/r/5861f8f6-cf5a-4d82-a062-139fb3f9cddb@lucifer.local
Link: https://lkml.kernel.org/r/3ba660af716d87a18ca5b4e635f2101edeb56340.1748537921.git.lorenzo.stoakes@oracle.com
Fixes: d7597f59d1d3 ("mm: add new api to enable ksm per process") # please no backport!
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Chengming Zhou <chengming.zhou@linux.dev>
Acked-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Xu Xin <xu.xin16@zte.com.cn>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Stefan Roesch <shr@devkernel.io>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

authored by

Lorenzo Stoakes and committed by
Andrew Morton
cf7e7a35 b914c47d

+77 -11
+5 -3
include/linux/ksm.h
··· 17 17 #ifdef CONFIG_KSM 18 18 int ksm_madvise(struct vm_area_struct *vma, unsigned long start, 19 19 unsigned long end, int advice, unsigned long *vm_flags); 20 - 21 - void ksm_add_vma(struct vm_area_struct *vma); 20 + vm_flags_t ksm_vma_flags(const struct mm_struct *mm, const struct file *file, 21 + vm_flags_t vm_flags); 22 22 int ksm_enable_merge_any(struct mm_struct *mm); 23 23 int ksm_disable_merge_any(struct mm_struct *mm); 24 24 int ksm_disable(struct mm_struct *mm); ··· 97 97 98 98 #else /* !CONFIG_KSM */ 99 99 100 - static inline void ksm_add_vma(struct vm_area_struct *vma) 100 + static inline vm_flags_t ksm_vma_flags(const struct mm_struct *mm, 101 + const struct file *file, vm_flags_t vm_flags) 101 102 { 103 + return vm_flags; 102 104 } 103 105 104 106 static inline int ksm_disable(struct mm_struct *mm)
+12 -6
mm/ksm.c
··· 2731 2731 return 0; 2732 2732 } 2733 2733 /** 2734 - * ksm_add_vma - Mark vma as mergeable if compatible 2734 + * ksm_vma_flags - Update VMA flags to mark as mergeable if compatible 2735 2735 * 2736 - * @vma: Pointer to vma 2736 + * @mm: Proposed VMA's mm_struct 2737 + * @file: Proposed VMA's file-backed mapping, if any. 2738 + * @vm_flags: Proposed VMA"s flags. 2739 + * 2740 + * Returns: @vm_flags possibly updated to mark mergeable. 2737 2741 */ 2738 - void ksm_add_vma(struct vm_area_struct *vma) 2742 + vm_flags_t ksm_vma_flags(const struct mm_struct *mm, const struct file *file, 2743 + vm_flags_t vm_flags) 2739 2744 { 2740 - struct mm_struct *mm = vma->vm_mm; 2745 + if (test_bit(MMF_VM_MERGE_ANY, &mm->flags) && 2746 + __ksm_should_add_vma(file, vm_flags)) 2747 + vm_flags |= VM_MERGEABLE; 2741 2748 2742 - if (test_bit(MMF_VM_MERGE_ANY, &mm->flags)) 2743 - __ksm_add_vma(vma); 2749 + return vm_flags; 2744 2750 } 2745 2751 2746 2752 static void ksm_add_vmas(struct mm_struct *mm)
+49 -2
mm/vma.c
··· 32 32 struct vma_munmap_struct vms; 33 33 struct ma_state mas_detach; 34 34 struct maple_tree mt_detach; 35 + 36 + /* Determine if we can check KSM flags early in mmap() logic. */ 37 + bool check_ksm_early; 35 38 }; 36 39 37 40 #define MMAP_STATE(name, mm_, vmi_, addr_, len_, pgoff_, flags_, file_) \ ··· 2323 2320 vms_complete_munmap_vmas(vms, mas_detach); 2324 2321 } 2325 2322 2323 + static void update_ksm_flags(struct mmap_state *map) 2324 + { 2325 + map->flags = ksm_vma_flags(map->mm, map->file, map->flags); 2326 + } 2327 + 2326 2328 /* 2327 2329 * __mmap_prepare() - Prepare to gather any overlapping VMAs that need to be 2328 2330 * unmapped once the map operation is completed, check limits, account mapping ··· 2432 2424 !(map->flags & VM_MAYWRITE) && 2433 2425 (vma->vm_flags & VM_MAYWRITE)); 2434 2426 2427 + map->file = vma->vm_file; 2435 2428 map->flags = vma->vm_flags; 2436 2429 2437 2430 return 0; ··· 2482 2473 if (error) 2483 2474 goto free_iter_vma; 2484 2475 2476 + if (!map->check_ksm_early) { 2477 + update_ksm_flags(map); 2478 + vm_flags_init(vma, map->flags); 2479 + } 2480 + 2485 2481 #ifdef CONFIG_SPARC64 2486 2482 /* TODO: Fix SPARC ADI! */ 2487 2483 WARN_ON_ONCE(!arch_validate_flags(map->flags)); ··· 2504 2490 */ 2505 2491 if (!vma_is_anonymous(vma)) 2506 2492 khugepaged_enter_vma(vma, map->flags); 2507 - ksm_add_vma(vma); 2508 2493 *vmap = vma; 2509 2494 return 0; 2510 2495 ··· 2606 2593 vma->vm_private_data = map->vm_private_data; 2607 2594 } 2608 2595 2596 + /* 2597 + * Are we guaranteed no driver can change state such as to preclude KSM merging? 2598 + * If so, let's set the KSM mergeable flag early so we don't break VMA merging. 2599 + */ 2600 + static bool can_set_ksm_flags_early(struct mmap_state *map) 2601 + { 2602 + struct file *file = map->file; 2603 + 2604 + /* Anonymous mappings have no driver which can change them. */ 2605 + if (!file) 2606 + return true; 2607 + 2608 + /* 2609 + * If .mmap_prepare() is specified, then the driver will have already 2610 + * manipulated state prior to updating KSM flags. So no need to worry 2611 + * about mmap callbacks modifying VMA flags after the KSM flag has been 2612 + * updated here, which could otherwise affect KSM eligibility. 2613 + */ 2614 + if (file->f_op->mmap_prepare) 2615 + return true; 2616 + 2617 + /* shmem is safe. */ 2618 + if (shmem_file(file)) 2619 + return true; 2620 + 2621 + /* Any other .mmap callback is not safe. */ 2622 + return false; 2623 + } 2624 + 2609 2625 static unsigned long __mmap_region(struct file *file, unsigned long addr, 2610 2626 unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, 2611 2627 struct list_head *uf) ··· 2646 2604 VMA_ITERATOR(vmi, mm, addr); 2647 2605 MMAP_STATE(map, mm, &vmi, addr, len, pgoff, vm_flags, file); 2648 2606 2607 + map.check_ksm_early = can_set_ksm_flags_early(&map); 2608 + 2649 2609 error = __mmap_prepare(&map, uf); 2650 2610 if (!error && have_mmap_prepare) 2651 2611 error = call_mmap_prepare(&map); 2652 2612 if (error) 2653 2613 goto abort_munmap; 2614 + 2615 + if (map.check_ksm_early) 2616 + update_ksm_flags(&map); 2654 2617 2655 2618 /* Attempt to merge with adjacent VMAs... */ 2656 2619 if (map.prev || map.next) { ··· 2768 2721 * Note: This happens *after* clearing old mappings in some code paths. 2769 2722 */ 2770 2723 flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; 2724 + flags = ksm_vma_flags(mm, NULL, flags); 2771 2725 if (!may_expand_vm(mm, flags, len >> PAGE_SHIFT)) 2772 2726 return -ENOMEM; 2773 2727 ··· 2812 2764 2813 2765 mm->map_count++; 2814 2766 validate_mm(mm); 2815 - ksm_add_vma(vma); 2816 2767 out: 2817 2768 perf_event_mmap(vma); 2818 2769 mm->total_vm += len >> PAGE_SHIFT;
+11
tools/testing/vma/vma_internal.h
··· 1484 1484 fput(file); 1485 1485 } 1486 1486 1487 + static inline bool shmem_file(struct file *) 1488 + { 1489 + return false; 1490 + } 1491 + 1492 + static inline vm_flags_t ksm_vma_flags(const struct mm_struct *, const struct file *, 1493 + vm_flags_t vm_flags) 1494 + { 1495 + return vm_flags; 1496 + } 1497 + 1487 1498 #endif /* __MM_VMA_INTERNAL_H */