Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

mm/mmap: change do_brk_flags() to expand existing VMA and add do_brk_munmap()

Avoid allocating a new VMA when it a vma modification can occur. When a
brk() can expand or contract a VMA, then the single store operation will
only modify one index of the maple tree instead of causing a node to split
or coalesce. This avoids unnecessary allocations/frees of maple tree
nodes and VMAs.

Move some limit & flag verifications out of the do_brk_flags() function to
use only relevant checks in the code path of bkr() and vm_brk_flags().

Set the vma to check if it can expand in vm_brk_flags() if extra criteria
are met.

Drop userfaultfd from do_brk_flags() path and only use it in
vm_brk_flags() path since that is the only place a munmap will happen.

Add a wraper for munmap for the brk case called do_brk_munmap().

Link: https://lkml.kernel.org/r/20220906194824.2110408-23-Liam.Howlett@oracle.com
Signed-off-by: Liam R. Howlett <Liam.Howlett@Oracle.com>
Tested-by: Yu Zhao <yuzhao@google.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Cc: SeongJae Park <sj@kernel.org>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

authored by

Liam R. Howlett and committed by
Andrew Morton
2e7ce7d3 94d815b2

+177 -60
+177 -60
mm/mmap.c
··· 147 147 return next; 148 148 } 149 149 150 - static int do_brk_flags(unsigned long addr, unsigned long request, unsigned long flags, 151 - struct list_head *uf); 150 + /* 151 + * check_brk_limits() - Use platform specific check of range & verify mlock 152 + * limits. 153 + * @addr: The address to check 154 + * @len: The size of increase. 155 + * 156 + * Return: 0 on success. 157 + */ 158 + static int check_brk_limits(unsigned long addr, unsigned long len) 159 + { 160 + unsigned long mapped_addr; 161 + 162 + mapped_addr = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED); 163 + if (IS_ERR_VALUE(mapped_addr)) 164 + return mapped_addr; 165 + 166 + return mlock_future_check(current->mm, current->mm->def_flags, len); 167 + } 168 + static int do_brk_munmap(struct ma_state *mas, struct vm_area_struct *vma, 169 + unsigned long newbrk, unsigned long oldbrk, 170 + struct list_head *uf); 171 + static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *brkvma, 172 + unsigned long addr, unsigned long request, 173 + unsigned long flags); 152 174 SYSCALL_DEFINE1(brk, unsigned long, brk) 153 175 { 154 176 unsigned long newbrk, oldbrk, origbrk; 155 177 struct mm_struct *mm = current->mm; 156 - struct vm_area_struct *next; 178 + struct vm_area_struct *brkvma, *next = NULL; 157 179 unsigned long min_brk; 158 180 bool populate; 159 181 bool downgraded = false; 160 182 LIST_HEAD(uf); 183 + MA_STATE(mas, &mm->mm_mt, 0, 0); 161 184 162 185 if (mmap_write_lock_killable(mm)) 163 186 return -EINTR; ··· 222 199 223 200 /* 224 201 * Always allow shrinking brk. 225 - * __do_munmap() may downgrade mmap_lock to read. 202 + * do_brk_munmap() may downgrade mmap_lock to read. 226 203 */ 227 204 if (brk <= mm->brk) { 228 205 int ret; 229 206 207 + /* Search one past newbrk */ 208 + mas_set(&mas, newbrk); 209 + brkvma = mas_find(&mas, oldbrk); 210 + BUG_ON(brkvma == NULL); 211 + if (brkvma->vm_start >= oldbrk) 212 + goto out; /* mapping intersects with an existing non-brk vma. */ 230 213 /* 231 - * mm->brk must to be protected by write mmap_lock so update it 232 - * before downgrading mmap_lock. When __do_munmap() fails, 233 - * mm->brk will be restored from origbrk. 214 + * mm->brk must be protected by write mmap_lock. 215 + * do_brk_munmap() may downgrade the lock, so update it 216 + * before calling do_brk_munmap(). 234 217 */ 235 218 mm->brk = brk; 236 - ret = __do_munmap(mm, newbrk, oldbrk-newbrk, &uf, true); 237 - if (ret < 0) { 238 - mm->brk = origbrk; 239 - goto out; 240 - } else if (ret == 1) { 219 + mas.last = oldbrk - 1; 220 + ret = do_brk_munmap(&mas, brkvma, newbrk, oldbrk, &uf); 221 + if (ret == 1) { 241 222 downgraded = true; 242 - } 243 - goto success; 223 + goto success; 224 + } else if (!ret) 225 + goto success; 226 + 227 + mm->brk = origbrk; 228 + goto out; 244 229 } 245 230 246 - /* Check against existing mmap mappings. */ 247 - next = find_vma(mm, oldbrk); 231 + if (check_brk_limits(oldbrk, newbrk - oldbrk)) 232 + goto out; 233 + 234 + /* 235 + * Only check if the next VMA is within the stack_guard_gap of the 236 + * expansion area 237 + */ 238 + mas_set(&mas, oldbrk); 239 + next = mas_find(&mas, newbrk - 1 + PAGE_SIZE + stack_guard_gap); 248 240 if (next && newbrk + PAGE_SIZE > vm_start_gap(next)) 249 241 goto out; 250 242 243 + brkvma = mas_prev(&mas, mm->start_brk); 251 244 /* Ok, looks good - let it rip. */ 252 - if (do_brk_flags(oldbrk, newbrk-oldbrk, 0, &uf) < 0) 245 + if (do_brk_flags(&mas, brkvma, oldbrk, newbrk - oldbrk, 0) < 0) 253 246 goto out; 247 + 254 248 mm->brk = brk; 255 249 256 250 success: ··· 2802 2762 } 2803 2763 2804 2764 /* 2805 - * this is really a simplified "do_mmap". it only handles 2806 - * anonymous maps. eventually we may be able to do some 2807 - * brk-specific accounting here. 2765 + * brk_munmap() - Unmap a parital vma. 2766 + * @mas: The maple tree state. 2767 + * @vma: The vma to be modified 2768 + * @newbrk: the start of the address to unmap 2769 + * @oldbrk: The end of the address to unmap 2770 + * @uf: The userfaultfd list_head 2771 + * 2772 + * Returns: 1 on success. 2773 + * unmaps a partial VMA mapping. Does not handle alignment, downgrades lock if 2774 + * possible. 2808 2775 */ 2809 - static int do_brk_flags(unsigned long addr, unsigned long len, 2810 - unsigned long flags, struct list_head *uf) 2776 + static int do_brk_munmap(struct ma_state *mas, struct vm_area_struct *vma, 2777 + unsigned long newbrk, unsigned long oldbrk, 2778 + struct list_head *uf) 2779 + { 2780 + struct mm_struct *mm = vma->vm_mm; 2781 + int ret; 2782 + 2783 + arch_unmap(mm, newbrk, oldbrk); 2784 + ret = __do_munmap(mm, newbrk, oldbrk - newbrk, uf, true); 2785 + validate_mm_mt(mm); 2786 + return ret; 2787 + } 2788 + 2789 + /* 2790 + * do_brk_flags() - Increase the brk vma if the flags match. 2791 + * @mas: The maple tree state. 2792 + * @addr: The start address 2793 + * @len: The length of the increase 2794 + * @vma: The vma, 2795 + * @flags: The VMA Flags 2796 + * 2797 + * Extend the brk VMA from addr to addr + len. If the VMA is NULL or the flags 2798 + * do not match then create a new anonymous VMA. Eventually we may be able to 2799 + * do some brk-specific accounting here. 2800 + */ 2801 + static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *vma, 2802 + unsigned long addr, unsigned long len, 2803 + unsigned long flags) 2811 2804 { 2812 2805 struct mm_struct *mm = current->mm; 2813 - struct vm_area_struct *vma, *prev; 2814 - pgoff_t pgoff = addr >> PAGE_SHIFT; 2815 - int error; 2816 - unsigned long mapped_addr; 2806 + struct vm_area_struct *prev = NULL; 2807 + 2817 2808 validate_mm_mt(mm); 2818 - 2819 - /* Until we need other flags, refuse anything except VM_EXEC. */ 2820 - if ((flags & (~VM_EXEC)) != 0) 2821 - return -EINVAL; 2809 + /* 2810 + * Check against address space limits by the changed size 2811 + * Note: This happens *after* clearing old mappings in some code paths. 2812 + */ 2822 2813 flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; 2823 - 2824 - mapped_addr = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED); 2825 - if (IS_ERR_VALUE(mapped_addr)) 2826 - return mapped_addr; 2827 - 2828 - error = mlock_future_check(mm, mm->def_flags, len); 2829 - if (error) 2830 - return error; 2831 - 2832 - /* Clear old maps, set up prev and uf */ 2833 - if (munmap_vma_range(mm, addr, len, &prev, uf)) 2834 - return -ENOMEM; 2835 - 2836 - /* Check against address space limits *after* clearing old maps... */ 2837 2814 if (!may_expand_vm(mm, flags, len >> PAGE_SHIFT)) 2838 2815 return -ENOMEM; 2839 2816 ··· 2860 2803 if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT)) 2861 2804 return -ENOMEM; 2862 2805 2863 - /* Can we just expand an old private anonymous mapping? */ 2864 - vma = vma_merge(mm, prev, addr, addr + len, flags, 2865 - NULL, NULL, pgoff, NULL, NULL_VM_UFFD_CTX, NULL); 2866 - if (vma) 2867 - goto out; 2868 - 2869 2806 /* 2870 - * create a vma struct for an anonymous mapping 2807 + * Expand the existing vma if possible; Note that singular lists do not 2808 + * occur after forking, so the expand will only happen on new VMAs. 2871 2809 */ 2872 - vma = vm_area_alloc(mm); 2873 - if (!vma) { 2874 - vm_unacct_memory(len >> PAGE_SHIFT); 2875 - return -ENOMEM; 2810 + if (vma && 2811 + (!vma->anon_vma || list_is_singular(&vma->anon_vma_chain)) && 2812 + ((vma->vm_flags & ~VM_SOFTDIRTY) == flags)) { 2813 + mas->index = vma->vm_start; 2814 + mas->last = addr + len - 1; 2815 + vma_adjust_trans_huge(vma, addr, addr + len, 0); 2816 + if (vma->anon_vma) { 2817 + anon_vma_lock_write(vma->anon_vma); 2818 + anon_vma_interval_tree_pre_update_vma(vma); 2819 + } 2820 + vma->vm_end = addr + len; 2821 + vma->vm_flags |= VM_SOFTDIRTY; 2822 + if (mas_store_gfp(mas, vma, GFP_KERNEL)) 2823 + goto mas_expand_failed; 2824 + 2825 + if (vma->anon_vma) { 2826 + anon_vma_interval_tree_post_update_vma(vma); 2827 + anon_vma_unlock_write(vma->anon_vma); 2828 + } 2829 + khugepaged_enter_vma(vma, flags); 2830 + goto out; 2876 2831 } 2832 + prev = vma; 2833 + 2834 + /* create a vma struct for an anonymous mapping */ 2835 + vma = vm_area_alloc(mm); 2836 + if (!vma) 2837 + goto vma_alloc_fail; 2877 2838 2878 2839 vma_set_anonymous(vma); 2879 2840 vma->vm_start = addr; 2880 2841 vma->vm_end = addr + len; 2881 - vma->vm_pgoff = pgoff; 2842 + vma->vm_pgoff = addr >> PAGE_SHIFT; 2882 2843 vma->vm_flags = flags; 2883 2844 vma->vm_page_prot = vm_get_page_prot(flags); 2884 - if (vma_link(mm, vma, prev)) 2885 - goto no_vma_link; 2845 + mas_set_range(mas, vma->vm_start, addr + len - 1); 2846 + if (mas_store_gfp(mas, vma, GFP_KERNEL)) 2847 + goto mas_store_fail; 2886 2848 2849 + if (!prev) 2850 + prev = mas_prev(mas, 0); 2851 + 2852 + __vma_link_list(mm, vma, prev); 2853 + mm->map_count++; 2887 2854 out: 2888 2855 perf_event_mmap(vma); 2889 2856 mm->total_vm += len >> PAGE_SHIFT; ··· 2918 2837 validate_mm_mt(mm); 2919 2838 return 0; 2920 2839 2921 - no_vma_link: 2840 + mas_store_fail: 2922 2841 vm_area_free(vma); 2842 + vma_alloc_fail: 2843 + vm_unacct_memory(len >> PAGE_SHIFT); 2844 + return -ENOMEM; 2845 + 2846 + mas_expand_failed: 2847 + if (vma->anon_vma) { 2848 + anon_vma_interval_tree_post_update_vma(vma); 2849 + anon_vma_unlock_write(vma->anon_vma); 2850 + } 2923 2851 return -ENOMEM; 2924 2852 } 2925 2853 2926 2854 int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags) 2927 2855 { 2928 2856 struct mm_struct *mm = current->mm; 2857 + struct vm_area_struct *vma = NULL; 2929 2858 unsigned long len; 2930 2859 int ret; 2931 2860 bool populate; 2932 2861 LIST_HEAD(uf); 2862 + MA_STATE(mas, &mm->mm_mt, addr, addr); 2933 2863 2934 2864 len = PAGE_ALIGN(request); 2935 2865 if (len < request) ··· 2951 2859 if (mmap_write_lock_killable(mm)) 2952 2860 return -EINTR; 2953 2861 2954 - ret = do_brk_flags(addr, len, flags, &uf); 2862 + /* Until we need other flags, refuse anything except VM_EXEC. */ 2863 + if ((flags & (~VM_EXEC)) != 0) 2864 + return -EINVAL; 2865 + 2866 + ret = check_brk_limits(addr, len); 2867 + if (ret) 2868 + goto limits_failed; 2869 + 2870 + if (find_vma_intersection(mm, addr, addr + len)) 2871 + ret = do_munmap(mm, addr, len, &uf); 2872 + 2873 + if (ret) 2874 + goto munmap_failed; 2875 + 2876 + vma = mas_prev(&mas, 0); 2877 + if (!vma || vma->vm_end != addr || vma_policy(vma) || 2878 + !can_vma_merge_after(vma, flags, NULL, NULL, 2879 + addr >> PAGE_SHIFT, NULL_VM_UFFD_CTX, NULL)) 2880 + vma = NULL; 2881 + 2882 + ret = do_brk_flags(&mas, vma, addr, len, flags); 2955 2883 populate = ((mm->def_flags & VM_LOCKED) != 0); 2956 2884 mmap_write_unlock(mm); 2957 2885 userfaultfd_unmap_complete(mm, &uf); 2958 2886 if (populate && !ret) 2959 2887 mm_populate(addr, len); 2888 + return ret; 2889 + 2890 + munmap_failed: 2891 + limits_failed: 2892 + mmap_write_unlock(mm); 2960 2893 return ret; 2961 2894 } 2962 2895 EXPORT_SYMBOL(vm_brk_flags);