Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

mm: add MAP_DROPPABLE for designating always lazily freeable mappings

The vDSO getrandom() implementation works with a buffer allocated with a
new system call that has certain requirements:

- It shouldn't be written to core dumps.
* Easy: VM_DONTDUMP.
- It should be zeroed on fork.
* Easy: VM_WIPEONFORK.

- It shouldn't be written to swap.
* Uh-oh: mlock is rlimited.
* Uh-oh: mlock isn't inherited by forks.

- It shouldn't reserve actual memory, but it also shouldn't crash when
page faulting in memory if none is available
* Uh-oh: VM_NORESERVE means segfaults.

It turns out that the vDSO getrandom() function has three really nice
characteristics that we can exploit to solve this problem:

1) Due to being wiped during fork(), the vDSO code is already robust to
having the contents of the pages it reads zeroed out midway through
the function's execution.

2) In the absolute worst case of whatever contingency we're coding for,
we have the option to fallback to the getrandom() syscall, and
everything is fine.

3) The buffers the function uses are only ever useful for a maximum of
60 seconds -- a sort of cache, rather than a long term allocation.

These characteristics mean that we can introduce VM_DROPPABLE, which
has the following semantics:

a) It never is written out to swap.
b) Under memory pressure, mm can just drop the pages (so that they're
zero when read back again).
c) It is inherited by fork.
d) It doesn't count against the mlock budget, since nothing is locked.
e) If there's not enough memory to service a page fault, it's not fatal,
and no signal is sent.

This way, allocations used by vDSO getrandom() can use:

VM_DROPPABLE | VM_DONTDUMP | VM_WIPEONFORK | VM_NORESERVE

And there will be no problem with OOMing, crashing on overcommitment,
using memory when not in use, not wiping on fork(), coredumps, or
writing out to swap.

In order to let vDSO getrandom() use this, expose these via mmap(2) as
MAP_DROPPABLE.

Note that this involves removing the MADV_FREE special case from
sort_folio(), which according to Yu Zhao is unnecessary and will simply
result in an extra call to shrink_folio_list() in the worst case. The
chunk removed reenables the swapbacked flag, which we don't want for
VM_DROPPABLE, and we can't conditionalize it here because there isn't a
vma reference available.

Finally, the provided self test ensures that this is working as desired.

Cc: linux-mm@kvack.org
Acked-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>

+146 -15
+1
fs/proc/task_mmu.c
··· 708 708 [ilog2(VM_SHADOW_STACK)] = "ss", 709 709 #endif 710 710 #ifdef CONFIG_64BIT 711 + [ilog2(VM_DROPPABLE)] = "dp", 711 712 [ilog2(VM_SEALED)] = "sl", 712 713 #endif 713 714 };
+7
include/linux/mm.h
··· 407 407 #endif 408 408 409 409 #ifdef CONFIG_64BIT 410 + #define VM_DROPPABLE_BIT 40 411 + #define VM_DROPPABLE BIT(VM_DROPPABLE_BIT) 412 + #else 413 + #define VM_DROPPABLE VM_NONE 414 + #endif 415 + 416 + #ifdef CONFIG_64BIT 410 417 /* VM is sealed, in vm_flags */ 411 418 #define VM_SEALED _BITUL(63) 412 419 #endif
+3
include/linux/userfaultfd_k.h
··· 218 218 { 219 219 vm_flags &= __VM_UFFD_FLAGS; 220 220 221 + if (vm_flags & VM_DROPPABLE) 222 + return false; 223 + 221 224 if ((vm_flags & VM_UFFD_MINOR) && 222 225 (!is_vm_hugetlb_page(vma) && !vma_is_shmem(vma))) 223 226 return false;
+7
include/trace/events/mmflags.h
··· 165 165 # define IF_HAVE_UFFD_MINOR(flag, name) 166 166 #endif 167 167 168 + #ifdef CONFIG_64BIT 169 + # define IF_HAVE_VM_DROPPABLE(flag, name) {flag, name}, 170 + #else 171 + # define IF_HAVE_VM_DROPPABLE(flag, name) 172 + #endif 173 + 168 174 #define __def_vmaflag_names \ 169 175 {VM_READ, "read" }, \ 170 176 {VM_WRITE, "write" }, \ ··· 203 197 {VM_MIXEDMAP, "mixedmap" }, \ 204 198 {VM_HUGEPAGE, "hugepage" }, \ 205 199 {VM_NOHUGEPAGE, "nohugepage" }, \ 200 + IF_HAVE_VM_DROPPABLE(VM_DROPPABLE, "droppable" ) \ 206 201 {VM_MERGEABLE, "mergeable" } \ 207 202 208 203 #define show_vma_flags(flags) \
+1
include/uapi/linux/mman.h
··· 17 17 #define MAP_SHARED 0x01 /* Share changes */ 18 18 #define MAP_PRIVATE 0x02 /* Changes are private */ 19 19 #define MAP_SHARED_VALIDATE 0x03 /* share + validate extension flags */ 20 + #define MAP_DROPPABLE 0x08 /* Zero memory under memory pressure. */ 20 21 21 22 /* 22 23 * Huge page size encoding when MAP_HUGETLB is specified, and a huge page
+1 -1
mm/ksm.c
··· 717 717 { 718 718 if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE | VM_PFNMAP | 719 719 VM_IO | VM_DONTEXPAND | VM_HUGETLB | 720 - VM_MIXEDMAP)) 720 + VM_MIXEDMAP| VM_DROPPABLE)) 721 721 return false; /* just ignore the advice */ 722 722 723 723 if (vma_is_dax(vma))
+4 -1
mm/madvise.c
··· 1068 1068 new_flags |= VM_WIPEONFORK; 1069 1069 break; 1070 1070 case MADV_KEEPONFORK: 1071 + if (vma->vm_flags & VM_DROPPABLE) 1072 + return -EINVAL; 1071 1073 new_flags &= ~VM_WIPEONFORK; 1072 1074 break; 1073 1075 case MADV_DONTDUMP: 1074 1076 new_flags |= VM_DONTDUMP; 1075 1077 break; 1076 1078 case MADV_DODUMP: 1077 - if (!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL) 1079 + if ((!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL) || 1080 + (vma->vm_flags & VM_DROPPABLE)) 1078 1081 return -EINVAL; 1079 1082 new_flags &= ~VM_DONTDUMP; 1080 1083 break;
+13
mm/memory.c
··· 5660 5660 /* If the fault handler drops the mmap_lock, vma may be freed */ 5661 5661 struct mm_struct *mm = vma->vm_mm; 5662 5662 vm_fault_t ret; 5663 + bool is_droppable; 5663 5664 5664 5665 __set_current_state(TASK_RUNNING); 5665 5666 ··· 5674 5673 ret = VM_FAULT_SIGSEGV; 5675 5674 goto out; 5676 5675 } 5676 + 5677 + is_droppable = !!(vma->vm_flags & VM_DROPPABLE); 5677 5678 5678 5679 /* 5679 5680 * Enable the memcg OOM handling for faults triggered in user ··· 5691 5688 else 5692 5689 ret = __handle_mm_fault(vma, address, flags); 5693 5690 5691 + /* 5692 + * Warning: It is no longer safe to dereference vma-> after this point, 5693 + * because mmap_lock might have been dropped by __handle_mm_fault(), so 5694 + * vma might be destroyed from underneath us. 5695 + */ 5696 + 5694 5697 lru_gen_exit_fault(); 5698 + 5699 + /* If the mapping is droppable, then errors due to OOM aren't fatal. */ 5700 + if (is_droppable) 5701 + ret &= ~VM_FAULT_OOM; 5695 5702 5696 5703 if (flags & FAULT_FLAG_USER) { 5697 5704 mem_cgroup_exit_user_fault();
+3
mm/mempolicy.c
··· 2300 2300 pgoff_t ilx; 2301 2301 struct page *page; 2302 2302 2303 + if (vma->vm_flags & VM_DROPPABLE) 2304 + gfp |= __GFP_NOWARN; 2305 + 2303 2306 pol = get_vma_policy(vma, addr, order, &ilx); 2304 2307 page = alloc_pages_mpol_noprof(gfp | __GFP_COMP, order, 2305 2308 pol, ilx, numa_node_id());
+1 -1
mm/mlock.c
··· 485 485 486 486 if (newflags == oldflags || (oldflags & VM_SPECIAL) || 487 487 is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm) || 488 - vma_is_dax(vma) || vma_is_secretmem(vma)) 488 + vma_is_dax(vma) || vma_is_secretmem(vma) || (oldflags & VM_DROPPABLE)) 489 489 /* don't set VM_LOCKED or VM_LOCKONFAULT and don't count */ 490 490 goto out; 491 491
+30
mm/mmap.c
··· 1369 1369 pgoff = 0; 1370 1370 vm_flags |= VM_SHARED | VM_MAYSHARE; 1371 1371 break; 1372 + case MAP_DROPPABLE: 1373 + if (VM_DROPPABLE == VM_NONE) 1374 + return -ENOTSUPP; 1375 + /* 1376 + * A locked or stack area makes no sense to be droppable. 1377 + * 1378 + * Also, since droppable pages can just go away at any time 1379 + * it makes no sense to copy them on fork or dump them. 1380 + * 1381 + * And don't attempt to combine with hugetlb for now. 1382 + */ 1383 + if (flags & (MAP_LOCKED | MAP_HUGETLB)) 1384 + return -EINVAL; 1385 + if (vm_flags & (VM_GROWSDOWN | VM_GROWSUP)) 1386 + return -EINVAL; 1387 + 1388 + vm_flags |= VM_DROPPABLE; 1389 + 1390 + /* 1391 + * If the pages can be dropped, then it doesn't make 1392 + * sense to reserve them. 1393 + */ 1394 + vm_flags |= VM_NORESERVE; 1395 + 1396 + /* 1397 + * Likewise, they're volatile enough that they 1398 + * shouldn't survive forks or coredumps. 1399 + */ 1400 + vm_flags |= VM_WIPEONFORK | VM_DONTDUMP; 1401 + fallthrough; 1372 1402 case MAP_PRIVATE: 1373 1403 /* 1374 1404 * Set pgoff according to addr for anon_vma.
+19 -3
mm/rmap.c
··· 1397 1397 VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio); 1398 1398 VM_BUG_ON_VMA(address < vma->vm_start || 1399 1399 address + (nr << PAGE_SHIFT) > vma->vm_end, vma); 1400 - __folio_set_swapbacked(folio); 1400 + /* 1401 + * VM_DROPPABLE mappings don't swap; instead they're just dropped when 1402 + * under memory pressure. 1403 + */ 1404 + if (!(vma->vm_flags & VM_DROPPABLE)) 1405 + __folio_set_swapbacked(folio); 1401 1406 __folio_set_anon(folio, vma, address, true); 1402 1407 1403 1408 if (likely(!folio_test_large(folio))) { ··· 1846 1841 * plus the rmap(s) (dropped by discard:). 1847 1842 */ 1848 1843 if (ref_count == 1 + map_count && 1849 - !folio_test_dirty(folio)) { 1844 + (!folio_test_dirty(folio) || 1845 + /* 1846 + * Unlike MADV_FREE mappings, VM_DROPPABLE 1847 + * ones can be dropped even if they've 1848 + * been dirtied. 1849 + */ 1850 + (vma->vm_flags & VM_DROPPABLE))) { 1850 1851 dec_mm_counter(mm, MM_ANONPAGES); 1851 1852 goto discard; 1852 1853 } ··· 1862 1851 * discarded. Remap the page to page table. 1863 1852 */ 1864 1853 set_pte_at(mm, address, pvmw.pte, pteval); 1865 - folio_set_swapbacked(folio); 1854 + /* 1855 + * Unlike MADV_FREE mappings, VM_DROPPABLE ones 1856 + * never get swap backed on failure to drop. 1857 + */ 1858 + if (!(vma->vm_flags & VM_DROPPABLE)) 1859 + folio_set_swapbacked(folio); 1866 1860 ret = false; 1867 1861 page_vma_mapped_walk_done(&pvmw); 1868 1862 break;
-9
mm/vmscan.c
··· 4265 4265 return true; 4266 4266 } 4267 4267 4268 - /* dirty lazyfree */ 4269 - if (type == LRU_GEN_FILE && folio_test_anon(folio) && folio_test_dirty(folio)) { 4270 - success = lru_gen_del_folio(lruvec, folio, true); 4271 - VM_WARN_ON_ONCE_FOLIO(!success, folio); 4272 - folio_set_swapbacked(folio); 4273 - lruvec_add_folio_tail(lruvec, folio); 4274 - return true; 4275 - } 4276 - 4277 4268 /* promoted */ 4278 4269 if (gen != lru_gen_from_seq(lrugen->min_seq[type])) { 4279 4270 list_move(&folio->lru, &lrugen->folios[gen][type][zone]);
+1
tools/include/uapi/linux/mman.h
··· 17 17 #define MAP_SHARED 0x01 /* Share changes */ 18 18 #define MAP_PRIVATE 0x02 /* Changes are private */ 19 19 #define MAP_SHARED_VALIDATE 0x03 /* share + validate extension flags */ 20 + #define MAP_DROPPABLE 0x08 /* Zero memory under memory pressure. */ 20 21 21 22 /* 22 23 * Huge page size encoding when MAP_HUGETLB is specified, and a huge page
+1
tools/testing/selftests/mm/.gitignore
··· 49 49 hugetlb_madv_vs_map 50 50 mseal_test 51 51 seal_elf 52 + droppable
+1
tools/testing/selftests/mm/Makefile
··· 73 73 TEST_GEN_FILES += mdwe_test 74 74 TEST_GEN_FILES += hugetlb_fault_after_madv 75 75 TEST_GEN_FILES += hugetlb_madv_vs_map 76 + TEST_GEN_FILES += droppable 76 77 77 78 ifneq ($(ARCH),arm64) 78 79 TEST_GEN_FILES += soft-dirty
+53
tools/testing/selftests/mm/droppable.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* 3 + * Copyright (C) 2024 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. 4 + */ 5 + 6 + #include <assert.h> 7 + #include <stdbool.h> 8 + #include <stdint.h> 9 + #include <stdio.h> 10 + #include <stdlib.h> 11 + #include <unistd.h> 12 + #include <signal.h> 13 + #include <sys/mman.h> 14 + #include <linux/mman.h> 15 + 16 + #include "../kselftest.h" 17 + 18 + int main(int argc, char *argv[]) 19 + { 20 + size_t alloc_size = 134217728; 21 + size_t page_size = getpagesize(); 22 + void *alloc; 23 + pid_t child; 24 + 25 + ksft_print_header(); 26 + ksft_set_plan(1); 27 + 28 + alloc = mmap(0, alloc_size, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_DROPPABLE, -1, 0); 29 + assert(alloc != MAP_FAILED); 30 + memset(alloc, 'A', alloc_size); 31 + for (size_t i = 0; i < alloc_size; i += page_size) 32 + assert(*(uint8_t *)(alloc + i)); 33 + 34 + child = fork(); 35 + assert(child >= 0); 36 + if (!child) { 37 + for (;;) 38 + *(char *)malloc(page_size) = 'B'; 39 + } 40 + 41 + for (bool done = false; !done;) { 42 + for (size_t i = 0; i < alloc_size; i += page_size) { 43 + if (!*(uint8_t *)(alloc + i)) { 44 + done = true; 45 + break; 46 + } 47 + } 48 + } 49 + kill(child, SIGTERM); 50 + 51 + ksft_test_result_pass("MAP_DROPPABLE: PASS\n"); 52 + exit(KSFT_PASS); 53 + }