Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

mm: perform VMA allocation, freeing, duplication in mm

Right now these are performed in kernel/fork.c which is odd and a
violation of separation of concerns, as well as preventing us from
integrating this and related logic into userland VMA testing going
forward.

There is a fly in the ointment - nommu - mmap.c is not compiled if
CONFIG_MMU not set, and neither is vma.c.

To square the circle, let's add a new file - vma_init.c. This will be
compiled for both CONFIG_MMU and nommu builds, and will also form part of
the VMA userland testing.

This allows us to de-duplicate code, while maintaining separation of
concerns and the ability for us to userland test this logic.

Update the VMA userland tests accordingly, additionally adding a
detach_free_vma() helper function to correctly detach VMAs before freeing
them in test code, as this change was triggering the assert for this.

[akpm@linux-foundation.org: remove stray newline, per Liam]
Link: https://lkml.kernel.org/r/f97b3a85a6da0196b28070df331b99e22b263be8.1745853549.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Reviewed-by: Pedro Falcato <pfalcato@suse.de>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Kees Cook <kees@kernel.org>
Reviewed-by: Suren Baghdasaryan <surenb@google.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

authored by

Lorenzo Stoakes and committed by
Andrew Morton
3e43e260 26a8f577

+250 -126
+1
MAINTAINERS
··· 15683 15683 F: mm/vma.c 15684 15684 F: mm/vma.h 15685 15685 F: mm/vma_exec.c 15686 + F: mm/vma_init.c 15686 15687 F: mm/vma_internal.h 15687 15688 F: tools/testing/selftests/mm/merge.c 15688 15689 F: tools/testing/vma/
-88
kernel/fork.c
··· 431 431 /* SLAB cache for fs_struct structures (tsk->fs) */ 432 432 struct kmem_cache *fs_cachep; 433 433 434 - /* SLAB cache for vm_area_struct structures */ 435 - static struct kmem_cache *vm_area_cachep; 436 - 437 434 /* SLAB cache for mm_struct structures (tsk->mm) */ 438 435 static struct kmem_cache *mm_cachep; 439 - 440 - struct vm_area_struct *vm_area_alloc(struct mm_struct *mm) 441 - { 442 - struct vm_area_struct *vma; 443 - 444 - vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); 445 - if (!vma) 446 - return NULL; 447 - 448 - vma_init(vma, mm); 449 - 450 - return vma; 451 - } 452 - 453 - static void vm_area_init_from(const struct vm_area_struct *src, 454 - struct vm_area_struct *dest) 455 - { 456 - dest->vm_mm = src->vm_mm; 457 - dest->vm_ops = src->vm_ops; 458 - dest->vm_start = src->vm_start; 459 - dest->vm_end = src->vm_end; 460 - dest->anon_vma = src->anon_vma; 461 - dest->vm_pgoff = src->vm_pgoff; 462 - dest->vm_file = src->vm_file; 463 - dest->vm_private_data = src->vm_private_data; 464 - vm_flags_init(dest, src->vm_flags); 465 - memcpy(&dest->vm_page_prot, &src->vm_page_prot, 466 - sizeof(dest->vm_page_prot)); 467 - /* 468 - * src->shared.rb may be modified concurrently when called from 469 - * dup_mmap(), but the clone will reinitialize it. 470 - */ 471 - data_race(memcpy(&dest->shared, &src->shared, sizeof(dest->shared))); 472 - memcpy(&dest->vm_userfaultfd_ctx, &src->vm_userfaultfd_ctx, 473 - sizeof(dest->vm_userfaultfd_ctx)); 474 - #ifdef CONFIG_ANON_VMA_NAME 475 - dest->anon_name = src->anon_name; 476 - #endif 477 - #ifdef CONFIG_SWAP 478 - memcpy(&dest->swap_readahead_info, &src->swap_readahead_info, 479 - sizeof(dest->swap_readahead_info)); 480 - #endif 481 - #ifndef CONFIG_MMU 482 - dest->vm_region = src->vm_region; 483 - #endif 484 - #ifdef CONFIG_NUMA 485 - dest->vm_policy = src->vm_policy; 486 - #endif 487 - } 488 - 489 - struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) 490 - { 491 - struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); 492 - 493 - if (!new) 494 - return NULL; 495 - 496 - ASSERT_EXCLUSIVE_WRITER(orig->vm_flags); 497 - ASSERT_EXCLUSIVE_WRITER(orig->vm_file); 498 - vm_area_init_from(orig, new); 499 - vma_lock_init(new, true); 500 - INIT_LIST_HEAD(&new->anon_vma_chain); 501 - vma_numab_state_init(new); 502 - dup_anon_vma_name(orig, new); 503 - 504 - return new; 505 - } 506 - 507 - void vm_area_free(struct vm_area_struct *vma) 508 - { 509 - /* The vma should be detached while being destroyed. */ 510 - vma_assert_detached(vma); 511 - vma_numab_state_free(vma); 512 - free_anon_vma_name(vma); 513 - kmem_cache_free(vm_area_cachep, vma); 514 - } 515 436 516 437 static void account_kernel_stack(struct task_struct *tsk, int account) 517 438 { ··· 2954 3033 2955 3034 void __init proc_caches_init(void) 2956 3035 { 2957 - struct kmem_cache_args args = { 2958 - .use_freeptr_offset = true, 2959 - .freeptr_offset = offsetof(struct vm_area_struct, vm_freeptr), 2960 - }; 2961 - 2962 3036 sighand_cachep = kmem_cache_create("sighand_cache", 2963 3037 sizeof(struct sighand_struct), 0, 2964 3038 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU| ··· 2970 3054 sizeof(struct fs_struct), 0, 2971 3055 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, 2972 3056 NULL); 2973 - vm_area_cachep = kmem_cache_create("vm_area_struct", 2974 - sizeof(struct vm_area_struct), &args, 2975 - SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU| 2976 - SLAB_ACCOUNT); 2977 3057 mmap_init(); 2978 3058 nsproxy_cache_init(); 2979 3059 }
+1 -1
mm/Makefile
··· 55 55 mm_init.o percpu.o slab_common.o \ 56 56 compaction.o show_mem.o \ 57 57 interval_tree.o list_lru.o workingset.o \ 58 - debug.o gup.o mmap_lock.o $(mmu-y) 58 + debug.o gup.o mmap_lock.o vma_init.o $(mmu-y) 59 59 60 60 # Give 'page_alloc' its own module-parameter namespace 61 61 page-alloc-y := page_alloc.o
+2 -1
mm/mmap.c
··· 1554 1554 #endif /* CONFIG_SYSCTL */ 1555 1555 1556 1556 /* 1557 - * initialise the percpu counter for VM 1557 + * initialise the percpu counter for VM, initialise VMA state. 1558 1558 */ 1559 1559 void __init mmap_init(void) 1560 1560 { ··· 1565 1565 #ifdef CONFIG_SYSCTL 1566 1566 register_sysctl_init("vm", mmap_table); 1567 1567 #endif 1568 + vma_state_init(); 1568 1569 } 1569 1570 1570 1571 /*
+3 -1
mm/nommu.c
··· 399 399 }; 400 400 401 401 /* 402 - * initialise the percpu counter for VM and region record slabs 402 + * initialise the percpu counter for VM and region record slabs, initialise VMA 403 + * state. 403 404 */ 404 405 void __init mmap_init(void) 405 406 { ··· 410 409 VM_BUG_ON(ret); 411 410 vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC|SLAB_ACCOUNT); 412 411 register_sysctl_init("vm", nommu_table); 412 + vma_state_init(); 413 413 } 414 414 415 415 /*
+6
mm/vma.h
··· 550 550 551 551 int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma); 552 552 553 + /* vma_init.h, shared between CONFIG_MMU and nommu. */ 554 + void __init vma_state_init(void); 555 + struct vm_area_struct *vm_area_alloc(struct mm_struct *mm); 556 + struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig); 557 + void vm_area_free(struct vm_area_struct *vma); 558 + 553 559 /* vma_exec.c */ 554 560 #ifdef CONFIG_MMU 555 561 int create_init_stack_vma(struct mm_struct *mm, struct vm_area_struct **vmap,
+101
mm/vma_init.c
··· 1 + // SPDX-License-Identifier: GPL-2.0-or-later 2 + 3 + /* 4 + * Functions for initialisaing, allocating, freeing and duplicating VMAs. Shared 5 + * between CONFIG_MMU and non-CONFIG_MMU kernel configurations. 6 + */ 7 + 8 + #include "vma_internal.h" 9 + #include "vma.h" 10 + 11 + /* SLAB cache for vm_area_struct structures */ 12 + static struct kmem_cache *vm_area_cachep; 13 + 14 + void __init vma_state_init(void) 15 + { 16 + struct kmem_cache_args args = { 17 + .use_freeptr_offset = true, 18 + .freeptr_offset = offsetof(struct vm_area_struct, vm_freeptr), 19 + }; 20 + 21 + vm_area_cachep = kmem_cache_create("vm_area_struct", 22 + sizeof(struct vm_area_struct), &args, 23 + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU| 24 + SLAB_ACCOUNT); 25 + } 26 + 27 + struct vm_area_struct *vm_area_alloc(struct mm_struct *mm) 28 + { 29 + struct vm_area_struct *vma; 30 + 31 + vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); 32 + if (!vma) 33 + return NULL; 34 + 35 + vma_init(vma, mm); 36 + 37 + return vma; 38 + } 39 + 40 + static void vm_area_init_from(const struct vm_area_struct *src, 41 + struct vm_area_struct *dest) 42 + { 43 + dest->vm_mm = src->vm_mm; 44 + dest->vm_ops = src->vm_ops; 45 + dest->vm_start = src->vm_start; 46 + dest->vm_end = src->vm_end; 47 + dest->anon_vma = src->anon_vma; 48 + dest->vm_pgoff = src->vm_pgoff; 49 + dest->vm_file = src->vm_file; 50 + dest->vm_private_data = src->vm_private_data; 51 + vm_flags_init(dest, src->vm_flags); 52 + memcpy(&dest->vm_page_prot, &src->vm_page_prot, 53 + sizeof(dest->vm_page_prot)); 54 + /* 55 + * src->shared.rb may be modified concurrently when called from 56 + * dup_mmap(), but the clone will reinitialize it. 57 + */ 58 + data_race(memcpy(&dest->shared, &src->shared, sizeof(dest->shared))); 59 + memcpy(&dest->vm_userfaultfd_ctx, &src->vm_userfaultfd_ctx, 60 + sizeof(dest->vm_userfaultfd_ctx)); 61 + #ifdef CONFIG_ANON_VMA_NAME 62 + dest->anon_name = src->anon_name; 63 + #endif 64 + #ifdef CONFIG_SWAP 65 + memcpy(&dest->swap_readahead_info, &src->swap_readahead_info, 66 + sizeof(dest->swap_readahead_info)); 67 + #endif 68 + #ifndef CONFIG_MMU 69 + dest->vm_region = src->vm_region; 70 + #endif 71 + #ifdef CONFIG_NUMA 72 + dest->vm_policy = src->vm_policy; 73 + #endif 74 + } 75 + 76 + struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) 77 + { 78 + struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); 79 + 80 + if (!new) 81 + return NULL; 82 + 83 + ASSERT_EXCLUSIVE_WRITER(orig->vm_flags); 84 + ASSERT_EXCLUSIVE_WRITER(orig->vm_file); 85 + vm_area_init_from(orig, new); 86 + vma_lock_init(new, true); 87 + INIT_LIST_HEAD(&new->anon_vma_chain); 88 + vma_numab_state_init(new); 89 + dup_anon_vma_name(orig, new); 90 + 91 + return new; 92 + } 93 + 94 + void vm_area_free(struct vm_area_struct *vma) 95 + { 96 + /* The vma should be detached while being destroyed. */ 97 + vma_assert_detached(vma); 98 + vma_numab_state_free(vma); 99 + free_anon_vma_name(vma); 100 + kmem_cache_free(vm_area_cachep, vma); 101 + }
+1 -1
tools/testing/vma/Makefile
··· 9 9 OFILES = $(SHARED_OFILES) vma.o maple-shim.o 10 10 TARGETS = vma 11 11 12 - vma.o: vma.c vma_internal.h ../../../mm/vma.c ../../../mm/vma_exec.c ../../../mm/vma.h 12 + vma.o: vma.c vma_internal.h ../../../mm/vma.c ../../../mm/vma_init.c ../../../mm/vma_exec.c ../../../mm/vma.h 13 13 14 14 vma: $(OFILES) 15 15 $(CC) $(CFLAGS) -o $@ $(OFILES) $(LDLIBS)
+17 -9
tools/testing/vma/vma.c
··· 28 28 * Directly import the VMA implementation here. Our vma_internal.h wrapper 29 29 * provides userland-equivalent functionality for everything vma.c uses. 30 30 */ 31 + #include "../../../mm/vma_init.c" 31 32 #include "../../../mm/vma_exec.c" 32 33 #include "../../../mm/vma.c" 33 34 ··· 92 91 return res; 93 92 } 94 93 94 + static void detach_free_vma(struct vm_area_struct *vma) 95 + { 96 + vma_mark_detached(vma); 97 + vm_area_free(vma); 98 + } 99 + 95 100 /* Helper function to allocate a VMA and link it to the tree. */ 96 101 static struct vm_area_struct *alloc_and_link_vma(struct mm_struct *mm, 97 102 unsigned long start, ··· 111 104 return NULL; 112 105 113 106 if (attach_vma(mm, vma)) { 114 - vm_area_free(vma); 107 + detach_free_vma(vma); 115 108 return NULL; 116 109 } 117 110 ··· 256 249 257 250 vma_iter_set(vmi, 0); 258 251 for_each_vma(*vmi, vma) { 259 - vm_area_free(vma); 252 + detach_free_vma(vma); 260 253 count++; 261 254 } 262 255 ··· 326 319 ASSERT_EQ(vma->vm_pgoff, 0); 327 320 ASSERT_EQ(vma->vm_flags, flags); 328 321 329 - vm_area_free(vma); 322 + detach_free_vma(vma); 330 323 mtree_destroy(&mm.mm_mt); 331 324 332 325 return true; ··· 368 361 ASSERT_EQ(vma->vm_end, 0x1000); 369 362 ASSERT_EQ(vma->vm_pgoff, 0); 370 363 371 - vm_area_free(vma); 364 + detach_free_vma(vma); 372 365 vma_iter_clear(&vmi); 373 366 374 367 vma = vma_next(&vmi); ··· 377 370 ASSERT_EQ(vma->vm_end, 0x2000); 378 371 ASSERT_EQ(vma->vm_pgoff, 1); 379 372 380 - vm_area_free(vma); 373 + detach_free_vma(vma); 381 374 vma_iter_clear(&vmi); 382 375 383 376 vma = vma_next(&vmi); ··· 386 379 ASSERT_EQ(vma->vm_end, 0x3000); 387 380 ASSERT_EQ(vma->vm_pgoff, 2); 388 381 389 - vm_area_free(vma); 382 + detach_free_vma(vma); 390 383 mtree_destroy(&mm.mm_mt); 391 384 392 385 return true; ··· 414 407 ASSERT_EQ(vma->vm_end, 0x3000); 415 408 ASSERT_EQ(vma->vm_pgoff, 0); 416 409 417 - vm_area_free(vma); 410 + detach_free_vma(vma); 418 411 mtree_destroy(&mm.mm_mt); 419 412 420 413 return true; ··· 435 428 ASSERT_EQ(vma->vm_end, 0x1000); 436 429 ASSERT_EQ(vma->vm_pgoff, 0); 437 430 438 - vm_area_free(vma); 431 + detach_free_vma(vma); 439 432 mtree_destroy(&mm.mm_mt); 440 433 441 434 return true; ··· 626 619 ASSERT_EQ(vma->vm_pgoff, 0); 627 620 ASSERT_EQ(vma->anon_vma, &dummy_anon_vma); 628 621 629 - vm_area_free(vma); 622 + detach_free_vma(vma); 630 623 count++; 631 624 } 632 625 ··· 1675 1668 int num_tests = 0, num_fail = 0; 1676 1669 1677 1670 maple_tree_init(); 1671 + vma_state_init(); 1678 1672 1679 1673 #define TEST(name) \ 1680 1674 do { \
+118 -25
tools/testing/vma/vma_internal.h
··· 155 155 */ 156 156 #define pr_warn_once pr_err 157 157 158 + #define data_race(expr) expr 159 + 160 + #define ASSERT_EXCLUSIVE_WRITER(x) 161 + 158 162 struct kref { 159 163 refcount_t refcount; 160 164 }; ··· 259 255 260 256 #define VMA_LOCK_OFFSET 0x40000000 261 257 258 + typedef struct { unsigned long v; } freeptr_t; 259 + 262 260 struct vm_area_struct { 263 261 /* The first cache line has the info for VMA tree walking. */ 264 262 ··· 270 264 unsigned long vm_start; 271 265 unsigned long vm_end; 272 266 }; 273 - #ifdef CONFIG_PER_VMA_LOCK 274 - struct rcu_head vm_rcu; /* Used for deferred freeing. */ 275 - #endif 267 + freeptr_t vm_freeptr; /* Pointer used by SLAB_TYPESAFE_BY_RCU */ 276 268 }; 277 269 278 270 struct mm_struct *vm_mm; /* The address space we belong to. */ ··· 467 463 .len_in = len_, \ 468 464 } 469 465 466 + struct kmem_cache_args { 467 + /** 468 + * @align: The required alignment for the objects. 469 + * 470 + * %0 means no specific alignment is requested. 471 + */ 472 + unsigned int align; 473 + /** 474 + * @useroffset: Usercopy region offset. 475 + * 476 + * %0 is a valid offset, when @usersize is non-%0 477 + */ 478 + unsigned int useroffset; 479 + /** 480 + * @usersize: Usercopy region size. 481 + * 482 + * %0 means no usercopy region is specified. 483 + */ 484 + unsigned int usersize; 485 + /** 486 + * @freeptr_offset: Custom offset for the free pointer 487 + * in &SLAB_TYPESAFE_BY_RCU caches 488 + * 489 + * By default &SLAB_TYPESAFE_BY_RCU caches place the free pointer 490 + * outside of the object. This might cause the object to grow in size. 491 + * Cache creators that have a reason to avoid this can specify a custom 492 + * free pointer offset in their struct where the free pointer will be 493 + * placed. 494 + * 495 + * Note that placing the free pointer inside the object requires the 496 + * caller to ensure that no fields are invalidated that are required to 497 + * guard against object recycling (See &SLAB_TYPESAFE_BY_RCU for 498 + * details). 499 + * 500 + * Using %0 as a value for @freeptr_offset is valid. If @freeptr_offset 501 + * is specified, %use_freeptr_offset must be set %true. 502 + * 503 + * Note that @ctor currently isn't supported with custom free pointers 504 + * as a @ctor requires an external free pointer. 505 + */ 506 + unsigned int freeptr_offset; 507 + /** 508 + * @use_freeptr_offset: Whether a @freeptr_offset is used. 509 + */ 510 + bool use_freeptr_offset; 511 + /** 512 + * @ctor: A constructor for the objects. 513 + * 514 + * The constructor is invoked for each object in a newly allocated slab 515 + * page. It is the cache user's responsibility to free object in the 516 + * same state as after calling the constructor, or deal appropriately 517 + * with any differences between a freshly constructed and a reallocated 518 + * object. 519 + * 520 + * %NULL means no constructor. 521 + */ 522 + void (*ctor)(void *); 523 + }; 524 + 470 525 static inline void vma_iter_invalidate(struct vma_iterator *vmi) 471 526 { 472 527 mas_pause(&vmi->mas); ··· 610 547 vma->vm_lock_seq = UINT_MAX; 611 548 } 612 549 613 - static inline struct vm_area_struct *vm_area_alloc(struct mm_struct *mm) 550 + struct kmem_cache { 551 + const char *name; 552 + size_t object_size; 553 + struct kmem_cache_args *args; 554 + }; 555 + 556 + static inline struct kmem_cache *__kmem_cache_create(const char *name, 557 + size_t object_size, 558 + struct kmem_cache_args *args) 614 559 { 615 - struct vm_area_struct *vma = calloc(1, sizeof(struct vm_area_struct)); 560 + struct kmem_cache *ret = malloc(sizeof(struct kmem_cache)); 616 561 617 - if (!vma) 618 - return NULL; 562 + ret->name = name; 563 + ret->object_size = object_size; 564 + ret->args = args; 619 565 620 - vma_init(vma, mm); 621 - 622 - return vma; 566 + return ret; 623 567 } 624 568 625 - static inline struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) 569 + #define kmem_cache_create(__name, __object_size, __args, ...) \ 570 + __kmem_cache_create((__name), (__object_size), (__args)) 571 + 572 + static inline void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags) 626 573 { 627 - struct vm_area_struct *new = calloc(1, sizeof(struct vm_area_struct)); 574 + (void)gfpflags; 628 575 629 - if (!new) 630 - return NULL; 576 + return calloc(s->object_size, 1); 577 + } 631 578 632 - memcpy(new, orig, sizeof(*new)); 633 - refcount_set(&new->vm_refcnt, 0); 634 - new->vm_lock_seq = UINT_MAX; 635 - INIT_LIST_HEAD(&new->anon_vma_chain); 636 - 637 - return new; 579 + static inline void kmem_cache_free(struct kmem_cache *s, void *x) 580 + { 581 + free(x); 638 582 } 639 583 640 584 /* ··· 806 736 807 737 static inline void mpol_put(struct mempolicy *) 808 738 { 809 - } 810 - 811 - static inline void vm_area_free(struct vm_area_struct *vma) 812 - { 813 - free(vma); 814 739 } 815 740 816 741 static inline void lru_add_drain(void) ··· 1375 1310 static inline void ksm_exit(struct mm_struct *mm) 1376 1311 { 1377 1312 (void)mm; 1313 + } 1314 + 1315 + static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt) 1316 + { 1317 + (void)vma; 1318 + (void)reset_refcnt; 1319 + } 1320 + 1321 + static inline void vma_numab_state_init(struct vm_area_struct *vma) 1322 + { 1323 + (void)vma; 1324 + } 1325 + 1326 + static inline void vma_numab_state_free(struct vm_area_struct *vma) 1327 + { 1328 + (void)vma; 1329 + } 1330 + 1331 + static inline void dup_anon_vma_name(struct vm_area_struct *orig_vma, 1332 + struct vm_area_struct *new_vma) 1333 + { 1334 + (void)orig_vma; 1335 + (void)new_vma; 1336 + } 1337 + 1338 + static inline void free_anon_vma_name(struct vm_area_struct *vma) 1339 + { 1340 + (void)vma; 1378 1341 } 1379 1342 1380 1343 #endif /* __MM_VMA_INTERNAL_H */