Merge tag 'mm-stable-2025-08-03-12-35' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm

-1

Documentation/core-api/mm-api.rst

··· 133 133 .. kernel-doc:: mm/mmu_notifier.c 134 134 .. kernel-doc:: mm/balloon_compaction.c 135 135 .. kernel-doc:: mm/huge_memory.c 136 - .. kernel-doc:: mm/io-mapping.c

+2 -2

arch/arm64/mm/mmu.c

··· 721 721 722 722 static void __init declare_vma(struct vm_struct *vma, 723 723 void *va_start, void *va_end, 724 - vm_flags_t vm_flags) 724 + unsigned long vm_flags) 725 725 { 726 726 phys_addr_t pa_start = __pa_symbol(va_start); 727 727 unsigned long size = va_end - va_start; ··· 1528 1528 pte_t modify_prot_start_ptes(struct vm_area_struct *vma, unsigned long addr, 1529 1529 pte_t *ptep, unsigned int nr) 1530 1530 { 1531 - pte_t pte = get_and_clear_full_ptes(vma->vm_mm, addr, ptep, nr, /* full = */ 0); 1531 + pte_t pte = get_and_clear_ptes(vma->vm_mm, addr, ptep, nr); 1532 1532 1533 1533 if (alternative_has_cap_unlikely(ARM64_WORKAROUND_2645198)) { 1534 1534 /*

+1 -2

arch/x86/kernel/alternative.c

··· 120 120 121 121 static void *__its_alloc(struct its_array *pages) 122 122 { 123 - void *page __free(execmem) = execmem_alloc(EXECMEM_MODULE_TEXT, PAGE_SIZE); 123 + void *page __free(execmem) = execmem_alloc_rw(EXECMEM_MODULE_TEXT, PAGE_SIZE); 124 124 if (!page) 125 125 return NULL; 126 126 ··· 237 237 if (!page) 238 238 return NULL; 239 239 240 - execmem_make_temp_rw(page, PAGE_SIZE); 241 240 if (pages == &its_pages) 242 241 set_memory_x((unsigned long)page, 1); 243 242

+1 -1

arch/x86/kernel/ftrace.c

··· 263 263 264 264 static inline void *alloc_tramp(unsigned long size) 265 265 { 266 - return execmem_alloc(EXECMEM_FTRACE, size); 266 + return execmem_alloc_rw(EXECMEM_FTRACE, size); 267 267 } 268 268 static inline void tramp_free(void *tramp) 269 269 {

-18

arch/x86/kernel/kprobes/core.c

··· 481 481 return len; 482 482 } 483 483 484 - /* Make page to RO mode when allocate it */ 485 - void *alloc_insn_page(void) 486 - { 487 - void *page; 488 - 489 - page = execmem_alloc(EXECMEM_KPROBES, PAGE_SIZE); 490 - if (!page) 491 - return NULL; 492 - 493 - /* 494 - * TODO: Once additional kernel code protection mechanisms are set, ensure 495 - * that the page was not maliciously altered and it is still zeroed. 496 - */ 497 - set_memory_rox((unsigned long)page, 1); 498 - 499 - return page; 500 - } 501 - 502 484 /* Kprobe x86 instruction emulation - only regs->ip or IF flag modifiers */ 503 485 504 486 static void kprobe_emulate_ifmodifiers(struct kprobe *p, struct pt_regs *regs)

+17 -7

arch/x86/mm/init.c

··· 1063 1063 static struct execmem_info execmem_info __ro_after_init; 1064 1064 1065 1065 #ifdef CONFIG_ARCH_HAS_EXECMEM_ROX 1066 - void execmem_fill_trapping_insns(void *ptr, size_t size, bool writeable) 1066 + void execmem_fill_trapping_insns(void *ptr, size_t size) 1067 1067 { 1068 - /* fill memory with INT3 instructions */ 1069 - if (writeable) 1070 - memset(ptr, INT3_INSN_OPCODE, size); 1071 - else 1072 - text_poke_set(ptr, INT3_INSN_OPCODE, size); 1068 + memset(ptr, INT3_INSN_OPCODE, size); 1073 1069 } 1074 1070 #endif 1075 1071 ··· 1098 1102 .pgprot = pgprot, 1099 1103 .alignment = MODULE_ALIGN, 1100 1104 }, 1101 - [EXECMEM_KPROBES ... EXECMEM_BPF] = { 1105 + [EXECMEM_KPROBES] = { 1106 + .flags = flags, 1107 + .start = start, 1108 + .end = MODULES_END, 1109 + .pgprot = PAGE_KERNEL_ROX, 1110 + .alignment = MODULE_ALIGN, 1111 + }, 1112 + [EXECMEM_FTRACE] = { 1113 + .flags = flags, 1114 + .start = start, 1115 + .end = MODULES_END, 1116 + .pgprot = pgprot, 1117 + .alignment = MODULE_ALIGN, 1118 + }, 1119 + [EXECMEM_BPF] = { 1102 1120 .flags = EXECMEM_KASAN_SHADOW, 1103 1121 .start = start, 1104 1122 .end = MODULES_END,

+23 -31

include/linux/execmem.h

··· 60 60 * will trap 61 61 * @ptr: pointer to memory to fill 62 62 * @size: size of the range to fill 63 - * @writable: is the memory poited by @ptr is writable or ROX 64 63 * 65 64 * A hook for architecures to fill execmem ranges with invalid instructions. 66 65 * Architectures that use EXECMEM_ROX_CACHE must implement this. 67 66 */ 68 - void execmem_fill_trapping_insns(void *ptr, size_t size, bool writable); 69 - 70 - /** 71 - * execmem_make_temp_rw - temporarily remap region with read-write 72 - * permissions 73 - * @ptr: address of the region to remap 74 - * @size: size of the region to remap 75 - * 76 - * Remaps a part of the cached large page in the ROX cache in the range 77 - * [@ptr, @ptr + @size) as writable and not executable. The caller must 78 - * have exclusive ownership of this range and ensure nothing will try to 79 - * execute code in this range. 80 - * 81 - * Return: 0 on success or negative error code on failure. 82 - */ 83 - int execmem_make_temp_rw(void *ptr, size_t size); 67 + void execmem_fill_trapping_insns(void *ptr, size_t size); 84 68 85 69 /** 86 70 * execmem_restore_rox - restore read-only-execute permissions ··· 79 95 */ 80 96 int execmem_restore_rox(void *ptr, size_t size); 81 97 #else 82 - static inline int execmem_make_temp_rw(void *ptr, size_t size) { return 0; } 83 98 static inline int execmem_restore_rox(void *ptr, size_t size) { return 0; } 84 99 #endif 85 100 ··· 149 166 void *execmem_alloc(enum execmem_type type, size_t size); 150 167 151 168 /** 169 + * execmem_alloc_rw - allocate writable executable memory 170 + * @type: type of the allocation 171 + * @size: how many bytes of memory are required 172 + * 173 + * Allocates memory that will contain executable code, either generated or 174 + * loaded from kernel modules. 175 + * 176 + * Allocates memory that will contain data coupled with executable code, 177 + * like data sections in kernel modules. 178 + * 179 + * Forces writable permissions on the allocated memory and the caller is 180 + * responsible to manage the permissions afterwards. 181 + * 182 + * For architectures that use ROX cache the permissions will be set to R+W. 183 + * For architectures that don't use ROX cache the default permissions for @type 184 + * will be used as they must be writable. 185 + * 186 + * Return: a pointer to the allocated memory or %NULL 187 + */ 188 + void *execmem_alloc_rw(enum execmem_type type, size_t size); 189 + 190 + /** 152 191 * execmem_free - free executable memory 153 192 * @ptr: pointer to the memory that should be freed 154 193 */ ··· 189 184 */ 190 185 struct vm_struct *execmem_vmap(size_t size); 191 186 #endif 192 - 193 - /** 194 - * execmem_update_copy - copy an update to executable memory 195 - * @dst: destination address to update 196 - * @src: source address containing the data 197 - * @size: how many bytes of memory shold be copied 198 - * 199 - * Copy @size bytes from @src to @dst using text poking if the memory at 200 - * @dst is read-only. 201 - * 202 - * Return: a pointer to @dst or NULL on error 203 - */ 204 - void *execmem_update_copy(void *dst, const void *src, size_t size); 205 187 206 188 /** 207 189 * execmem_is_rox - check if execmem is read-only

-3

include/linux/io-mapping.h

··· 225 225 kfree(iomap); 226 226 } 227 227 228 - int io_mapping_map_user(struct io_mapping *iomap, struct vm_area_struct *vma, 229 - unsigned long addr, unsigned long pfn, unsigned long size); 230 - 231 228 #endif /* _LINUX_IO_MAPPING_H */

+4 -2

include/linux/mm.h

··· 414 414 #endif 415 415 416 416 #ifdef CONFIG_64BIT 417 - /* VM is sealed, in vm_flags */ 418 - #define VM_SEALED _BITUL(63) 417 + #define VM_SEALED_BIT 42 418 + #define VM_SEALED BIT(VM_SEALED_BIT) 419 + #else 420 + #define VM_SEALED VM_NONE 419 421 #endif 420 422 421 423 /* Bits set in the VMA until the stack is in its final location */

+30

include/linux/mmap_lock.h

··· 12 12 #include <linux/tracepoint-defs.h> 13 13 #include <linux/types.h> 14 14 #include <linux/cleanup.h> 15 + #include <linux/sched/mm.h> 15 16 16 17 #define MMAP_LOCK_INITIALIZER(name) \ 17 18 .mmap_lock = __RWSEM_INITIALIZER((name).mmap_lock), ··· 155 154 * reused and attached to a different mm before we lock it. 156 155 * Returns the vma on success, NULL on failure to lock and EAGAIN if vma got 157 156 * detached. 157 + * 158 + * WARNING! The vma passed to this function cannot be used if the function 159 + * fails to lock it because in certain cases RCU lock is dropped and then 160 + * reacquired. Once RCU lock is dropped the vma can be concurently freed. 158 161 */ 159 162 static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm, 160 163 struct vm_area_struct *vma) ··· 188 183 } 189 184 190 185 rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_); 186 + 187 + /* 188 + * If vma got attached to another mm from under us, that mm is not 189 + * stable and can be freed in the narrow window after vma->vm_refcnt 190 + * is dropped and before rcuwait_wake_up(mm) is called. Grab it before 191 + * releasing vma->vm_refcnt. 192 + */ 193 + if (unlikely(vma->vm_mm != mm)) { 194 + /* Use a copy of vm_mm in case vma is freed after we drop vm_refcnt */ 195 + struct mm_struct *other_mm = vma->vm_mm; 196 + 197 + /* 198 + * __mmdrop() is a heavy operation and we don't need RCU 199 + * protection here. Release RCU lock during these operations. 200 + * We reinstate the RCU read lock as the caller expects it to 201 + * be held when this function returns even on error. 202 + */ 203 + rcu_read_unlock(); 204 + mmgrab(other_mm); 205 + vma_refcount_put(vma); 206 + mmdrop(other_mm); 207 + rcu_read_lock(); 208 + return NULL; 209 + } 210 + 191 211 /* 192 212 * Overflow of vm_lock_seq/mm_lock_seq might produce false locked result. 193 213 * False unlocked result is impossible because we modify and check

-2

include/linux/page-flags.h

··· 837 837 838 838 #define folio_start_writeback(folio) \ 839 839 __folio_start_writeback(folio, false) 840 - #define folio_start_writeback_keepwrite(folio) \ 841 - __folio_start_writeback(folio, true) 842 840 843 841 static __always_inline bool folio_test_head(const struct folio *folio) 844 842 {

+45

include/linux/pgtable.h

··· 736 736 } 737 737 #endif 738 738 739 + /** 740 + * get_and_clear_ptes - Clear present PTEs that map consecutive pages of 741 + * the same folio, collecting dirty/accessed bits. 742 + * @mm: Address space the pages are mapped into. 743 + * @addr: Address the first page is mapped at. 744 + * @ptep: Page table pointer for the first entry. 745 + * @nr: Number of entries to clear. 746 + * 747 + * Use this instead of get_and_clear_full_ptes() if it is known that we don't 748 + * need to clear the full mm, which is mostly the case. 749 + * 750 + * Note that PTE bits in the PTE range besides the PFN can differ. For example, 751 + * some PTEs might be write-protected. 752 + * 753 + * Context: The caller holds the page table lock. The PTEs map consecutive 754 + * pages that belong to the same folio. The PTEs are all in the same PMD. 755 + */ 756 + static inline pte_t get_and_clear_ptes(struct mm_struct *mm, unsigned long addr, 757 + pte_t *ptep, unsigned int nr) 758 + { 759 + return get_and_clear_full_ptes(mm, addr, ptep, nr, 0); 760 + } 761 + 739 762 #ifndef clear_full_ptes 740 763 /** 741 764 * clear_full_ptes - Clear present PTEs that map consecutive pages of the same ··· 790 767 } 791 768 } 792 769 #endif 770 + 771 + /** 772 + * clear_ptes - Clear present PTEs that map consecutive pages of the same folio. 773 + * @mm: Address space the pages are mapped into. 774 + * @addr: Address the first page is mapped at. 775 + * @ptep: Page table pointer for the first entry. 776 + * @nr: Number of entries to clear. 777 + * 778 + * Use this instead of clear_full_ptes() if it is known that we don't need to 779 + * clear the full mm, which is mostly the case. 780 + * 781 + * Note that PTE bits in the PTE range besides the PFN can differ. For example, 782 + * some PTEs might be write-protected. 783 + * 784 + * Context: The caller holds the page table lock. The PTEs map consecutive 785 + * pages that belong to the same folio. The PTEs are all in the same PMD. 786 + */ 787 + static inline void clear_ptes(struct mm_struct *mm, unsigned long addr, 788 + pte_t *ptep, unsigned int nr) 789 + { 790 + clear_full_ptes(mm, addr, ptep, nr, 0); 791 + } 793 792 794 793 /* 795 794 * If two threads concurrently fault at the same page, the thread that

+22

include/linux/rmap.h

··· 449 449 default: 450 450 VM_WARN_ON_ONCE(true); 451 451 } 452 + 453 + /* 454 + * Anon folios must have an associated live anon_vma as long as they're 455 + * mapped into userspace. 456 + * Note that the atomic_read() mainly does two things: 457 + * 458 + * 1. In KASAN builds with CONFIG_SLUB_RCU_DEBUG, it causes KASAN to 459 + * check that the associated anon_vma has not yet been freed (subject 460 + * to KASAN's usual limitations). This check will pass if the 461 + * anon_vma's refcount has already dropped to 0 but an RCU grace 462 + * period hasn't passed since then. 463 + * 2. If the anon_vma has not yet been freed, it checks that the 464 + * anon_vma still has a nonzero refcount (as opposed to being in the 465 + * middle of an RCU delay for getting freed). 466 + */ 467 + if (folio_test_anon(folio) && !folio_test_ksm(folio)) { 468 + unsigned long mapping = (unsigned long)folio->mapping; 469 + struct anon_vma *anon_vma; 470 + 471 + anon_vma = (void *)(mapping - FOLIO_MAPPING_ANON); 472 + VM_WARN_ON_FOLIO(atomic_read(&anon_vma->refcount) == 0, folio); 473 + } 452 474 } 453 475 454 476 /*

+6 -3

kernel/fork.c

··· 585 585 for (i = 0; i < NR_MM_COUNTERS; i++) { 586 586 long x = percpu_counter_sum(&mm->rss_stat[i]); 587 587 588 - if (unlikely(x)) 589 - pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld\n", 590 - mm, resident_page_types[i], x); 588 + if (unlikely(x)) { 589 + pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld Comm:%s Pid:%d\n", 590 + mm, resident_page_types[i], x, 591 + current->comm, 592 + task_pid_nr(current)); 593 + } 591 594 } 592 595 593 596 if (mm_pgtables_bytes(mm))

+2 -11

kernel/module/main.c

··· 1322 1322 else 1323 1323 execmem_type = EXECMEM_MODULE_TEXT; 1324 1324 1325 - ptr = execmem_alloc(execmem_type, size); 1325 + ptr = execmem_alloc_rw(execmem_type, size); 1326 1326 if (!ptr) 1327 1327 return -ENOMEM; 1328 1328 1329 - if (execmem_is_rox(execmem_type)) { 1330 - int err = execmem_make_temp_rw(ptr, size); 1331 - 1332 - if (err) { 1333 - execmem_free(ptr); 1334 - return -ENOMEM; 1335 - } 1336 - 1337 - mod->mem[type].is_rox = true; 1338 - } 1329 + mod->mem[type].is_rox = execmem_is_rox(execmem_type); 1339 1330 1340 1331 /* 1341 1332 * The pointer to these blocks of memory are stored on the module

-4

mm/Kconfig

··· 1242 1242 config KMAP_LOCAL_NON_LINEAR_PTE_ARRAY 1243 1243 bool 1244 1244 1245 - # struct io_mapping based helper. Selected by drivers that need them 1246 - config IO_MAPPING 1247 - bool 1248 - 1249 1245 config MEMFD_CREATE 1250 1246 bool "Enable memfd_create() system call" if EXPERT 1251 1247

-1

mm/Makefile

··· 141 141 obj-$(CONFIG_MAPPING_DIRTY_HELPERS) += mapping_dirty_helpers.o 142 142 obj-$(CONFIG_PTDUMP) += ptdump.o 143 143 obj-$(CONFIG_PAGE_REPORTING) += page_reporting.o 144 - obj-$(CONFIG_IO_MAPPING) += io-mapping.o 145 144 obj-$(CONFIG_HAVE_BOOTMEM_INFO_NODE) += bootmem_info.o 146 145 obj-$(CONFIG_GENERIC_IOREMAP) += ioremap.o 147 146 obj-$(CONFIG_SHRINKER_DEBUG) += shrinker_debug.o

+4

mm/damon/vaddr.c

··· 711 711 target -= dests->weight_arr[i]; 712 712 } 713 713 714 + /* If the folio is already in the right node, don't do anything */ 715 + if (folio_nid(folio) == dests->node_id_arr[i]) 716 + return; 717 + 714 718 isolate: 715 719 if (!folio_isolate_lru(folio)) 716 720 return;

+157 -55

mm/execmem.c

··· 26 26 27 27 #ifdef CONFIG_MMU 28 28 static void *execmem_vmalloc(struct execmem_range *range, size_t size, 29 - pgprot_t pgprot, vm_flags_t vm_flags) 29 + pgprot_t pgprot, unsigned long vm_flags) 30 30 { 31 31 bool kasan = range->flags & EXECMEM_KASAN_SHADOW; 32 32 gfp_t gfp_flags = GFP_KERNEL | __GFP_NOWARN; ··· 82 82 } 83 83 #else 84 84 static void *execmem_vmalloc(struct execmem_range *range, size_t size, 85 - pgprot_t pgprot, vm_flags_t vm_flags) 85 + pgprot_t pgprot, unsigned long vm_flags) 86 86 { 87 87 return vmalloc(size); 88 88 } ··· 93 93 struct mutex mutex; 94 94 struct maple_tree busy_areas; 95 95 struct maple_tree free_areas; 96 + unsigned int pending_free_cnt; /* protected by mutex */ 96 97 }; 98 + 99 + /* delay to schedule asynchronous free if fast path free fails */ 100 + #define FREE_DELAY (msecs_to_jiffies(10)) 101 + 102 + /* mark entries in busy_areas that should be freed asynchronously */ 103 + #define PENDING_FREE_MASK (1 << (PAGE_SHIFT - 1)) 97 104 98 105 static struct execmem_cache execmem_cache = { 99 106 .mutex = __MUTEX_INITIALIZER(execmem_cache.mutex), ··· 137 130 return err; 138 131 } 139 132 133 + static int execmem_force_rw(void *ptr, size_t size) 134 + { 135 + unsigned int nr = PAGE_ALIGN(size) >> PAGE_SHIFT; 136 + unsigned long addr = (unsigned long)ptr; 137 + int ret; 138 + 139 + ret = set_memory_nx(addr, nr); 140 + if (ret) 141 + return ret; 142 + 143 + return set_memory_rw(addr, nr); 144 + } 145 + 146 + int execmem_restore_rox(void *ptr, size_t size) 147 + { 148 + unsigned int nr = PAGE_ALIGN(size) >> PAGE_SHIFT; 149 + unsigned long addr = (unsigned long)ptr; 150 + 151 + return set_memory_rox(addr, nr); 152 + } 153 + 140 154 static void execmem_cache_clean(struct work_struct *work) 141 155 { 142 156 struct maple_tree *free_areas = &execmem_cache.free_areas; ··· 183 155 184 156 static DECLARE_WORK(execmem_cache_clean_work, execmem_cache_clean); 185 157 186 - static int execmem_cache_add(void *ptr, size_t size) 158 + static int execmem_cache_add_locked(void *ptr, size_t size, gfp_t gfp_mask) 187 159 { 188 160 struct maple_tree *free_areas = &execmem_cache.free_areas; 189 - struct mutex *mutex = &execmem_cache.mutex; 190 161 unsigned long addr = (unsigned long)ptr; 191 162 MA_STATE(mas, free_areas, addr - 1, addr + 1); 192 163 unsigned long lower, upper; 193 164 void *area = NULL; 194 - int err; 195 165 196 166 lower = addr; 197 167 upper = addr + size - 1; 198 168 199 - mutex_lock(mutex); 200 169 area = mas_walk(&mas); 201 170 if (area && mas.last == addr - 1) 202 171 lower = mas.index; ··· 203 178 upper = mas.last; 204 179 205 180 mas_set_range(&mas, lower, upper); 206 - err = mas_store_gfp(&mas, (void *)lower, GFP_KERNEL); 207 - mutex_unlock(mutex); 208 - if (err) 209 - return err; 181 + return mas_store_gfp(&mas, (void *)lower, gfp_mask); 182 + } 210 183 211 - return 0; 184 + static int execmem_cache_add(void *ptr, size_t size, gfp_t gfp_mask) 185 + { 186 + guard(mutex)(&execmem_cache.mutex); 187 + 188 + return execmem_cache_add_locked(ptr, size, gfp_mask); 212 189 } 213 190 214 191 static bool within_range(struct execmem_range *range, struct ma_state *mas, ··· 283 256 284 257 static int execmem_cache_populate(struct execmem_range *range, size_t size) 285 258 { 286 - vm_flags_t vm_flags = VM_ALLOW_HUGE_VMAP; 259 + unsigned long vm_flags = VM_ALLOW_HUGE_VMAP; 287 260 struct vm_struct *vm; 288 261 size_t alloc_size; 289 262 int err = -ENOMEM; ··· 291 264 292 265 alloc_size = round_up(size, PMD_SIZE); 293 266 p = execmem_vmalloc(range, alloc_size, PAGE_KERNEL, vm_flags); 267 + if (!p) { 268 + alloc_size = size; 269 + p = execmem_vmalloc(range, alloc_size, PAGE_KERNEL, vm_flags); 270 + } 271 + 294 272 if (!p) 295 273 return err; 296 274 ··· 304 272 goto err_free_mem; 305 273 306 274 /* fill memory with instructions that will trap */ 307 - execmem_fill_trapping_insns(p, alloc_size, /* writable = */ true); 275 + execmem_fill_trapping_insns(p, alloc_size); 308 276 309 277 err = set_memory_rox((unsigned long)p, vm->nr_pages); 310 278 if (err) 311 279 goto err_free_mem; 312 280 313 - err = execmem_cache_add(p, alloc_size); 281 + err = execmem_cache_add(p, alloc_size, GFP_KERNEL); 314 282 if (err) 315 283 goto err_reset_direct_map; 316 284 ··· 339 307 return __execmem_cache_alloc(range, size); 340 308 } 341 309 310 + static inline bool is_pending_free(void *ptr) 311 + { 312 + return ((unsigned long)ptr & PENDING_FREE_MASK); 313 + } 314 + 315 + static inline void *pending_free_set(void *ptr) 316 + { 317 + return (void *)((unsigned long)ptr | PENDING_FREE_MASK); 318 + } 319 + 320 + static inline void *pending_free_clear(void *ptr) 321 + { 322 + return (void *)((unsigned long)ptr & ~PENDING_FREE_MASK); 323 + } 324 + 325 + static int __execmem_cache_free(struct ma_state *mas, void *ptr, gfp_t gfp_mask) 326 + { 327 + size_t size = mas_range_len(mas); 328 + int err; 329 + 330 + err = execmem_force_rw(ptr, size); 331 + if (err) 332 + return err; 333 + 334 + execmem_fill_trapping_insns(ptr, size); 335 + execmem_restore_rox(ptr, size); 336 + 337 + err = execmem_cache_add_locked(ptr, size, gfp_mask); 338 + if (err) 339 + return err; 340 + 341 + mas_store_gfp(mas, NULL, gfp_mask); 342 + return 0; 343 + } 344 + 345 + static void execmem_cache_free_slow(struct work_struct *work); 346 + static DECLARE_DELAYED_WORK(execmem_cache_free_work, execmem_cache_free_slow); 347 + 348 + static void execmem_cache_free_slow(struct work_struct *work) 349 + { 350 + struct maple_tree *busy_areas = &execmem_cache.busy_areas; 351 + MA_STATE(mas, busy_areas, 0, ULONG_MAX); 352 + void *area; 353 + 354 + guard(mutex)(&execmem_cache.mutex); 355 + 356 + if (!execmem_cache.pending_free_cnt) 357 + return; 358 + 359 + mas_for_each(&mas, area, ULONG_MAX) { 360 + if (!is_pending_free(area)) 361 + continue; 362 + 363 + area = pending_free_clear(area); 364 + if (__execmem_cache_free(&mas, area, GFP_KERNEL)) 365 + continue; 366 + 367 + execmem_cache.pending_free_cnt--; 368 + } 369 + 370 + if (execmem_cache.pending_free_cnt) 371 + schedule_delayed_work(&execmem_cache_free_work, FREE_DELAY); 372 + else 373 + schedule_work(&execmem_cache_clean_work); 374 + } 375 + 342 376 static bool execmem_cache_free(void *ptr) 343 377 { 344 378 struct maple_tree *busy_areas = &execmem_cache.busy_areas; 345 - struct mutex *mutex = &execmem_cache.mutex; 346 379 unsigned long addr = (unsigned long)ptr; 347 380 MA_STATE(mas, busy_areas, addr, addr); 348 - size_t size; 349 381 void *area; 382 + int err; 350 383 351 - mutex_lock(mutex); 384 + guard(mutex)(&execmem_cache.mutex); 385 + 352 386 area = mas_walk(&mas); 353 - if (!area) { 354 - mutex_unlock(mutex); 387 + if (!area) 355 388 return false; 389 + 390 + err = __execmem_cache_free(&mas, area, GFP_KERNEL | __GFP_NORETRY); 391 + if (err) { 392 + /* 393 + * mas points to exact slot we've got the area from, nothing 394 + * else can modify the tree because of the mutex, so there 395 + * won't be any allocations in mas_store_gfp() and it will just 396 + * change the pointer. 397 + */ 398 + area = pending_free_set(area); 399 + mas_store_gfp(&mas, area, GFP_KERNEL); 400 + execmem_cache.pending_free_cnt++; 401 + schedule_delayed_work(&execmem_cache_free_work, FREE_DELAY); 402 + return true; 356 403 } 357 - size = mas_range_len(&mas); 358 - 359 - mas_store_gfp(&mas, NULL, GFP_KERNEL); 360 - mutex_unlock(mutex); 361 - 362 - execmem_fill_trapping_insns(ptr, size, /* writable = */ false); 363 - 364 - execmem_cache_add(ptr, size); 365 404 366 405 schedule_work(&execmem_cache_clean_work); 367 406 368 407 return true; 369 408 } 370 409 371 - int execmem_make_temp_rw(void *ptr, size_t size) 372 - { 373 - unsigned int nr = PAGE_ALIGN(size) >> PAGE_SHIFT; 374 - unsigned long addr = (unsigned long)ptr; 375 - int ret; 376 - 377 - ret = set_memory_nx(addr, nr); 378 - if (ret) 379 - return ret; 380 - 381 - return set_memory_rw(addr, nr); 382 - } 383 - 384 - int execmem_restore_rox(void *ptr, size_t size) 385 - { 386 - unsigned int nr = PAGE_ALIGN(size) >> PAGE_SHIFT; 387 - unsigned long addr = (unsigned long)ptr; 388 - 389 - return set_memory_rox(addr, nr); 390 - } 391 - 392 410 #else /* CONFIG_ARCH_HAS_EXECMEM_ROX */ 411 + /* 412 + * when ROX cache is not used the permissions defined by architectures for 413 + * execmem ranges that are updated before use (e.g. EXECMEM_MODULE_TEXT) must 414 + * be writable anyway 415 + */ 416 + static inline int execmem_force_rw(void *ptr, size_t size) 417 + { 418 + return 0; 419 + } 420 + 393 421 static void *execmem_cache_alloc(struct execmem_range *range, size_t size) 394 422 { 395 423 return NULL; ··· 465 373 { 466 374 struct execmem_range *range = &execmem_info->ranges[type]; 467 375 bool use_cache = range->flags & EXECMEM_ROX_CACHE; 468 - vm_flags_t vm_flags = VM_FLUSH_RESET_PERMS; 376 + unsigned long vm_flags = VM_FLUSH_RESET_PERMS; 469 377 pgprot_t pgprot = range->pgprot; 470 - void *p; 378 + void *p = NULL; 471 379 472 380 size = PAGE_ALIGN(size); 473 381 ··· 477 385 p = execmem_vmalloc(range, size, pgprot, vm_flags); 478 386 479 387 return kasan_reset_tag(p); 388 + } 389 + 390 + void *execmem_alloc_rw(enum execmem_type type, size_t size) 391 + { 392 + void *p __free(execmem) = execmem_alloc(type, size); 393 + int err; 394 + 395 + if (!p) 396 + return NULL; 397 + 398 + err = execmem_force_rw(p, size); 399 + if (err) 400 + return NULL; 401 + 402 + return no_free_ptr(p); 480 403 } 481 404 482 405 void execmem_free(void *ptr) ··· 504 397 505 398 if (!execmem_cache_free(ptr)) 506 399 vfree(ptr); 507 - } 508 - 509 - void *execmem_update_copy(void *dst, const void *src, size_t size) 510 - { 511 - return text_poke_copy(dst, src, size); 512 400 } 513 401 514 402 bool execmem_is_rox(enum execmem_type type)

+1 -1

mm/internal.h

··· 1391 1391 1392 1392 struct vm_struct *__get_vm_area_node(unsigned long size, 1393 1393 unsigned long align, unsigned long shift, 1394 - vm_flags_t vm_flags, unsigned long start, 1394 + unsigned long vm_flags, unsigned long start, 1395 1395 unsigned long end, int node, gfp_t gfp_mask, 1396 1396 const void *caller); 1397 1397

-30

mm/io-mapping.c

··· 1 - // SPDX-License-Identifier: GPL-2.0-only 2 - 3 - #include <linux/mm.h> 4 - #include <linux/io-mapping.h> 5 - 6 - /** 7 - * io_mapping_map_user - remap an I/O mapping to userspace 8 - * @iomap: the source io_mapping 9 - * @vma: user vma to map to 10 - * @addr: target user address to start at 11 - * @pfn: physical address of kernel memory 12 - * @size: size of map area 13 - * 14 - * Note: this is only safe if the mm semaphore is held when called. 15 - */ 16 - int io_mapping_map_user(struct io_mapping *iomap, struct vm_area_struct *vma, 17 - unsigned long addr, unsigned long pfn, unsigned long size) 18 - { 19 - vm_flags_t expected_flags = VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; 20 - 21 - if (WARN_ON_ONCE((vma->vm_flags & expected_flags) != expected_flags)) 22 - return -EINVAL; 23 - 24 - pgprot_t remap_prot = __pgprot((pgprot_val(iomap->prot) & _PAGE_CACHE_MASK) | 25 - (pgprot_val(vma->vm_page_prot) & ~_PAGE_CACHE_MASK)); 26 - 27 - /* We rely on prevalidation of the io-mapping to skip pfnmap tracking. */ 28 - return remap_pfn_range_notrack(vma, addr, pfn, size, remap_prot); 29 - } 30 - EXPORT_SYMBOL_GPL(io_mapping_map_user);

+18 -7

mm/kasan/common.c

··· 230 230 } 231 231 232 232 static inline void poison_slab_object(struct kmem_cache *cache, void *object, 233 - bool init, bool still_accessible) 233 + bool init) 234 234 { 235 235 void *tagged_object = object; 236 236 237 237 object = kasan_reset_tag(object); 238 - 239 - /* RCU slabs could be legally used after free within the RCU period. */ 240 - if (unlikely(still_accessible)) 241 - return; 242 238 243 239 kasan_poison(object, round_up(cache->object_size, KASAN_GRANULE_SIZE), 244 240 KASAN_SLAB_FREE, init); ··· 257 261 if (!kasan_arch_is_ready() || is_kfence_address(object)) 258 262 return false; 259 263 260 - poison_slab_object(cache, object, init, still_accessible); 264 + /* 265 + * If this point is reached with an object that must still be 266 + * accessible under RCU, we can't poison it; in that case, also skip the 267 + * quarantine. This should mostly only happen when CONFIG_SLUB_RCU_DEBUG 268 + * has been disabled manually. 269 + * 270 + * Putting the object on the quarantine wouldn't help catch UAFs (since 271 + * we can't poison it here), and it would mask bugs caused by 272 + * SLAB_TYPESAFE_BY_RCU users not being careful enough about object 273 + * reuse; so overall, putting the object into the quarantine here would 274 + * be counterproductive. 275 + */ 276 + if (still_accessible) 277 + return false; 278 + 279 + poison_slab_object(cache, object, init); 261 280 262 281 /* 263 282 * If the object is put into quarantine, do not let slab put the object ··· 530 519 if (check_slab_allocation(slab->slab_cache, ptr, ip)) 531 520 return false; 532 521 533 - poison_slab_object(slab->slab_cache, ptr, false, false); 522 + poison_slab_object(slab->slab_cache, ptr, false); 534 523 return true; 535 524 } 536 525

+39 -19

mm/khugepaged.c

··· 700 700 spinlock_t *ptl, 701 701 struct list_head *compound_pagelist) 702 702 { 703 + unsigned long end = address + HPAGE_PMD_SIZE; 703 704 struct folio *src, *tmp; 704 - pte_t *_pte; 705 705 pte_t pteval; 706 + pte_t *_pte; 707 + unsigned int nr_ptes; 706 708 707 - for (_pte = pte; _pte < pte + HPAGE_PMD_NR; 708 - _pte++, address += PAGE_SIZE) { 709 + for (_pte = pte; _pte < pte + HPAGE_PMD_NR; _pte += nr_ptes, 710 + address += nr_ptes * PAGE_SIZE) { 711 + nr_ptes = 1; 709 712 pteval = ptep_get(_pte); 710 713 if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { 711 714 add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1); ··· 725 722 struct page *src_page = pte_page(pteval); 726 723 727 724 src = page_folio(src_page); 728 - if (!folio_test_large(src)) 725 + 726 + if (folio_test_large(src)) { 727 + unsigned int max_nr_ptes = (end - address) >> PAGE_SHIFT; 728 + 729 + nr_ptes = folio_pte_batch(src, _pte, pteval, max_nr_ptes); 730 + } else { 729 731 release_pte_folio(src); 732 + } 733 + 730 734 /* 731 735 * ptl mostly unnecessary, but preempt has to 732 736 * be disabled to update the per-cpu stats 733 737 * inside folio_remove_rmap_pte(). 734 738 */ 735 739 spin_lock(ptl); 736 - ptep_clear(vma->vm_mm, address, _pte); 737 - folio_remove_rmap_pte(src, src_page, vma); 740 + clear_ptes(vma->vm_mm, address, _pte, nr_ptes); 741 + folio_remove_rmap_ptes(src, src_page, nr_ptes, vma); 738 742 spin_unlock(ptl); 739 - free_folio_and_swap_cache(src); 743 + free_swap_cache(src); 744 + folio_put_refs(src, nr_ptes); 740 745 } 741 746 } 742 747 ··· 1503 1492 int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, 1504 1493 bool install_pmd) 1505 1494 { 1495 + int nr_mapped_ptes = 0, result = SCAN_FAIL; 1496 + unsigned int nr_batch_ptes; 1506 1497 struct mmu_notifier_range range; 1507 1498 bool notified = false; 1508 1499 unsigned long haddr = addr & HPAGE_PMD_MASK; 1500 + unsigned long end = haddr + HPAGE_PMD_SIZE; 1509 1501 struct vm_area_struct *vma = vma_lookup(mm, haddr); 1510 1502 struct folio *folio; 1511 1503 pte_t *start_pte, *pte; 1512 1504 pmd_t *pmd, pgt_pmd; 1513 1505 spinlock_t *pml = NULL, *ptl; 1514 - int nr_ptes = 0, result = SCAN_FAIL; 1515 1506 int i; 1516 1507 1517 1508 mmap_assert_locked(mm); ··· 1627 1614 goto abort; 1628 1615 1629 1616 /* step 2: clear page table and adjust rmap */ 1630 - for (i = 0, addr = haddr, pte = start_pte; 1631 - i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) { 1617 + for (i = 0, addr = haddr, pte = start_pte; i < HPAGE_PMD_NR; 1618 + i += nr_batch_ptes, addr += nr_batch_ptes * PAGE_SIZE, 1619 + pte += nr_batch_ptes) { 1620 + unsigned int max_nr_batch_ptes = (end - addr) >> PAGE_SHIFT; 1632 1621 struct page *page; 1633 1622 pte_t ptent = ptep_get(pte); 1623 + 1624 + nr_batch_ptes = 1; 1634 1625 1635 1626 if (pte_none(ptent)) 1636 1627 continue; ··· 1649 1632 goto abort; 1650 1633 } 1651 1634 page = vm_normal_page(vma, addr, ptent); 1635 + 1652 1636 if (folio_page(folio, i) != page) 1653 1637 goto abort; 1638 + 1639 + nr_batch_ptes = folio_pte_batch(folio, pte, ptent, max_nr_batch_ptes); 1654 1640 1655 1641 /* 1656 1642 * Must clear entry, or a racing truncate may re-remove it. 1657 1643 * TLB flush can be left until pmdp_collapse_flush() does it. 1658 1644 * PTE dirty? Shmem page is already dirty; file is read-only. 1659 1645 */ 1660 - ptep_clear(mm, addr, pte); 1661 - folio_remove_rmap_pte(folio, page, vma); 1662 - nr_ptes++; 1646 + clear_ptes(mm, addr, pte, nr_batch_ptes); 1647 + folio_remove_rmap_ptes(folio, page, nr_batch_ptes, vma); 1648 + nr_mapped_ptes += nr_batch_ptes; 1663 1649 } 1664 1650 1665 1651 if (!pml) 1666 1652 spin_unlock(ptl); 1667 1653 1668 1654 /* step 3: set proper refcount and mm_counters. */ 1669 - if (nr_ptes) { 1670 - folio_ref_sub(folio, nr_ptes); 1671 - add_mm_counter(mm, mm_counter_file(folio), -nr_ptes); 1655 + if (nr_mapped_ptes) { 1656 + folio_ref_sub(folio, nr_mapped_ptes); 1657 + add_mm_counter(mm, mm_counter_file(folio), -nr_mapped_ptes); 1672 1658 } 1673 1659 1674 1660 /* step 4: remove empty page table */ ··· 1704 1684 : SCAN_SUCCEED; 1705 1685 goto drop_folio; 1706 1686 abort: 1707 - if (nr_ptes) { 1687 + if (nr_mapped_ptes) { 1708 1688 flush_tlb_mm(mm); 1709 - folio_ref_sub(folio, nr_ptes); 1710 - add_mm_counter(mm, mm_counter_file(folio), -nr_ptes); 1689 + folio_ref_sub(folio, nr_mapped_ptes); 1690 + add_mm_counter(mm, mm_counter_file(folio), -nr_mapped_ptes); 1711 1691 } 1712 1692 unlock: 1713 1693 if (start_pte)

+70 -1

mm/madvise.c

··· 19 19 #include <linux/sched.h> 20 20 #include <linux/sched/mm.h> 21 21 #include <linux/mm_inline.h> 22 + #include <linux/mmu_context.h> 22 23 #include <linux/string.h> 23 24 #include <linux/uio.h> 24 25 #include <linux/ksm.h> ··· 1257 1256 &guard_remove_walk_ops, NULL); 1258 1257 } 1259 1258 1259 + #ifdef CONFIG_64BIT 1260 + /* Does the madvise operation result in discarding of mapped data? */ 1261 + static bool is_discard(int behavior) 1262 + { 1263 + switch (behavior) { 1264 + case MADV_FREE: 1265 + case MADV_DONTNEED: 1266 + case MADV_DONTNEED_LOCKED: 1267 + case MADV_REMOVE: 1268 + case MADV_DONTFORK: 1269 + case MADV_WIPEONFORK: 1270 + case MADV_GUARD_INSTALL: 1271 + return true; 1272 + } 1273 + 1274 + return false; 1275 + } 1276 + 1277 + /* 1278 + * We are restricted from madvise()'ing mseal()'d VMAs only in very particular 1279 + * circumstances - discarding of data from read-only anonymous SEALED mappings. 1280 + * 1281 + * This is because users cannot trivally discard data from these VMAs, and may 1282 + * only do so via an appropriate madvise() call. 1283 + */ 1284 + static bool can_madvise_modify(struct madvise_behavior *madv_behavior) 1285 + { 1286 + struct vm_area_struct *vma = madv_behavior->vma; 1287 + 1288 + /* If the VMA isn't sealed we're good. */ 1289 + if (!vma_is_sealed(vma)) 1290 + return true; 1291 + 1292 + /* For a sealed VMA, we only care about discard operations. */ 1293 + if (!is_discard(madv_behavior->behavior)) 1294 + return true; 1295 + 1296 + /* 1297 + * We explicitly permit all file-backed mappings, whether MAP_SHARED or 1298 + * MAP_PRIVATE. 1299 + * 1300 + * The latter causes some complications. Because now, one can mmap() 1301 + * read/write a MAP_PRIVATE mapping, write to it, then mprotect() 1302 + * read-only, mseal() and a discard will be permitted. 1303 + * 1304 + * However, in order to avoid issues with potential use of madvise(..., 1305 + * MADV_DONTNEED) of mseal()'d .text mappings we, for the time being, 1306 + * permit this. 1307 + */ 1308 + if (!vma_is_anonymous(vma)) 1309 + return true; 1310 + 1311 + /* If the user could write to the mapping anyway, then this is fine. */ 1312 + if ((vma->vm_flags & VM_WRITE) && 1313 + arch_vma_access_permitted(vma, /* write= */ true, 1314 + /* execute= */ false, /* foreign= */ false)) 1315 + return true; 1316 + 1317 + /* Otherwise, we are not permitted to perform this operation. */ 1318 + return false; 1319 + } 1320 + #else 1321 + static bool can_madvise_modify(struct madvise_behavior *madv_behavior) 1322 + { 1323 + return true; 1324 + } 1325 + #endif 1326 + 1260 1327 /* 1261 1328 * Apply an madvise behavior to a region of a vma. madvise_update_vma 1262 1329 * will handle splitting a vm area into separate areas, each area with its own ··· 1338 1269 struct madvise_behavior_range *range = &madv_behavior->range; 1339 1270 int error; 1340 1271 1341 - if (unlikely(!can_modify_vma_madv(madv_behavior->vma, behavior))) 1272 + if (unlikely(!can_madvise_modify(madv_behavior))) 1342 1273 return -EPERM; 1343 1274 1344 1275 switch (behavior) {

+9 -3

mm/memory-failure.c

··· 837 837 struct mm_walk *walk) 838 838 { 839 839 struct hwpoison_walk *hwp = walk->private; 840 - pte_t pte = huge_ptep_get(walk->mm, addr, ptep); 841 840 struct hstate *h = hstate_vma(walk->vma); 841 + spinlock_t *ptl; 842 + pte_t pte; 843 + int ret; 842 844 843 - return check_hwpoisoned_entry(pte, addr, huge_page_shift(h), 844 - hwp->pfn, &hwp->tk); 845 + ptl = huge_pte_lock(h, walk->mm, ptep); 846 + pte = huge_ptep_get(walk->mm, addr, ptep); 847 + ret = check_hwpoisoned_entry(pte, addr, huge_page_shift(h), 848 + hwp->pfn, &hwp->tk); 849 + spin_unlock(ptl); 850 + return ret; 845 851 } 846 852 #else 847 853 #define hwpoison_hugetlb_range NULL

+10 -14

mm/mempool.c

··· 136 136 137 137 static __always_inline void add_element(mempool_t *pool, void *element) 138 138 { 139 - BUG_ON(pool->curr_nr >= pool->min_nr); 139 + BUG_ON(pool->min_nr != 0 && pool->curr_nr >= pool->min_nr); 140 140 poison_element(pool, element); 141 141 if (kasan_poison_element(pool, element)) 142 142 pool->elements[pool->curr_nr++] = element; ··· 202 202 pool->alloc = alloc_fn; 203 203 pool->free = free_fn; 204 204 init_waitqueue_head(&pool->wait); 205 - 206 - pool->elements = kmalloc_array_node(min_nr, sizeof(void *), 205 + /* 206 + * max() used here to ensure storage for at least 1 element to support 207 + * zero minimum pool 208 + */ 209 + pool->elements = kmalloc_array_node(max(1, min_nr), sizeof(void *), 207 210 gfp_mask, node_id); 208 211 if (!pool->elements) 209 212 return -ENOMEM; 210 213 211 214 /* 212 - * First pre-allocate the guaranteed number of buffers. 215 + * First pre-allocate the guaranteed number of buffers, 216 + * also pre-allocate 1 element for zero minimum pool. 213 217 */ 214 - while (pool->curr_nr < pool->min_nr) { 218 + while (pool->curr_nr < max(1, pool->min_nr)) { 215 219 void *element; 216 220 217 221 element = pool->alloc(gfp_mask, pool->pool_data); ··· 559 555 * wake-up path of previous test. This explicit check ensures the 560 556 * allocation of element when both min_nr and curr_nr are 0, and 561 557 * any active waiters are properly awakened. 562 - * 563 - * Inline the same logic as previous test, add_element() cannot be 564 - * directly used here since it has BUG_ON to deny if min_nr equals 565 - * curr_nr, so here picked rest of add_element() to use without 566 - * BUG_ON check. 567 558 */ 568 559 if (unlikely(pool->min_nr == 0 && 569 560 READ_ONCE(pool->curr_nr) == 0)) { 570 561 spin_lock_irqsave(&pool->lock, flags); 571 562 if (likely(pool->curr_nr == 0)) { 572 - /* Inline the logic of add_element() */ 573 - poison_element(pool, element); 574 - if (kasan_poison_element(pool, element)) 575 - pool->elements[pool->curr_nr++] = element; 563 + add_element(pool, element); 576 564 spin_unlock_irqrestore(&pool->lock, flags); 577 565 if (wq_has_sleeper(&pool->wait)) 578 566 wake_up(&pool->wait);

+3

mm/mincore.c

··· 29 29 #ifdef CONFIG_HUGETLB_PAGE 30 30 unsigned char present; 31 31 unsigned char *vec = walk->private; 32 + spinlock_t *ptl; 32 33 34 + ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte); 33 35 /* 34 36 * Hugepages under user process are always in RAM and never 35 37 * swapped out, but theoretically it needs to be checked. ··· 40 38 for (; addr != end; vec++, addr += PAGE_SIZE) 41 39 *vec = present; 42 40 walk->private = vec; 41 + spin_unlock(ptl); 43 42 #else 44 43 BUG(); 45 44 #endif

+3 -7

mm/mmap_lock.c

··· 164 164 */ 165 165 166 166 /* Check if the vma we locked is the right one. */ 167 - if (unlikely(vma->vm_mm != mm || 168 - address < vma->vm_start || address >= vma->vm_end)) 167 + if (unlikely(address < vma->vm_start || address >= vma->vm_end)) 169 168 goto inval_end_read; 170 169 171 170 rcu_read_unlock(); ··· 235 236 goto fallback; 236 237 } 237 238 238 - /* 239 - * Verify the vma we locked belongs to the same address space and it's 240 - * not behind of the last search position. 241 - */ 242 - if (unlikely(vma->vm_mm != mm || from_addr >= vma->vm_end)) 239 + /* Verify the vma is not behind the last search position. */ 240 + if (unlikely(from_addr >= vma->vm_end)) 243 241 goto fallback_unlock; 244 242 245 243 /*

+1 -1

mm/mprotect.c

··· 766 766 unsigned long charged = 0; 767 767 int error; 768 768 769 - if (!can_modify_vma(vma)) 769 + if (vma_is_sealed(vma)) 770 770 return -EPERM; 771 771 772 772 if (newflags == oldflags) {

+2 -2

mm/mremap.c

··· 280 280 old_pte, max_nr_ptes); 281 281 force_flush = true; 282 282 } 283 - pte = get_and_clear_full_ptes(mm, old_addr, old_ptep, nr_ptes, 0); 283 + pte = get_and_clear_ptes(mm, old_addr, old_ptep, nr_ptes); 284 284 pte = move_pte(pte, old_addr, new_addr); 285 285 pte = move_soft_dirty_pte(pte); 286 286 ··· 1651 1651 return -EFAULT; 1652 1652 1653 1653 /* If mseal()'d, mremap() is prohibited. */ 1654 - if (!can_modify_vma(vma)) 1654 + if (vma_is_sealed(vma)) 1655 1655 return -EPERM; 1656 1656 1657 1657 /* Align to hugetlb page size, if required. */

+46 -124

mm/mseal.c

··· 11 11 #include <linux/mman.h> 12 12 #include <linux/mm.h> 13 13 #include <linux/mm_inline.h> 14 - #include <linux/mmu_context.h> 15 14 #include <linux/syscalls.h> 16 15 #include <linux/sched.h> 17 16 #include "internal.h" 18 17 19 - static inline void set_vma_sealed(struct vm_area_struct *vma) 20 - { 21 - vm_flags_set(vma, VM_SEALED); 22 - } 23 - 24 - static bool is_madv_discard(int behavior) 25 - { 26 - switch (behavior) { 27 - case MADV_FREE: 28 - case MADV_DONTNEED: 29 - case MADV_DONTNEED_LOCKED: 30 - case MADV_REMOVE: 31 - case MADV_DONTFORK: 32 - case MADV_WIPEONFORK: 33 - case MADV_GUARD_INSTALL: 34 - return true; 35 - } 36 - 37 - return false; 38 - } 39 - 40 - static bool is_ro_anon(struct vm_area_struct *vma) 41 - { 42 - /* check anonymous mapping. */ 43 - if (vma->vm_file || vma->vm_flags & VM_SHARED) 44 - return false; 45 - 46 - /* 47 - * check for non-writable: 48 - * PROT=RO or PKRU is not writeable. 49 - */ 50 - if (!(vma->vm_flags & VM_WRITE) || 51 - !arch_vma_access_permitted(vma, true, false, false)) 52 - return true; 53 - 54 - return false; 55 - } 18 + /* 19 + * mseal() disallows an input range which contain unmapped ranges (VMA holes). 20 + * 21 + * It disallows unmapped regions from start to end whether they exist at the 22 + * start, in the middle, or at the end of the range, or any combination thereof. 23 + * 24 + * This is because after sealng a range, there's nothing to stop memory mapping 25 + * of ranges in the remaining gaps later, meaning that the user might then 26 + * wrongly consider the entirety of the mseal()'d range to be sealed when it 27 + * in fact isn't. 28 + */ 56 29 57 30 /* 58 - * Check if a vma is allowed to be modified by madvise. 31 + * Does the [start, end) range contain any unmapped memory? 32 + * 33 + * We ensure that: 34 + * - start is part of a valid VMA. 35 + * - end is part of a valid VMA. 36 + * - no gap (unallocated memory) exists between start and end. 59 37 */ 60 - bool can_modify_vma_madv(struct vm_area_struct *vma, int behavior) 61 - { 62 - if (!is_madv_discard(behavior)) 63 - return true; 64 - 65 - if (unlikely(!can_modify_vma(vma) && is_ro_anon(vma))) 66 - return false; 67 - 68 - /* Allow by default. */ 69 - return true; 70 - } 71 - 72 - static int mseal_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma, 73 - struct vm_area_struct **prev, unsigned long start, 74 - unsigned long end, vm_flags_t newflags) 75 - { 76 - int ret = 0; 77 - vm_flags_t oldflags = vma->vm_flags; 78 - 79 - if (newflags == oldflags) 80 - goto out; 81 - 82 - vma = vma_modify_flags(vmi, *prev, vma, start, end, newflags); 83 - if (IS_ERR(vma)) { 84 - ret = PTR_ERR(vma); 85 - goto out; 86 - } 87 - 88 - set_vma_sealed(vma); 89 - out: 90 - *prev = vma; 91 - return ret; 92 - } 93 - 94 - /* 95 - * Check for do_mseal: 96 - * 1> start is part of a valid vma. 97 - * 2> end is part of a valid vma. 98 - * 3> No gap (unallocated address) between start and end. 99 - * 4> map is sealable. 100 - */ 101 - static int check_mm_seal(unsigned long start, unsigned long end) 38 + static bool range_contains_unmapped(struct mm_struct *mm, 39 + unsigned long start, unsigned long end) 102 40 { 103 41 struct vm_area_struct *vma; 104 - unsigned long nstart = start; 105 - 42 + unsigned long prev_end = start; 106 43 VMA_ITERATOR(vmi, current->mm, start); 107 44 108 - /* going through each vma to check. */ 109 45 for_each_vma_range(vmi, vma, end) { 110 - if (vma->vm_start > nstart) 111 - /* unallocated memory found. */ 112 - return -ENOMEM; 46 + if (vma->vm_start > prev_end) 47 + return true; 113 48 114 - if (vma->vm_end >= end) 115 - return 0; 116 - 117 - nstart = vma->vm_end; 49 + prev_end = vma->vm_end; 118 50 } 119 51 120 - return -ENOMEM; 52 + return prev_end < end; 121 53 } 122 54 123 - /* 124 - * Apply sealing. 125 - */ 126 - static int apply_mm_seal(unsigned long start, unsigned long end) 55 + static int mseal_apply(struct mm_struct *mm, 56 + unsigned long start, unsigned long end) 127 57 { 128 - unsigned long nstart; 129 58 struct vm_area_struct *vma, *prev; 59 + unsigned long curr_start = start; 60 + VMA_ITERATOR(vmi, mm, start); 130 61 131 - VMA_ITERATOR(vmi, current->mm, start); 132 - 62 + /* We know there are no gaps so this will be non-NULL. */ 133 63 vma = vma_iter_load(&vmi); 134 - /* 135 - * Note: check_mm_seal should already checked ENOMEM case. 136 - * so vma should not be null, same for the other ENOMEM cases. 137 - */ 138 64 prev = vma_prev(&vmi); 139 65 if (start > vma->vm_start) 140 66 prev = vma; 141 67 142 - nstart = start; 143 68 for_each_vma_range(vmi, vma, end) { 144 - int error; 145 - unsigned long tmp; 146 - vm_flags_t newflags; 69 + unsigned long curr_end = MIN(vma->vm_end, end); 147 70 148 - newflags = vma->vm_flags | VM_SEALED; 149 - tmp = vma->vm_end; 150 - if (tmp > end) 151 - tmp = end; 152 - error = mseal_fixup(&vmi, vma, &prev, nstart, tmp, newflags); 153 - if (error) 154 - return error; 155 - nstart = vma_iter_end(&vmi); 71 + if (!(vma->vm_flags & VM_SEALED)) { 72 + vma = vma_modify_flags(&vmi, prev, vma, 73 + curr_start, curr_end, 74 + vma->vm_flags | VM_SEALED); 75 + if (IS_ERR(vma)) 76 + return PTR_ERR(vma); 77 + vm_flags_set(vma, VM_SEALED); 78 + } 79 + 80 + prev = vma; 81 + curr_start = curr_end; 156 82 } 157 83 158 84 return 0; ··· 166 240 if (mmap_write_lock_killable(mm)) 167 241 return -EINTR; 168 242 169 - /* 170 - * First pass, this helps to avoid 171 - * partial sealing in case of error in input address range, 172 - * e.g. ENOMEM error. 173 - */ 174 - ret = check_mm_seal(start, end); 175 - if (ret) 243 + if (range_contains_unmapped(mm, start, end)) { 244 + ret = -ENOMEM; 176 245 goto out; 246 + } 177 247 178 248 /* 179 249 * Second pass, this should success, unless there are errors ··· 177 255 * reaching the max supported VMAs, however, those cases shall 178 256 * be rare. 179 257 */ 180 - ret = apply_mm_seal(start, end); 258 + ret = mseal_apply(mm, start, end); 181 259 182 260 out: 183 - mmap_write_unlock(current->mm); 261 + mmap_write_unlock(mm); 184 262 return ret; 185 263 } 186 264

+1 -1

mm/nommu.c

··· 126 126 127 127 void *__vmalloc_node_range_noprof(unsigned long size, unsigned long align, 128 128 unsigned long start, unsigned long end, gfp_t gfp_mask, 129 - pgprot_t prot, vm_flags_t vm_flags, int node, 129 + pgprot_t prot, unsigned long vm_flags, int node, 130 130 const void *caller) 131 131 { 132 132 return __vmalloc_noprof(size, gfp_mask);

+1 -1

mm/rmap.c

··· 2036 2036 flush_cache_range(vma, address, end_addr); 2037 2037 2038 2038 /* Nuke the page table entry. */ 2039 - pteval = get_and_clear_full_ptes(mm, address, pvmw.pte, nr_pages, 0); 2039 + pteval = get_and_clear_ptes(mm, address, pvmw.pte, nr_pages); 2040 2040 /* 2041 2041 * We clear the PTE but do not flush so potentially 2042 2042 * a remote CPU could still be writing to the folio.

+155 -126

mm/shmem.c

··· 512 512 513 513 /* 514 514 * Sometimes, before we decide whether to proceed or to fail, we must check 515 - * that an entry was not already brought back from swap by a racing thread. 515 + * that an entry was not already brought back or split by a racing thread. 516 516 * 517 517 * Checking folio is not enough: by the time a swapcache folio is locked, it 518 518 * might be reused, and again be swapcache, using the same swap as before. 519 + * Returns the swap entry's order if it still presents, else returns -1. 519 520 */ 520 - static bool shmem_confirm_swap(struct address_space *mapping, 521 - pgoff_t index, swp_entry_t swap) 521 + static int shmem_confirm_swap(struct address_space *mapping, pgoff_t index, 522 + swp_entry_t swap) 522 523 { 523 - return xa_load(&mapping->i_pages, index) == swp_to_radix_entry(swap); 524 + XA_STATE(xas, &mapping->i_pages, index); 525 + int ret = -1; 526 + void *entry; 527 + 528 + rcu_read_lock(); 529 + do { 530 + entry = xas_load(&xas); 531 + if (entry == swp_to_radix_entry(swap)) 532 + ret = xas_get_order(&xas); 533 + } while (xas_retry(&xas, entry)); 534 + rcu_read_unlock(); 535 + return ret; 524 536 } 525 537 526 538 /* ··· 903 891 pgoff_t index, void *expected, gfp_t gfp) 904 892 { 905 893 XA_STATE_ORDER(xas, &mapping->i_pages, index, folio_order(folio)); 906 - long nr = folio_nr_pages(folio); 894 + unsigned long nr = folio_nr_pages(folio); 895 + swp_entry_t iter, swap; 896 + void *entry; 907 897 908 898 VM_BUG_ON_FOLIO(index != round_down(index, nr), folio); 909 899 VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); ··· 917 903 918 904 gfp &= GFP_RECLAIM_MASK; 919 905 folio_throttle_swaprate(folio, gfp); 906 + swap = radix_to_swp_entry(expected); 920 907 921 908 do { 909 + iter = swap; 922 910 xas_lock_irq(&xas); 923 - if (expected != xas_find_conflict(&xas)) { 924 - xas_set_err(&xas, -EEXIST); 925 - goto unlock; 911 + xas_for_each_conflict(&xas, entry) { 912 + /* 913 + * The range must either be empty, or filled with 914 + * expected swap entries. Shmem swap entries are never 915 + * partially freed without split of both entry and 916 + * folio, so there shouldn't be any holes. 917 + */ 918 + if (!expected || entry != swp_to_radix_entry(iter)) { 919 + xas_set_err(&xas, -EEXIST); 920 + goto unlock; 921 + } 922 + iter.val += 1 << xas_get_order(&xas); 926 923 } 927 - if (expected && xas_find_conflict(&xas)) { 924 + if (expected && iter.val - nr != swap.val) { 928 925 xas_set_err(&xas, -EEXIST); 929 926 goto unlock; 930 927 } ··· 2017 1992 swp_entry_t entry, int order, gfp_t gfp) 2018 1993 { 2019 1994 struct shmem_inode_info *info = SHMEM_I(inode); 1995 + int nr_pages = 1 << order; 2020 1996 struct folio *new; 1997 + gfp_t alloc_gfp; 2021 1998 void *shadow; 2022 - int nr_pages; 2023 1999 2024 2000 /* 2025 2001 * We have arrived here because our zones are constrained, so don't 2026 2002 * limit chance of success with further cpuset and node constraints. 2027 2003 */ 2028 2004 gfp &= ~GFP_CONSTRAINT_MASK; 2029 - if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && order > 0) { 2030 - gfp_t huge_gfp = vma_thp_gfp_mask(vma); 2005 + alloc_gfp = gfp; 2006 + if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { 2007 + if (WARN_ON_ONCE(order)) 2008 + return ERR_PTR(-EINVAL); 2009 + } else if (order) { 2010 + /* 2011 + * If uffd is active for the vma, we need per-page fault 2012 + * fidelity to maintain the uffd semantics, then fallback 2013 + * to swapin order-0 folio, as well as for zswap case. 2014 + * Any existing sub folio in the swap cache also blocks 2015 + * mTHP swapin. 2016 + */ 2017 + if ((vma && unlikely(userfaultfd_armed(vma))) || 2018 + !zswap_never_enabled() || 2019 + non_swapcache_batch(entry, nr_pages) != nr_pages) 2020 + goto fallback; 2031 2021 2032 - gfp = limit_gfp_mask(huge_gfp, gfp); 2022 + alloc_gfp = limit_gfp_mask(vma_thp_gfp_mask(vma), gfp); 2023 + } 2024 + retry: 2025 + new = shmem_alloc_folio(alloc_gfp, order, info, index); 2026 + if (!new) { 2027 + new = ERR_PTR(-ENOMEM); 2028 + goto fallback; 2033 2029 } 2034 2030 2035 - new = shmem_alloc_folio(gfp, order, info, index); 2036 - if (!new) 2037 - return ERR_PTR(-ENOMEM); 2038 - 2039 - nr_pages = folio_nr_pages(new); 2040 2031 if (mem_cgroup_swapin_charge_folio(new, vma ? vma->vm_mm : NULL, 2041 - gfp, entry)) { 2032 + alloc_gfp, entry)) { 2042 2033 folio_put(new); 2043 - return ERR_PTR(-ENOMEM); 2034 + new = ERR_PTR(-ENOMEM); 2035 + goto fallback; 2044 2036 } 2045 2037 2046 2038 /* ··· 2072 2030 */ 2073 2031 if (swapcache_prepare(entry, nr_pages)) { 2074 2032 folio_put(new); 2075 - return ERR_PTR(-EEXIST); 2033 + new = ERR_PTR(-EEXIST); 2034 + /* Try smaller folio to avoid cache conflict */ 2035 + goto fallback; 2076 2036 } 2077 2037 2078 2038 __folio_set_locked(new); ··· 2088 2044 folio_add_lru(new); 2089 2045 swap_read_folio(new, NULL); 2090 2046 return new; 2047 + fallback: 2048 + /* Order 0 swapin failed, nothing to fallback to, abort */ 2049 + if (!order) 2050 + return new; 2051 + entry.val += index - round_down(index, nr_pages); 2052 + alloc_gfp = gfp; 2053 + nr_pages = 1; 2054 + order = 0; 2055 + goto retry; 2091 2056 } 2092 2057 2093 2058 /* ··· 2302 2249 if (xas_error(&xas)) 2303 2250 return xas_error(&xas); 2304 2251 2305 - return entry_order; 2252 + return 0; 2306 2253 } 2307 2254 2308 2255 /* ··· 2319 2266 struct address_space *mapping = inode->i_mapping; 2320 2267 struct mm_struct *fault_mm = vma ? vma->vm_mm : NULL; 2321 2268 struct shmem_inode_info *info = SHMEM_I(inode); 2269 + swp_entry_t swap, index_entry; 2322 2270 struct swap_info_struct *si; 2323 2271 struct folio *folio = NULL; 2324 2272 bool skip_swapcache = false; 2325 - swp_entry_t swap; 2326 - int error, nr_pages, order, split_order; 2273 + int error, nr_pages, order; 2274 + pgoff_t offset; 2327 2275 2328 2276 VM_BUG_ON(!*foliop || !xa_is_value(*foliop)); 2329 - swap = radix_to_swp_entry(*foliop); 2277 + index_entry = radix_to_swp_entry(*foliop); 2278 + swap = index_entry; 2330 2279 *foliop = NULL; 2331 2280 2332 - if (is_poisoned_swp_entry(swap)) 2281 + if (is_poisoned_swp_entry(index_entry)) 2333 2282 return -EIO; 2334 2283 2335 - si = get_swap_device(swap); 2336 - if (!si) { 2337 - if (!shmem_confirm_swap(mapping, index, swap)) 2284 + si = get_swap_device(index_entry); 2285 + order = shmem_confirm_swap(mapping, index, index_entry); 2286 + if (unlikely(!si)) { 2287 + if (order < 0) 2338 2288 return -EEXIST; 2339 2289 else 2340 2290 return -EINVAL; 2341 2291 } 2292 + if (unlikely(order < 0)) { 2293 + put_swap_device(si); 2294 + return -EEXIST; 2295 + } 2296 + 2297 + /* index may point to the middle of a large entry, get the sub entry */ 2298 + if (order) { 2299 + offset = index - round_down(index, 1 << order); 2300 + swap = swp_entry(swp_type(swap), swp_offset(swap) + offset); 2301 + } 2342 2302 2343 2303 /* Look it up and read it in.. */ 2344 2304 folio = swap_cache_get_folio(swap, NULL, 0); 2345 - order = xa_get_order(&mapping->i_pages, index); 2346 2305 if (!folio) { 2347 - int nr_pages = 1 << order; 2348 - bool fallback_order0 = false; 2349 - 2350 - /* Or update major stats only when swapin succeeds?? */ 2306 + if (data_race(si->flags & SWP_SYNCHRONOUS_IO)) { 2307 + /* Direct swapin skipping swap cache & readahead */ 2308 + folio = shmem_swap_alloc_folio(inode, vma, index, 2309 + index_entry, order, gfp); 2310 + if (IS_ERR(folio)) { 2311 + error = PTR_ERR(folio); 2312 + folio = NULL; 2313 + goto failed; 2314 + } 2315 + skip_swapcache = true; 2316 + } else { 2317 + /* Cached swapin only supports order 0 folio */ 2318 + folio = shmem_swapin_cluster(swap, gfp, info, index); 2319 + if (!folio) { 2320 + error = -ENOMEM; 2321 + goto failed; 2322 + } 2323 + } 2351 2324 if (fault_type) { 2352 2325 *fault_type |= VM_FAULT_MAJOR; 2353 2326 count_vm_event(PGMAJFAULT); 2354 2327 count_memcg_event_mm(fault_mm, PGMAJFAULT); 2355 2328 } 2329 + } 2356 2330 2331 + if (order > folio_order(folio)) { 2357 2332 /* 2358 - * If uffd is active for the vma, we need per-page fault 2359 - * fidelity to maintain the uffd semantics, then fallback 2360 - * to swapin order-0 folio, as well as for zswap case. 2361 - * Any existing sub folio in the swap cache also blocks 2362 - * mTHP swapin. 2363 - */ 2364 - if (order > 0 && ((vma && unlikely(userfaultfd_armed(vma))) || 2365 - !zswap_never_enabled() || 2366 - non_swapcache_batch(swap, nr_pages) != nr_pages)) 2367 - fallback_order0 = true; 2368 - 2369 - /* Skip swapcache for synchronous device. */ 2370 - if (!fallback_order0 && data_race(si->flags & SWP_SYNCHRONOUS_IO)) { 2371 - folio = shmem_swap_alloc_folio(inode, vma, index, swap, order, gfp); 2372 - if (!IS_ERR(folio)) { 2373 - skip_swapcache = true; 2374 - goto alloced; 2375 - } 2376 - 2377 - /* 2378 - * Fallback to swapin order-0 folio unless the swap entry 2379 - * already exists. 2380 - */ 2381 - error = PTR_ERR(folio); 2382 - folio = NULL; 2383 - if (error == -EEXIST) 2384 - goto failed; 2385 - } 2386 - 2387 - /* 2388 - * Now swap device can only swap in order 0 folio, then we 2389 - * should split the large swap entry stored in the pagecache 2390 - * if necessary. 2391 - */ 2392 - split_order = shmem_split_large_entry(inode, index, swap, gfp); 2393 - if (split_order < 0) { 2394 - error = split_order; 2395 - goto failed; 2396 - } 2397 - 2398 - /* 2399 - * If the large swap entry has already been split, it is 2400 - * necessary to recalculate the new swap entry based on 2401 - * the old order alignment. 2402 - */ 2403 - if (split_order > 0) { 2404 - pgoff_t offset = index - round_down(index, 1 << split_order); 2405 - 2406 - swap = swp_entry(swp_type(swap), swp_offset(swap) + offset); 2407 - } 2408 - 2409 - /* Here we actually start the io */ 2410 - folio = shmem_swapin_cluster(swap, gfp, info, index); 2411 - if (!folio) { 2412 - error = -ENOMEM; 2413 - goto failed; 2414 - } 2415 - } else if (order != folio_order(folio)) { 2416 - /* 2417 - * Swap readahead may swap in order 0 folios into swapcache 2333 + * Swapin may get smaller folios due to various reasons: 2334 + * It may fallback to order 0 due to memory pressure or race, 2335 + * swap readahead may swap in order 0 folios into swapcache 2418 2336 * asynchronously, while the shmem mapping can still stores 2419 2337 * large swap entries. In such cases, we should split the 2420 2338 * large swap entry to prevent possible data corruption. 2421 2339 */ 2422 - split_order = shmem_split_large_entry(inode, index, swap, gfp); 2423 - if (split_order < 0) { 2424 - folio_put(folio); 2425 - folio = NULL; 2426 - error = split_order; 2427 - goto failed; 2428 - } 2429 - 2430 - /* 2431 - * If the large swap entry has already been split, it is 2432 - * necessary to recalculate the new swap entry based on 2433 - * the old order alignment. 2434 - */ 2435 - if (split_order > 0) { 2436 - pgoff_t offset = index - round_down(index, 1 << split_order); 2437 - 2438 - swap = swp_entry(swp_type(swap), swp_offset(swap) + offset); 2439 - } 2340 + error = shmem_split_large_entry(inode, index, index_entry, gfp); 2341 + if (error) 2342 + goto failed_nolock; 2440 2343 } 2441 2344 2442 - alloced: 2443 - /* We have to do this with folio locked to prevent races */ 2345 + /* 2346 + * If the folio is large, round down swap and index by folio size. 2347 + * No matter what race occurs, the swap layer ensures we either get 2348 + * a valid folio that has its swap entry aligned by size, or a 2349 + * temporarily invalid one which we'll abort very soon and retry. 2350 + * 2351 + * shmem_add_to_page_cache ensures the whole range contains expected 2352 + * entries and prevents any corruption, so any race split is fine 2353 + * too, it will succeed as long as the entries are still there. 2354 + */ 2355 + nr_pages = folio_nr_pages(folio); 2356 + if (nr_pages > 1) { 2357 + swap.val = round_down(swap.val, nr_pages); 2358 + index = round_down(index, nr_pages); 2359 + } 2360 + 2361 + /* 2362 + * We have to do this with the folio locked to prevent races. 2363 + * The shmem_confirm_swap below only checks if the first swap 2364 + * entry matches the folio, that's enough to ensure the folio 2365 + * is not used outside of shmem, as shmem swap entries 2366 + * and swap cache folios are never partially freed. 2367 + */ 2444 2368 folio_lock(folio); 2445 2369 if ((!skip_swapcache && !folio_test_swapcache(folio)) || 2446 - folio->swap.val != swap.val || 2447 - !shmem_confirm_swap(mapping, index, swap) || 2448 - xa_get_order(&mapping->i_pages, index) != folio_order(folio)) { 2370 + shmem_confirm_swap(mapping, index, swap) < 0 || 2371 + folio->swap.val != swap.val) { 2449 2372 error = -EEXIST; 2450 2373 goto unlock; 2451 2374 } ··· 2444 2415 goto failed; 2445 2416 } 2446 2417 2447 - error = shmem_add_to_page_cache(folio, mapping, 2448 - round_down(index, nr_pages), 2418 + error = shmem_add_to_page_cache(folio, mapping, index, 2449 2419 swp_to_radix_entry(swap), gfp); 2450 2420 if (error) 2451 2421 goto failed; ··· 2467 2439 *foliop = folio; 2468 2440 return 0; 2469 2441 failed: 2470 - if (!shmem_confirm_swap(mapping, index, swap)) 2442 + if (shmem_confirm_swap(mapping, index, swap) < 0) 2471 2443 error = -EEXIST; 2472 2444 if (error == -EIO) 2473 2445 shmem_set_folio_swapin_error(inode, index, folio, swap, 2474 2446 skip_swapcache); 2475 2447 unlock: 2476 - if (skip_swapcache) 2477 - swapcache_clear(si, swap, folio_nr_pages(folio)); 2478 - if (folio) { 2448 + if (folio) 2479 2449 folio_unlock(folio); 2450 + failed_nolock: 2451 + if (skip_swapcache) 2452 + swapcache_clear(si, folio->swap, folio_nr_pages(folio)); 2453 + if (folio) 2480 2454 folio_put(folio); 2481 - } 2482 2455 put_swap_device(si); 2483 2456 2484 2457 return error; ··· 5989 5960 struct folio *folio; 5990 5961 int error; 5991 5962 5992 - error = shmem_get_folio_gfp(inode, index, 0, &folio, SGP_CACHE, 5993 - gfp, NULL, NULL); 5963 + error = shmem_get_folio_gfp(inode, index, i_size_read(inode), 5964 + &folio, SGP_CACHE, gfp, NULL, NULL); 5994 5965 if (error) 5995 5966 return ERR_PTR(error); 5996 5967

+2 -2

mm/vma.c

··· 1351 1351 } 1352 1352 1353 1353 /* Don't bother splitting the VMA if we can't unmap it anyway */ 1354 - if (!can_modify_vma(vms->vma)) { 1354 + if (vma_is_sealed(vms->vma)) { 1355 1355 error = -EPERM; 1356 1356 goto start_split_failed; 1357 1357 } ··· 1371 1371 for_each_vma_range(*(vms->vmi), next, vms->end) { 1372 1372 long nrpages; 1373 1373 1374 - if (!can_modify_vma(next)) { 1374 + if (vma_is_sealed(next)) { 1375 1375 error = -EPERM; 1376 1376 goto modify_vma_failed; 1377 1377 }

+2 -25

mm/vma.h

··· 559 559 } 560 560 561 561 #ifdef CONFIG_64BIT 562 - 563 562 static inline bool vma_is_sealed(struct vm_area_struct *vma) 564 563 { 565 564 return (vma->vm_flags & VM_SEALED); 566 565 } 567 - 568 - /* 569 - * check if a vma is sealed for modification. 570 - * return true, if modification is allowed. 571 - */ 572 - static inline bool can_modify_vma(struct vm_area_struct *vma) 573 - { 574 - if (unlikely(vma_is_sealed(vma))) 575 - return false; 576 - 577 - return true; 578 - } 579 - 580 - bool can_modify_vma_madv(struct vm_area_struct *vma, int behavior); 581 - 582 566 #else 583 - 584 - static inline bool can_modify_vma(struct vm_area_struct *vma) 567 + static inline bool vma_is_sealed(struct vm_area_struct *vma) 585 568 { 586 - return true; 569 + return false; 587 570 } 588 - 589 - static inline bool can_modify_vma_madv(struct vm_area_struct *vma, int behavior) 590 - { 591 - return true; 592 - } 593 - 594 571 #endif 595 572 596 573 #if defined(CONFIG_STACK_GROWSUP)

+54 -8

tools/testing/selftests/cachestat/test_cachestat.c

··· 33 33 cs->nr_evicted, cs->nr_recently_evicted); 34 34 } 35 35 36 + enum file_type { 37 + FILE_MMAP, 38 + FILE_SHMEM 39 + }; 40 + 36 41 bool write_exactly(int fd, size_t filesize) 37 42 { 38 43 int random_fd = open("/dev/urandom", O_RDONLY); ··· 206 201 out: 207 202 return ret; 208 203 } 204 + const char *file_type_str(enum file_type type) 205 + { 206 + switch (type) { 207 + case FILE_SHMEM: 208 + return "shmem"; 209 + case FILE_MMAP: 210 + return "mmap"; 211 + default: 212 + return "unknown"; 213 + } 214 + } 209 215 210 - bool test_cachestat_shmem(void) 216 + 217 + bool run_cachestat_test(enum file_type type) 211 218 { 212 219 size_t PS = sysconf(_SC_PAGESIZE); 213 220 size_t filesize = PS * 512 * 2; /* 2 2MB huge pages */ ··· 229 212 char *filename = "tmpshmcstat"; 230 213 struct cachestat cs; 231 214 bool ret = true; 215 + int fd; 232 216 unsigned long num_pages = compute_len / PS; 233 - int fd = shm_open(filename, O_CREAT | O_RDWR, 0600); 217 + if (type == FILE_SHMEM) 218 + fd = shm_open(filename, O_CREAT | O_RDWR, 0600); 219 + else 220 + fd = open(filename, O_RDWR | O_CREAT | O_TRUNC, 0666); 234 221 235 222 if (fd < 0) { 236 - ksft_print_msg("Unable to create shmem file.\n"); 223 + ksft_print_msg("Unable to create %s file.\n", 224 + file_type_str(type)); 237 225 ret = false; 238 226 goto out; 239 227 } 240 228 241 229 if (ftruncate(fd, filesize)) { 242 - ksft_print_msg("Unable to truncate shmem file.\n"); 230 + ksft_print_msg("Unable to truncate %s file.\n",file_type_str(type)); 243 231 ret = false; 244 232 goto close_fd; 245 233 } 234 + switch (type) { 235 + case FILE_SHMEM: 236 + if (!write_exactly(fd, filesize)) { 237 + ksft_print_msg("Unable to write to file.\n"); 238 + ret = false; 239 + goto close_fd; 240 + } 241 + break; 242 + case FILE_MMAP: 243 + char *map = mmap(NULL, filesize, PROT_READ | PROT_WRITE, 244 + MAP_SHARED, fd, 0); 246 245 247 - if (!write_exactly(fd, filesize)) { 248 - ksft_print_msg("Unable to write to shmem file.\n"); 246 + if (map == MAP_FAILED) { 247 + ksft_print_msg("mmap failed.\n"); 248 + ret = false; 249 + goto close_fd; 250 + } 251 + for (int i = 0; i < filesize; i++) 252 + map[i] = 'A'; 253 + break; 254 + default: 255 + ksft_print_msg("Unsupported file type.\n"); 249 256 ret = false; 250 257 goto close_fd; 251 258 } 252 - 253 259 syscall_ret = syscall(__NR_cachestat, fd, &cs_range, &cs, 0); 254 260 255 261 if (syscall_ret) { ··· 348 308 break; 349 309 } 350 310 351 - if (test_cachestat_shmem()) 311 + if (run_cachestat_test(FILE_SHMEM)) 352 312 ksft_test_result_pass("cachestat works with a shmem file\n"); 353 313 else { 354 314 ksft_test_result_fail("cachestat fails with a shmem file\n"); 355 315 ret = 1; 356 316 } 357 317 318 + if (run_cachestat_test(FILE_MMAP)) 319 + ksft_test_result_pass("cachestat works with a mmap file\n"); 320 + else { 321 + ksft_test_result_fail("cachestat fails with a mmap file\n"); 322 + ret = 1; 323 + } 358 324 return ret; 359 325 }

+1

tools/testing/selftests/mm/.gitignore

··· 21 21 transhuge-stress 22 22 pagemap_ioctl 23 23 pfnmap 24 + process_madv 24 25 *.tmp* 25 26 protection_keys 26 27 protection_keys_32

+1

tools/testing/selftests/mm/Makefile

··· 85 85 TEST_GEN_FILES += on-fault-limit 86 86 TEST_GEN_FILES += pagemap_ioctl 87 87 TEST_GEN_FILES += pfnmap 88 + TEST_GEN_FILES += process_madv 88 89 TEST_GEN_FILES += thuge-gen 89 90 TEST_GEN_FILES += transhuge-stress 90 91 TEST_GEN_FILES += uffd-stress

+344

tools/testing/selftests/mm/process_madv.c

··· 1 + // SPDX-License-Identifier: GPL-2.0-or-later 2 + 3 + #define _GNU_SOURCE 4 + #include "../kselftest_harness.h" 5 + #include <errno.h> 6 + #include <setjmp.h> 7 + #include <signal.h> 8 + #include <stdbool.h> 9 + #include <stdio.h> 10 + #include <stdlib.h> 11 + #include <string.h> 12 + #include <linux/mman.h> 13 + #include <sys/syscall.h> 14 + #include <unistd.h> 15 + #include <sched.h> 16 + #include "vm_util.h" 17 + 18 + #include "../pidfd/pidfd.h" 19 + 20 + FIXTURE(process_madvise) 21 + { 22 + unsigned long page_size; 23 + pid_t child_pid; 24 + int remote_pidfd; 25 + int pidfd; 26 + }; 27 + 28 + FIXTURE_SETUP(process_madvise) 29 + { 30 + self->page_size = (unsigned long)sysconf(_SC_PAGESIZE); 31 + self->pidfd = PIDFD_SELF; 32 + self->remote_pidfd = -1; 33 + self->child_pid = -1; 34 + }; 35 + 36 + FIXTURE_TEARDOWN_PARENT(process_madvise) 37 + { 38 + /* This teardown is guaranteed to run, even if tests SKIP or ASSERT */ 39 + if (self->child_pid > 0) { 40 + kill(self->child_pid, SIGKILL); 41 + waitpid(self->child_pid, NULL, 0); 42 + } 43 + 44 + if (self->remote_pidfd >= 0) 45 + close(self->remote_pidfd); 46 + } 47 + 48 + static ssize_t sys_process_madvise(int pidfd, const struct iovec *iovec, 49 + size_t vlen, int advice, unsigned int flags) 50 + { 51 + return syscall(__NR_process_madvise, pidfd, iovec, vlen, advice, flags); 52 + } 53 + 54 + /* 55 + * This test uses PIDFD_SELF to target the current process. The main 56 + * goal is to verify the basic behavior of process_madvise() with 57 + * a vector of non-contiguous memory ranges, not its cross-process 58 + * capabilities. 59 + */ 60 + TEST_F(process_madvise, basic) 61 + { 62 + const unsigned long pagesize = self->page_size; 63 + const int madvise_pages = 4; 64 + struct iovec vec[madvise_pages]; 65 + int pidfd = self->pidfd; 66 + ssize_t ret; 67 + char *map; 68 + 69 + /* 70 + * Create a single large mapping. We will pick pages from this 71 + * mapping to advise on. This ensures we test non-contiguous iovecs. 72 + */ 73 + map = mmap(NULL, pagesize * 10, PROT_READ | PROT_WRITE, 74 + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); 75 + if (map == MAP_FAILED) 76 + SKIP(return, "mmap failed, not enough memory.\n"); 77 + 78 + /* Fill the entire region with a known pattern. */ 79 + memset(map, 'A', pagesize * 10); 80 + 81 + /* 82 + * Setup the iovec to point to 4 non-contiguous pages 83 + * within the mapping. 84 + */ 85 + vec[0].iov_base = &map[0 * pagesize]; 86 + vec[0].iov_len = pagesize; 87 + vec[1].iov_base = &map[3 * pagesize]; 88 + vec[1].iov_len = pagesize; 89 + vec[2].iov_base = &map[5 * pagesize]; 90 + vec[2].iov_len = pagesize; 91 + vec[3].iov_base = &map[8 * pagesize]; 92 + vec[3].iov_len = pagesize; 93 + 94 + ret = sys_process_madvise(pidfd, vec, madvise_pages, MADV_DONTNEED, 0); 95 + if (ret == -1 && errno == EPERM) 96 + SKIP(return, 97 + "process_madvise() unsupported or permission denied, try running as root.\n"); 98 + else if (errno == EINVAL) 99 + SKIP(return, 100 + "process_madvise() unsupported or parameter invalid, please check arguments.\n"); 101 + 102 + /* The call should succeed and report the total bytes processed. */ 103 + ASSERT_EQ(ret, madvise_pages * pagesize); 104 + 105 + /* Check that advised pages are now zero. */ 106 + for (int i = 0; i < madvise_pages; i++) { 107 + char *advised_page = (char *)vec[i].iov_base; 108 + 109 + /* Content must be 0, not 'A'. */ 110 + ASSERT_EQ(*advised_page, '\0'); 111 + } 112 + 113 + /* Check that an un-advised page in between is still 'A'. */ 114 + char *unadvised_page = &map[1 * pagesize]; 115 + 116 + for (int i = 0; i < pagesize; i++) 117 + ASSERT_EQ(unadvised_page[i], 'A'); 118 + 119 + /* Cleanup. */ 120 + ASSERT_EQ(munmap(map, pagesize * 10), 0); 121 + } 122 + 123 + /* 124 + * This test deterministically validates process_madvise() with MADV_COLLAPSE 125 + * on a remote process, other advices are difficult to verify reliably. 126 + * 127 + * The test verifies that a memory region in a child process, 128 + * focus on process_madv remote result, only check addresses and lengths. 129 + * The correctness of the MADV_COLLAPSE can be found in the relevant test examples in khugepaged. 130 + */ 131 + TEST_F(process_madvise, remote_collapse) 132 + { 133 + const unsigned long pagesize = self->page_size; 134 + long huge_page_size; 135 + int pipe_info[2]; 136 + ssize_t ret; 137 + struct iovec vec; 138 + 139 + struct child_info { 140 + pid_t pid; 141 + void *map_addr; 142 + } info; 143 + 144 + huge_page_size = read_pmd_pagesize(); 145 + if (huge_page_size <= 0) 146 + SKIP(return, "Could not determine a valid huge page size.\n"); 147 + 148 + ASSERT_EQ(pipe(pipe_info), 0); 149 + 150 + self->child_pid = fork(); 151 + ASSERT_NE(self->child_pid, -1); 152 + 153 + if (self->child_pid == 0) { 154 + char *map; 155 + size_t map_size = 2 * huge_page_size; 156 + 157 + close(pipe_info[0]); 158 + 159 + map = mmap(NULL, map_size, PROT_READ | PROT_WRITE, 160 + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); 161 + ASSERT_NE(map, MAP_FAILED); 162 + 163 + /* Fault in as small pages */ 164 + for (size_t i = 0; i < map_size; i += pagesize) 165 + map[i] = 'A'; 166 + 167 + /* Send info and pause */ 168 + info.pid = getpid(); 169 + info.map_addr = map; 170 + ret = write(pipe_info[1], &info, sizeof(info)); 171 + ASSERT_EQ(ret, sizeof(info)); 172 + close(pipe_info[1]); 173 + 174 + pause(); 175 + exit(0); 176 + } 177 + 178 + close(pipe_info[1]); 179 + 180 + /* Receive child info */ 181 + ret = read(pipe_info[0], &info, sizeof(info)); 182 + if (ret <= 0) { 183 + waitpid(self->child_pid, NULL, 0); 184 + SKIP(return, "Failed to read child info from pipe.\n"); 185 + } 186 + ASSERT_EQ(ret, sizeof(info)); 187 + close(pipe_info[0]); 188 + self->child_pid = info.pid; 189 + 190 + self->remote_pidfd = syscall(__NR_pidfd_open, self->child_pid, 0); 191 + ASSERT_GE(self->remote_pidfd, 0); 192 + 193 + vec.iov_base = info.map_addr; 194 + vec.iov_len = huge_page_size; 195 + 196 + ret = sys_process_madvise(self->remote_pidfd, &vec, 1, MADV_COLLAPSE, 197 + 0); 198 + if (ret == -1) { 199 + if (errno == EINVAL) 200 + SKIP(return, "PROCESS_MADV_ADVISE is not supported.\n"); 201 + else if (errno == EPERM) 202 + SKIP(return, 203 + "No process_madvise() permissions, try running as root.\n"); 204 + return; 205 + } 206 + 207 + ASSERT_EQ(ret, huge_page_size); 208 + } 209 + 210 + /* 211 + * Test process_madvise() with a pidfd for a process that has already 212 + * exited to ensure correct error handling. 213 + */ 214 + TEST_F(process_madvise, exited_process_pidfd) 215 + { 216 + const unsigned long pagesize = self->page_size; 217 + struct iovec vec; 218 + char *map; 219 + ssize_t ret; 220 + 221 + map = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1, 222 + 0); 223 + if (map == MAP_FAILED) 224 + SKIP(return, "mmap failed, not enough memory.\n"); 225 + 226 + vec.iov_base = map; 227 + vec.iov_len = pagesize; 228 + 229 + /* 230 + * Using a pidfd for a process that has already exited should fail 231 + * with ESRCH. 232 + */ 233 + self->child_pid = fork(); 234 + ASSERT_NE(self->child_pid, -1); 235 + 236 + if (self->child_pid == 0) 237 + exit(0); 238 + 239 + self->remote_pidfd = syscall(__NR_pidfd_open, self->child_pid, 0); 240 + ASSERT_GE(self->remote_pidfd, 0); 241 + 242 + /* Wait for the child to ensure it has terminated. */ 243 + waitpid(self->child_pid, NULL, 0); 244 + 245 + ret = sys_process_madvise(self->remote_pidfd, &vec, 1, MADV_DONTNEED, 246 + 0); 247 + ASSERT_EQ(ret, -1); 248 + ASSERT_EQ(errno, ESRCH); 249 + } 250 + 251 + /* 252 + * Test process_madvise() with bad pidfds to ensure correct error 253 + * handling. 254 + */ 255 + TEST_F(process_madvise, bad_pidfd) 256 + { 257 + const unsigned long pagesize = self->page_size; 258 + struct iovec vec; 259 + char *map; 260 + ssize_t ret; 261 + 262 + map = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1, 263 + 0); 264 + if (map == MAP_FAILED) 265 + SKIP(return, "mmap failed, not enough memory.\n"); 266 + 267 + vec.iov_base = map; 268 + vec.iov_len = pagesize; 269 + 270 + /* Using an invalid fd number (-1) should fail with EBADF. */ 271 + ret = sys_process_madvise(-1, &vec, 1, MADV_DONTNEED, 0); 272 + ASSERT_EQ(ret, -1); 273 + ASSERT_EQ(errno, EBADF); 274 + 275 + /* 276 + * Using a valid fd that is not a pidfd (e.g. stdin) should fail 277 + * with EBADF. 278 + */ 279 + ret = sys_process_madvise(STDIN_FILENO, &vec, 1, MADV_DONTNEED, 0); 280 + ASSERT_EQ(ret, -1); 281 + ASSERT_EQ(errno, EBADF); 282 + } 283 + 284 + /* 285 + * Test that process_madvise() rejects vlen > UIO_MAXIOV. 286 + * The kernel should return -EINVAL when the number of iovecs exceeds 1024. 287 + */ 288 + TEST_F(process_madvise, invalid_vlen) 289 + { 290 + const unsigned long pagesize = self->page_size; 291 + int pidfd = self->pidfd; 292 + struct iovec vec; 293 + char *map; 294 + ssize_t ret; 295 + 296 + map = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1, 297 + 0); 298 + if (map == MAP_FAILED) 299 + SKIP(return, "mmap failed, not enough memory.\n"); 300 + 301 + vec.iov_base = map; 302 + vec.iov_len = pagesize; 303 + 304 + ret = sys_process_madvise(pidfd, &vec, 1025, MADV_DONTNEED, 0); 305 + ASSERT_EQ(ret, -1); 306 + ASSERT_EQ(errno, EINVAL); 307 + 308 + /* Cleanup. */ 309 + ASSERT_EQ(munmap(map, pagesize), 0); 310 + } 311 + 312 + /* 313 + * Test process_madvise() with an invalid flag value. Currently, only a flag 314 + * value of 0 is supported. This test is reserved for the future, e.g., if 315 + * synchronous flags are added. 316 + */ 317 + TEST_F(process_madvise, flag) 318 + { 319 + const unsigned long pagesize = self->page_size; 320 + unsigned int invalid_flag; 321 + int pidfd = self->pidfd; 322 + struct iovec vec; 323 + char *map; 324 + ssize_t ret; 325 + 326 + map = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1, 327 + 0); 328 + if (map == MAP_FAILED) 329 + SKIP(return, "mmap failed, not enough memory.\n"); 330 + 331 + vec.iov_base = map; 332 + vec.iov_len = pagesize; 333 + 334 + invalid_flag = 0x80000000; 335 + 336 + ret = sys_process_madvise(pidfd, &vec, 1, MADV_DONTNEED, invalid_flag); 337 + ASSERT_EQ(ret, -1); 338 + ASSERT_EQ(errno, EINVAL); 339 + 340 + /* Cleanup. */ 341 + ASSERT_EQ(munmap(map, pagesize), 0); 342 + } 343 + 344 + TEST_HARNESS_MAIN

+5

tools/testing/selftests/mm/run_vmtests.sh

··· 65 65 test pagemap_scan IOCTL 66 66 - pfnmap 67 67 tests for VM_PFNMAP handling 68 + - process_madv 69 + test for process_madv 68 70 - cow 69 71 test copy-on-write semantics 70 72 - thp ··· 426 424 427 425 # MADV_POPULATE_READ and MADV_POPULATE_WRITE tests 428 426 CATEGORY="madv_populate" run_test ./madv_populate 427 + 428 + # PROCESS_MADV test 429 + CATEGORY="process_madv" run_test ./process_madv 429 430 430 431 CATEGORY="vma_merge" run_test ./merge 431 432

+4 -2

tools/testing/vma/vma_internal.h

··· 108 108 #define CAP_IPC_LOCK 14 109 109 110 110 #ifdef CONFIG_64BIT 111 - /* VM is sealed, in vm_flags */ 112 - #define VM_SEALED _BITUL(63) 111 + #define VM_SEALED_BIT 42 112 + #define VM_SEALED BIT(VM_SEALED_BIT) 113 + #else 114 + #define VM_SEALED VM_NONE 113 115 #endif 114 116 115 117 #define FIRST_USER_ADDRESS 0UL