Merge tag 'mm-hotfixes-stable-2026-01-20-13-09' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

Pull misc fixes from Andrew Morton:

- A patch series from David Hildenbrand which fixes a few things
related to hugetlb PMD sharing

- The remainder are singletons, please see their changelogs for details

* tag 'mm-hotfixes-stable-2026-01-20-13-09' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm:
mm: restore per-memcg proactive reclaim with !CONFIG_NUMA
mm/kfence: fix potential deadlock in reboot notifier
Docs/mm/allocation-profiling: describe sysctrl limitations in debug mode
mm: do not copy page tables unnecessarily for VM_UFFD_WP
mm/hugetlb: fix excessive IPI broadcasts when unsharing PMD tables using mmu_gather
mm/rmap: fix two comments related to huge_pmd_unshare()
mm/hugetlb: fix two comments related to huge_pmd_unshare()
mm/hugetlb: fix hugetlb_pmd_shared()
mm: remove unnecessary and incorrect mmap lock assert
x86/kfence: avoid writing L1TF-vulnerable PTEs
mm/vma: do not leak memory when .mmap_prepare swaps the file
migrate: correct lock ordering for hugetlb file folios
panic: only warn about deprecated panic_print on write access
fs/writeback: skip AS_NO_DATA_INTEGRITY mappings in wait_sb_inodes()
mm: take into account mm_cid size for mm_struct static definitions
mm: rename cpu_bitmap field to flexible_array
mm: add missing static initializer for init_mm::mm_cid.lock

Linus Torvalds 2 months ago c25f2fb1 c03e9c42

+341 -135

22 changed files

expand all

Documentation

admin-guide

sysctl

vm.rst

allocation-profiling.rst

arch

x86

include

asm

kfence.h

drivers

firmware

efi

efi.c

fs-writeback.c

fuse

file.c

include

asm-generic

tlb.h

linux

hugetlb.h

mm.h

mm_types.h

pagemap.h

kernel

panic.c

hugetlb.c

init-mm.c

internal.h

kfence

core.c

memory.c

migrate.c

mmu_gather.c

rmap.c

vma.c

vmscan.c

Documentation/admin-guide/sysctl/vm.rst

··· 494 494 495 495 The default value depends on CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT. 496 496 497 + When CONFIG_MEM_ALLOC_PROFILING_DEBUG=y, this control is read-only to avoid 498 + warnings produced by allocations made while profiling is disabled and freed 499 + when it's enabled. 500 + 497 501 498 502 memory_failure_early_kill 499 503 =========================

+10

Documentation/mm/allocation-profiling.rst

··· 33 33 sysctl: 34 34 /proc/sys/vm/mem_profiling 35 35 36 + 1: Enable memory profiling. 37 + 38 + 0: Disable memory profiling. 39 + 40 + The default value depends on CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT. 41 + 42 + When CONFIG_MEM_ALLOC_PROFILING_DEBUG=y, this control is read-only to avoid 43 + warnings produced by allocations made while profiling is disabled and freed 44 + when it's enabled. 45 + 36 46 Runtime info: 37 47 /proc/allocinfo 38 48

+24 -5

arch/x86/include/asm/kfence.h

··· 42 42 { 43 43 unsigned int level; 44 44 pte_t *pte = lookup_address(addr, &level); 45 + pteval_t val; 45 46 46 47 if (WARN_ON(!pte || level != PG_LEVEL_4K)) 47 48 return false; 49 + 50 + val = pte_val(*pte); 51 + 52 + /* 53 + * protect requires making the page not-present. If the PTE is 54 + * already in the right state, there's nothing to do. 55 + */ 56 + if (protect != !!(val & _PAGE_PRESENT)) 57 + return true; 58 + 59 + /* 60 + * Otherwise, invert the entire PTE. This avoids writing out an 61 + * L1TF-vulnerable PTE (not present, without the high address bits 62 + * set). 63 + */ 64 + set_pte(pte, __pte(~val)); 65 + 66 + /* 67 + * If the page was protected (non-present) and we're making it 68 + * present, there is no need to flush the TLB at all. 69 + */ 70 + if (!protect) 71 + return true; 48 72 49 73 /* 50 74 * We need to avoid IPIs, as we may get KFENCE allocations or faults ··· 76 52 * does not flush TLBs on all CPUs. We can tolerate some inaccuracy; 77 53 * lazy fault handling takes care of faults after the page is PRESENT. 78 54 */ 79 - 80 - if (protect) 81 - set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_PRESENT)); 82 - else 83 - set_pte(pte, __pte(pte_val(*pte) | _PAGE_PRESENT)); 84 55 85 56 /* 86 57 * Flush this CPU's TLB, assuming whoever did the allocation/free is

+1 -1

drivers/firmware/efi/efi.c

··· 74 74 .page_table_lock = __SPIN_LOCK_UNLOCKED(efi_mm.page_table_lock), 75 75 .mmlist = LIST_HEAD_INIT(efi_mm.mmlist), 76 76 .user_ns = &init_user_ns, 77 - .cpu_bitmap = { [BITS_TO_LONGS(NR_CPUS)] = 0}, 78 77 #ifdef CONFIG_SCHED_MM_CID 79 78 .mm_cid.lock = __RAW_SPIN_LOCK_UNLOCKED(efi_mm.mm_cid.lock), 80 79 #endif 80 + .flexible_array = MM_STRUCT_FLEXIBLE_ARRAY_INIT, 81 81 }; 82 82 83 83 struct workqueue_struct *efi_rts_wq;

+6 -1

fs/fs-writeback.c

··· 2750 2750 * The mapping can appear untagged while still on-list since we 2751 2751 * do not have the mapping lock. Skip it here, wb completion 2752 2752 * will remove it. 2753 + * 2754 + * If the mapping does not have data integrity semantics, 2755 + * there's no need to wait for the writeout to complete, as the 2756 + * mapping cannot guarantee that data is persistently stored. 2753 2757 */ 2754 - if (!mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) 2758 + if (!mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK) || 2759 + mapping_no_data_integrity(mapping)) 2755 2760 continue; 2756 2761 2757 2762 spin_unlock_irq(&sb->s_inode_wblist_lock);

+3 -1

fs/fuse/file.c

··· 3200 3200 3201 3201 inode->i_fop = &fuse_file_operations; 3202 3202 inode->i_data.a_ops = &fuse_file_aops; 3203 - if (fc->writeback_cache) 3203 + if (fc->writeback_cache) { 3204 3204 mapping_set_writeback_may_deadlock_on_reclaim(&inode->i_data); 3205 + mapping_set_no_data_integrity(&inode->i_data); 3206 + } 3205 3207 3206 3208 INIT_LIST_HEAD(&fi->write_files); 3207 3209 INIT_LIST_HEAD(&fi->queued_writes);

+75 -2

include/asm-generic/tlb.h

··· 46 46 * 47 47 * The mmu_gather API consists of: 48 48 * 49 - * - tlb_gather_mmu() / tlb_gather_mmu_fullmm() / tlb_finish_mmu() 49 + * - tlb_gather_mmu() / tlb_gather_mmu_fullmm() / tlb_gather_mmu_vma() / 50 + * tlb_finish_mmu() 50 51 * 51 52 * start and finish a mmu_gather 52 53 * ··· 365 364 unsigned int vma_huge : 1; 366 365 unsigned int vma_pfn : 1; 367 366 367 + /* 368 + * Did we unshare (unmap) any shared page tables? For now only 369 + * used for hugetlb PMD table sharing. 370 + */ 371 + unsigned int unshared_tables : 1; 372 + 373 + /* 374 + * Did we unshare any page tables such that they are now exclusive 375 + * and could get reused+modified by the new owner? When setting this 376 + * flag, "unshared_tables" will be set as well. For now only used 377 + * for hugetlb PMD table sharing. 378 + */ 379 + unsigned int fully_unshared_tables : 1; 380 + 368 381 unsigned int batch_count; 369 382 370 383 #ifndef CONFIG_MMU_GATHER_NO_GATHER ··· 415 400 tlb->cleared_pmds = 0; 416 401 tlb->cleared_puds = 0; 417 402 tlb->cleared_p4ds = 0; 403 + tlb->unshared_tables = 0; 418 404 /* 419 405 * Do not reset mmu_gather::vma_* fields here, we do not 420 406 * call into tlb_start_vma() again to set them if there is an ··· 500 484 * these bits. 501 485 */ 502 486 if (!(tlb->freed_tables || tlb->cleared_ptes || tlb->cleared_pmds || 503 - tlb->cleared_puds || tlb->cleared_p4ds)) 487 + tlb->cleared_puds || tlb->cleared_p4ds || tlb->unshared_tables)) 504 488 return; 505 489 506 490 tlb_flush(tlb); ··· 788 772 return true; 789 773 } 790 774 #endif 775 + 776 + #ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING 777 + static inline void tlb_unshare_pmd_ptdesc(struct mmu_gather *tlb, struct ptdesc *pt, 778 + unsigned long addr) 779 + { 780 + /* 781 + * The caller must make sure that concurrent unsharing + exclusive 782 + * reuse is impossible until tlb_flush_unshared_tables() was called. 783 + */ 784 + VM_WARN_ON_ONCE(!ptdesc_pmd_is_shared(pt)); 785 + ptdesc_pmd_pts_dec(pt); 786 + 787 + /* Clearing a PUD pointing at a PMD table with PMD leaves. */ 788 + tlb_flush_pmd_range(tlb, addr & PUD_MASK, PUD_SIZE); 789 + 790 + /* 791 + * If the page table is now exclusively owned, we fully unshared 792 + * a page table. 793 + */ 794 + if (!ptdesc_pmd_is_shared(pt)) 795 + tlb->fully_unshared_tables = true; 796 + tlb->unshared_tables = true; 797 + } 798 + 799 + static inline void tlb_flush_unshared_tables(struct mmu_gather *tlb) 800 + { 801 + /* 802 + * As soon as the caller drops locks to allow for reuse of 803 + * previously-shared tables, these tables could get modified and 804 + * even reused outside of hugetlb context, so we have to make sure that 805 + * any page table walkers (incl. TLB, GUP-fast) are aware of that 806 + * change. 807 + * 808 + * Even if we are not fully unsharing a PMD table, we must 809 + * flush the TLB for the unsharer now. 810 + */ 811 + if (tlb->unshared_tables) 812 + tlb_flush_mmu_tlbonly(tlb); 813 + 814 + /* 815 + * Similarly, we must make sure that concurrent GUP-fast will not 816 + * walk previously-shared page tables that are getting modified+reused 817 + * elsewhere. So broadcast an IPI to wait for any concurrent GUP-fast. 818 + * 819 + * We only perform this when we are the last sharer of a page table, 820 + * as the IPI will reach all CPUs: any GUP-fast. 821 + * 822 + * Note that on configs where tlb_remove_table_sync_one() is a NOP, 823 + * the expectation is that the tlb_flush_mmu_tlbonly() would have issued 824 + * required IPIs already for us. 825 + */ 826 + if (tlb->fully_unshared_tables) { 827 + tlb_remove_table_sync_one(); 828 + tlb->fully_unshared_tables = false; 829 + } 830 + } 831 + #endif /* CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING */ 791 832 792 833 #endif /* CONFIG_MMU */ 793 834

+11 -6

include/linux/hugetlb.h

··· 240 240 pte_t *huge_pte_offset(struct mm_struct *mm, 241 241 unsigned long addr, unsigned long sz); 242 242 unsigned long hugetlb_mask_last_page(struct hstate *h); 243 - int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, 244 - unsigned long addr, pte_t *ptep); 243 + int huge_pmd_unshare(struct mmu_gather *tlb, struct vm_area_struct *vma, 244 + unsigned long addr, pte_t *ptep); 245 + void huge_pmd_unshare_flush(struct mmu_gather *tlb, struct vm_area_struct *vma); 245 246 void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, 246 247 unsigned long *start, unsigned long *end); 247 248 ··· 301 300 return NULL; 302 301 } 303 302 304 - static inline int huge_pmd_unshare(struct mm_struct *mm, 305 - struct vm_area_struct *vma, 306 - unsigned long addr, pte_t *ptep) 303 + static inline int huge_pmd_unshare(struct mmu_gather *tlb, 304 + struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) 307 305 { 308 306 return 0; 307 + } 308 + 309 + static inline void huge_pmd_unshare_flush(struct mmu_gather *tlb, 310 + struct vm_area_struct *vma) 311 + { 309 312 } 310 313 311 314 static inline void adjust_range_if_pmd_sharing_possible( ··· 1331 1326 #ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING 1332 1327 static inline bool hugetlb_pmd_shared(pte_t *pte) 1333 1328 { 1334 - return page_count(virt_to_page(pte)) > 1; 1329 + return ptdesc_pmd_is_shared(virt_to_ptdesc(pte)); 1335 1330 } 1336 1331 #else 1337 1332 static inline bool hugetlb_pmd_shared(pte_t *pte)

+5 -1

include/linux/mm.h

··· 608 608 /* 609 609 * Flags which should result in page tables being copied on fork. These are 610 610 * flags which indicate that the VMA maps page tables which cannot be 611 - * reconsistuted upon page fault, so necessitate page table copying upon 611 + * reconsistuted upon page fault, so necessitate page table copying upon fork. 612 + * 613 + * Note that these flags should be compared with the DESTINATION VMA not the 614 + * source, as VM_UFFD_WP may not be propagated to destination, while all other 615 + * flags will be. 612 616 * 613 617 * VM_PFNMAP / VM_MIXEDMAP - These contain kernel-mapped data which cannot be 614 618 * reasonably reconstructed on page fault.

+14 -5

include/linux/mm_types.h

··· 1329 1329 * The mm_cpumask needs to be at the end of mm_struct, because it 1330 1330 * is dynamically sized based on nr_cpu_ids. 1331 1331 */ 1332 - unsigned long cpu_bitmap[]; 1332 + char flexible_array[] __aligned(__alignof__(unsigned long)); 1333 1333 }; 1334 1334 1335 1335 /* Copy value to the first system word of mm flags, non-atomically. */ ··· 1366 1366 MT_FLAGS_USE_RCU) 1367 1367 extern struct mm_struct init_mm; 1368 1368 1369 + #define MM_STRUCT_FLEXIBLE_ARRAY_INIT \ 1370 + { \ 1371 + [0 ... sizeof(cpumask_t) + MM_CID_STATIC_SIZE - 1] = 0 \ 1372 + } 1373 + 1369 1374 /* Pointer magic because the dynamic array size confuses some compilers. */ 1370 1375 static inline void mm_init_cpumask(struct mm_struct *mm) 1371 1376 { 1372 1377 unsigned long cpu_bitmap = (unsigned long)mm; 1373 1378 1374 - cpu_bitmap += offsetof(struct mm_struct, cpu_bitmap); 1379 + cpu_bitmap += offsetof(struct mm_struct, flexible_array); 1375 1380 cpumask_clear((struct cpumask *)cpu_bitmap); 1376 1381 } 1377 1382 1378 1383 /* Future-safe accessor for struct mm_struct's cpu_vm_mask. */ 1379 1384 static inline cpumask_t *mm_cpumask(struct mm_struct *mm) 1380 1385 { 1381 - return (struct cpumask *)&mm->cpu_bitmap; 1386 + return (struct cpumask *)&mm->flexible_array; 1382 1387 } 1383 1388 1384 1389 #ifdef CONFIG_LRU_GEN ··· 1474 1469 { 1475 1470 unsigned long bitmap = (unsigned long)mm; 1476 1471 1477 - bitmap += offsetof(struct mm_struct, cpu_bitmap); 1472 + bitmap += offsetof(struct mm_struct, flexible_array); 1478 1473 /* Skip cpu_bitmap */ 1479 1474 bitmap += cpumask_size(); 1480 1475 return (struct cpumask *)bitmap; ··· 1500 1495 mm_init_cid(mm, p); 1501 1496 return 0; 1502 1497 } 1503 - #define mm_alloc_cid(...) alloc_hooks(mm_alloc_cid_noprof(__VA_ARGS__)) 1498 + # define mm_alloc_cid(...) alloc_hooks(mm_alloc_cid_noprof(__VA_ARGS__)) 1504 1499 1505 1500 static inline void mm_destroy_cid(struct mm_struct *mm) 1506 1501 { ··· 1514 1509 return cpumask_size() + bitmap_size(num_possible_cpus()); 1515 1510 } 1516 1511 1512 + /* Use 2 * NR_CPUS as worse case for static allocation. */ 1513 + # define MM_CID_STATIC_SIZE (2 * sizeof(cpumask_t)) 1517 1514 #else /* CONFIG_SCHED_MM_CID */ 1518 1515 static inline void mm_init_cid(struct mm_struct *mm, struct task_struct *p) { } 1519 1516 static inline int mm_alloc_cid(struct mm_struct *mm, struct task_struct *p) { return 0; } ··· 1524 1517 { 1525 1518 return 0; 1526 1519 } 1520 + # define MM_CID_STATIC_SIZE 0 1527 1521 #endif /* CONFIG_SCHED_MM_CID */ 1528 1522 1529 1523 struct mmu_gather; 1530 1524 extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm); 1531 1525 extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm); 1526 + void tlb_gather_mmu_vma(struct mmu_gather *tlb, struct vm_area_struct *vma); 1532 1527 extern void tlb_finish_mmu(struct mmu_gather *tlb); 1533 1528 1534 1529 struct vm_fault;

+11

include/linux/pagemap.h

··· 210 210 AS_WRITEBACK_MAY_DEADLOCK_ON_RECLAIM = 9, 211 211 AS_KERNEL_FILE = 10, /* mapping for a fake kernel file that shouldn't 212 212 account usage to user cgroups */ 213 + AS_NO_DATA_INTEGRITY = 11, /* no data integrity guarantees */ 213 214 /* Bits 16-25 are used for FOLIO_ORDER */ 214 215 AS_FOLIO_ORDER_BITS = 5, 215 216 AS_FOLIO_ORDER_MIN = 16, ··· 344 343 static inline bool mapping_writeback_may_deadlock_on_reclaim(const struct address_space *mapping) 345 344 { 346 345 return test_bit(AS_WRITEBACK_MAY_DEADLOCK_ON_RECLAIM, &mapping->flags); 346 + } 347 + 348 + static inline void mapping_set_no_data_integrity(struct address_space *mapping) 349 + { 350 + set_bit(AS_NO_DATA_INTEGRITY, &mapping->flags); 351 + } 352 + 353 + static inline bool mapping_no_data_integrity(const struct address_space *mapping) 354 + { 355 + return test_bit(AS_NO_DATA_INTEGRITY, &mapping->flags); 347 356 } 348 357 349 358 static inline gfp_t mapping_gfp_mask(const struct address_space *mapping)

+2 -2

kernel/panic.c

··· 131 131 static int sysctl_panic_print_handler(const struct ctl_table *table, int write, 132 132 void *buffer, size_t *lenp, loff_t *ppos) 133 133 { 134 - panic_print_deprecated(); 134 + if (write) 135 + panic_print_deprecated(); 135 136 return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); 136 137 } 137 138 ··· 1015 1014 1016 1015 static int panic_print_get(char *val, const struct kernel_param *kp) 1017 1016 { 1018 - panic_print_deprecated(); 1019 1017 return param_get_ulong(val, kp); 1020 1018 } 1021 1019

+72 -59

mm/hugetlb.c

··· 5112 5112 unsigned long last_addr_mask; 5113 5113 pte_t *src_pte, *dst_pte; 5114 5114 struct mmu_notifier_range range; 5115 - bool shared_pmd = false; 5115 + struct mmu_gather tlb; 5116 5116 5117 5117 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, old_addr, 5118 5118 old_end); ··· 5122 5122 * range. 5123 5123 */ 5124 5124 flush_cache_range(vma, range.start, range.end); 5125 + tlb_gather_mmu_vma(&tlb, vma); 5125 5126 5126 5127 mmu_notifier_invalidate_range_start(&range); 5127 5128 last_addr_mask = hugetlb_mask_last_page(h); ··· 5139 5138 if (huge_pte_none(huge_ptep_get(mm, old_addr, src_pte))) 5140 5139 continue; 5141 5140 5142 - if (huge_pmd_unshare(mm, vma, old_addr, src_pte)) { 5143 - shared_pmd = true; 5141 + if (huge_pmd_unshare(&tlb, vma, old_addr, src_pte)) { 5144 5142 old_addr |= last_addr_mask; 5145 5143 new_addr |= last_addr_mask; 5146 5144 continue; ··· 5150 5150 break; 5151 5151 5152 5152 move_huge_pte(vma, old_addr, new_addr, src_pte, dst_pte, sz); 5153 + tlb_remove_huge_tlb_entry(h, &tlb, src_pte, old_addr); 5153 5154 } 5154 5155 5155 - if (shared_pmd) 5156 - flush_hugetlb_tlb_range(vma, range.start, range.end); 5157 - else 5158 - flush_hugetlb_tlb_range(vma, old_end - len, old_end); 5156 + tlb_flush_mmu_tlbonly(&tlb); 5157 + huge_pmd_unshare_flush(&tlb, vma); 5158 + 5159 5159 mmu_notifier_invalidate_range_end(&range); 5160 5160 i_mmap_unlock_write(mapping); 5161 5161 hugetlb_vma_unlock_write(vma); 5162 + tlb_finish_mmu(&tlb); 5162 5163 5163 5164 return len + old_addr - old_end; 5164 5165 } ··· 5178 5177 unsigned long sz = huge_page_size(h); 5179 5178 bool adjust_reservation; 5180 5179 unsigned long last_addr_mask; 5181 - bool force_flush = false; 5182 5180 5183 5181 WARN_ON(!is_vm_hugetlb_page(vma)); 5184 5182 BUG_ON(start & ~huge_page_mask(h)); ··· 5200 5200 } 5201 5201 5202 5202 ptl = huge_pte_lock(h, mm, ptep); 5203 - if (huge_pmd_unshare(mm, vma, address, ptep)) { 5203 + if (huge_pmd_unshare(tlb, vma, address, ptep)) { 5204 5204 spin_unlock(ptl); 5205 - tlb_flush_pmd_range(tlb, address & PUD_MASK, PUD_SIZE); 5206 - force_flush = true; 5207 5205 address |= last_addr_mask; 5208 5206 continue; 5209 5207 } ··· 5317 5319 } 5318 5320 tlb_end_vma(tlb, vma); 5319 5321 5320 - /* 5321 - * If we unshared PMDs, the TLB flush was not recorded in mmu_gather. We 5322 - * could defer the flush until now, since by holding i_mmap_rwsem we 5323 - * guaranteed that the last reference would not be dropped. But we must 5324 - * do the flushing before we return, as otherwise i_mmap_rwsem will be 5325 - * dropped and the last reference to the shared PMDs page might be 5326 - * dropped as well. 5327 - * 5328 - * In theory we could defer the freeing of the PMD pages as well, but 5329 - * huge_pmd_unshare() relies on the exact page_count for the PMD page to 5330 - * detect sharing, so we cannot defer the release of the page either. 5331 - * Instead, do flush now. 5332 - */ 5333 - if (force_flush) 5334 - tlb_flush_mmu_tlbonly(tlb); 5322 + huge_pmd_unshare_flush(tlb, vma); 5335 5323 } 5336 5324 5337 5325 void __hugetlb_zap_begin(struct vm_area_struct *vma, ··· 6416 6432 pte_t pte; 6417 6433 struct hstate *h = hstate_vma(vma); 6418 6434 long pages = 0, psize = huge_page_size(h); 6419 - bool shared_pmd = false; 6420 6435 struct mmu_notifier_range range; 6421 6436 unsigned long last_addr_mask; 6422 6437 bool uffd_wp = cp_flags & MM_CP_UFFD_WP; 6423 6438 bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE; 6439 + struct mmu_gather tlb; 6424 6440 6425 6441 /* 6426 6442 * In the case of shared PMDs, the area to flush could be beyond ··· 6433 6449 6434 6450 BUG_ON(address >= end); 6435 6451 flush_cache_range(vma, range.start, range.end); 6452 + tlb_gather_mmu_vma(&tlb, vma); 6436 6453 6437 6454 mmu_notifier_invalidate_range_start(&range); 6438 6455 hugetlb_vma_lock_write(vma); ··· 6460 6475 } 6461 6476 } 6462 6477 ptl = huge_pte_lock(h, mm, ptep); 6463 - if (huge_pmd_unshare(mm, vma, address, ptep)) { 6478 + if (huge_pmd_unshare(&tlb, vma, address, ptep)) { 6464 6479 /* 6465 6480 * When uffd-wp is enabled on the vma, unshare 6466 6481 * shouldn't happen at all. Warn about it if it ··· 6469 6484 WARN_ON_ONCE(uffd_wp || uffd_wp_resolve); 6470 6485 pages++; 6471 6486 spin_unlock(ptl); 6472 - shared_pmd = true; 6473 6487 address |= last_addr_mask; 6474 6488 continue; 6475 6489 } ··· 6529 6545 pte = huge_pte_clear_uffd_wp(pte); 6530 6546 huge_ptep_modify_prot_commit(vma, address, ptep, old_pte, pte); 6531 6547 pages++; 6548 + tlb_remove_huge_tlb_entry(h, &tlb, ptep, address); 6532 6549 } 6533 6550 6534 6551 next: 6535 6552 spin_unlock(ptl); 6536 6553 cond_resched(); 6537 6554 } 6538 - /* 6539 - * Must flush TLB before releasing i_mmap_rwsem: x86's huge_pmd_unshare 6540 - * may have cleared our pud entry and done put_page on the page table: 6541 - * once we release i_mmap_rwsem, another task can do the final put_page 6542 - * and that page table be reused and filled with junk. If we actually 6543 - * did unshare a page of pmds, flush the range corresponding to the pud. 6544 - */ 6545 - if (shared_pmd) 6546 - flush_hugetlb_tlb_range(vma, range.start, range.end); 6547 - else 6548 - flush_hugetlb_tlb_range(vma, start, end); 6555 + 6556 + tlb_flush_mmu_tlbonly(&tlb); 6557 + huge_pmd_unshare_flush(&tlb, vma); 6549 6558 /* 6550 6559 * No need to call mmu_notifier_arch_invalidate_secondary_tlbs() we are 6551 6560 * downgrading page table protection not changing it to point to a new ··· 6549 6572 i_mmap_unlock_write(vma->vm_file->f_mapping); 6550 6573 hugetlb_vma_unlock_write(vma); 6551 6574 mmu_notifier_invalidate_range_end(&range); 6575 + tlb_finish_mmu(&tlb); 6552 6576 6553 6577 return pages > 0 ? (pages << h->order) : pages; 6554 6578 } ··· 6906 6928 return pte; 6907 6929 } 6908 6930 6909 - /* 6910 - * unmap huge page backed by shared pte. 6931 + /** 6932 + * huge_pmd_unshare - Unmap a pmd table if it is shared by multiple users 6933 + * @tlb: the current mmu_gather. 6934 + * @vma: the vma covering the pmd table. 6935 + * @addr: the address we are trying to unshare. 6936 + * @ptep: pointer into the (pmd) page table. 6911 6937 * 6912 - * Called with page table lock held. 6938 + * Called with the page table lock held, the i_mmap_rwsem held in write mode 6939 + * and the hugetlb vma lock held in write mode. 6913 6940 * 6914 - * returns: 1 successfully unmapped a shared pte page 6915 - * 0 the underlying pte page is not shared, or it is the last user 6941 + * Note: The caller must call huge_pmd_unshare_flush() before dropping the 6942 + * i_mmap_rwsem. 6943 + * 6944 + * Returns: 1 if it was a shared PMD table and it got unmapped, or 0 if it 6945 + * was not a shared PMD table. 6916 6946 */ 6917 - int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, 6918 - unsigned long addr, pte_t *ptep) 6947 + int huge_pmd_unshare(struct mmu_gather *tlb, struct vm_area_struct *vma, 6948 + unsigned long addr, pte_t *ptep) 6919 6949 { 6920 6950 unsigned long sz = huge_page_size(hstate_vma(vma)); 6951 + struct mm_struct *mm = vma->vm_mm; 6921 6952 pgd_t *pgd = pgd_offset(mm, addr); 6922 6953 p4d_t *p4d = p4d_offset(pgd, addr); 6923 6954 pud_t *pud = pud_offset(p4d, addr); ··· 6938 6951 i_mmap_assert_write_locked(vma->vm_file->f_mapping); 6939 6952 hugetlb_vma_assert_locked(vma); 6940 6953 pud_clear(pud); 6941 - /* 6942 - * Once our caller drops the rmap lock, some other process might be 6943 - * using this page table as a normal, non-hugetlb page table. 6944 - * Wait for pending gup_fast() in other threads to finish before letting 6945 - * that happen. 6946 - */ 6947 - tlb_remove_table_sync_one(); 6948 - ptdesc_pmd_pts_dec(virt_to_ptdesc(ptep)); 6954 + 6955 + tlb_unshare_pmd_ptdesc(tlb, virt_to_ptdesc(ptep), addr); 6956 + 6949 6957 mm_dec_nr_pmds(mm); 6950 6958 return 1; 6959 + } 6960 + 6961 + /* 6962 + * huge_pmd_unshare_flush - Complete a sequence of huge_pmd_unshare() calls 6963 + * @tlb: the current mmu_gather. 6964 + * @vma: the vma covering the pmd table. 6965 + * 6966 + * Perform necessary TLB flushes or IPI broadcasts to synchronize PMD table 6967 + * unsharing with concurrent page table walkers. 6968 + * 6969 + * This function must be called after a sequence of huge_pmd_unshare() 6970 + * calls while still holding the i_mmap_rwsem. 6971 + */ 6972 + void huge_pmd_unshare_flush(struct mmu_gather *tlb, struct vm_area_struct *vma) 6973 + { 6974 + /* 6975 + * We must synchronize page table unsharing such that nobody will 6976 + * try reusing a previously-shared page table while it might still 6977 + * be in use by previous sharers (TLB, GUP_fast). 6978 + */ 6979 + i_mmap_assert_write_locked(vma->vm_file->f_mapping); 6980 + 6981 + tlb_flush_unshared_tables(tlb); 6951 6982 } 6952 6983 6953 6984 #else /* !CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING */ ··· 6976 6971 return NULL; 6977 6972 } 6978 6973 6979 - int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, 6980 - unsigned long addr, pte_t *ptep) 6974 + int huge_pmd_unshare(struct mmu_gather *tlb, struct vm_area_struct *vma, 6975 + unsigned long addr, pte_t *ptep) 6981 6976 { 6982 6977 return 0; 6978 + } 6979 + 6980 + void huge_pmd_unshare_flush(struct mmu_gather *tlb, struct vm_area_struct *vma) 6981 + { 6983 6982 } 6984 6983 6985 6984 void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, ··· 7252 7243 unsigned long sz = huge_page_size(h); 7253 7244 struct mm_struct *mm = vma->vm_mm; 7254 7245 struct mmu_notifier_range range; 7246 + struct mmu_gather tlb; 7255 7247 unsigned long address; 7256 7248 spinlock_t *ptl; 7257 7249 pte_t *ptep; ··· 7264 7254 return; 7265 7255 7266 7256 flush_cache_range(vma, start, end); 7257 + tlb_gather_mmu_vma(&tlb, vma); 7258 + 7267 7259 /* 7268 7260 * No need to call adjust_range_if_pmd_sharing_possible(), because 7269 7261 * we have already done the PUD_SIZE alignment. ··· 7284 7272 if (!ptep) 7285 7273 continue; 7286 7274 ptl = huge_pte_lock(h, mm, ptep); 7287 - huge_pmd_unshare(mm, vma, address, ptep); 7275 + huge_pmd_unshare(&tlb, vma, address, ptep); 7288 7276 spin_unlock(ptl); 7289 7277 } 7290 - flush_hugetlb_tlb_range(vma, start, end); 7278 + huge_pmd_unshare_flush(&tlb, vma); 7291 7279 if (take_locks) { 7292 7280 i_mmap_unlock_write(vma->vm_file->f_mapping); 7293 7281 hugetlb_vma_unlock_write(vma); ··· 7297 7285 * Documentation/mm/mmu_notifier.rst. 7298 7286 */ 7299 7287 mmu_notifier_invalidate_range_end(&range); 7288 + tlb_finish_mmu(&tlb); 7300 7289 } 7301 7290 7302 7291 /*

+4 -1

mm/init-mm.c

··· 44 44 .mm_lock_seq = SEQCNT_ZERO(init_mm.mm_lock_seq), 45 45 #endif 46 46 .user_ns = &init_user_ns, 47 - .cpu_bitmap = CPU_BITS_NONE, 47 + #ifdef CONFIG_SCHED_MM_CID 48 + .mm_cid.lock = __RAW_SPIN_LOCK_UNLOCKED(init_mm.mm_cid.lock), 49 + #endif 50 + .flexible_array = MM_STRUCT_FLEXIBLE_ARRAY_INIT, 48 51 INIT_MM_CONTEXT(init_mm) 49 52 }; 50 53

-8

mm/internal.h

··· 538 538 bool folio_isolate_lru(struct folio *folio); 539 539 void folio_putback_lru(struct folio *folio); 540 540 extern void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason); 541 - #ifdef CONFIG_NUMA 542 541 int user_proactive_reclaim(char *buf, 543 542 struct mem_cgroup *memcg, pg_data_t *pgdat); 544 - #else 545 - static inline int user_proactive_reclaim(char *buf, 546 - struct mem_cgroup *memcg, pg_data_t *pgdat) 547 - { 548 - return 0; 549 - } 550 - #endif 551 543 552 544 /* 553 545 * in mm/rmap.c:

+12 -5

mm/kfence/core.c

··· 823 823 static struct delayed_work kfence_timer; 824 824 825 825 #ifdef CONFIG_KFENCE_STATIC_KEYS 826 + /* Wait queue to wake up allocation-gate timer task. */ 827 + static DECLARE_WAIT_QUEUE_HEAD(allocation_wait); 828 + 826 829 static int kfence_reboot_callback(struct notifier_block *nb, 827 830 unsigned long action, void *data) 828 831 { ··· 835 832 */ 836 833 WRITE_ONCE(kfence_enabled, false); 837 834 /* Cancel any pending timer work */ 838 - cancel_delayed_work_sync(&kfence_timer); 835 + cancel_delayed_work(&kfence_timer); 836 + /* 837 + * Wake up any blocked toggle_allocation_gate() so it can complete 838 + * early while the system is still able to handle IPIs. 839 + */ 840 + wake_up(&allocation_wait); 839 841 840 842 return NOTIFY_OK; 841 843 } ··· 849 841 .notifier_call = kfence_reboot_callback, 850 842 .priority = INT_MAX, /* Run early to stop timers ASAP */ 851 843 }; 852 - 853 - /* Wait queue to wake up allocation-gate timer task. */ 854 - static DECLARE_WAIT_QUEUE_HEAD(allocation_wait); 855 844 856 845 static void wake_up_kfence_timer(struct irq_work *work) 857 846 { ··· 878 873 /* Enable static key, and await allocation to happen. */ 879 874 static_branch_enable(&kfence_allocation_key); 880 875 881 - wait_event_idle(allocation_wait, atomic_read(&kfence_allocation_gate) > 0); 876 + wait_event_idle(allocation_wait, 877 + atomic_read(&kfence_allocation_gate) > 0 || 878 + !READ_ONCE(kfence_enabled)); 882 879 883 880 /* Disable static key and reset timer. */ 884 881 static_branch_disable(&kfence_allocation_key);

+7 -4

mm/memory.c

··· 1465 1465 static bool 1466 1466 vma_needs_copy(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) 1467 1467 { 1468 - if (src_vma->vm_flags & VM_COPY_ON_FORK) 1468 + /* 1469 + * We check against dst_vma as while sane VMA flags will have been 1470 + * copied, VM_UFFD_WP may be set only on dst_vma. 1471 + */ 1472 + if (dst_vma->vm_flags & VM_COPY_ON_FORK) 1469 1473 return true; 1470 1474 /* 1471 1475 * The presence of an anon_vma indicates an anonymous VMA has page ··· 1967 1963 do { 1968 1964 next = pud_addr_end(addr, end); 1969 1965 if (pud_trans_huge(*pud)) { 1970 - if (next - addr != HPAGE_PUD_SIZE) { 1971 - mmap_assert_locked(tlb->mm); 1966 + if (next - addr != HPAGE_PUD_SIZE) 1972 1967 split_huge_pud(vma, pud, addr); 1973 - } else if (zap_huge_pud(tlb, vma, pud, addr)) 1968 + else if (zap_huge_pud(tlb, vma, pud, addr)) 1974 1969 goto next; 1975 1970 /* fall through */ 1976 1971 }

+6 -6

mm/migrate.c

··· 1458 1458 int page_was_mapped = 0; 1459 1459 struct anon_vma *anon_vma = NULL; 1460 1460 struct address_space *mapping = NULL; 1461 + enum ttu_flags ttu = 0; 1461 1462 1462 1463 if (folio_ref_count(src) == 1) { 1463 1464 /* page was freed from under us. So we are done. */ ··· 1499 1498 goto put_anon; 1500 1499 1501 1500 if (folio_mapped(src)) { 1502 - enum ttu_flags ttu = 0; 1503 - 1504 1501 if (!folio_test_anon(src)) { 1505 1502 /* 1506 1503 * In shared mappings, try_to_unmap could potentially ··· 1515 1516 1516 1517 try_to_migrate(src, ttu); 1517 1518 page_was_mapped = 1; 1518 - 1519 - if (ttu & TTU_RMAP_LOCKED) 1520 - i_mmap_unlock_write(mapping); 1521 1519 } 1522 1520 1523 1521 if (!folio_mapped(src)) 1524 1522 rc = move_to_new_folio(dst, src, mode); 1525 1523 1526 1524 if (page_was_mapped) 1527 - remove_migration_ptes(src, !rc ? dst : src, 0); 1525 + remove_migration_ptes(src, !rc ? dst : src, 1526 + ttu ? RMP_LOCKED : 0); 1527 + 1528 + if (ttu & TTU_RMAP_LOCKED) 1529 + i_mmap_unlock_write(mapping); 1528 1530 1529 1531 unlock_put_anon: 1530 1532 folio_unlock(dst);

+33

mm/mmu_gather.c

··· 10 10 #include <linux/swap.h> 11 11 #include <linux/rmap.h> 12 12 #include <linux/pgalloc.h> 13 + #include <linux/hugetlb.h> 13 14 14 15 #include <asm/tlb.h> 15 16 ··· 427 426 #endif 428 427 tlb->vma_pfn = 0; 429 428 429 + tlb->fully_unshared_tables = 0; 430 430 __tlb_reset_range(tlb); 431 431 inc_tlb_flush_pending(tlb->mm); 432 432 } ··· 462 460 } 463 461 464 462 /** 463 + * tlb_gather_mmu_vma - initialize an mmu_gather structure for operating on a 464 + * single VMA 465 + * @tlb: the mmu_gather structure to initialize 466 + * @vma: the vm_area_struct 467 + * 468 + * Called to initialize an (on-stack) mmu_gather structure for operating on 469 + * a single VMA. In contrast to tlb_gather_mmu(), calling this function will 470 + * not require another call to tlb_start_vma(). In contrast to tlb_start_vma(), 471 + * this function will *not* call flush_cache_range(). 472 + * 473 + * For hugetlb VMAs, this function will also initialize the mmu_gather 474 + * page_size accordingly, not requiring a separate call to 475 + * tlb_change_page_size(). 476 + * 477 + */ 478 + void tlb_gather_mmu_vma(struct mmu_gather *tlb, struct vm_area_struct *vma) 479 + { 480 + tlb_gather_mmu(tlb, vma->vm_mm); 481 + tlb_update_vma_flags(tlb, vma); 482 + if (is_vm_hugetlb_page(vma)) 483 + /* All entries have the same size. */ 484 + tlb_change_page_size(tlb, huge_page_size(hstate_vma(vma))); 485 + } 486 + 487 + /** 465 488 * tlb_finish_mmu - finish an mmu_gather structure 466 489 * @tlb: the mmu_gather structure to finish 467 490 * ··· 495 468 */ 496 469 void tlb_finish_mmu(struct mmu_gather *tlb) 497 470 { 471 + /* 472 + * We expect an earlier huge_pmd_unshare_flush() call to sort this out, 473 + * due to complicated locking requirements with page table unsharing. 474 + */ 475 + VM_WARN_ON_ONCE(tlb->fully_unshared_tables); 476 + 498 477 /* 499 478 * If there are parallel threads are doing PTE changes on same range 500 479 * under non-exclusive lock (e.g., mmap_lock read-side) but defer TLB

+21 -24

mm/rmap.c

··· 76 76 #include <linux/mm_inline.h> 77 77 #include <linux/oom.h> 78 78 79 - #include <asm/tlbflush.h> 79 + #include <asm/tlb.h> 80 80 81 81 #define CREATE_TRACE_POINTS 82 82 #include <trace/events/migrate.h> ··· 2008 2008 * if unsuccessful. 2009 2009 */ 2010 2010 if (!anon) { 2011 + struct mmu_gather tlb; 2012 + 2011 2013 VM_BUG_ON(!(flags & TTU_RMAP_LOCKED)); 2012 2014 if (!hugetlb_vma_trylock_write(vma)) 2013 2015 goto walk_abort; 2014 - if (huge_pmd_unshare(mm, vma, address, pvmw.pte)) { 2016 + 2017 + tlb_gather_mmu_vma(&tlb, vma); 2018 + if (huge_pmd_unshare(&tlb, vma, address, pvmw.pte)) { 2015 2019 hugetlb_vma_unlock_write(vma); 2016 - flush_tlb_range(vma, 2017 - range.start, range.end); 2020 + huge_pmd_unshare_flush(&tlb, vma); 2021 + tlb_finish_mmu(&tlb); 2018 2022 /* 2019 - * The ref count of the PMD page was 2020 - * dropped which is part of the way map 2021 - * counting is done for shared PMDs. 2022 - * Return 'true' here. When there is 2023 - * no other sharing, huge_pmd_unshare 2024 - * returns false and we will unmap the 2025 - * actual page and drop map count 2026 - * to zero. 2023 + * The PMD table was unmapped, 2024 + * consequently unmapping the folio. 2027 2025 */ 2028 2026 goto walk_done; 2029 2027 } 2030 2028 hugetlb_vma_unlock_write(vma); 2029 + tlb_finish_mmu(&tlb); 2031 2030 } 2032 2031 pteval = huge_ptep_clear_flush(vma, address, pvmw.pte); 2033 2032 if (pte_dirty(pteval)) ··· 2403 2404 * fail if unsuccessful. 2404 2405 */ 2405 2406 if (!anon) { 2407 + struct mmu_gather tlb; 2408 + 2406 2409 VM_BUG_ON(!(flags & TTU_RMAP_LOCKED)); 2407 2410 if (!hugetlb_vma_trylock_write(vma)) { 2408 2411 page_vma_mapped_walk_done(&pvmw); 2409 2412 ret = false; 2410 2413 break; 2411 2414 } 2412 - if (huge_pmd_unshare(mm, vma, address, pvmw.pte)) { 2413 - hugetlb_vma_unlock_write(vma); 2414 - flush_tlb_range(vma, 2415 - range.start, range.end); 2416 2415 2416 + tlb_gather_mmu_vma(&tlb, vma); 2417 + if (huge_pmd_unshare(&tlb, vma, address, pvmw.pte)) { 2418 + hugetlb_vma_unlock_write(vma); 2419 + huge_pmd_unshare_flush(&tlb, vma); 2420 + tlb_finish_mmu(&tlb); 2417 2421 /* 2418 - * The ref count of the PMD page was 2419 - * dropped which is part of the way map 2420 - * counting is done for shared PMDs. 2421 - * Return 'true' here. When there is 2422 - * no other sharing, huge_pmd_unshare 2423 - * returns false and we will unmap the 2424 - * actual page and drop map count 2425 - * to zero. 2422 + * The PMD table was unmapped, 2423 + * consequently unmapping the folio. 2426 2424 */ 2427 2425 page_vma_mapped_walk_done(&pvmw); 2428 2426 break; 2429 2427 } 2430 2428 hugetlb_vma_unlock_write(vma); 2429 + tlb_finish_mmu(&tlb); 2431 2430 } 2432 2431 /* Nuke the hugetlb page table entry */ 2433 2432 pteval = huge_ptep_clear_flush(vma, address, pvmw.pte);

+9 -2

mm/vma.c

··· 37 37 bool check_ksm_early :1; 38 38 /* If we map new, hold the file rmap lock on mapping. */ 39 39 bool hold_file_rmap_lock :1; 40 + /* If .mmap_prepare changed the file, we don't need to pin. */ 41 + bool file_doesnt_need_get :1; 40 42 }; 41 43 42 44 #define MMAP_STATE(name, mm_, vmi_, addr_, len_, pgoff_, vm_flags_, file_) \ ··· 2452 2450 struct vma_iterator *vmi = map->vmi; 2453 2451 int error; 2454 2452 2455 - vma->vm_file = get_file(map->file); 2453 + vma->vm_file = map->file; 2454 + if (!map->file_doesnt_need_get) 2455 + get_file(map->file); 2456 2456 2457 2457 if (!map->file->f_op->mmap) 2458 2458 return 0; ··· 2642 2638 2643 2639 /* Update fields permitted to be changed. */ 2644 2640 map->pgoff = desc->pgoff; 2645 - map->file = desc->vm_file; 2641 + if (desc->vm_file != map->file) { 2642 + map->file_doesnt_need_get = true; 2643 + map->file = desc->vm_file; 2644 + } 2646 2645 map->vm_flags = desc->vm_flags; 2647 2646 map->page_prot = desc->page_prot; 2648 2647 /* User-defined fields. */

+11 -2

mm/vmscan.c

··· 7707 7707 return ret; 7708 7708 } 7709 7709 7710 + #else 7711 + 7712 + static unsigned long __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, 7713 + unsigned long nr_pages, 7714 + struct scan_control *sc) 7715 + { 7716 + return 0; 7717 + } 7718 + 7719 + #endif 7720 + 7710 7721 enum { 7711 7722 MEMORY_RECLAIM_SWAPPINESS = 0, 7712 7723 MEMORY_RECLAIM_SWAPPINESS_MAX, ··· 7824 7813 7825 7814 return 0; 7826 7815 } 7827 - 7828 - #endif 7829 7816 7830 7817 /** 7831 7818 * check_move_unevictable_folios - Move evictable folios to appropriate zone