Merge branch 'mm-hotfixes-stable' into mm-nonmm-stable in order to be able to merge "kho: make debugfs interface optional" into mm-nonmm-stable.

+3

.mailmap

··· 206 206 David Brownell <david-b@pacbell.net> 207 207 David Collins <quic_collinsd@quicinc.com> <collinsd@codeaurora.org> 208 208 David Heidelberg <david@ixit.cz> <d.okias@gmail.com> 209 + David Hildenbrand <david@kernel.org> <david@redhat.com> 209 210 David Rheinsberg <david@readahead.eu> <dh.herrmann@gmail.com> 210 211 David Rheinsberg <david@readahead.eu> <dh.herrmann@googlemail.com> 211 212 David Rheinsberg <david@readahead.eu> <david.rheinsberg@gmail.com> ··· 688 687 Sachin P Sant <ssant@in.ibm.com> 689 688 Sai Prakash Ranjan <quic_saipraka@quicinc.com> <saiprakash.ranjan@codeaurora.org> 690 689 Sakari Ailus <sakari.ailus@linux.intel.com> <sakari.ailus@iki.fi> 690 + Sam Protsenko <semen.protsenko@linaro.org> 691 + Sam Protsenko <semen.protsenko@linaro.org> <semen.protsenko@globallogic.com> 691 692 Sam Ravnborg <sam@mars.ravnborg.org> 692 693 Sankeerth Billakanti <quic_sbillaka@quicinc.com> <sbillaka@codeaurora.org> 693 694 Santosh Shilimkar <santosh.shilimkar@oracle.org>

+17 -16

MAINTAINERS

··· 11526 11526 HUGETLB SUBSYSTEM 11527 11527 M: Muchun Song <muchun.song@linux.dev> 11528 11528 M: Oscar Salvador <osalvador@suse.de> 11529 - R: David Hildenbrand <david@redhat.com> 11529 + R: David Hildenbrand <david@kernel.org> 11530 11530 L: linux-mm@kvack.org 11531 11531 S: Maintained 11532 11532 F: Documentation/ABI/testing/sysfs-kernel-mm-hugepages ··· 13734 13734 M: Christian Borntraeger <borntraeger@linux.ibm.com> 13735 13735 M: Janosch Frank <frankja@linux.ibm.com> 13736 13736 M: Claudio Imbrenda <imbrenda@linux.ibm.com> 13737 - R: David Hildenbrand <david@redhat.com> 13737 + R: David Hildenbrand <david@kernel.org> 13738 13738 L: kvm@vger.kernel.org 13739 13739 S: Supported 13740 13740 T: git git://git.kernel.org/pub/scm/linux/kernel/git/kvms390/linux.git ··· 13800 13800 F: Documentation/core-api/kho/* 13801 13801 F: include/linux/kexec_handover.h 13802 13802 F: kernel/kexec_handover.c 13803 + F: lib/test_kho.c 13803 13804 F: tools/testing/selftests/kho/ 13804 13805 13805 13806 KEYS-ENCRYPTED ··· 16223 16222 F: drivers/devfreq/tegra30-devfreq.c 16224 16223 16225 16224 MEMORY HOT(UN)PLUG 16226 - M: David Hildenbrand <david@redhat.com> 16225 + M: David Hildenbrand <david@kernel.org> 16227 16226 M: Oscar Salvador <osalvador@suse.de> 16228 16227 L: linux-mm@kvack.org 16229 16228 S: Maintained ··· 16248 16247 16249 16248 MEMORY MANAGEMENT - CORE 16250 16249 M: Andrew Morton <akpm@linux-foundation.org> 16251 - M: David Hildenbrand <david@redhat.com> 16250 + M: David Hildenbrand <david@kernel.org> 16252 16251 R: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> 16253 16252 R: Liam R. Howlett <Liam.Howlett@oracle.com> 16254 16253 R: Vlastimil Babka <vbabka@suse.cz> ··· 16304 16303 16305 16304 MEMORY MANAGEMENT - GUP (GET USER PAGES) 16306 16305 M: Andrew Morton <akpm@linux-foundation.org> 16307 - M: David Hildenbrand <david@redhat.com> 16306 + M: David Hildenbrand <david@kernel.org> 16308 16307 R: Jason Gunthorpe <jgg@nvidia.com> 16309 16308 R: John Hubbard <jhubbard@nvidia.com> 16310 16309 R: Peter Xu <peterx@redhat.com> ··· 16320 16319 16321 16320 MEMORY MANAGEMENT - KSM (Kernel Samepage Merging) 16322 16321 M: Andrew Morton <akpm@linux-foundation.org> 16323 - M: David Hildenbrand <david@redhat.com> 16322 + M: David Hildenbrand <david@kernel.org> 16324 16323 R: Xu Xin <xu.xin16@zte.com.cn> 16325 16324 R: Chengming Zhou <chengming.zhou@linux.dev> 16326 16325 L: linux-mm@kvack.org ··· 16336 16335 16337 16336 MEMORY MANAGEMENT - MEMORY POLICY AND MIGRATION 16338 16337 M: Andrew Morton <akpm@linux-foundation.org> 16339 - M: David Hildenbrand <david@redhat.com> 16338 + M: David Hildenbrand <david@kernel.org> 16340 16339 R: Zi Yan <ziy@nvidia.com> 16341 16340 R: Matthew Brost <matthew.brost@intel.com> 16342 16341 R: Joshua Hahn <joshua.hahnjy@gmail.com> ··· 16376 16375 16377 16376 MEMORY MANAGEMENT - MISC 16378 16377 M: Andrew Morton <akpm@linux-foundation.org> 16379 - M: David Hildenbrand <david@redhat.com> 16378 + M: David Hildenbrand <david@kernel.org> 16380 16379 R: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> 16381 16380 R: Liam R. Howlett <Liam.Howlett@oracle.com> 16382 16381 R: Vlastimil Babka <vbabka@suse.cz> ··· 16464 16463 MEMORY MANAGEMENT - RECLAIM 16465 16464 M: Andrew Morton <akpm@linux-foundation.org> 16466 16465 M: Johannes Weiner <hannes@cmpxchg.org> 16467 - R: David Hildenbrand <david@redhat.com> 16466 + R: David Hildenbrand <david@kernel.org> 16468 16467 R: Michal Hocko <mhocko@kernel.org> 16469 16468 R: Qi Zheng <zhengqi.arch@bytedance.com> 16470 16469 R: Shakeel Butt <shakeel.butt@linux.dev> ··· 16477 16476 16478 16477 MEMORY MANAGEMENT - RMAP (REVERSE MAPPING) 16479 16478 M: Andrew Morton <akpm@linux-foundation.org> 16480 - M: David Hildenbrand <david@redhat.com> 16479 + M: David Hildenbrand <david@kernel.org> 16481 16480 M: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> 16482 16481 R: Rik van Riel <riel@surriel.com> 16483 16482 R: Liam R. Howlett <Liam.Howlett@oracle.com> ··· 16501 16500 16502 16501 MEMORY MANAGEMENT - SWAP 16503 16502 M: Andrew Morton <akpm@linux-foundation.org> 16503 + M: Chris Li <chrisl@kernel.org> 16504 + M: Kairui Song <kasong@tencent.com> 16504 16505 R: Kemeng Shi <shikemeng@huaweicloud.com> 16505 - R: Kairui Song <kasong@tencent.com> 16506 16506 R: Nhat Pham <nphamcs@gmail.com> 16507 16507 R: Baoquan He <bhe@redhat.com> 16508 16508 R: Barry Song <baohua@kernel.org> 16509 - R: Chris Li <chrisl@kernel.org> 16510 16509 L: linux-mm@kvack.org 16511 16510 S: Maintained 16512 16511 F: Documentation/mm/swap-table.rst ··· 16522 16521 16523 16522 MEMORY MANAGEMENT - THP (TRANSPARENT HUGE PAGE) 16524 16523 M: Andrew Morton <akpm@linux-foundation.org> 16525 - M: David Hildenbrand <david@redhat.com> 16524 + M: David Hildenbrand <david@kernel.org> 16526 16525 M: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> 16527 16526 R: Zi Yan <ziy@nvidia.com> 16528 16527 R: Baolin Wang <baolin.wang@linux.alibaba.com> ··· 16624 16623 M: Andrew Morton <akpm@linux-foundation.org> 16625 16624 M: Liam R. Howlett <Liam.Howlett@oracle.com> 16626 16625 M: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> 16627 - M: David Hildenbrand <david@redhat.com> 16626 + M: David Hildenbrand <david@kernel.org> 16628 16627 R: Vlastimil Babka <vbabka@suse.cz> 16629 16628 R: Jann Horn <jannh@google.com> 16630 16629 L: linux-mm@kvack.org ··· 27091 27090 27092 27091 VIRTIO BALLOON 27093 27092 M: "Michael S. Tsirkin" <mst@redhat.com> 27094 - M: David Hildenbrand <david@redhat.com> 27093 + M: David Hildenbrand <david@kernel.org> 27095 27094 L: virtualization@lists.linux.dev 27096 27095 S: Maintained 27097 27096 F: drivers/virtio/virtio_balloon.c ··· 27246 27245 F: include/uapi/linux/virtio_iommu.h 27247 27246 27248 27247 VIRTIO MEM DRIVER 27249 - M: David Hildenbrand <david@redhat.com> 27248 + M: David Hildenbrand <david@kernel.org> 27250 27249 L: virtualization@lists.linux.dev 27251 27250 S: Maintained 27252 27251 W: https://virtio-mem.gitlab.io/

+2 -1

arch/arm64/kernel/mte.c

··· 476 476 477 477 folio = page_folio(page); 478 478 if (folio_test_hugetlb(folio)) 479 - WARN_ON_ONCE(!folio_test_hugetlb_mte_tagged(folio)); 479 + WARN_ON_ONCE(!folio_test_hugetlb_mte_tagged(folio) && 480 + !is_huge_zero_folio(folio)); 480 481 else 481 482 WARN_ON_ONCE(!page_mte_tagged(page) && !is_zero_page(page)); 482 483

+10

arch/arm64/mm/fault.c

··· 969 969 970 970 void tag_clear_highpage(struct page *page) 971 971 { 972 + /* 973 + * Check if MTE is supported and fall back to clear_highpage(). 974 + * get_huge_zero_folio() unconditionally passes __GFP_ZEROTAGS and 975 + * post_alloc_hook() will invoke tag_clear_highpage(). 976 + */ 977 + if (!system_supports_mte()) { 978 + clear_highpage(page); 979 + return; 980 + } 981 + 972 982 /* Newly allocated page, shouldn't have been tagged yet */ 973 983 WARN_ON_ONCE(!try_page_mte_tagging(page)); 974 984 mte_zero_clear_page_tags(page_address(page));

+1

arch/powerpc/Kconfig

··· 137 137 select ARCH_HAS_DMA_OPS if PPC64 138 138 select ARCH_HAS_FORTIFY_SOURCE 139 139 select ARCH_HAS_GCOV_PROFILE_ALL 140 + select ARCH_HAS_GIGANTIC_PAGE if ARCH_SUPPORTS_HUGETLBFS 140 141 select ARCH_HAS_KCOV 141 142 select ARCH_HAS_KERNEL_FPU_SUPPORT if PPC64 && PPC_FPU 142 143 select ARCH_HAS_MEMBARRIER_CALLBACKS

-1

arch/powerpc/platforms/Kconfig.cputype

··· 423 423 config PPC_RADIX_MMU 424 424 bool "Radix MMU Support" 425 425 depends on PPC_BOOK3S_64 426 - select ARCH_HAS_GIGANTIC_PAGE 427 426 default y 428 427 help 429 428 Enable support for the Power ISA 3.0 Radix style MMU. Currently this

+6 -1

fs/nilfs2/segment.c

··· 2768 2768 2769 2769 if (sci->sc_task) { 2770 2770 wake_up(&sci->sc_wait_daemon); 2771 - kthread_stop(sci->sc_task); 2771 + if (kthread_stop(sci->sc_task)) { 2772 + spin_lock(&sci->sc_state_lock); 2773 + sci->sc_task = NULL; 2774 + timer_shutdown_sync(&sci->sc_timer); 2775 + spin_unlock(&sci->sc_state_lock); 2776 + } 2772 2777 } 2773 2778 2774 2779 spin_lock(&sci->sc_state_lock);

+9 -3

fs/proc/generic.c

··· 698 698 } 699 699 } 700 700 701 + static void pde_erase(struct proc_dir_entry *pde, struct proc_dir_entry *parent) 702 + { 703 + rb_erase(&pde->subdir_node, &parent->subdir); 704 + RB_CLEAR_NODE(&pde->subdir_node); 705 + } 706 + 701 707 /* 702 708 * Remove a /proc entry and free it if it's not currently in use. 703 709 */ ··· 726 720 WARN(1, "removing permanent /proc entry '%s'", de->name); 727 721 de = NULL; 728 722 } else { 729 - rb_erase(&de->subdir_node, &parent->subdir); 723 + pde_erase(de, parent); 730 724 if (S_ISDIR(de->mode)) 731 725 parent->nlink--; 732 726 } ··· 770 764 root->parent->name, root->name); 771 765 return -EINVAL; 772 766 } 773 - rb_erase(&root->subdir_node, &parent->subdir); 767 + pde_erase(root, parent); 774 768 775 769 de = root; 776 770 while (1) { ··· 782 776 next->parent->name, next->name); 783 777 return -EINVAL; 784 778 } 785 - rb_erase(&next->subdir_node, &de->subdir); 779 + pde_erase(next, de); 786 780 de = next; 787 781 continue; 788 782 }

+3

include/linux/gfp.h

··· 7 7 #include <linux/mmzone.h> 8 8 #include <linux/topology.h> 9 9 #include <linux/alloc_tag.h> 10 + #include <linux/cleanup.h> 10 11 #include <linux/sched.h> 11 12 12 13 struct vm_area_struct; ··· 463 462 #endif 464 463 /* This should be paired with folio_put() rather than free_contig_range(). */ 465 464 #define folio_alloc_gigantic(...) alloc_hooks(folio_alloc_gigantic_noprof(__VA_ARGS__)) 465 + 466 + DEFINE_FREE(free_page, void *, free_page((unsigned long)_T)) 466 467 467 468 #endif /* __LINUX_GFP_H */

+23 -32

include/linux/huge_mm.h

··· 376 376 int folio_split(struct folio *folio, unsigned int new_order, struct page *page, 377 377 struct list_head *list); 378 378 /* 379 - * try_folio_split - try to split a @folio at @page using non uniform split. 379 + * try_folio_split_to_order - try to split a @folio at @page to @new_order using 380 + * non uniform split. 380 381 * @folio: folio to be split 381 - * @page: split to order-0 at the given page 382 - * @list: store the after-split folios 382 + * @page: split to @new_order at the given page 383 + * @new_order: the target split order 383 384 * 384 - * Try to split a @folio at @page using non uniform split to order-0, if 385 - * non uniform split is not supported, fall back to uniform split. 385 + * Try to split a @folio at @page using non uniform split to @new_order, if 386 + * non uniform split is not supported, fall back to uniform split. After-split 387 + * folios are put back to LRU list. Use min_order_for_split() to get the lower 388 + * bound of @new_order. 386 389 * 387 390 * Return: 0: split is successful, otherwise split failed. 388 391 */ 389 - static inline int try_folio_split(struct folio *folio, struct page *page, 390 - struct list_head *list) 392 + static inline int try_folio_split_to_order(struct folio *folio, 393 + struct page *page, unsigned int new_order) 391 394 { 392 - int ret = min_order_for_split(folio); 393 - 394 - if (ret < 0) 395 - return ret; 396 - 397 - if (!non_uniform_split_supported(folio, 0, false)) 398 - return split_huge_page_to_list_to_order(&folio->page, list, 399 - ret); 400 - return folio_split(folio, ret, page, list); 395 + if (!non_uniform_split_supported(folio, new_order, /* warns= */ false)) 396 + return split_huge_page_to_list_to_order(&folio->page, NULL, 397 + new_order); 398 + return folio_split(folio, new_order, page, NULL); 401 399 } 402 400 static inline int split_huge_page(struct page *page) 403 401 { 404 - struct folio *folio = page_folio(page); 405 - int ret = min_order_for_split(folio); 406 - 407 - if (ret < 0) 408 - return ret; 409 - 410 - /* 411 - * split_huge_page() locks the page before splitting and 412 - * expects the same page that has been split to be locked when 413 - * returned. split_folio(page_folio(page)) cannot be used here 414 - * because it converts the page to folio and passes the head 415 - * page to be split. 416 - */ 417 - return split_huge_page_to_list_to_order(page, NULL, ret); 402 + return split_huge_page_to_list_to_order(page, NULL, 0); 418 403 } 419 404 void deferred_split_folio(struct folio *folio, bool partially_mapped); 420 405 ··· 582 597 return -EINVAL; 583 598 } 584 599 600 + static inline int min_order_for_split(struct folio *folio) 601 + { 602 + VM_WARN_ON_ONCE_FOLIO(1, folio); 603 + return -EINVAL; 604 + } 605 + 585 606 static inline int split_folio_to_list(struct folio *folio, struct list_head *list) 586 607 { 587 608 VM_WARN_ON_ONCE_FOLIO(1, folio); 588 609 return -EINVAL; 589 610 } 590 611 591 - static inline int try_folio_split(struct folio *folio, struct page *page, 592 - struct list_head *list) 612 + static inline int try_folio_split_to_order(struct folio *folio, 613 + struct page *page, unsigned int new_order) 593 614 { 594 615 VM_WARN_ON_ONCE_FOLIO(1, folio); 595 616 return -EINVAL;

+10 -3

include/linux/mm.h

··· 2074 2074 return folio_large_nr_pages(folio); 2075 2075 } 2076 2076 2077 - #if !defined(CONFIG_ARCH_HAS_GIGANTIC_PAGE) 2077 + #if !defined(CONFIG_HAVE_GIGANTIC_FOLIOS) 2078 2078 /* 2079 2079 * We don't expect any folios that exceed buddy sizes (and consequently 2080 2080 * memory sections). ··· 2087 2087 * pages are guaranteed to be contiguous. 2088 2088 */ 2089 2089 #define MAX_FOLIO_ORDER PFN_SECTION_SHIFT 2090 - #else 2090 + #elif defined(CONFIG_HUGETLB_PAGE) 2091 2091 /* 2092 2092 * There is no real limit on the folio size. We limit them to the maximum we 2093 - * currently expect (e.g., hugetlb, dax). 2093 + * currently expect (see CONFIG_HAVE_GIGANTIC_FOLIOS): with hugetlb, we expect 2094 + * no folios larger than 16 GiB on 64bit and 1 GiB on 32bit. 2095 + */ 2096 + #define MAX_FOLIO_ORDER get_order(IS_ENABLED(CONFIG_64BIT) ? SZ_16G : SZ_1G) 2097 + #else 2098 + /* 2099 + * Without hugetlb, gigantic folios that are bigger than a single PUD are 2100 + * currently impossible. 2094 2101 */ 2095 2102 #define MAX_FOLIO_ORDER PUD_ORDER 2096 2103 #endif

+9

kernel/Kconfig.kexec

··· 109 109 to keep data or state alive across the kexec. For this to work, 110 110 both source and target kernels need to have this option enabled. 111 111 112 + config KEXEC_HANDOVER_DEBUG 113 + bool "Enable Kexec Handover debug checks" 114 + depends on KEXEC_HANDOVER 115 + help 116 + This option enables extra sanity checks for the Kexec Handover 117 + subsystem. Since, KHO performance is crucial in live update 118 + scenarios and the extra code might be adding overhead it is 119 + only optionally enabled. 120 + 112 121 config CRASH_DUMP 113 122 bool "kernel crash dumps" 114 123 default ARCH_DEFAULT_CRASH_DUMP

+1

kernel/Makefile

··· 83 83 obj-$(CONFIG_KEXEC_FILE) += kexec_file.o 84 84 obj-$(CONFIG_KEXEC_ELF) += kexec_elf.o 85 85 obj-$(CONFIG_KEXEC_HANDOVER) += kexec_handover.o 86 + obj-$(CONFIG_KEXEC_HANDOVER_DEBUG) += kexec_handover_debug.o 86 87 obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o 87 88 obj-$(CONFIG_COMPAT) += compat.o 88 89 obj-$(CONFIG_CGROUPS) += cgroup/

+1 -1

kernel/crash_core.c

··· 373 373 old_res->start = 0; 374 374 old_res->end = 0; 375 375 } else { 376 - crashk_res.end = ram_res->start - 1; 376 + old_res->end = ram_res->start - 1; 377 377 } 378 378 379 379 crash_free_reserved_phys_range(ram_res->start, ram_res->end);

+3 -1

kernel/gcov/gcc_4_7.c

··· 18 18 #include <linux/mm.h> 19 19 #include "gcov.h" 20 20 21 - #if (__GNUC__ >= 14) 21 + #if (__GNUC__ >= 15) 22 + #define GCOV_COUNTERS 10 23 + #elif (__GNUC__ >= 14) 22 24 #define GCOV_COUNTERS 9 23 25 #elif (__GNUC__ >= 10) 24 26 #define GCOV_COUNTERS 8

+58 -37

kernel/kexec_handover.c

··· 8 8 9 9 #define pr_fmt(fmt) "KHO: " fmt 10 10 11 + #include <linux/cleanup.h> 11 12 #include <linux/cma.h> 12 13 #include <linux/count_zeros.h> 13 14 #include <linux/debugfs.h> ··· 23 22 24 23 #include <asm/early_ioremap.h> 25 24 25 + #include "kexec_handover_internal.h" 26 26 /* 27 27 * KHO is tightly coupled with mm init and needs access to some of mm 28 28 * internal APIs. ··· 69 67 * Keep track of memory that is to be preserved across KHO. 70 68 * 71 69 * The serializing side uses two levels of xarrays to manage chunks of per-order 72 - * 512 byte bitmaps. For instance if PAGE_SIZE = 4096, the entire 1G order of a 73 - * 1TB system would fit inside a single 512 byte bitmap. For order 0 allocations 74 - * each bitmap will cover 16M of address space. Thus, for 16G of memory at most 75 - * 512K of bitmap memory will be needed for order 0. 70 + * PAGE_SIZE byte bitmaps. For instance if PAGE_SIZE = 4096, the entire 1G order 71 + * of a 8TB system would fit inside a single 4096 byte bitmap. For order 0 72 + * allocations each bitmap will cover 128M of address space. Thus, for 16G of 73 + * memory at most 512K of bitmap memory will be needed for order 0. 76 74 * 77 75 * This approach is fully incremental, as the serialization progresses folios 78 76 * can continue be aggregated to the tracker. The final step, immediately prior ··· 80 78 * successor kernel to parse. 81 79 */ 82 80 83 - #define PRESERVE_BITS (512 * 8) 81 + #define PRESERVE_BITS (PAGE_SIZE * 8) 84 82 85 83 struct kho_mem_phys_bits { 86 84 DECLARE_BITMAP(preserve, PRESERVE_BITS); 87 85 }; 86 + 87 + static_assert(sizeof(struct kho_mem_phys_bits) == PAGE_SIZE); 88 88 89 89 struct kho_mem_phys { 90 90 /* ··· 135 131 .finalized = false, 136 132 }; 137 133 138 - static void *xa_load_or_alloc(struct xarray *xa, unsigned long index, size_t sz) 134 + static void *xa_load_or_alloc(struct xarray *xa, unsigned long index) 139 135 { 140 - void *elm, *res; 136 + void *res = xa_load(xa, index); 141 137 142 - elm = xa_load(xa, index); 143 - if (elm) 144 - return elm; 138 + if (res) 139 + return res; 145 140 146 - elm = kzalloc(sz, GFP_KERNEL); 141 + void *elm __free(free_page) = (void *)get_zeroed_page(GFP_KERNEL); 142 + 147 143 if (!elm) 148 144 return ERR_PTR(-ENOMEM); 149 145 146 + if (WARN_ON(kho_scratch_overlap(virt_to_phys(elm), PAGE_SIZE))) 147 + return ERR_PTR(-EINVAL); 148 + 150 149 res = xa_cmpxchg(xa, index, NULL, elm, GFP_KERNEL); 151 150 if (xa_is_err(res)) 152 - res = ERR_PTR(xa_err(res)); 153 - 154 - if (res) { 155 - kfree(elm); 151 + return ERR_PTR(xa_err(res)); 152 + else if (res) 156 153 return res; 157 - } 158 154 159 - return elm; 155 + return no_free_ptr(elm); 160 156 } 161 157 162 158 static void __kho_unpreserve(struct kho_mem_track *track, unsigned long pfn, ··· 171 167 const unsigned long pfn_high = pfn >> order; 172 168 173 169 physxa = xa_load(&track->orders, order); 174 - if (!physxa) 175 - continue; 170 + if (WARN_ON_ONCE(!physxa)) 171 + return; 176 172 177 173 bits = xa_load(&physxa->phys_bits, pfn_high / PRESERVE_BITS); 178 - if (!bits) 179 - continue; 174 + if (WARN_ON_ONCE(!bits)) 175 + return; 180 176 181 177 clear_bit(pfn_high % PRESERVE_BITS, bits->preserve); 182 178 ··· 220 216 } 221 217 } 222 218 223 - bits = xa_load_or_alloc(&physxa->phys_bits, pfn_high / PRESERVE_BITS, 224 - sizeof(*bits)); 219 + bits = xa_load_or_alloc(&physxa->phys_bits, pfn_high / PRESERVE_BITS); 225 220 if (IS_ERR(bits)) 226 221 return PTR_ERR(bits); 227 222 ··· 348 345 static struct khoser_mem_chunk *new_chunk(struct khoser_mem_chunk *cur_chunk, 349 346 unsigned long order) 350 347 { 351 - struct khoser_mem_chunk *chunk; 348 + struct khoser_mem_chunk *chunk __free(free_page) = NULL; 352 349 353 - chunk = kzalloc(PAGE_SIZE, GFP_KERNEL); 350 + chunk = (void *)get_zeroed_page(GFP_KERNEL); 354 351 if (!chunk) 355 - return NULL; 352 + return ERR_PTR(-ENOMEM); 353 + 354 + if (WARN_ON(kho_scratch_overlap(virt_to_phys(chunk), PAGE_SIZE))) 355 + return ERR_PTR(-EINVAL); 356 + 356 357 chunk->hdr.order = order; 357 358 if (cur_chunk) 358 359 KHOSER_STORE_PTR(cur_chunk->hdr.next, chunk); 359 - return chunk; 360 + return no_free_ptr(chunk); 360 361 } 361 362 362 363 static void kho_mem_ser_free(struct khoser_mem_chunk *first_chunk) ··· 381 374 struct khoser_mem_chunk *chunk = NULL; 382 375 struct kho_mem_phys *physxa; 383 376 unsigned long order; 377 + int err = -ENOMEM; 384 378 385 379 xa_for_each(&ser->track.orders, order, physxa) { 386 380 struct kho_mem_phys_bits *bits; 387 381 unsigned long phys; 388 382 389 383 chunk = new_chunk(chunk, order); 390 - if (!chunk) 384 + if (IS_ERR(chunk)) { 385 + err = PTR_ERR(chunk); 391 386 goto err_free; 387 + } 392 388 393 389 if (!first_chunk) 394 390 first_chunk = chunk; ··· 401 391 402 392 if (chunk->hdr.num_elms == ARRAY_SIZE(chunk->bitmaps)) { 403 393 chunk = new_chunk(chunk, order); 404 - if (!chunk) 394 + if (IS_ERR(chunk)) { 395 + err = PTR_ERR(chunk); 405 396 goto err_free; 397 + } 406 398 } 407 399 408 400 elm = &chunk->bitmaps[chunk->hdr.num_elms]; ··· 421 409 422 410 err_free: 423 411 kho_mem_ser_free(first_chunk); 424 - return -ENOMEM; 412 + return err; 425 413 } 426 414 427 415 static void __init deserialize_bitmap(unsigned int order, ··· 477 465 * area for early allocations that happen before page allocator is 478 466 * initialized. 479 467 */ 480 - static struct kho_scratch *kho_scratch; 481 - static unsigned int kho_scratch_cnt; 468 + struct kho_scratch *kho_scratch; 469 + unsigned int kho_scratch_cnt; 482 470 483 471 /* 484 472 * The scratch areas are scaled by default as percent of memory allocated from ··· 764 752 const unsigned int order = folio_order(folio); 765 753 struct kho_mem_track *track = &kho_out.ser.track; 766 754 755 + if (WARN_ON(kho_scratch_overlap(pfn << PAGE_SHIFT, PAGE_SIZE << order))) 756 + return -EINVAL; 757 + 767 758 return __kho_preserve_order(track, pfn, order); 768 759 } 769 760 EXPORT_SYMBOL_GPL(kho_preserve_folio); ··· 789 774 unsigned long pfn = start_pfn; 790 775 unsigned long failed_pfn = 0; 791 776 int err = 0; 777 + 778 + if (WARN_ON(kho_scratch_overlap(start_pfn << PAGE_SHIFT, 779 + nr_pages << PAGE_SHIFT))) { 780 + return -EINVAL; 781 + } 792 782 793 783 while (pfn < end_pfn) { 794 784 const unsigned int order = ··· 882 862 return NULL; 883 863 } 884 864 885 - static void kho_vmalloc_unpreserve_chunk(struct kho_vmalloc_chunk *chunk) 865 + static void kho_vmalloc_unpreserve_chunk(struct kho_vmalloc_chunk *chunk, 866 + unsigned short order) 886 867 { 887 868 struct kho_mem_track *track = &kho_out.ser.track; 888 869 unsigned long pfn = PHYS_PFN(virt_to_phys(chunk)); 889 870 890 871 __kho_unpreserve(track, pfn, pfn + 1); 891 872 892 - for (int i = 0; chunk->phys[i]; i++) { 873 + for (int i = 0; i < ARRAY_SIZE(chunk->phys) && chunk->phys[i]; i++) { 893 874 pfn = PHYS_PFN(chunk->phys[i]); 894 - __kho_unpreserve(track, pfn, pfn + 1); 875 + __kho_unpreserve(track, pfn, pfn + (1 << order)); 895 876 } 896 877 } 897 878 ··· 903 882 while (chunk) { 904 883 struct kho_vmalloc_chunk *tmp = chunk; 905 884 906 - kho_vmalloc_unpreserve_chunk(chunk); 885 + kho_vmalloc_unpreserve_chunk(chunk, kho_vmalloc->order); 907 886 908 887 chunk = KHOSER_LOAD_PTR(chunk->hdr.next); 909 888 free_page((unsigned long)tmp); ··· 1013 992 while (chunk) { 1014 993 struct page *page; 1015 994 1016 - for (int i = 0; chunk->phys[i]; i++) { 995 + for (int i = 0; i < ARRAY_SIZE(chunk->phys) && chunk->phys[i]; i++) { 1017 996 phys_addr_t phys = chunk->phys[i]; 1018 997 1019 998 if (idx + contig_pages > total_pages)

+25

kernel/kexec_handover_debug.c

··· 1 + // SPDX-License-Identifier: GPL-2.0-only 2 + /* 3 + * kexec_handover_debug.c - kexec handover optional debug functionality 4 + * Copyright (C) 2025 Google LLC, Pasha Tatashin <pasha.tatashin@soleen.com> 5 + */ 6 + 7 + #define pr_fmt(fmt) "KHO: " fmt 8 + 9 + #include "kexec_handover_internal.h" 10 + 11 + bool kho_scratch_overlap(phys_addr_t phys, size_t size) 12 + { 13 + phys_addr_t scratch_start, scratch_end; 14 + unsigned int i; 15 + 16 + for (i = 0; i < kho_scratch_cnt; i++) { 17 + scratch_start = kho_scratch[i].addr; 18 + scratch_end = kho_scratch[i].addr + kho_scratch[i].size; 19 + 20 + if (phys < scratch_end && (phys + size) > scratch_start) 21 + return true; 22 + } 23 + 24 + return false; 25 + }

+20

kernel/kexec_handover_internal.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + #ifndef LINUX_KEXEC_HANDOVER_INTERNAL_H 3 + #define LINUX_KEXEC_HANDOVER_INTERNAL_H 4 + 5 + #include <linux/kexec_handover.h> 6 + #include <linux/types.h> 7 + 8 + extern struct kho_scratch *kho_scratch; 9 + extern unsigned int kho_scratch_cnt; 10 + 11 + #ifdef CONFIG_KEXEC_HANDOVER_DEBUG 12 + bool kho_scratch_overlap(phys_addr_t phys, size_t size); 13 + #else 14 + static inline bool kho_scratch_overlap(phys_addr_t phys, size_t size) 15 + { 16 + return false; 17 + } 18 + #endif /* CONFIG_KEXEC_HANDOVER_DEBUG */ 19 + 20 + #endif /* LINUX_KEXEC_HANDOVER_INTERNAL_H */

+16 -14

lib/maple_tree.c

··· 64 64 #define CREATE_TRACE_POINTS 65 65 #include <trace/events/maple_tree.h> 66 66 67 + #define TP_FCT tracepoint_string(__func__) 68 + 67 69 /* 68 70 * Kernel pointer hashing renders much of the maple tree dump useless as tagged 69 71 * pointers get hashed to arbitrary values. ··· 2758 2756 MA_STATE(l_mas, mas->tree, mas->index, mas->last); 2759 2757 MA_STATE(r_mas, mas->tree, mas->index, mas->last); 2760 2758 2761 - trace_ma_op(__func__, mas); 2759 + trace_ma_op(TP_FCT, mas); 2762 2760 2763 2761 /* 2764 2762 * Rebalancing occurs if a node is insufficient. Data is rebalanced ··· 2999 2997 MA_STATE(prev_l_mas, mas->tree, mas->index, mas->last); 3000 2998 MA_STATE(prev_r_mas, mas->tree, mas->index, mas->last); 3001 2999 3002 - trace_ma_op(__func__, mas); 3000 + trace_ma_op(TP_FCT, mas); 3003 3001 3004 3002 mast.l = &l_mas; 3005 3003 mast.r = &r_mas; ··· 3174 3172 return false; 3175 3173 } 3176 3174 3177 - trace_ma_write(__func__, wr_mas->mas, wr_mas->r_max, entry); 3175 + trace_ma_write(TP_FCT, wr_mas->mas, wr_mas->r_max, entry); 3178 3176 return true; 3179 3177 } 3180 3178 ··· 3418 3416 * of data may happen. 3419 3417 */ 3420 3418 mas = wr_mas->mas; 3421 - trace_ma_op(__func__, mas); 3419 + trace_ma_op(TP_FCT, mas); 3422 3420 3423 3421 if (unlikely(!mas->index && mas->last == ULONG_MAX)) 3424 3422 return mas_new_root(mas, wr_mas->entry); ··· 3554 3552 } else { 3555 3553 memcpy(wr_mas->node, newnode, sizeof(struct maple_node)); 3556 3554 } 3557 - trace_ma_write(__func__, mas, 0, wr_mas->entry); 3555 + trace_ma_write(TP_FCT, mas, 0, wr_mas->entry); 3558 3556 mas_update_gap(mas); 3559 3557 mas->end = new_end; 3560 3558 return; ··· 3598 3596 mas->offset++; /* Keep mas accurate. */ 3599 3597 } 3600 3598 3601 - trace_ma_write(__func__, mas, 0, wr_mas->entry); 3599 + trace_ma_write(TP_FCT, mas, 0, wr_mas->entry); 3602 3600 /* 3603 3601 * Only update gap when the new entry is empty or there is an empty 3604 3602 * entry in the original two ranges. ··· 3719 3717 mas_update_gap(mas); 3720 3718 3721 3719 mas->end = new_end; 3722 - trace_ma_write(__func__, mas, new_end, wr_mas->entry); 3720 + trace_ma_write(TP_FCT, mas, new_end, wr_mas->entry); 3723 3721 return; 3724 3722 } 3725 3723 ··· 3733 3731 { 3734 3732 struct maple_big_node b_node; 3735 3733 3736 - trace_ma_write(__func__, wr_mas->mas, 0, wr_mas->entry); 3734 + trace_ma_write(TP_FCT, wr_mas->mas, 0, wr_mas->entry); 3737 3735 memset(&b_node, 0, sizeof(struct maple_big_node)); 3738 3736 mas_store_b_node(wr_mas, &b_node, wr_mas->offset_end); 3739 3737 mas_commit_b_node(wr_mas, &b_node); ··· 5064 5062 { 5065 5063 MA_WR_STATE(wr_mas, mas, entry); 5066 5064 5067 - trace_ma_write(__func__, mas, 0, entry); 5065 + trace_ma_write(TP_FCT, mas, 0, entry); 5068 5066 #ifdef CONFIG_DEBUG_MAPLE_TREE 5069 5067 if (MAS_WARN_ON(mas, mas->index > mas->last)) 5070 5068 pr_err("Error %lX > %lX " PTR_FMT "\n", mas->index, mas->last, ··· 5165 5163 } 5166 5164 5167 5165 store: 5168 - trace_ma_write(__func__, mas, 0, entry); 5166 + trace_ma_write(TP_FCT, mas, 0, entry); 5169 5167 mas_wr_store_entry(&wr_mas); 5170 5168 MAS_WR_BUG_ON(&wr_mas, mas_is_err(mas)); 5171 5169 mas_destroy(mas); ··· 5884 5882 MA_STATE(mas, mt, index, index); 5885 5883 void *entry; 5886 5884 5887 - trace_ma_read(__func__, &mas); 5885 + trace_ma_read(TP_FCT, &mas); 5888 5886 rcu_read_lock(); 5889 5887 retry: 5890 5888 entry = mas_start(&mas); ··· 5927 5925 MA_STATE(mas, mt, index, last); 5928 5926 int ret = 0; 5929 5927 5930 - trace_ma_write(__func__, &mas, 0, entry); 5928 + trace_ma_write(TP_FCT, &mas, 0, entry); 5931 5929 if (WARN_ON_ONCE(xa_is_advanced(entry))) 5932 5930 return -EINVAL; 5933 5931 ··· 6150 6148 void *entry = NULL; 6151 6149 6152 6150 MA_STATE(mas, mt, index, index); 6153 - trace_ma_op(__func__, &mas); 6151 + trace_ma_op(TP_FCT, &mas); 6154 6152 6155 6153 mtree_lock(mt); 6156 6154 entry = mas_erase(&mas); ··· 6487 6485 unsigned long copy = *index; 6488 6486 #endif 6489 6487 6490 - trace_ma_read(__func__, &mas); 6488 + trace_ma_read(TP_FCT, &mas); 6491 6489 6492 6490 if ((*index) > max) 6493 6491 return NULL;

+3

lib/test_kho.c

··· 301 301 phys_addr_t fdt_phys; 302 302 int err; 303 303 304 + if (!kho_is_enabled()) 305 + return 0; 306 + 304 307 err = kho_retrieve_subtree(KHO_TEST_FDT, &fdt_phys); 305 308 if (!err) 306 309 return kho_test_restore(fdt_phys);

+7

mm/Kconfig

··· 908 908 config PGTABLE_HAS_HUGE_LEAVES 909 909 def_bool TRANSPARENT_HUGEPAGE || HUGETLB_PAGE 910 910 911 + # 912 + # We can end up creating gigantic folio. 913 + # 914 + config HAVE_GIGANTIC_FOLIOS 915 + def_bool (HUGETLB_PAGE && ARCH_HAS_GIGANTIC_PAGE) || \ 916 + (ZONE_DEVICE && HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) 917 + 911 918 # TODO: Allow to be enabled without THP 912 919 config ARCH_SUPPORTS_HUGE_PFNMAP 913 920 def_bool n

+6 -3

mm/damon/stat.c

··· 46 46 47 47 static struct damon_ctx *damon_stat_context; 48 48 49 + static unsigned long damon_stat_last_refresh_jiffies; 50 + 49 51 static void damon_stat_set_estimated_memory_bandwidth(struct damon_ctx *c) 50 52 { 51 53 struct damon_target *t; ··· 132 130 static int damon_stat_damon_call_fn(void *data) 133 131 { 134 132 struct damon_ctx *c = data; 135 - static unsigned long last_refresh_jiffies; 136 133 137 134 /* avoid unnecessarily frequent stat update */ 138 - if (time_before_eq(jiffies, last_refresh_jiffies + 135 + if (time_before_eq(jiffies, damon_stat_last_refresh_jiffies + 139 136 msecs_to_jiffies(5 * MSEC_PER_SEC))) 140 137 return 0; 141 - last_refresh_jiffies = jiffies; 138 + damon_stat_last_refresh_jiffies = jiffies; 142 139 143 140 aggr_interval_us = c->attrs.aggr_interval; 144 141 damon_stat_set_estimated_memory_bandwidth(c); ··· 211 210 err = damon_start(&damon_stat_context, 1, true); 212 211 if (err) 213 212 return err; 213 + 214 + damon_stat_last_refresh_jiffies = jiffies; 214 215 call_control.data = damon_stat_context; 215 216 return damon_call(damon_stat_context, &call_control); 216 217 }

+7 -3

mm/damon/sysfs.c

··· 1552 1552 return ctx; 1553 1553 } 1554 1554 1555 + static unsigned long damon_sysfs_next_update_jiffies; 1556 + 1555 1557 static int damon_sysfs_repeat_call_fn(void *data) 1556 1558 { 1557 1559 struct damon_sysfs_kdamond *sysfs_kdamond = data; 1558 - static unsigned long next_update_jiffies; 1559 1560 1560 1561 if (!sysfs_kdamond->refresh_ms) 1561 1562 return 0; 1562 - if (time_before(jiffies, next_update_jiffies)) 1563 + if (time_before(jiffies, damon_sysfs_next_update_jiffies)) 1563 1564 return 0; 1564 - next_update_jiffies = jiffies + 1565 + damon_sysfs_next_update_jiffies = jiffies + 1565 1566 msecs_to_jiffies(sysfs_kdamond->refresh_ms); 1566 1567 1567 1568 if (!mutex_trylock(&damon_sysfs_lock)) ··· 1607 1606 return err; 1608 1607 } 1609 1608 kdamond->damon_ctx = ctx; 1609 + 1610 + damon_sysfs_next_update_jiffies = 1611 + jiffies + msecs_to_jiffies(kdamond->refresh_ms); 1610 1612 1611 1613 repeat_call_control->fn = damon_sysfs_repeat_call_fn; 1612 1614 repeat_call_control->data = kdamond;

+23 -10

mm/filemap.c

··· 3681 3681 static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf, 3682 3682 struct folio *folio, unsigned long start, 3683 3683 unsigned long addr, unsigned int nr_pages, 3684 - unsigned long *rss, unsigned short *mmap_miss) 3684 + unsigned long *rss, unsigned short *mmap_miss, 3685 + pgoff_t file_end) 3685 3686 { 3687 + struct address_space *mapping = folio->mapping; 3686 3688 unsigned int ref_from_caller = 1; 3687 3689 vm_fault_t ret = 0; 3688 3690 struct page *page = folio_page(folio, start); ··· 3693 3691 unsigned long addr0; 3694 3692 3695 3693 /* 3696 - * Map the large folio fully where possible. 3694 + * Map the large folio fully where possible: 3697 3695 * 3698 - * The folio must not cross VMA or page table boundary. 3696 + * - The folio is fully within size of the file or belong 3697 + * to shmem/tmpfs; 3698 + * - The folio doesn't cross VMA boundary; 3699 + * - The folio doesn't cross page table boundary; 3699 3700 */ 3700 3701 addr0 = addr - start * PAGE_SIZE; 3701 - if (folio_within_vma(folio, vmf->vma) && 3702 + if ((file_end >= folio_next_index(folio) || shmem_mapping(mapping)) && 3703 + folio_within_vma(folio, vmf->vma) && 3702 3704 (addr0 & PMD_MASK) == ((addr0 + folio_size(folio) - 1) & PMD_MASK)) { 3703 3705 vmf->pte -= start; 3704 3706 page -= start; ··· 3823 3817 if (!folio) 3824 3818 goto out; 3825 3819 3826 - if (filemap_map_pmd(vmf, folio, start_pgoff)) { 3820 + file_end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE) - 1; 3821 + end_pgoff = min(end_pgoff, file_end); 3822 + 3823 + /* 3824 + * Do not allow to map with PMD across i_size to preserve 3825 + * SIGBUS semantics. 3826 + * 3827 + * Make an exception for shmem/tmpfs that for long time 3828 + * intentionally mapped with PMDs across i_size. 3829 + */ 3830 + if ((file_end >= folio_next_index(folio) || shmem_mapping(mapping)) && 3831 + filemap_map_pmd(vmf, folio, start_pgoff)) { 3827 3832 ret = VM_FAULT_NOPAGE; 3828 3833 goto out; 3829 3834 } ··· 3846 3829 folio_put(folio); 3847 3830 goto out; 3848 3831 } 3849 - 3850 - file_end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE) - 1; 3851 - if (end_pgoff > file_end) 3852 - end_pgoff = file_end; 3853 3832 3854 3833 folio_type = mm_counter_file(folio); 3855 3834 do { ··· 3863 3850 else 3864 3851 ret |= filemap_map_folio_range(vmf, folio, 3865 3852 xas.xa_index - folio->index, addr, 3866 - nr_pages, &rss, &mmap_miss); 3853 + nr_pages, &rss, &mmap_miss, file_end); 3867 3854 3868 3855 folio_unlock(folio); 3869 3856 } while ((folio = next_uptodate_folio(&xas, mapping, end_pgoff)) != NULL);

+37 -26

mm/huge_memory.c

··· 214 214 if (likely(atomic_inc_not_zero(&huge_zero_refcount))) 215 215 return true; 216 216 217 - zero_folio = folio_alloc((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE, 217 + zero_folio = folio_alloc((GFP_TRANSHUGE | __GFP_ZERO | __GFP_ZEROTAGS) & 218 + ~__GFP_MOVABLE, 218 219 HPAGE_PMD_ORDER); 219 220 if (!zero_folio) { 220 221 count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED); ··· 3264 3263 caller_pins; 3265 3264 } 3266 3265 3266 + static bool page_range_has_hwpoisoned(struct page *page, long nr_pages) 3267 + { 3268 + for (; nr_pages; page++, nr_pages--) 3269 + if (PageHWPoison(page)) 3270 + return true; 3271 + return false; 3272 + } 3273 + 3267 3274 /* 3268 3275 * It splits @folio into @new_order folios and copies the @folio metadata to 3269 3276 * all the resulting folios. ··· 3279 3270 static void __split_folio_to_order(struct folio *folio, int old_order, 3280 3271 int new_order) 3281 3272 { 3273 + /* Scan poisoned pages when split a poisoned folio to large folios */ 3274 + const bool handle_hwpoison = folio_test_has_hwpoisoned(folio) && new_order; 3282 3275 long new_nr_pages = 1 << new_order; 3283 3276 long nr_pages = 1 << old_order; 3284 3277 long i; 3285 3278 3279 + folio_clear_has_hwpoisoned(folio); 3280 + 3281 + /* Check first new_nr_pages since the loop below skips them */ 3282 + if (handle_hwpoison && 3283 + page_range_has_hwpoisoned(folio_page(folio, 0), new_nr_pages)) 3284 + folio_set_has_hwpoisoned(folio); 3286 3285 /* 3287 3286 * Skip the first new_nr_pages, since the new folio from them have all 3288 3287 * the flags from the original folio. 3289 3288 */ 3290 3289 for (i = new_nr_pages; i < nr_pages; i += new_nr_pages) { 3291 3290 struct page *new_head = &folio->page + i; 3292 - 3293 3291 /* 3294 3292 * Careful: new_folio is not a "real" folio before we cleared PageTail. 3295 3293 * Don't pass it around before clear_compound_head(). ··· 3337 3321 #endif 3338 3322 (1L << PG_dirty) | 3339 3323 LRU_GEN_MASK | LRU_REFS_MASK)); 3324 + 3325 + if (handle_hwpoison && 3326 + page_range_has_hwpoisoned(new_head, new_nr_pages)) 3327 + folio_set_has_hwpoisoned(new_folio); 3340 3328 3341 3329 new_folio->mapping = folio->mapping; 3342 3330 new_folio->index = folio->index + i; ··· 3442 3422 if (folio_test_anon(folio)) 3443 3423 mod_mthp_stat(order, MTHP_STAT_NR_ANON, -1); 3444 3424 3445 - folio_clear_has_hwpoisoned(folio); 3446 - 3447 3425 /* 3448 3426 * split to new_order one order at a time. For uniform split, 3449 3427 * folio is split to new_order directly. ··· 3522 3504 /* order-1 is not supported for anonymous THP. */ 3523 3505 VM_WARN_ONCE(warns && new_order == 1, 3524 3506 "Cannot split to order-1 folio"); 3525 - return new_order != 1; 3507 + if (new_order == 1) 3508 + return false; 3526 3509 } else if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && 3527 3510 !mapping_large_folio_support(folio->mapping)) { 3528 3511 /* ··· 3554 3535 if (folio_test_anon(folio)) { 3555 3536 VM_WARN_ONCE(warns && new_order == 1, 3556 3537 "Cannot split to order-1 folio"); 3557 - return new_order != 1; 3538 + if (new_order == 1) 3539 + return false; 3558 3540 } else if (new_order) { 3559 3541 if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && 3560 3542 !mapping_large_folio_support(folio->mapping)) { ··· 3619 3599 if (folio != page_folio(split_at) || folio != page_folio(lock_at)) 3620 3600 return -EINVAL; 3621 3601 3602 + /* 3603 + * Folios that just got truncated cannot get split. Signal to the 3604 + * caller that there was a race. 3605 + * 3606 + * TODO: this will also currently refuse shmem folios that are in the 3607 + * swapcache. 3608 + */ 3609 + if (!is_anon && !folio->mapping) 3610 + return -EBUSY; 3611 + 3622 3612 if (new_order >= folio_order(folio)) 3623 3613 return -EINVAL; 3624 3614 ··· 3669 3639 gfp_t gfp; 3670 3640 3671 3641 mapping = folio->mapping; 3672 - 3673 - /* Truncated ? */ 3674 - /* 3675 - * TODO: add support for large shmem folio in swap cache. 3676 - * When shmem is in swap cache, mapping is NULL and 3677 - * folio_test_swapcache() is true. 3678 - */ 3679 - if (!mapping) { 3680 - ret = -EBUSY; 3681 - goto out; 3682 - } 3683 - 3684 3642 min_order = mapping_min_folio_order(folio->mapping); 3685 3643 if (new_order < min_order) { 3686 - VM_WARN_ONCE(1, "Cannot split mapped folio below min-order: %u", 3687 - min_order); 3688 3644 ret = -EINVAL; 3689 3645 goto out; 3690 3646 } ··· 4002 3986 4003 3987 int split_folio_to_list(struct folio *folio, struct list_head *list) 4004 3988 { 4005 - int ret = min_order_for_split(folio); 4006 - 4007 - if (ret < 0) 4008 - return ret; 4009 - 4010 - return split_huge_page_to_list_to_order(&folio->page, list, ret); 3989 + return split_huge_page_to_list_to_order(&folio->page, list, 0); 4011 3990 } 4012 3991 4013 3992 /*

-3

mm/kmsan/core.c

··· 72 72 73 73 nr_entries = stack_trace_save(entries, KMSAN_STACK_DEPTH, 0); 74 74 75 - /* Don't sleep. */ 76 - flags &= ~(__GFP_DIRECT_RECLAIM | __GFP_KSWAPD_RECLAIM); 77 - 78 75 handle = stack_depot_save(entries, nr_entries, flags); 79 76 return stack_depot_set_extra_bits(handle, extra); 80 77 }

+4 -2

mm/kmsan/hooks.c

··· 84 84 if (s->ctor) 85 85 return; 86 86 kmsan_enter_runtime(); 87 - kmsan_internal_poison_memory(object, s->object_size, GFP_KERNEL, 87 + kmsan_internal_poison_memory(object, s->object_size, 88 + GFP_KERNEL & ~(__GFP_RECLAIM), 88 89 KMSAN_POISON_CHECK | KMSAN_POISON_FREE); 89 90 kmsan_leave_runtime(); 90 91 } ··· 115 114 kmsan_enter_runtime(); 116 115 page = virt_to_head_page((void *)ptr); 117 116 KMSAN_WARN_ON(ptr != page_address(page)); 118 - kmsan_internal_poison_memory((void *)ptr, page_size(page), GFP_KERNEL, 117 + kmsan_internal_poison_memory((void *)ptr, page_size(page), 118 + GFP_KERNEL & ~(__GFP_RECLAIM), 119 119 KMSAN_POISON_CHECK | KMSAN_POISON_FREE); 120 120 kmsan_leave_runtime(); 121 121 }

+1 -1

mm/kmsan/shadow.c

··· 208 208 return; 209 209 kmsan_enter_runtime(); 210 210 kmsan_internal_poison_memory(page_address(page), page_size(page), 211 - GFP_KERNEL, 211 + GFP_KERNEL & ~(__GFP_RECLAIM), 212 212 KMSAN_POISON_CHECK | KMSAN_POISON_FREE); 213 213 kmsan_leave_runtime(); 214 214 }

+104 -9

mm/ksm.c

··· 2455 2455 return true; 2456 2456 } 2457 2457 2458 + struct ksm_next_page_arg { 2459 + struct folio *folio; 2460 + struct page *page; 2461 + unsigned long addr; 2462 + }; 2463 + 2464 + static int ksm_next_page_pmd_entry(pmd_t *pmdp, unsigned long addr, unsigned long end, 2465 + struct mm_walk *walk) 2466 + { 2467 + struct ksm_next_page_arg *private = walk->private; 2468 + struct vm_area_struct *vma = walk->vma; 2469 + pte_t *start_ptep = NULL, *ptep, pte; 2470 + struct mm_struct *mm = walk->mm; 2471 + struct folio *folio; 2472 + struct page *page; 2473 + spinlock_t *ptl; 2474 + pmd_t pmd; 2475 + 2476 + if (ksm_test_exit(mm)) 2477 + return 0; 2478 + 2479 + cond_resched(); 2480 + 2481 + pmd = pmdp_get_lockless(pmdp); 2482 + if (!pmd_present(pmd)) 2483 + return 0; 2484 + 2485 + if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && pmd_leaf(pmd)) { 2486 + ptl = pmd_lock(mm, pmdp); 2487 + pmd = pmdp_get(pmdp); 2488 + 2489 + if (!pmd_present(pmd)) { 2490 + goto not_found_unlock; 2491 + } else if (pmd_leaf(pmd)) { 2492 + page = vm_normal_page_pmd(vma, addr, pmd); 2493 + if (!page) 2494 + goto not_found_unlock; 2495 + folio = page_folio(page); 2496 + 2497 + if (folio_is_zone_device(folio) || !folio_test_anon(folio)) 2498 + goto not_found_unlock; 2499 + 2500 + page += ((addr & (PMD_SIZE - 1)) >> PAGE_SHIFT); 2501 + goto found_unlock; 2502 + } 2503 + spin_unlock(ptl); 2504 + } 2505 + 2506 + start_ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); 2507 + if (!start_ptep) 2508 + return 0; 2509 + 2510 + for (ptep = start_ptep; addr < end; ptep++, addr += PAGE_SIZE) { 2511 + pte = ptep_get(ptep); 2512 + 2513 + if (!pte_present(pte)) 2514 + continue; 2515 + 2516 + page = vm_normal_page(vma, addr, pte); 2517 + if (!page) 2518 + continue; 2519 + folio = page_folio(page); 2520 + 2521 + if (folio_is_zone_device(folio) || !folio_test_anon(folio)) 2522 + continue; 2523 + goto found_unlock; 2524 + } 2525 + 2526 + not_found_unlock: 2527 + spin_unlock(ptl); 2528 + if (start_ptep) 2529 + pte_unmap(start_ptep); 2530 + return 0; 2531 + found_unlock: 2532 + folio_get(folio); 2533 + spin_unlock(ptl); 2534 + if (start_ptep) 2535 + pte_unmap(start_ptep); 2536 + private->page = page; 2537 + private->folio = folio; 2538 + private->addr = addr; 2539 + return 1; 2540 + } 2541 + 2542 + static struct mm_walk_ops ksm_next_page_ops = { 2543 + .pmd_entry = ksm_next_page_pmd_entry, 2544 + .walk_lock = PGWALK_RDLOCK, 2545 + }; 2546 + 2458 2547 static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page) 2459 2548 { 2460 2549 struct mm_struct *mm; ··· 2631 2542 ksm_scan.address = vma->vm_end; 2632 2543 2633 2544 while (ksm_scan.address < vma->vm_end) { 2545 + struct ksm_next_page_arg ksm_next_page_arg; 2634 2546 struct page *tmp_page = NULL; 2635 - struct folio_walk fw; 2636 2547 struct folio *folio; 2637 2548 2638 2549 if (ksm_test_exit(mm)) 2639 2550 break; 2640 2551 2641 - folio = folio_walk_start(&fw, vma, ksm_scan.address, 0); 2642 - if (folio) { 2643 - if (!folio_is_zone_device(folio) && 2644 - folio_test_anon(folio)) { 2645 - folio_get(folio); 2646 - tmp_page = fw.page; 2647 - } 2648 - folio_walk_end(&fw, vma); 2552 + int found; 2553 + 2554 + found = walk_page_range_vma(vma, ksm_scan.address, 2555 + vma->vm_end, 2556 + &ksm_next_page_ops, 2557 + &ksm_next_page_arg); 2558 + 2559 + if (found > 0) { 2560 + folio = ksm_next_page_arg.folio; 2561 + tmp_page = ksm_next_page_arg.page; 2562 + ksm_scan.address = ksm_next_page_arg.addr; 2563 + } else { 2564 + VM_WARN_ON_ONCE(found < 0); 2565 + ksm_scan.address = vma->vm_end - PAGE_SIZE; 2649 2566 } 2650 2567 2651 2568 if (tmp_page) {

+27

mm/memfd.c

··· 96 96 NULL, 97 97 gfp_mask); 98 98 if (folio) { 99 + u32 hash; 100 + 101 + /* 102 + * Zero the folio to prevent information leaks to userspace. 103 + * Use folio_zero_user() which is optimized for huge/gigantic 104 + * pages. Pass 0 as addr_hint since this is not a faulting path 105 + * and we don't have a user virtual address yet. 106 + */ 107 + folio_zero_user(folio, 0); 108 + 109 + /* 110 + * Mark the folio uptodate before adding to page cache, 111 + * as required by filemap.c and other hugetlb paths. 112 + */ 113 + __folio_mark_uptodate(folio); 114 + 115 + /* 116 + * Serialize hugepage allocation and instantiation to prevent 117 + * races with concurrent allocations, as required by all other 118 + * callers of hugetlb_add_to_page_cache(). 119 + */ 120 + hash = hugetlb_fault_mutex_hash(memfd->f_mapping, idx); 121 + mutex_lock(&hugetlb_fault_mutex_table[hash]); 122 + 99 123 err = hugetlb_add_to_page_cache(folio, 100 124 memfd->f_mapping, 101 125 idx); 126 + 127 + mutex_unlock(&hugetlb_fault_mutex_table[hash]); 128 + 102 129 if (err) { 103 130 folio_put(folio); 104 131 goto err_unresv;

+19 -1

mm/memory.c

··· 65 65 #include <linux/gfp.h> 66 66 #include <linux/migrate.h> 67 67 #include <linux/string.h> 68 + #include <linux/shmem_fs.h> 68 69 #include <linux/memory-tiers.h> 69 70 #include <linux/debugfs.h> 70 71 #include <linux/userfaultfd_k.h> ··· 5502 5501 return ret; 5503 5502 } 5504 5503 5504 + if (!needs_fallback && vma->vm_file) { 5505 + struct address_space *mapping = vma->vm_file->f_mapping; 5506 + pgoff_t file_end; 5507 + 5508 + file_end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE); 5509 + 5510 + /* 5511 + * Do not allow to map with PTEs beyond i_size and with PMD 5512 + * across i_size to preserve SIGBUS semantics. 5513 + * 5514 + * Make an exception for shmem/tmpfs that for long time 5515 + * intentionally mapped with PMDs across i_size. 5516 + */ 5517 + needs_fallback = !shmem_mapping(mapping) && 5518 + file_end < folio_next_index(folio); 5519 + } 5520 + 5505 5521 if (pmd_none(*vmf->pmd)) { 5506 - if (folio_test_pmd_mappable(folio)) { 5522 + if (!needs_fallback && folio_test_pmd_mappable(folio)) { 5507 5523 ret = do_set_pmd(vmf, folio, page); 5508 5524 if (ret != VM_FAULT_FALLBACK) 5509 5525 return ret;

+1 -1

mm/mm_init.c

··· 2469 2469 panic("Failed to allocate %s hash table\n", tablename); 2470 2470 2471 2471 pr_info("%s hash table entries: %ld (order: %d, %lu bytes, %s)\n", 2472 - tablename, 1UL << log2qty, ilog2(size) - PAGE_SHIFT, size, 2472 + tablename, 1UL << log2qty, get_order(size), size, 2473 2473 virt ? (huge ? "vmalloc hugepage" : "vmalloc") : "linear"); 2474 2474 2475 2475 if (_hash_shift)

+1

mm/mmap_lock.c

··· 241 241 if (PTR_ERR(vma) == -EAGAIN) { 242 242 count_vm_vma_lock_event(VMA_LOCK_MISS); 243 243 /* The area was replaced with another one */ 244 + mas_set(&mas, address); 244 245 goto retry; 245 246 } 246 247

+1 -1

mm/mremap.c

··· 187 187 if (!folio || !folio_test_large(folio)) 188 188 return 1; 189 189 190 - return folio_pte_batch(folio, ptep, pte, max_nr); 190 + return folio_pte_batch_flags(folio, NULL, ptep, &pte, max_nr, FPB_RESPECT_WRITE); 191 191 } 192 192 193 193 static int move_ptes(struct pagetable_move_control *pmc,

+1 -1

mm/secretmem.c

··· 82 82 __folio_mark_uptodate(folio); 83 83 err = filemap_add_folio(mapping, folio, offset, gfp); 84 84 if (unlikely(err)) { 85 - folio_put(folio); 86 85 /* 87 86 * If a split of large page was required, it 88 87 * already happened when we marked the page invalid 89 88 * which guarantees that this call won't fail 90 89 */ 91 90 set_direct_map_default_noflush(folio_page(folio, 0)); 91 + folio_put(folio); 92 92 if (err == -EEXIST) 93 93 goto retry; 94 94

+6 -3

mm/shmem.c

··· 1882 1882 struct shmem_inode_info *info = SHMEM_I(inode); 1883 1883 unsigned long suitable_orders = 0; 1884 1884 struct folio *folio = NULL; 1885 + pgoff_t aligned_index; 1885 1886 long pages; 1886 1887 int error, order; 1887 1888 ··· 1896 1895 order = highest_order(suitable_orders); 1897 1896 while (suitable_orders) { 1898 1897 pages = 1UL << order; 1899 - index = round_down(index, pages); 1900 - folio = shmem_alloc_folio(gfp, order, info, index); 1901 - if (folio) 1898 + aligned_index = round_down(index, pages); 1899 + folio = shmem_alloc_folio(gfp, order, info, aligned_index); 1900 + if (folio) { 1901 + index = aligned_index; 1902 1902 goto allocated; 1903 + } 1903 1904 1904 1905 if (pages == HPAGE_PMD_NR) 1905 1906 count_vm_event(THP_FILE_FALLBACK);

+5 -1

mm/slub.c

··· 2046 2046 if (slab_exts) { 2047 2047 unsigned int offs = obj_to_index(obj_exts_slab->slab_cache, 2048 2048 obj_exts_slab, obj_exts); 2049 - /* codetag should be NULL */ 2049 + 2050 + if (unlikely(is_codetag_empty(&slab_exts[offs].ref))) 2051 + return; 2052 + 2053 + /* codetag should be NULL here */ 2050 2054 WARN_ON(slab_exts[offs].ref.ct); 2051 2055 set_codetag_empty(&slab_exts[offs].ref); 2052 2056 }

+13

mm/swap_state.c

··· 748 748 749 749 blk_start_plug(&plug); 750 750 for (addr = start; addr < end; ilx++, addr += PAGE_SIZE) { 751 + struct swap_info_struct *si = NULL; 752 + 751 753 if (!pte++) { 752 754 pte = pte_offset_map(vmf->pmd, addr); 753 755 if (!pte) ··· 763 761 continue; 764 762 pte_unmap(pte); 765 763 pte = NULL; 764 + /* 765 + * Readahead entry may come from a device that we are not 766 + * holding a reference to, try to grab a reference, or skip. 767 + */ 768 + if (swp_type(entry) != swp_type(targ_entry)) { 769 + si = get_swap_device(entry); 770 + if (!si) 771 + continue; 772 + } 766 773 folio = __read_swap_cache_async(entry, gfp_mask, mpol, ilx, 767 774 &page_allocated, false); 775 + if (si) 776 + put_swap_device(si); 768 777 if (!folio) 769 778 continue; 770 779 if (page_allocated) {

+1 -3

mm/swapfile.c

··· 2005 2005 local_lock(&percpu_swap_cluster.lock); 2006 2006 offset = cluster_alloc_swap_entry(si, 0, 1); 2007 2007 local_unlock(&percpu_swap_cluster.lock); 2008 - if (offset) { 2008 + if (offset) 2009 2009 entry = swp_entry(si->type, offset); 2010 - atomic_long_dec(&nr_swap_pages); 2011 - } 2012 2010 } 2013 2011 put_swap_device(si); 2014 2012 }

+31 -6

mm/truncate.c

··· 177 177 return 0; 178 178 } 179 179 180 + static int try_folio_split_or_unmap(struct folio *folio, struct page *split_at, 181 + unsigned long min_order) 182 + { 183 + enum ttu_flags ttu_flags = 184 + TTU_SYNC | 185 + TTU_SPLIT_HUGE_PMD | 186 + TTU_IGNORE_MLOCK; 187 + int ret; 188 + 189 + ret = try_folio_split_to_order(folio, split_at, min_order); 190 + 191 + /* 192 + * If the split fails, unmap the folio, so it will be refaulted 193 + * with PTEs to respect SIGBUS semantics. 194 + * 195 + * Make an exception for shmem/tmpfs that for long time 196 + * intentionally mapped with PMDs across i_size. 197 + */ 198 + if (ret && !shmem_mapping(folio->mapping)) { 199 + try_to_unmap(folio, ttu_flags); 200 + WARN_ON(folio_mapped(folio)); 201 + } 202 + 203 + return ret; 204 + } 205 + 180 206 /* 181 207 * Handle partial folios. The folio may be entirely within the 182 208 * range if a split has raced with us. If not, we zero the part of the ··· 220 194 size_t size = folio_size(folio); 221 195 unsigned int offset, length; 222 196 struct page *split_at, *split_at2; 197 + unsigned int min_order; 223 198 224 199 if (pos < start) 225 200 offset = start - pos; ··· 250 223 if (!folio_test_large(folio)) 251 224 return true; 252 225 226 + min_order = mapping_min_folio_order(folio->mapping); 253 227 split_at = folio_page(folio, PAGE_ALIGN_DOWN(offset) / PAGE_SIZE); 254 - if (!try_folio_split(folio, split_at, NULL)) { 228 + if (!try_folio_split_or_unmap(folio, split_at, min_order)) { 255 229 /* 256 230 * try to split at offset + length to make sure folios within 257 231 * the range can be dropped, especially to avoid memory waste ··· 276 248 if (!folio_trylock(folio2)) 277 249 goto out; 278 250 279 - /* 280 - * make sure folio2 is large and does not change its mapping. 281 - * Its split result does not matter here. 282 - */ 251 + /* make sure folio2 is large and does not change its mapping */ 283 252 if (folio_test_large(folio2) && 284 253 folio2->mapping == folio->mapping) 285 - try_folio_split(folio2, split_at2, NULL); 254 + try_folio_split_or_unmap(folio2, split_at2, min_order); 286 255 287 256 folio_unlock(folio2); 288 257 out:

+8 -6

scripts/decode_stacktrace.sh

··· 277 277 fi 278 278 done 279 279 280 - if [[ ${words[$last]} =~ ^[0-9a-f]+\] ]]; then 281 - words[$last-1]="${words[$last-1]} ${words[$last]}" 282 - unset words[$last] spaces[$last] 283 - last=$(( $last - 1 )) 284 - fi 285 - 286 280 # Extract info after the symbol if present. E.g.: 287 281 # func_name+0x54/0x80 (P) 288 282 # ^^^ ··· 285 291 local info_str="" 286 292 if [[ ${words[$last]} =~ $[A-Z]*$ ]]; then 287 293 info_str=${words[$last]} 294 + unset words[$last] spaces[$last] 295 + last=$(( $last - 1 )) 296 + fi 297 + 298 + # Join module name with its build id if present, as these were 299 + # split during tokenization (e.g. "[module" and "modbuildid]"). 300 + if [[ ${words[$last]} =~ ^[0-9a-f]+\] ]]; then 301 + words[$last-1]="${words[$last-1]} ${words[$last]}" 288 302 unset words[$last] spaces[$last] 289 303 last=$(( $last - 1 )) 290 304 fi

+7 -8

tools/testing/selftests/mm/uffd-unit-tests.c

··· 1758 1758 uffd_test_ops = mem_type->mem_ops; 1759 1759 uffd_test_case_ops = test->test_case_ops; 1760 1760 1761 - if (mem_type->mem_flag & (MEM_HUGETLB_PRIVATE | MEM_HUGETLB)) 1761 + if (mem_type->mem_flag & (MEM_HUGETLB_PRIVATE | MEM_HUGETLB)) { 1762 1762 gopts.page_size = default_huge_page_size(); 1763 - else 1763 + if (gopts.page_size == 0) { 1764 + uffd_test_skip("huge page size is 0, feature missing?"); 1765 + continue; 1766 + } 1767 + } else { 1764 1768 gopts.page_size = psize(); 1769 + } 1765 1770 1766 1771 /* Ensure we have at least 2 pages */ 1767 1772 gopts.nr_pages = MAX(UFFD_TEST_MEM_SIZE, gopts.page_size * 2) ··· 1781 1776 continue; 1782 1777 1783 1778 uffd_test_start("%s on %s", test->name, mem_type->name); 1784 - if ((mem_type->mem_flag == MEM_HUGETLB || 1785 - mem_type->mem_flag == MEM_HUGETLB_PRIVATE) && 1786 - (default_huge_page_size() == 0)) { 1787 - uffd_test_skip("huge page size is 0, feature missing?"); 1788 - continue; 1789 - } 1790 1779 if (!uffd_feature_supported(test)) { 1791 1780 uffd_test_skip("feature missing"); 1792 1781 continue;

+1 -1

tools/testing/selftests/user_events/perf_test.c

··· 236 236 ASSERT_EQ(1 << reg.enable_bit, self->check); 237 237 238 238 /* Ensure write shows up at correct offset */ 239 - ASSERT_NE(-1, write(self->data_fd, &reg.write_index, 239 + ASSERT_NE(-1, write(self->data_fd, (void *)&reg.write_index, 240 240 sizeof(reg.write_index))); 241 241 val = (void *)(((char *)perf_page) + perf_page->data_offset); 242 242 ASSERT_EQ(PERF_RECORD_SAMPLE, *val);