commit 3ce9925823c7d6bb0e6eb951bf2db0e9e182582d · tjh.dev/kernel

tjh.dev / kernel

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

Merge tag 'mm-hotfixes-stable-2025-05-10-14-23' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm

Pull misc hotfixes from Andrew Morton:
"22 hotfixes. 13 are cc:stable and the remainder address post-6.14
issues or aren't considered necessary for -stable kernels.

About half are for MM. Five OCFS2 fixes and a few MAINTAINERS updates"

* tag 'mm-hotfixes-stable-2025-05-10-14-23' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (22 commits)
mm: fix folio_pte_batch() on XEN PV
nilfs2: fix deadlock warnings caused by lock dependency in init_nilfs()
mm/hugetlb: copy the CMA flag when demoting
mm, swap: fix false warning for large allocation with !THP_SWAP
selftests/mm: fix a build failure on powerpc
selftests/mm: fix build break when compiling pkey_util.c
mm: vmalloc: support more granular vrealloc() sizing
tools/testing/selftests: fix guard region test tmpfs assumption
ocfs2: stop quota recovery before disabling quotas
ocfs2: implement handshaking with ocfs2 recovery thread
ocfs2: switch osb->disable_recovery to enum
mailmap: map Uwe's BayLibre addresses to a single one
MAINTAINERS: add mm THP section
mm/userfaultfd: fix uninitialized output field for -EAGAIN race
selftests/mm: compaction_test: support platform with huge mount of memory
MAINTAINERS: add core mm section
ocfs2: fix panic in failed foilio allocation
mm/huge_memory: fix dereferencing invalid pmd migration entry
MAINTAINERS: add reverse mapping section
x86: disable image size check for test builds
...

Linus Torvalds 8 months ago 3ce99258 34503098

+314 -95

23 changed files

expand all

unified split

.mailmap

MAINTAINERS

arch

x86

kernel

vmlinux.lds.S

nilfs2

the_nilfs.c

ocfs2

alloc.c

journal.c

journal.h

ocfs2.h

quota_local.c

suballoc.c

suballoc.h

super.c

userfaultfd.c

include

linux

vmalloc.h

huge_memory.c

hugetlb.c

internal.h

swapfile.c

vmalloc.c

tools

testing

selftests

compaction_test.c

guard-regions.c

pkey-powerpc.h

pkey_util.c

.mailmap

··· 447 447 Luca Weiss <luca@lucaweiss.eu> <luca@z3ntu.xyz> 448 448 Lukasz Luba <lukasz.luba@arm.com> <l.luba@partner.samsung.com> 449 449 Luo Jie <quic_luoj@quicinc.com> <luoj@codeaurora.org> 450 + Lance Yang <lance.yang@linux.dev> <ioworker0@gmail.com> 451 + Lance Yang <lance.yang@linux.dev> <mingzhe.yang@ly.com> 450 452 Maciej W. Rozycki <macro@mips.com> <macro@imgtec.com> 451 453 Maciej W. Rozycki <macro@orcam.me.uk> <macro@linux-mips.org> 452 454 Maharaja Kennadyrajan <quic_mkenna@quicinc.com> <mkenna@codeaurora.org> ··· 751 749 Tycho Andersen <tycho@tycho.pizza> <tycho@tycho.ws> 752 750 Tzung-Bi Shih <tzungbi@kernel.org> <tzungbi@google.com> 753 751 Uwe Kleine-König <ukleinek@informatik.uni-freiburg.de> 752 + Uwe Kleine-König <u.kleine-koenig@baylibre.com> <ukleinek@baylibre.com> 754 753 Uwe Kleine-König <u.kleine-koenig@pengutronix.de> 755 754 Uwe Kleine-König <ukleinek@strlen.de> 756 755 Uwe Kleine-König <ukl@pengutronix.de>

+62 -4

MAINTAINERS

··· 15495 15495 F: include/linux/gfp.h 15496 15496 F: include/linux/gfp_types.h 15497 15497 F: include/linux/memfd.h 15498 - F: include/linux/memory.h 15499 15498 F: include/linux/memory_hotplug.h 15500 15499 F: include/linux/memory-tiers.h 15501 15500 F: include/linux/mempolicy.h 15502 15501 F: include/linux/mempool.h 15503 15502 F: include/linux/memremap.h 15504 - F: include/linux/mm.h 15505 - F: include/linux/mm_*.h 15506 15503 F: include/linux/mmzone.h 15507 15504 F: include/linux/mmu_notifier.h 15508 15505 F: include/linux/pagewalk.h 15509 - F: include/linux/rmap.h 15510 15506 F: include/trace/events/ksm.h 15511 15507 F: mm/ 15512 15508 F: tools/mm/ 15513 15509 F: tools/testing/selftests/mm/ 15514 15510 N: include/linux/page[-_]* 15511 + 15512 + MEMORY MANAGEMENT - CORE 15513 + M: Andrew Morton <akpm@linux-foundation.org> 15514 + M: David Hildenbrand <david@redhat.com> 15515 + R: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> 15516 + R: Liam R. Howlett <Liam.Howlett@oracle.com> 15517 + R: Vlastimil Babka <vbabka@suse.cz> 15518 + R: Mike Rapoport <rppt@kernel.org> 15519 + R: Suren Baghdasaryan <surenb@google.com> 15520 + R: Michal Hocko <mhocko@suse.com> 15521 + L: linux-mm@kvack.org 15522 + S: Maintained 15523 + W: http://www.linux-mm.org 15524 + T: git git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm 15525 + F: include/linux/memory.h 15526 + F: include/linux/mm.h 15527 + F: include/linux/mm_*.h 15528 + F: include/linux/mmdebug.h 15529 + F: include/linux/pagewalk.h 15530 + F: mm/Kconfig 15531 + F: mm/debug.c 15532 + F: mm/init-mm.c 15533 + F: mm/memory.c 15534 + F: mm/pagewalk.c 15535 + F: mm/util.c 15515 15536 15516 15537 MEMORY MANAGEMENT - EXECMEM 15517 15538 M: Andrew Morton <akpm@linux-foundation.org> ··· 15567 15546 F: include/linux/gfp.h 15568 15547 F: include/linux/compaction.h 15569 15548 15549 + MEMORY MANAGEMENT - RMAP (REVERSE MAPPING) 15550 + M: Andrew Morton <akpm@linux-foundation.org> 15551 + M: David Hildenbrand <david@redhat.com> 15552 + M: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> 15553 + R: Rik van Riel <riel@surriel.com> 15554 + R: Liam R. Howlett <Liam.Howlett@oracle.com> 15555 + R: Vlastimil Babka <vbabka@suse.cz> 15556 + R: Harry Yoo <harry.yoo@oracle.com> 15557 + L: linux-mm@kvack.org 15558 + S: Maintained 15559 + F: include/linux/rmap.h 15560 + F: mm/rmap.c 15561 + 15570 15562 MEMORY MANAGEMENT - SECRETMEM 15571 15563 M: Andrew Morton <akpm@linux-foundation.org> 15572 15564 M: Mike Rapoport <rppt@kernel.org> ··· 15587 15553 S: Maintained 15588 15554 F: include/linux/secretmem.h 15589 15555 F: mm/secretmem.c 15556 + 15557 + MEMORY MANAGEMENT - THP (TRANSPARENT HUGE PAGE) 15558 + M: Andrew Morton <akpm@linux-foundation.org> 15559 + M: David Hildenbrand <david@redhat.com> 15560 + R: Zi Yan <ziy@nvidia.com> 15561 + R: Baolin Wang <baolin.wang@linux.alibaba.com> 15562 + R: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> 15563 + R: Liam R. Howlett <Liam.Howlett@oracle.com> 15564 + R: Nico Pache <npache@redhat.com> 15565 + R: Ryan Roberts <ryan.roberts@arm.com> 15566 + R: Dev Jain <dev.jain@arm.com> 15567 + L: linux-mm@kvack.org 15568 + S: Maintained 15569 + W: http://www.linux-mm.org 15570 + T: git git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm 15571 + F: Documentation/admin-guide/mm/transhuge.rst 15572 + F: include/linux/huge_mm.h 15573 + F: include/linux/khugepaged.h 15574 + F: include/trace/events/huge_memory.h 15575 + F: mm/huge_memory.c 15576 + F: mm/khugepaged.c 15577 + F: tools/testing/selftests/mm/khugepaged.c 15578 + F: tools/testing/selftests/mm/split_huge_page_test.c 15579 + F: tools/testing/selftests/mm/transhuge-stress.c 15590 15580 15591 15581 MEMORY MANAGEMENT - USERFAULTFD 15592 15582 M: Andrew Morton <akpm@linux-foundation.org>

+9 -1

arch/x86/kernel/vmlinux.lds.S

··· 466 466 } 467 467 468 468 /* 469 - * The ASSERT() sink to . is intentional, for binutils 2.14 compatibility: 469 + * COMPILE_TEST kernels can be large - CONFIG_KASAN, for example, can cause 470 + * this. Let's assume that nobody will be running a COMPILE_TEST kernel and 471 + * let's assert that fuller build coverage is more valuable than being able to 472 + * run a COMPILE_TEST kernel. 473 + */ 474 + #ifndef CONFIG_COMPILE_TEST 475 + /* 476 + * The ASSERT() sync to . is intentional, for binutils 2.14 compatibility: 470 477 */ 471 478 . = ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE), 472 479 "kernel image bigger than KERNEL_IMAGE_SIZE"); 480 + #endif 473 481 474 482 /* needed for Clang - see arch/x86/entry/entry.S */ 475 483 PROVIDE(__ref_stack_chk_guard = __stack_chk_guard);

-3

fs/nilfs2/the_nilfs.c

··· 705 705 int blocksize; 706 706 int err; 707 707 708 - down_write(&nilfs->ns_sem); 709 - 710 708 blocksize = sb_min_blocksize(sb, NILFS_MIN_BLOCK_SIZE); 711 709 if (!blocksize) { 712 710 nilfs_err(sb, "unable to set blocksize"); ··· 777 779 set_nilfs_init(nilfs); 778 780 err = 0; 779 781 out: 780 - up_write(&nilfs->ns_sem); 781 782 return err; 782 783 783 784 failed_sbh:

fs/ocfs2/alloc.c

··· 6918 6918 if (IS_ERR(folios[numfolios])) { 6919 6919 ret = PTR_ERR(folios[numfolios]); 6920 6920 mlog_errno(ret); 6921 + folios[numfolios] = NULL; 6921 6922 goto out; 6922 6923 } 6923 6924

+58 -22

fs/ocfs2/journal.c

··· 174 174 struct ocfs2_recovery_map *rm; 175 175 176 176 mutex_init(&osb->recovery_lock); 177 - osb->disable_recovery = 0; 177 + osb->recovery_state = OCFS2_REC_ENABLED; 178 178 osb->recovery_thread_task = NULL; 179 179 init_waitqueue_head(&osb->recovery_event); 180 180 ··· 190 190 return 0; 191 191 } 192 192 193 - /* we can't grab the goofy sem lock from inside wait_event, so we use 194 - * memory barriers to make sure that we'll see the null task before 195 - * being woken up */ 196 193 static int ocfs2_recovery_thread_running(struct ocfs2_super *osb) 197 194 { 198 - mb(); 199 195 return osb->recovery_thread_task != NULL; 196 + } 197 + 198 + static void ocfs2_recovery_disable(struct ocfs2_super *osb, 199 + enum ocfs2_recovery_state state) 200 + { 201 + mutex_lock(&osb->recovery_lock); 202 + /* 203 + * If recovery thread is not running, we can directly transition to 204 + * final state. 205 + */ 206 + if (!ocfs2_recovery_thread_running(osb)) { 207 + osb->recovery_state = state + 1; 208 + goto out_lock; 209 + } 210 + osb->recovery_state = state; 211 + /* Wait for recovery thread to acknowledge state transition */ 212 + wait_event_cmd(osb->recovery_event, 213 + !ocfs2_recovery_thread_running(osb) || 214 + osb->recovery_state >= state + 1, 215 + mutex_unlock(&osb->recovery_lock), 216 + mutex_lock(&osb->recovery_lock)); 217 + out_lock: 218 + mutex_unlock(&osb->recovery_lock); 219 + 220 + /* 221 + * At this point we know that no more recovery work can be queued so 222 + * wait for any recovery completion work to complete. 223 + */ 224 + if (osb->ocfs2_wq) 225 + flush_workqueue(osb->ocfs2_wq); 226 + } 227 + 228 + void ocfs2_recovery_disable_quota(struct ocfs2_super *osb) 229 + { 230 + ocfs2_recovery_disable(osb, OCFS2_REC_QUOTA_WANT_DISABLE); 200 231 } 201 232 202 233 void ocfs2_recovery_exit(struct ocfs2_super *osb) ··· 236 205 237 206 /* disable any new recovery threads and wait for any currently 238 207 * running ones to exit. Do this before setting the vol_state. */ 239 - mutex_lock(&osb->recovery_lock); 240 - osb->disable_recovery = 1; 241 - mutex_unlock(&osb->recovery_lock); 242 - wait_event(osb->recovery_event, !ocfs2_recovery_thread_running(osb)); 243 - 244 - /* At this point, we know that no more recovery threads can be 245 - * launched, so wait for any recovery completion work to 246 - * complete. */ 247 - if (osb->ocfs2_wq) 248 - flush_workqueue(osb->ocfs2_wq); 208 + ocfs2_recovery_disable(osb, OCFS2_REC_WANT_DISABLE); 249 209 250 210 /* 251 211 * Now that recovery is shut down, and the osb is about to be ··· 1494 1472 } 1495 1473 } 1496 1474 restart: 1475 + if (quota_enabled) { 1476 + mutex_lock(&osb->recovery_lock); 1477 + /* Confirm that recovery thread will no longer recover quotas */ 1478 + if (osb->recovery_state == OCFS2_REC_QUOTA_WANT_DISABLE) { 1479 + osb->recovery_state = OCFS2_REC_QUOTA_DISABLED; 1480 + wake_up(&osb->recovery_event); 1481 + } 1482 + if (osb->recovery_state >= OCFS2_REC_QUOTA_DISABLED) 1483 + quota_enabled = 0; 1484 + mutex_unlock(&osb->recovery_lock); 1485 + } 1486 + 1497 1487 status = ocfs2_super_lock(osb, 1); 1498 1488 if (status < 0) { 1499 1489 mlog_errno(status); ··· 1603 1569 1604 1570 ocfs2_free_replay_slots(osb); 1605 1571 osb->recovery_thread_task = NULL; 1606 - mb(); /* sync with ocfs2_recovery_thread_running */ 1572 + if (osb->recovery_state == OCFS2_REC_WANT_DISABLE) 1573 + osb->recovery_state = OCFS2_REC_DISABLED; 1607 1574 wake_up(&osb->recovery_event); 1608 1575 1609 1576 mutex_unlock(&osb->recovery_lock); 1610 1577 1611 - if (quota_enabled) 1612 - kfree(rm_quota); 1578 + kfree(rm_quota); 1613 1579 1614 1580 return status; 1615 1581 } 1616 1582 1617 1583 void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num) 1618 1584 { 1585 + int was_set = -1; 1586 + 1619 1587 mutex_lock(&osb->recovery_lock); 1588 + if (osb->recovery_state < OCFS2_REC_WANT_DISABLE) 1589 + was_set = ocfs2_recovery_map_set(osb, node_num); 1620 1590 1621 1591 trace_ocfs2_recovery_thread(node_num, osb->node_num, 1622 - osb->disable_recovery, osb->recovery_thread_task, 1623 - osb->disable_recovery ? 1624 - -1 : ocfs2_recovery_map_set(osb, node_num)); 1592 + osb->recovery_state, osb->recovery_thread_task, was_set); 1625 1593 1626 - if (osb->disable_recovery) 1594 + if (osb->recovery_state >= OCFS2_REC_WANT_DISABLE) 1627 1595 goto out; 1628 1596 1629 1597 if (osb->recovery_thread_task)

fs/ocfs2/journal.h

··· 148 148 149 149 int ocfs2_recovery_init(struct ocfs2_super *osb); 150 150 void ocfs2_recovery_exit(struct ocfs2_super *osb); 151 + void ocfs2_recovery_disable_quota(struct ocfs2_super *osb); 151 152 152 153 int ocfs2_compute_replay_slots(struct ocfs2_super *osb); 153 154 void ocfs2_free_replay_slots(struct ocfs2_super *osb);

+16 -1

fs/ocfs2/ocfs2.h

··· 308 308 void ocfs2_initialize_journal_triggers(struct super_block *sb, 309 309 struct ocfs2_triggers triggers[]); 310 310 311 + enum ocfs2_recovery_state { 312 + OCFS2_REC_ENABLED = 0, 313 + OCFS2_REC_QUOTA_WANT_DISABLE, 314 + /* 315 + * Must be OCFS2_REC_QUOTA_WANT_DISABLE + 1 for 316 + * ocfs2_recovery_disable_quota() to work. 317 + */ 318 + OCFS2_REC_QUOTA_DISABLED, 319 + OCFS2_REC_WANT_DISABLE, 320 + /* 321 + * Must be OCFS2_REC_WANT_DISABLE + 1 for ocfs2_recovery_exit() to work 322 + */ 323 + OCFS2_REC_DISABLED, 324 + }; 325 + 311 326 struct ocfs2_journal; 312 327 struct ocfs2_slot_info; 313 328 struct ocfs2_recovery_map; ··· 385 370 struct ocfs2_recovery_map *recovery_map; 386 371 struct ocfs2_replay_map *replay_map; 387 372 struct task_struct *recovery_thread_task; 388 - int disable_recovery; 373 + enum ocfs2_recovery_state recovery_state; 389 374 wait_queue_head_t checkpoint_event; 390 375 struct ocfs2_journal *journal; 391 376 unsigned long osb_commit_interval;

+2 -7

fs/ocfs2/quota_local.c

··· 453 453 454 454 /* Sync changes in local quota file into global quota file and 455 455 * reinitialize local quota file. 456 - * The function expects local quota file to be already locked and 457 - * s_umount locked in shared mode. */ 456 + * The function expects local quota file to be already locked. */ 458 457 static int ocfs2_recover_local_quota_file(struct inode *lqinode, 459 458 int type, 460 459 struct ocfs2_quota_recovery *rec) ··· 587 588 { 588 589 unsigned int ino[OCFS2_MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE, 589 590 LOCAL_GROUP_QUOTA_SYSTEM_INODE }; 590 - struct super_block *sb = osb->sb; 591 591 struct ocfs2_local_disk_dqinfo *ldinfo; 592 592 struct buffer_head *bh; 593 593 handle_t *handle; ··· 598 600 printk(KERN_NOTICE "ocfs2: Finishing quota recovery on device (%s) for " 599 601 "slot %u\n", osb->dev_str, slot_num); 600 602 601 - down_read(&sb->s_umount); 602 603 for (type = 0; type < OCFS2_MAXQUOTAS; type++) { 603 604 if (list_empty(&(rec->r_list[type]))) 604 605 continue; ··· 674 677 break; 675 678 } 676 679 out: 677 - up_read(&sb->s_umount); 678 680 kfree(rec); 679 681 return status; 680 682 } ··· 839 843 ocfs2_release_local_quota_bitmaps(&oinfo->dqi_chunk); 840 844 841 845 /* 842 - * s_umount held in exclusive mode protects us against racing with 843 - * recovery thread... 846 + * ocfs2_dismount_volume() has already aborted quota recovery... 844 847 */ 845 848 if (oinfo->dqi_rec) { 846 849 ocfs2_free_quota_recovery(oinfo->dqi_rec);

+32 -6

fs/ocfs2/suballoc.c

··· 698 698 699 699 bg_bh = ocfs2_block_group_alloc_contig(osb, handle, alloc_inode, 700 700 ac, cl); 701 - if (PTR_ERR(bg_bh) == -ENOSPC) 701 + if (PTR_ERR(bg_bh) == -ENOSPC) { 702 + ac->ac_which = OCFS2_AC_USE_MAIN_DISCONTIG; 702 703 bg_bh = ocfs2_block_group_alloc_discontig(handle, 703 704 alloc_inode, 704 705 ac, cl); 706 + } 705 707 if (IS_ERR(bg_bh)) { 706 708 status = PTR_ERR(bg_bh); 707 709 bg_bh = NULL; ··· 1796 1794 { 1797 1795 int status; 1798 1796 u16 chain; 1797 + u32 contig_bits; 1799 1798 u64 next_group; 1800 1799 struct inode *alloc_inode = ac->ac_inode; 1801 1800 struct buffer_head *group_bh = NULL; ··· 1822 1819 status = -ENOSPC; 1823 1820 /* for now, the chain search is a bit simplistic. We just use 1824 1821 * the 1st group with any empty bits. */ 1825 - while ((status = ac->ac_group_search(alloc_inode, group_bh, 1826 - bits_wanted, min_bits, 1827 - ac->ac_max_block, 1828 - res)) == -ENOSPC) { 1822 + while (1) { 1823 + if (ac->ac_which == OCFS2_AC_USE_MAIN_DISCONTIG) { 1824 + contig_bits = le16_to_cpu(bg->bg_contig_free_bits); 1825 + if (!contig_bits) 1826 + contig_bits = ocfs2_find_max_contig_free_bits(bg->bg_bitmap, 1827 + le16_to_cpu(bg->bg_bits), 0); 1828 + if (bits_wanted > contig_bits && contig_bits >= min_bits) 1829 + bits_wanted = contig_bits; 1830 + } 1831 + 1832 + status = ac->ac_group_search(alloc_inode, group_bh, 1833 + bits_wanted, min_bits, 1834 + ac->ac_max_block, res); 1835 + if (status != -ENOSPC) 1836 + break; 1829 1837 if (!bg->bg_next_group) 1830 1838 break; 1831 1839 ··· 1996 1982 victim = ocfs2_find_victim_chain(cl); 1997 1983 ac->ac_chain = victim; 1998 1984 1985 + search: 1999 1986 status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits, 2000 1987 res, &bits_left); 2001 1988 if (!status) { ··· 2035 2020 mlog_errno(status); 2036 2021 goto bail; 2037 2022 } 2023 + } 2024 + 2025 + /* Chains can't supply the bits_wanted contiguous space. 2026 + * We should switch to using every single bit when allocating 2027 + * from the global bitmap. */ 2028 + if (i == le16_to_cpu(cl->cl_next_free_rec) && 2029 + status == -ENOSPC && ac->ac_which == OCFS2_AC_USE_MAIN) { 2030 + ac->ac_which = OCFS2_AC_USE_MAIN_DISCONTIG; 2031 + ac->ac_chain = victim; 2032 + goto search; 2038 2033 } 2039 2034 2040 2035 set_hint: ··· 2390 2365 BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted); 2391 2366 2392 2367 BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL 2393 - && ac->ac_which != OCFS2_AC_USE_MAIN); 2368 + && ac->ac_which != OCFS2_AC_USE_MAIN 2369 + && ac->ac_which != OCFS2_AC_USE_MAIN_DISCONTIG); 2394 2370 2395 2371 if (ac->ac_which == OCFS2_AC_USE_LOCAL) { 2396 2372 WARN_ON(min_clusters > 1);

fs/ocfs2/suballoc.h

··· 29 29 #define OCFS2_AC_USE_MAIN 2 30 30 #define OCFS2_AC_USE_INODE 3 31 31 #define OCFS2_AC_USE_META 4 32 + #define OCFS2_AC_USE_MAIN_DISCONTIG 5 32 33 u32 ac_which; 33 34 34 35 /* these are used by the chain search */

fs/ocfs2/super.c

··· 1812 1812 /* Orphan scan should be stopped as early as possible */ 1813 1813 ocfs2_orphan_scan_stop(osb); 1814 1814 1815 + /* Stop quota recovery so that we can disable quotas */ 1816 + ocfs2_recovery_disable_quota(osb); 1817 + 1815 1818 ocfs2_disable_quotas(osb); 1816 1819 1817 1820 /* All dquots should be freed by now */

+22 -6

fs/userfaultfd.c

··· 1585 1585 user_uffdio_copy = (struct uffdio_copy __user *) arg; 1586 1586 1587 1587 ret = -EAGAIN; 1588 - if (atomic_read(&ctx->mmap_changing)) 1588 + if (unlikely(atomic_read(&ctx->mmap_changing))) { 1589 + if (unlikely(put_user(ret, &user_uffdio_copy->copy))) 1590 + return -EFAULT; 1589 1591 goto out; 1592 + } 1590 1593 1591 1594 ret = -EFAULT; 1592 1595 if (copy_from_user(&uffdio_copy, user_uffdio_copy, ··· 1644 1641 user_uffdio_zeropage = (struct uffdio_zeropage __user *) arg; 1645 1642 1646 1643 ret = -EAGAIN; 1647 - if (atomic_read(&ctx->mmap_changing)) 1644 + if (unlikely(atomic_read(&ctx->mmap_changing))) { 1645 + if (unlikely(put_user(ret, &user_uffdio_zeropage->zeropage))) 1646 + return -EFAULT; 1648 1647 goto out; 1648 + } 1649 1649 1650 1650 ret = -EFAULT; 1651 1651 if (copy_from_user(&uffdio_zeropage, user_uffdio_zeropage, ··· 1750 1744 user_uffdio_continue = (struct uffdio_continue __user *)arg; 1751 1745 1752 1746 ret = -EAGAIN; 1753 - if (atomic_read(&ctx->mmap_changing)) 1747 + if (unlikely(atomic_read(&ctx->mmap_changing))) { 1748 + if (unlikely(put_user(ret, &user_uffdio_continue->mapped))) 1749 + return -EFAULT; 1754 1750 goto out; 1751 + } 1755 1752 1756 1753 ret = -EFAULT; 1757 1754 if (copy_from_user(&uffdio_continue, user_uffdio_continue, ··· 1810 1801 user_uffdio_poison = (struct uffdio_poison __user *)arg; 1811 1802 1812 1803 ret = -EAGAIN; 1813 - if (atomic_read(&ctx->mmap_changing)) 1804 + if (unlikely(atomic_read(&ctx->mmap_changing))) { 1805 + if (unlikely(put_user(ret, &user_uffdio_poison->updated))) 1806 + return -EFAULT; 1814 1807 goto out; 1808 + } 1815 1809 1816 1810 ret = -EFAULT; 1817 1811 if (copy_from_user(&uffdio_poison, user_uffdio_poison, ··· 1882 1870 1883 1871 user_uffdio_move = (struct uffdio_move __user *) arg; 1884 1872 1885 - if (atomic_read(&ctx->mmap_changing)) 1886 - return -EAGAIN; 1873 + ret = -EAGAIN; 1874 + if (unlikely(atomic_read(&ctx->mmap_changing))) { 1875 + if (unlikely(put_user(ret, &user_uffdio_move->move))) 1876 + return -EFAULT; 1877 + goto out; 1878 + } 1887 1879 1888 1880 if (copy_from_user(&uffdio_move, user_uffdio_move, 1889 1881 /* don't copy "move" last field */

include/linux/vmalloc.h

··· 61 61 unsigned int nr_pages; 62 62 phys_addr_t phys_addr; 63 63 const void *caller; 64 + unsigned long requested_size; 64 65 }; 65 66 66 67 struct vmap_area {

+8 -3

mm/huge_memory.c

··· 3075 3075 void split_huge_pmd_locked(struct vm_area_struct *vma, unsigned long address, 3076 3076 pmd_t *pmd, bool freeze, struct folio *folio) 3077 3077 { 3078 + bool pmd_migration = is_pmd_migration_entry(*pmd); 3079 + 3078 3080 VM_WARN_ON_ONCE(folio && !folio_test_pmd_mappable(folio)); 3079 3081 VM_WARN_ON_ONCE(!IS_ALIGNED(address, HPAGE_PMD_SIZE)); 3080 3082 VM_WARN_ON_ONCE(folio && !folio_test_locked(folio)); ··· 3087 3085 * require a folio to check the PMD against. Otherwise, there 3088 3086 * is a risk of replacing the wrong folio. 3089 3087 */ 3090 - if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd) || 3091 - is_pmd_migration_entry(*pmd)) { 3092 - if (folio && folio != pmd_folio(*pmd)) 3088 + if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd) || pmd_migration) { 3089 + /* 3090 + * Do not apply pmd_folio() to a migration entry; and folio lock 3091 + * guarantees that it must be of the wrong folio anyway. 3092 + */ 3093 + if (folio && (pmd_migration || folio != pmd_folio(*pmd))) 3093 3094 return; 3094 3095 __split_huge_pmd_locked(vma, pmd, address, freeze); 3095 3096 }

mm/hugetlb.c

··· 4034 4034 4035 4035 list_for_each_entry_safe(folio, next, src_list, lru) { 4036 4036 int i; 4037 + bool cma; 4037 4038 4038 4039 if (folio_test_hugetlb_vmemmap_optimized(folio)) 4039 4040 continue; 4041 + 4042 + cma = folio_test_hugetlb_cma(folio); 4040 4043 4041 4044 list_del(&folio->lru); 4042 4045 ··· 4056 4053 4057 4054 new_folio->mapping = NULL; 4058 4055 init_new_hugetlb_folio(dst, new_folio); 4056 + /* Copy the CMA flag so that it is freed correctly */ 4057 + if (cma) 4058 + folio_set_hugetlb_cma(new_folio); 4059 4059 list_add(&new_folio->lru, &dst_list); 4060 4060 } 4061 4061 }

+11 -16

mm/internal.h

··· 248 248 pte_t *start_ptep, pte_t pte, int max_nr, fpb_t flags, 249 249 bool *any_writable, bool *any_young, bool *any_dirty) 250 250 { 251 - unsigned long folio_end_pfn = folio_pfn(folio) + folio_nr_pages(folio); 252 - const pte_t *end_ptep = start_ptep + max_nr; 253 251 pte_t expected_pte, *ptep; 254 252 bool writable, young, dirty; 255 - int nr; 253 + int nr, cur_nr; 256 254 257 255 if (any_writable) 258 256 *any_writable = false; ··· 263 265 VM_WARN_ON_FOLIO(!folio_test_large(folio) || max_nr < 1, folio); 264 266 VM_WARN_ON_FOLIO(page_folio(pfn_to_page(pte_pfn(pte))) != folio, folio); 265 267 268 + /* Limit max_nr to the actual remaining PFNs in the folio we could batch. */ 269 + max_nr = min_t(unsigned long, max_nr, 270 + folio_pfn(folio) + folio_nr_pages(folio) - pte_pfn(pte)); 271 + 266 272 nr = pte_batch_hint(start_ptep, pte); 267 273 expected_pte = __pte_batch_clear_ignored(pte_advance_pfn(pte, nr), flags); 268 274 ptep = start_ptep + nr; 269 275 270 - while (ptep < end_ptep) { 276 + while (nr < max_nr) { 271 277 pte = ptep_get(ptep); 272 278 if (any_writable) 273 279 writable = !!pte_write(pte); ··· 284 282 if (!pte_same(pte, expected_pte)) 285 283 break; 286 284 287 - /* 288 - * Stop immediately once we reached the end of the folio. In 289 - * corner cases the next PFN might fall into a different 290 - * folio. 291 - */ 292 - if (pte_pfn(pte) >= folio_end_pfn) 293 - break; 294 - 295 285 if (any_writable) 296 286 *any_writable |= writable; 297 287 if (any_young) ··· 291 297 if (any_dirty) 292 298 *any_dirty |= dirty; 293 299 294 - nr = pte_batch_hint(ptep, pte); 295 - expected_pte = pte_advance_pfn(expected_pte, nr); 296 - ptep += nr; 300 + cur_nr = pte_batch_hint(ptep, pte); 301 + expected_pte = pte_advance_pfn(expected_pte, cur_nr); 302 + ptep += cur_nr; 303 + nr += cur_nr; 297 304 } 298 305 299 - return min(ptep - start_ptep, max_nr); 306 + return min(nr, max_nr); 300 307 } 301 308 302 309 /**

+16 -7

mm/swapfile.c

··· 1272 1272 VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); 1273 1273 VM_BUG_ON_FOLIO(!folio_test_uptodate(folio), folio); 1274 1274 1275 - /* 1276 - * Should not even be attempting large allocations when huge 1277 - * page swap is disabled. Warn and fail the allocation. 1278 - */ 1279 - if (order && (!IS_ENABLED(CONFIG_THP_SWAP) || size > SWAPFILE_CLUSTER)) { 1280 - VM_WARN_ON_ONCE(1); 1281 - return -EINVAL; 1275 + if (order) { 1276 + /* 1277 + * Reject large allocation when THP_SWAP is disabled, 1278 + * the caller should split the folio and try again. 1279 + */ 1280 + if (!IS_ENABLED(CONFIG_THP_SWAP)) 1281 + return -EAGAIN; 1282 + 1283 + /* 1284 + * Allocation size should never exceed cluster size 1285 + * (HPAGE_PMD_SIZE). 1286 + */ 1287 + if (size > SWAPFILE_CLUSTER) { 1288 + VM_WARN_ON_ONCE(1); 1289 + return -EINVAL; 1290 + } 1282 1291 } 1283 1292 1284 1293 local_lock(&percpu_swap_cluster.lock);

+24 -7

mm/vmalloc.c

··· 1940 1940 { 1941 1941 vm->flags = flags; 1942 1942 vm->addr = (void *)va->va_start; 1943 - vm->size = va_size(va); 1943 + vm->size = vm->requested_size = va_size(va); 1944 1944 vm->caller = caller; 1945 1945 va->vm = vm; 1946 1946 } ··· 3133 3133 3134 3134 area->flags = flags; 3135 3135 area->caller = caller; 3136 + area->requested_size = requested_size; 3136 3137 3137 3138 va = alloc_vmap_area(size, align, start, end, node, gfp_mask, 0, area); 3138 3139 if (IS_ERR(va)) { ··· 4064 4063 */ 4065 4064 void *vrealloc_noprof(const void *p, size_t size, gfp_t flags) 4066 4065 { 4066 + struct vm_struct *vm = NULL; 4067 + size_t alloced_size = 0; 4067 4068 size_t old_size = 0; 4068 4069 void *n; 4069 4070 ··· 4075 4072 } 4076 4073 4077 4074 if (p) { 4078 - struct vm_struct *vm; 4079 - 4080 4075 vm = find_vm_area(p); 4081 4076 if (unlikely(!vm)) { 4082 4077 WARN(1, "Trying to vrealloc() nonexistent vm area (%p)\n", p); 4083 4078 return NULL; 4084 4079 } 4085 4080 4086 - old_size = get_vm_area_size(vm); 4081 + alloced_size = get_vm_area_size(vm); 4082 + old_size = vm->requested_size; 4083 + if (WARN(alloced_size < old_size, 4084 + "vrealloc() has mismatched area vs requested sizes (%p)\n", p)) 4085 + return NULL; 4087 4086 } 4088 4087 4089 4088 /* ··· 4093 4088 * would be a good heuristic for when to shrink the vm_area? 4094 4089 */ 4095 4090 if (size <= old_size) { 4096 - /* Zero out spare memory. */ 4097 - if (want_init_on_alloc(flags)) 4091 + /* Zero out "freed" memory. */ 4092 + if (want_init_on_free()) 4098 4093 memset((void *)p + size, 0, old_size - size); 4094 + vm->requested_size = size; 4099 4095 kasan_poison_vmalloc(p + size, old_size - size); 4100 - kasan_unpoison_vmalloc(p, size, KASAN_VMALLOC_PROT_NORMAL); 4101 4096 return (void *)p; 4097 + } 4098 + 4099 + /* 4100 + * We already have the bytes available in the allocation; use them. 4101 + */ 4102 + if (size <= alloced_size) { 4103 + kasan_unpoison_vmalloc(p + old_size, size - old_size, 4104 + KASAN_VMALLOC_PROT_NORMAL); 4105 + /* Zero out "alloced" memory. */ 4106 + if (want_init_on_alloc(flags)) 4107 + memset((void *)p + old_size, 0, size - old_size); 4108 + vm->requested_size = size; 4102 4109 } 4103 4110 4104 4111 /* TODO: Grow the vm_area, i.e. allocate and map additional pages. */

+14 -5

tools/testing/selftests/mm/compaction_test.c

··· 90 90 int compaction_index = 0; 91 91 char nr_hugepages[20] = {0}; 92 92 char init_nr_hugepages[24] = {0}; 93 + char target_nr_hugepages[24] = {0}; 94 + int slen; 93 95 94 96 snprintf(init_nr_hugepages, sizeof(init_nr_hugepages), 95 97 "%lu", initial_nr_hugepages); ··· 108 106 goto out; 109 107 } 110 108 111 - /* Request a large number of huge pages. The Kernel will allocate 112 - as much as it can */ 113 - if (write(fd, "100000", (6*sizeof(char))) != (6*sizeof(char))) { 114 - ksft_print_msg("Failed to write 100000 to /proc/sys/vm/nr_hugepages: %s\n", 115 - strerror(errno)); 109 + /* 110 + * Request huge pages for about half of the free memory. The Kernel 111 + * will allocate as much as it can, and we expect it will get at least 1/3 112 + */ 113 + nr_hugepages_ul = mem_free / hugepage_size / 2; 114 + snprintf(target_nr_hugepages, sizeof(target_nr_hugepages), 115 + "%lu", nr_hugepages_ul); 116 + 117 + slen = strlen(target_nr_hugepages); 118 + if (write(fd, target_nr_hugepages, slen) != slen) { 119 + ksft_print_msg("Failed to write %lu to /proc/sys/vm/nr_hugepages: %s\n", 120 + nr_hugepages_ul, strerror(errno)); 116 121 goto close_fd; 117 122 } 118 123

+10 -6

tools/testing/selftests/mm/guard-regions.c

··· 271 271 self->page_size = (unsigned long)sysconf(_SC_PAGESIZE); 272 272 setup_sighandler(); 273 273 274 - if (variant->backing == ANON_BACKED) 274 + switch (variant->backing) { 275 + case ANON_BACKED: 275 276 return; 276 - 277 - self->fd = open_file( 278 - variant->backing == SHMEM_BACKED ? "/tmp/" : "", 279 - self->path); 277 + case LOCAL_FILE_BACKED: 278 + self->fd = open_file("", self->path); 279 + break; 280 + case SHMEM_BACKED: 281 + self->fd = memfd_create(self->path, 0); 282 + break; 283 + } 280 284 281 285 /* We truncate file to at least 100 pages, tests can modify as needed. */ 282 286 ASSERT_EQ(ftruncate(self->fd, 100 * self->page_size), 0); ··· 1700 1696 char *ptr; 1701 1697 int i; 1702 1698 1703 - if (variant->backing == ANON_BACKED) 1699 + if (variant->backing != LOCAL_FILE_BACKED) 1704 1700 SKIP(return, "Read-only test specific to file-backed"); 1705 1701 1706 1702 /* Map shared so we can populate with pattern, populate it, unmap. */

+13 -1

tools/testing/selftests/mm/pkey-powerpc.h

··· 3 3 #ifndef _PKEYS_POWERPC_H 4 4 #define _PKEYS_POWERPC_H 5 5 6 + #include <sys/stat.h> 7 + 6 8 #ifndef SYS_pkey_alloc 7 9 # define SYS_pkey_alloc 384 8 10 # define SYS_pkey_free 385 ··· 104 102 return; 105 103 } 106 104 105 + #define REPEAT_8(s) s s s s s s s s 106 + #define REPEAT_64(s) REPEAT_8(s) REPEAT_8(s) REPEAT_8(s) REPEAT_8(s) \ 107 + REPEAT_8(s) REPEAT_8(s) REPEAT_8(s) REPEAT_8(s) 108 + #define REPEAT_512(s) REPEAT_64(s) REPEAT_64(s) REPEAT_64(s) REPEAT_64(s) \ 109 + REPEAT_64(s) REPEAT_64(s) REPEAT_64(s) REPEAT_64(s) 110 + #define REPEAT_4096(s) REPEAT_512(s) REPEAT_512(s) REPEAT_512(s) REPEAT_512(s) \ 111 + REPEAT_512(s) REPEAT_512(s) REPEAT_512(s) REPEAT_512(s) 112 + #define REPEAT_16384(s) REPEAT_4096(s) REPEAT_4096(s) \ 113 + REPEAT_4096(s) REPEAT_4096(s) 114 + 107 115 /* 4-byte instructions * 16384 = 64K page */ 108 - #define __page_o_noops() asm(".rept 16384 ; nop; .endr") 116 + #define __page_o_noops() asm(REPEAT_16384("nop\n")) 109 117 110 118 static inline void *malloc_pkey_with_mprotect_subpage(long size, int prot, u16 pkey) 111 119 {

tools/testing/selftests/mm/pkey_util.c

··· 1 1 // SPDX-License-Identifier: GPL-2.0-only 2 + #define __SANE_USERSPACE_TYPES__ 2 3 #include <sys/syscall.h> 3 4 #include <unistd.h> 4 5