commit aa918db707fba507e85217961643281ee8dfb2ed · tjh.dev/kernel

tjh.dev / kernel

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

Merge tag 'bpf_try_alloc_pages' of git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next

Pull bpf try_alloc_pages() support from Alexei Starovoitov:
"The pull includes work from Sebastian, Vlastimil and myself with a lot
of help from Michal and Shakeel.

This is a first step towards making kmalloc reentrant to get rid of
slab wrappers: bpf_mem_alloc, kretprobe's objpool, etc. These patches
make page allocator safe from any context.

Vlastimil kicked off this effort at LSFMM 2024:

https://lwn.net/Articles/974138/

and we continued at LSFMM 2025:

https://lore.kernel.org/all/CAADnVQKfkGxudNUkcPJgwe3nTZ=xohnRshx9kLZBTmR_E1DFEg@mail.gmail.com/

Why:

SLAB wrappers bind memory to a particular subsystem making it
unavailable to the rest of the kernel. Some BPF maps in production
consume Gbytes of preallocated memory. Top 5 in Meta: 1.5G, 1.2G,
1.1G, 300M, 200M. Once we have kmalloc that works in any context BPF
map preallocation won't be necessary.

How:

Synchronous kmalloc/page alloc stack has multiple stages going from
fast to slow: cmpxchg16 -> slab_alloc -> new_slab -> alloc_pages ->
rmqueue_pcplist -> __rmqueue, where rmqueue_pcplist was already
relying on trylock.

This set changes rmqueue_bulk/rmqueue_buddy to attempt a trylock and
return ENOMEM if alloc_flags & ALLOC_TRYLOCK. It then wraps this
functionality into try_alloc_pages() helper. We make sure that the
logic is sane in PREEMPT_RT.

End result: try_alloc_pages()/free_pages_nolock() are safe to call
from any context.

try_kmalloc() for any context with similar trylock approach will
follow. It will use try_alloc_pages() when slab needs a new page.
Though such try_kmalloc/page_alloc() is an opportunistic allocator,
this design ensures that the probability of successful allocation of
small objects (up to one page in size) is high.

Even before we have try_kmalloc(), we already use try_alloc_pages() in
BPF arena implementation and it's going to be used more extensively in
BPF"

* tag 'bpf_try_alloc_pages' of git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next:
mm: Fix the flipped condition in gfpflags_allow_spinning()
bpf: Use try_alloc_pages() to allocate pages for bpf needs.
mm, bpf: Use memcg in try_alloc_pages().
memcg: Use trylock to access memcg stock_lock.
mm, bpf: Introduce free_pages_nolock()
mm, bpf: Introduce try_alloc_pages() for opportunistic page allocation
locking/local_lock: Introduce localtry_lock_t

Linus Torvalds 10 months ago aa918db7 494e7fe5

+511 -44

13 changed files

expand all

unified split

include

linux

bpf.h

gfp.h

local_lock.h

local_lock_internal.h

mm_types.h

mmzone.h

kernel

bpf

arena.c

syscall.c

lib

stackdepot.c

internal.h

memcontrol.c

page_alloc.c

page_owner.c

+1 -1

include/linux/bpf.h

··· 2385 2385 struct bpf_map *bpf_map_get_curr_or_next(u32 *id); 2386 2386 struct bpf_prog *bpf_prog_get_curr_or_next(u32 *id); 2387 2387 2388 - int bpf_map_alloc_pages(const struct bpf_map *map, gfp_t gfp, int nid, 2388 + int bpf_map_alloc_pages(const struct bpf_map *map, int nid, 2389 2389 unsigned long nr_pages, struct page **page_array); 2390 2390 #ifdef CONFIG_MEMCG 2391 2391 void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags,

+23

include/linux/gfp.h

··· 39 39 return !!(gfp_flags & __GFP_DIRECT_RECLAIM); 40 40 } 41 41 42 + static inline bool gfpflags_allow_spinning(const gfp_t gfp_flags) 43 + { 44 + /* 45 + * !__GFP_DIRECT_RECLAIM -> direct claim is not allowed. 46 + * !__GFP_KSWAPD_RECLAIM -> it's not safe to wake up kswapd. 47 + * All GFP_* flags including GFP_NOWAIT use one or both flags. 48 + * try_alloc_pages() is the only API that doesn't specify either flag. 49 + * 50 + * This is stronger than GFP_NOWAIT or GFP_ATOMIC because 51 + * those are guaranteed to never block on a sleeping lock. 52 + * Here we are enforcing that the allocation doesn't ever spin 53 + * on any locks (i.e. only trylocks). There is no high level 54 + * GFP_$FOO flag for this use in try_alloc_pages() as the 55 + * regular page allocator doesn't fully support this 56 + * allocation mode. 57 + */ 58 + return !!(gfp_flags & __GFP_RECLAIM); 59 + } 60 + 42 61 #ifdef CONFIG_HIGHMEM 43 62 #define OPT_ZONE_HIGHMEM ZONE_HIGHMEM 44 63 #else ··· 354 335 } 355 336 #define alloc_page_vma(...) alloc_hooks(alloc_page_vma_noprof(__VA_ARGS__)) 356 337 338 + struct page *try_alloc_pages_noprof(int nid, unsigned int order); 339 + #define try_alloc_pages(...) alloc_hooks(try_alloc_pages_noprof(__VA_ARGS__)) 340 + 357 341 extern unsigned long get_free_pages_noprof(gfp_t gfp_mask, unsigned int order); 358 342 #define __get_free_pages(...) alloc_hooks(get_free_pages_noprof(__VA_ARGS__)) 359 343 ··· 379 357 __get_free_pages((gfp_mask) | GFP_DMA, (order)) 380 358 381 359 extern void __free_pages(struct page *page, unsigned int order); 360 + extern void free_pages_nolock(struct page *page, unsigned int order); 382 361 extern void free_pages(unsigned long addr, unsigned int order); 383 362 384 363 #define __free_page(page) __free_pages((page), 0)

+70

include/linux/local_lock.h

··· 51 51 #define local_unlock_irqrestore(lock, flags) \ 52 52 __local_unlock_irqrestore(lock, flags) 53 53 54 + /** 55 + * localtry_lock_init - Runtime initialize a lock instance 56 + */ 57 + #define localtry_lock_init(lock) __localtry_lock_init(lock) 58 + 59 + /** 60 + * localtry_lock - Acquire a per CPU local lock 61 + * @lock: The lock variable 62 + */ 63 + #define localtry_lock(lock) __localtry_lock(lock) 64 + 65 + /** 66 + * localtry_lock_irq - Acquire a per CPU local lock and disable interrupts 67 + * @lock: The lock variable 68 + */ 69 + #define localtry_lock_irq(lock) __localtry_lock_irq(lock) 70 + 71 + /** 72 + * localtry_lock_irqsave - Acquire a per CPU local lock, save and disable 73 + * interrupts 74 + * @lock: The lock variable 75 + * @flags: Storage for interrupt flags 76 + */ 77 + #define localtry_lock_irqsave(lock, flags) \ 78 + __localtry_lock_irqsave(lock, flags) 79 + 80 + /** 81 + * localtry_trylock - Try to acquire a per CPU local lock. 82 + * @lock: The lock variable 83 + * 84 + * The function can be used in any context such as NMI or HARDIRQ. Due to 85 + * locking constrains it will _always_ fail to acquire the lock in NMI or 86 + * HARDIRQ context on PREEMPT_RT. 87 + */ 88 + #define localtry_trylock(lock) __localtry_trylock(lock) 89 + 90 + /** 91 + * localtry_trylock_irqsave - Try to acquire a per CPU local lock, save and disable 92 + * interrupts if acquired 93 + * @lock: The lock variable 94 + * @flags: Storage for interrupt flags 95 + * 96 + * The function can be used in any context such as NMI or HARDIRQ. Due to 97 + * locking constrains it will _always_ fail to acquire the lock in NMI or 98 + * HARDIRQ context on PREEMPT_RT. 99 + */ 100 + #define localtry_trylock_irqsave(lock, flags) \ 101 + __localtry_trylock_irqsave(lock, flags) 102 + 103 + /** 104 + * local_unlock - Release a per CPU local lock 105 + * @lock: The lock variable 106 + */ 107 + #define localtry_unlock(lock) __localtry_unlock(lock) 108 + 109 + /** 110 + * local_unlock_irq - Release a per CPU local lock and enable interrupts 111 + * @lock: The lock variable 112 + */ 113 + #define localtry_unlock_irq(lock) __localtry_unlock_irq(lock) 114 + 115 + /** 116 + * localtry_unlock_irqrestore - Release a per CPU local lock and restore 117 + * interrupt flags 118 + * @lock: The lock variable 119 + * @flags: Interrupt flags to restore 120 + */ 121 + #define localtry_unlock_irqrestore(lock, flags) \ 122 + __localtry_unlock_irqrestore(lock, flags) 123 + 54 124 DEFINE_GUARD(local_lock, local_lock_t __percpu*, 55 125 local_lock(_T), 56 126 local_unlock(_T))

+146

include/linux/local_lock_internal.h

··· 15 15 #endif 16 16 } local_lock_t; 17 17 18 + typedef struct { 19 + local_lock_t llock; 20 + unsigned int acquired; 21 + } localtry_lock_t; 22 + 18 23 #ifdef CONFIG_DEBUG_LOCK_ALLOC 19 24 # define LOCAL_LOCK_DEBUG_INIT(lockname) \ 20 25 .dep_map = { \ ··· 32 27 static inline void local_lock_acquire(local_lock_t *l) 33 28 { 34 29 lock_map_acquire(&l->dep_map); 30 + DEBUG_LOCKS_WARN_ON(l->owner); 31 + l->owner = current; 32 + } 33 + 34 + static inline void local_trylock_acquire(local_lock_t *l) 35 + { 36 + lock_map_acquire_try(&l->dep_map); 35 37 DEBUG_LOCKS_WARN_ON(l->owner); 36 38 l->owner = current; 37 39 } ··· 57 45 #else /* CONFIG_DEBUG_LOCK_ALLOC */ 58 46 # define LOCAL_LOCK_DEBUG_INIT(lockname) 59 47 static inline void local_lock_acquire(local_lock_t *l) { } 48 + static inline void local_trylock_acquire(local_lock_t *l) { } 60 49 static inline void local_lock_release(local_lock_t *l) { } 61 50 static inline void local_lock_debug_init(local_lock_t *l) { } 62 51 #endif /* !CONFIG_DEBUG_LOCK_ALLOC */ 63 52 64 53 #define INIT_LOCAL_LOCK(lockname) { LOCAL_LOCK_DEBUG_INIT(lockname) } 54 + #define INIT_LOCALTRY_LOCK(lockname) { .llock = { LOCAL_LOCK_DEBUG_INIT(lockname.llock) }} 65 55 66 56 #define __local_lock_init(lock) \ 67 57 do { \ ··· 132 118 #define __local_unlock_nested_bh(lock) \ 133 119 local_lock_release(this_cpu_ptr(lock)) 134 120 121 + /* localtry_lock_t variants */ 122 + 123 + #define __localtry_lock_init(lock) \ 124 + do { \ 125 + __local_lock_init(&(lock)->llock); \ 126 + WRITE_ONCE((lock)->acquired, 0); \ 127 + } while (0) 128 + 129 + #define __localtry_lock(lock) \ 130 + do { \ 131 + localtry_lock_t *lt; \ 132 + preempt_disable(); \ 133 + lt = this_cpu_ptr(lock); \ 134 + local_lock_acquire(&lt->llock); \ 135 + WRITE_ONCE(lt->acquired, 1); \ 136 + } while (0) 137 + 138 + #define __localtry_lock_irq(lock) \ 139 + do { \ 140 + localtry_lock_t *lt; \ 141 + local_irq_disable(); \ 142 + lt = this_cpu_ptr(lock); \ 143 + local_lock_acquire(&lt->llock); \ 144 + WRITE_ONCE(lt->acquired, 1); \ 145 + } while (0) 146 + 147 + #define __localtry_lock_irqsave(lock, flags) \ 148 + do { \ 149 + localtry_lock_t *lt; \ 150 + local_irq_save(flags); \ 151 + lt = this_cpu_ptr(lock); \ 152 + local_lock_acquire(&lt->llock); \ 153 + WRITE_ONCE(lt->acquired, 1); \ 154 + } while (0) 155 + 156 + #define __localtry_trylock(lock) \ 157 + ({ \ 158 + localtry_lock_t *lt; \ 159 + bool _ret; \ 160 + \ 161 + preempt_disable(); \ 162 + lt = this_cpu_ptr(lock); \ 163 + if (!READ_ONCE(lt->acquired)) { \ 164 + WRITE_ONCE(lt->acquired, 1); \ 165 + local_trylock_acquire(&lt->llock); \ 166 + _ret = true; \ 167 + } else { \ 168 + _ret = false; \ 169 + preempt_enable(); \ 170 + } \ 171 + _ret; \ 172 + }) 173 + 174 + #define __localtry_trylock_irqsave(lock, flags) \ 175 + ({ \ 176 + localtry_lock_t *lt; \ 177 + bool _ret; \ 178 + \ 179 + local_irq_save(flags); \ 180 + lt = this_cpu_ptr(lock); \ 181 + if (!READ_ONCE(lt->acquired)) { \ 182 + WRITE_ONCE(lt->acquired, 1); \ 183 + local_trylock_acquire(&lt->llock); \ 184 + _ret = true; \ 185 + } else { \ 186 + _ret = false; \ 187 + local_irq_restore(flags); \ 188 + } \ 189 + _ret; \ 190 + }) 191 + 192 + #define __localtry_unlock(lock) \ 193 + do { \ 194 + localtry_lock_t *lt; \ 195 + lt = this_cpu_ptr(lock); \ 196 + WRITE_ONCE(lt->acquired, 0); \ 197 + local_lock_release(&lt->llock); \ 198 + preempt_enable(); \ 199 + } while (0) 200 + 201 + #define __localtry_unlock_irq(lock) \ 202 + do { \ 203 + localtry_lock_t *lt; \ 204 + lt = this_cpu_ptr(lock); \ 205 + WRITE_ONCE(lt->acquired, 0); \ 206 + local_lock_release(&lt->llock); \ 207 + local_irq_enable(); \ 208 + } while (0) 209 + 210 + #define __localtry_unlock_irqrestore(lock, flags) \ 211 + do { \ 212 + localtry_lock_t *lt; \ 213 + lt = this_cpu_ptr(lock); \ 214 + WRITE_ONCE(lt->acquired, 0); \ 215 + local_lock_release(&lt->llock); \ 216 + local_irq_restore(flags); \ 217 + } while (0) 218 + 135 219 #else /* !CONFIG_PREEMPT_RT */ 136 220 137 221 /* ··· 237 125 * critical section while staying preemptible. 238 126 */ 239 127 typedef spinlock_t local_lock_t; 128 + typedef spinlock_t localtry_lock_t; 240 129 241 130 #define INIT_LOCAL_LOCK(lockname) __LOCAL_SPIN_LOCK_UNLOCKED((lockname)) 131 + #define INIT_LOCALTRY_LOCK(lockname) INIT_LOCAL_LOCK(lockname) 242 132 243 133 #define __local_lock_init(l) \ 244 134 do { \ ··· 282 168 do { \ 283 169 spin_unlock(this_cpu_ptr((lock))); \ 284 170 } while (0) 171 + 172 + /* localtry_lock_t variants */ 173 + 174 + #define __localtry_lock_init(lock) __local_lock_init(lock) 175 + #define __localtry_lock(lock) __local_lock(lock) 176 + #define __localtry_lock_irq(lock) __local_lock(lock) 177 + #define __localtry_lock_irqsave(lock, flags) __local_lock_irqsave(lock, flags) 178 + #define __localtry_unlock(lock) __local_unlock(lock) 179 + #define __localtry_unlock_irq(lock) __local_unlock(lock) 180 + #define __localtry_unlock_irqrestore(lock, flags) __local_unlock_irqrestore(lock, flags) 181 + 182 + #define __localtry_trylock(lock) \ 183 + ({ \ 184 + int __locked; \ 185 + \ 186 + if (in_nmi() | in_hardirq()) { \ 187 + __locked = 0; \ 188 + } else { \ 189 + migrate_disable(); \ 190 + __locked = spin_trylock(this_cpu_ptr((lock))); \ 191 + if (!__locked) \ 192 + migrate_enable(); \ 193 + } \ 194 + __locked; \ 195 + }) 196 + 197 + #define __localtry_trylock_irqsave(lock, flags) \ 198 + ({ \ 199 + typecheck(unsigned long, flags); \ 200 + flags = 0; \ 201 + __localtry_trylock(lock); \ 202 + }) 285 203 286 204 #endif /* CONFIG_PREEMPT_RT */

include/linux/mm_types.h

··· 99 99 /* Or, free page */ 100 100 struct list_head buddy_list; 101 101 struct list_head pcp_list; 102 + struct { 103 + struct llist_node pcp_llist; 104 + unsigned int order; 105 + }; 102 106 }; 103 107 /* See page-flags.h for PAGE_MAPPING_FLAGS */ 104 108 struct address_space *mapping;

include/linux/mmzone.h

··· 972 972 /* Primarily protects free_area */ 973 973 spinlock_t lock; 974 974 975 + /* Pages to be freed when next trylock succeeds */ 976 + struct llist_head trylock_free_pages; 977 + 975 978 /* Write-intensive fields used by compaction and vmstats. */ 976 979 CACHELINE_PADDING(_pad2_); 977 980

+2 -3

kernel/bpf/arena.c

··· 287 287 return VM_FAULT_SIGSEGV; 288 288 289 289 /* Account into memcg of the process that created bpf_arena */ 290 - ret = bpf_map_alloc_pages(map, GFP_KERNEL | __GFP_ZERO, NUMA_NO_NODE, 1, &page); 290 + ret = bpf_map_alloc_pages(map, NUMA_NO_NODE, 1, &page); 291 291 if (ret) { 292 292 range_tree_set(&arena->rt, vmf->pgoff, 1); 293 293 return VM_FAULT_SIGSEGV; ··· 465 465 if (ret) 466 466 goto out_free_pages; 467 467 468 - ret = bpf_map_alloc_pages(&arena->map, GFP_KERNEL | __GFP_ZERO, 469 - node_id, page_cnt, pages); 468 + ret = bpf_map_alloc_pages(&arena->map, node_id, page_cnt, pages); 470 469 if (ret) 471 470 goto out; 472 471

+20 -3

kernel/bpf/syscall.c

··· 569 569 } 570 570 #endif 571 571 572 - int bpf_map_alloc_pages(const struct bpf_map *map, gfp_t gfp, int nid, 572 + static bool can_alloc_pages(void) 573 + { 574 + return preempt_count() == 0 && !irqs_disabled() && 575 + !IS_ENABLED(CONFIG_PREEMPT_RT); 576 + } 577 + 578 + static struct page *__bpf_alloc_page(int nid) 579 + { 580 + if (!can_alloc_pages()) 581 + return try_alloc_pages(nid, 0); 582 + 583 + return alloc_pages_node(nid, 584 + GFP_KERNEL | __GFP_ZERO | __GFP_ACCOUNT 585 + | __GFP_NOWARN, 586 + 0); 587 + } 588 + 589 + int bpf_map_alloc_pages(const struct bpf_map *map, int nid, 573 590 unsigned long nr_pages, struct page **pages) 574 591 { 575 592 unsigned long i, j; ··· 599 582 old_memcg = set_active_memcg(memcg); 600 583 #endif 601 584 for (i = 0; i < nr_pages; i++) { 602 - pg = alloc_pages_node(nid, gfp | __GFP_ACCOUNT, 0); 585 + pg = __bpf_alloc_page(nid); 603 586 604 587 if (pg) { 605 588 pages[i] = pg; 606 589 continue; 607 590 } 608 591 for (j = 0; j < i; j++) 609 - __free_page(pages[j]); 592 + free_pages_nolock(pages[j], 0); 610 593 ret = -ENOMEM; 611 594 break; 612 595 }

+7 -3

lib/stackdepot.c

··· 591 591 depot_stack_handle_t handle = 0; 592 592 struct page *page = NULL; 593 593 void *prealloc = NULL; 594 - bool can_alloc = depot_flags & STACK_DEPOT_FLAG_CAN_ALLOC; 594 + bool allow_spin = gfpflags_allow_spinning(alloc_flags); 595 + bool can_alloc = (depot_flags & STACK_DEPOT_FLAG_CAN_ALLOC) && allow_spin; 595 596 unsigned long flags; 596 597 u32 hash; 597 598 ··· 631 630 prealloc = page_address(page); 632 631 } 633 632 634 - if (in_nmi()) { 633 + if (in_nmi() || !allow_spin) { 635 634 /* We can never allocate in NMI context. */ 636 635 WARN_ON_ONCE(can_alloc); 637 636 /* Best effort; bail if we fail to take the lock. */ ··· 672 671 exit: 673 672 if (prealloc) { 674 673 /* Stack depot didn't use this memory, free it. */ 675 - free_pages((unsigned long)prealloc, DEPOT_POOL_ORDER); 674 + if (!allow_spin) 675 + free_pages_nolock(virt_to_page(prealloc), DEPOT_POOL_ORDER); 676 + else 677 + free_pages((unsigned long)prealloc, DEPOT_POOL_ORDER); 676 678 } 677 679 if (found) 678 680 handle = found->handle.handle;

mm/internal.h

··· 1198 1198 #define ALLOC_NOFRAGMENT 0x0 1199 1199 #endif 1200 1200 #define ALLOC_HIGHATOMIC 0x200 /* Allows access to MIGRATE_HIGHATOMIC */ 1201 + #define ALLOC_TRYLOCK 0x400 /* Only use spin_trylock in allocation path */ 1201 1202 #define ALLOC_KSWAPD 0x800 /* allow waking of kswapd, __GFP_KSWAPD_RECLAIM set */ 1202 1203 1203 1204 /* Flags that allow allocations below the min watermark. */

+39 -18

mm/memcontrol.c

··· 1739 1739 } 1740 1740 1741 1741 struct memcg_stock_pcp { 1742 - local_lock_t stock_lock; 1742 + localtry_lock_t stock_lock; 1743 1743 struct mem_cgroup *cached; /* this never be root cgroup */ 1744 1744 unsigned int nr_pages; 1745 1745 ··· 1754 1754 #define FLUSHING_CACHED_CHARGE 0 1755 1755 }; 1756 1756 static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock) = { 1757 - .stock_lock = INIT_LOCAL_LOCK(stock_lock), 1757 + .stock_lock = INIT_LOCALTRY_LOCK(stock_lock), 1758 1758 }; 1759 1759 static DEFINE_MUTEX(percpu_charge_mutex); 1760 1760 ··· 1766 1766 * consume_stock: Try to consume stocked charge on this cpu. 1767 1767 * @memcg: memcg to consume from. 1768 1768 * @nr_pages: how many pages to charge. 1769 + * @gfp_mask: allocation mask. 1769 1770 * 1770 1771 * The charges will only happen if @memcg matches the current cpu's memcg 1771 1772 * stock, and at least @nr_pages are available in that stock. Failure to ··· 1774 1773 * 1775 1774 * returns true if successful, false otherwise. 1776 1775 */ 1777 - static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) 1776 + static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages, 1777 + gfp_t gfp_mask) 1778 1778 { 1779 1779 struct memcg_stock_pcp *stock; 1780 1780 unsigned int stock_pages; ··· 1785 1783 if (nr_pages > MEMCG_CHARGE_BATCH) 1786 1784 return ret; 1787 1785 1788 - local_lock_irqsave(&memcg_stock.stock_lock, flags); 1786 + if (!localtry_trylock_irqsave(&memcg_stock.stock_lock, flags)) { 1787 + if (!gfpflags_allow_spinning(gfp_mask)) 1788 + return ret; 1789 + localtry_lock_irqsave(&memcg_stock.stock_lock, flags); 1790 + } 1789 1791 1790 1792 stock = this_cpu_ptr(&memcg_stock); 1791 1793 stock_pages = READ_ONCE(stock->nr_pages); ··· 1798 1792 ret = true; 1799 1793 } 1800 1794 1801 - local_unlock_irqrestore(&memcg_stock.stock_lock, flags); 1795 + localtry_unlock_irqrestore(&memcg_stock.stock_lock, flags); 1802 1796 1803 1797 return ret; 1804 1798 } ··· 1837 1831 * drain_stock races is that we always operate on local CPU stock 1838 1832 * here with IRQ disabled 1839 1833 */ 1840 - local_lock_irqsave(&memcg_stock.stock_lock, flags); 1834 + localtry_lock_irqsave(&memcg_stock.stock_lock, flags); 1841 1835 1842 1836 stock = this_cpu_ptr(&memcg_stock); 1843 1837 old = drain_obj_stock(stock); 1844 1838 drain_stock(stock); 1845 1839 clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); 1846 1840 1847 - local_unlock_irqrestore(&memcg_stock.stock_lock, flags); 1841 + localtry_unlock_irqrestore(&memcg_stock.stock_lock, flags); 1848 1842 obj_cgroup_put(old); 1849 1843 } 1850 1844 ··· 1874 1868 { 1875 1869 unsigned long flags; 1876 1870 1877 - local_lock_irqsave(&memcg_stock.stock_lock, flags); 1871 + if (!localtry_trylock_irqsave(&memcg_stock.stock_lock, flags)) { 1872 + /* 1873 + * In case of unlikely failure to lock percpu stock_lock 1874 + * uncharge memcg directly. 1875 + */ 1876 + if (mem_cgroup_is_root(memcg)) 1877 + return; 1878 + page_counter_uncharge(&memcg->memory, nr_pages); 1879 + if (do_memsw_account()) 1880 + page_counter_uncharge(&memcg->memsw, nr_pages); 1881 + return; 1882 + } 1878 1883 __refill_stock(memcg, nr_pages); 1879 - local_unlock_irqrestore(&memcg_stock.stock_lock, flags); 1884 + localtry_unlock_irqrestore(&memcg_stock.stock_lock, flags); 1880 1885 } 1881 1886 1882 1887 /* ··· 1944 1927 stock = &per_cpu(memcg_stock, cpu); 1945 1928 1946 1929 /* drain_obj_stock requires stock_lock */ 1947 - local_lock_irqsave(&memcg_stock.stock_lock, flags); 1930 + localtry_lock_irqsave(&memcg_stock.stock_lock, flags); 1948 1931 old = drain_obj_stock(stock); 1949 - local_unlock_irqrestore(&memcg_stock.stock_lock, flags); 1932 + localtry_unlock_irqrestore(&memcg_stock.stock_lock, flags); 1950 1933 1951 1934 drain_stock(stock); 1952 1935 obj_cgroup_put(old); ··· 2239 2222 unsigned long pflags; 2240 2223 2241 2224 retry: 2242 - if (consume_stock(memcg, nr_pages)) 2225 + if (consume_stock(memcg, nr_pages, gfp_mask)) 2243 2226 return 0; 2227 + 2228 + if (!gfpflags_allow_spinning(gfp_mask)) 2229 + /* Avoid the refill and flush of the older stock */ 2230 + batch = nr_pages; 2244 2231 2245 2232 if (!do_memsw_account() || 2246 2233 page_counter_try_charge(&memcg->memsw, batch, &counter)) { ··· 2729 2708 unsigned long flags; 2730 2709 int *bytes; 2731 2710 2732 - local_lock_irqsave(&memcg_stock.stock_lock, flags); 2711 + localtry_lock_irqsave(&memcg_stock.stock_lock, flags); 2733 2712 stock = this_cpu_ptr(&memcg_stock); 2734 2713 2735 2714 /* ··· 2782 2761 if (nr) 2783 2762 __mod_objcg_mlstate(objcg, pgdat, idx, nr); 2784 2763 2785 - local_unlock_irqrestore(&memcg_stock.stock_lock, flags); 2764 + localtry_unlock_irqrestore(&memcg_stock.stock_lock, flags); 2786 2765 obj_cgroup_put(old); 2787 2766 } 2788 2767 ··· 2792 2771 unsigned long flags; 2793 2772 bool ret = false; 2794 2773 2795 - local_lock_irqsave(&memcg_stock.stock_lock, flags); 2774 + localtry_lock_irqsave(&memcg_stock.stock_lock, flags); 2796 2775 2797 2776 stock = this_cpu_ptr(&memcg_stock); 2798 2777 if (objcg == READ_ONCE(stock->cached_objcg) && stock->nr_bytes >= nr_bytes) { ··· 2800 2779 ret = true; 2801 2780 } 2802 2781 2803 - local_unlock_irqrestore(&memcg_stock.stock_lock, flags); 2782 + localtry_unlock_irqrestore(&memcg_stock.stock_lock, flags); 2804 2783 2805 2784 return ret; 2806 2785 } ··· 2892 2871 unsigned long flags; 2893 2872 unsigned int nr_pages = 0; 2894 2873 2895 - local_lock_irqsave(&memcg_stock.stock_lock, flags); 2874 + localtry_lock_irqsave(&memcg_stock.stock_lock, flags); 2896 2875 2897 2876 stock = this_cpu_ptr(&memcg_stock); 2898 2877 if (READ_ONCE(stock->cached_objcg) != objcg) { /* reset if necessary */ ··· 2910 2889 stock->nr_bytes &= (PAGE_SIZE - 1); 2911 2890 } 2912 2891 2913 - local_unlock_irqrestore(&memcg_stock.stock_lock, flags); 2892 + localtry_unlock_irqrestore(&memcg_stock.stock_lock, flags); 2914 2893 obj_cgroup_put(old); 2915 2894 2916 2895 if (nr_pages)

+188 -15

mm/page_alloc.c

··· 88 88 */ 89 89 #define FPI_TO_TAIL ((__force fpi_t)BIT(1)) 90 90 91 + /* Free the page without taking locks. Rely on trylock only. */ 92 + #define FPI_TRYLOCK ((__force fpi_t)BIT(2)) 93 + 91 94 /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */ 92 95 static DEFINE_MUTEX(pcp_batch_high_lock); 93 96 #define MIN_PERCPU_PAGELIST_HIGH_FRACTION (8) ··· 1252 1249 } while (1); 1253 1250 } 1254 1251 1252 + static void add_page_to_zone_llist(struct zone *zone, struct page *page, 1253 + unsigned int order) 1254 + { 1255 + /* Remember the order */ 1256 + page->order = order; 1257 + /* Add the page to the free list */ 1258 + llist_add(&page->pcp_llist, &zone->trylock_free_pages); 1259 + } 1260 + 1255 1261 static void free_one_page(struct zone *zone, struct page *page, 1256 1262 unsigned long pfn, unsigned int order, 1257 1263 fpi_t fpi_flags) 1258 1264 { 1265 + struct llist_head *llhead; 1259 1266 unsigned long flags; 1260 1267 1261 - spin_lock_irqsave(&zone->lock, flags); 1268 + if (!spin_trylock_irqsave(&zone->lock, flags)) { 1269 + if (unlikely(fpi_flags & FPI_TRYLOCK)) { 1270 + add_page_to_zone_llist(zone, page, order); 1271 + return; 1272 + } 1273 + spin_lock_irqsave(&zone->lock, flags); 1274 + } 1275 + 1276 + /* The lock succeeded. Process deferred pages. */ 1277 + llhead = &zone->trylock_free_pages; 1278 + if (unlikely(!llist_empty(llhead) && !(fpi_flags & FPI_TRYLOCK))) { 1279 + struct llist_node *llnode; 1280 + struct page *p, *tmp; 1281 + 1282 + llnode = llist_del_all(llhead); 1283 + llist_for_each_entry_safe(p, tmp, llnode, pcp_llist) { 1284 + unsigned int p_order = p->order; 1285 + 1286 + split_large_buddy(zone, p, page_to_pfn(p), p_order, fpi_flags); 1287 + __count_vm_events(PGFREE, 1 << p_order); 1288 + } 1289 + } 1262 1290 split_large_buddy(zone, page, pfn, order, fpi_flags); 1263 1291 spin_unlock_irqrestore(&zone->lock, flags); 1264 1292 ··· 2341 2307 unsigned long flags; 2342 2308 int i; 2343 2309 2344 - spin_lock_irqsave(&zone->lock, flags); 2310 + if (!spin_trylock_irqsave(&zone->lock, flags)) { 2311 + if (unlikely(alloc_flags & ALLOC_TRYLOCK)) 2312 + return 0; 2313 + spin_lock_irqsave(&zone->lock, flags); 2314 + } 2345 2315 for (i = 0; i < count; ++i) { 2346 2316 struct page *page = __rmqueue(zone, order, migratetype, 2347 2317 alloc_flags); ··· 2633 2595 2634 2596 static void free_frozen_page_commit(struct zone *zone, 2635 2597 struct per_cpu_pages *pcp, struct page *page, int migratetype, 2636 - unsigned int order) 2598 + unsigned int order, fpi_t fpi_flags) 2637 2599 { 2638 2600 int high, batch; 2639 2601 int pindex; ··· 2668 2630 } 2669 2631 if (pcp->free_count < (batch << CONFIG_PCP_BATCH_SCALE_MAX)) 2670 2632 pcp->free_count += (1 << order); 2633 + 2634 + if (unlikely(fpi_flags & FPI_TRYLOCK)) { 2635 + /* 2636 + * Do not attempt to take a zone lock. Let pcp->count get 2637 + * over high mark temporarily. 2638 + */ 2639 + return; 2640 + } 2671 2641 high = nr_pcp_high(pcp, zone, batch, free_high); 2672 2642 if (pcp->count >= high) { 2673 2643 free_pcppages_bulk(zone, nr_pcp_free(pcp, batch, high, free_high), ··· 2690 2644 /* 2691 2645 * Free a pcp page 2692 2646 */ 2693 - void free_frozen_pages(struct page *page, unsigned int order) 2647 + static void __free_frozen_pages(struct page *page, unsigned int order, 2648 + fpi_t fpi_flags) 2694 2649 { 2695 2650 unsigned long __maybe_unused UP_flags; 2696 2651 struct per_cpu_pages *pcp; ··· 2700 2653 int migratetype; 2701 2654 2702 2655 if (!pcp_allowed_order(order)) { 2703 - __free_pages_ok(page, order, FPI_NONE); 2656 + __free_pages_ok(page, order, fpi_flags); 2704 2657 return; 2705 2658 } 2706 2659 ··· 2718 2671 migratetype = get_pfnblock_migratetype(page, pfn); 2719 2672 if (unlikely(migratetype >= MIGRATE_PCPTYPES)) { 2720 2673 if (unlikely(is_migrate_isolate(migratetype))) { 2721 - free_one_page(zone, page, pfn, order, FPI_NONE); 2674 + free_one_page(zone, page, pfn, order, fpi_flags); 2722 2675 return; 2723 2676 } 2724 2677 migratetype = MIGRATE_MOVABLE; 2725 2678 } 2726 2679 2680 + if (unlikely((fpi_flags & FPI_TRYLOCK) && IS_ENABLED(CONFIG_PREEMPT_RT) 2681 + && (in_nmi() || in_hardirq()))) { 2682 + add_page_to_zone_llist(zone, page, order); 2683 + return; 2684 + } 2727 2685 pcp_trylock_prepare(UP_flags); 2728 2686 pcp = pcp_spin_trylock(zone->per_cpu_pageset); 2729 2687 if (pcp) { 2730 - free_frozen_page_commit(zone, pcp, page, migratetype, order); 2688 + free_frozen_page_commit(zone, pcp, page, migratetype, order, fpi_flags); 2731 2689 pcp_spin_unlock(pcp); 2732 2690 } else { 2733 - free_one_page(zone, page, pfn, order, FPI_NONE); 2691 + free_one_page(zone, page, pfn, order, fpi_flags); 2734 2692 } 2735 2693 pcp_trylock_finish(UP_flags); 2694 + } 2695 + 2696 + void free_frozen_pages(struct page *page, unsigned int order) 2697 + { 2698 + __free_frozen_pages(page, order, FPI_NONE); 2736 2699 } 2737 2700 2738 2701 /* ··· 2833 2776 2834 2777 trace_mm_page_free_batched(&folio->page); 2835 2778 free_frozen_page_commit(zone, pcp, &folio->page, migratetype, 2836 - order); 2779 + order, FPI_NONE); 2837 2780 } 2838 2781 2839 2782 if (pcp) { ··· 2964 2907 2965 2908 do { 2966 2909 page = NULL; 2967 - spin_lock_irqsave(&zone->lock, flags); 2910 + if (!spin_trylock_irqsave(&zone->lock, flags)) { 2911 + if (unlikely(alloc_flags & ALLOC_TRYLOCK)) 2912 + return NULL; 2913 + spin_lock_irqsave(&zone->lock, flags); 2914 + } 2968 2915 if (alloc_flags & ALLOC_HIGHATOMIC) 2969 2916 page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC); 2970 2917 if (!page) { ··· 4573 4512 4574 4513 might_alloc(gfp_mask); 4575 4514 4576 - if (should_fail_alloc_page(gfp_mask, order)) 4515 + /* 4516 + * Don't invoke should_fail logic, since it may call 4517 + * get_random_u32() and printk() which need to spin_lock. 4518 + */ 4519 + if (!(*alloc_flags & ALLOC_TRYLOCK) && 4520 + should_fail_alloc_page(gfp_mask, order)) 4577 4521 return false; 4578 4522 4579 4523 *alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, *alloc_flags); ··· 4876 4810 EXPORT_SYMBOL(get_zeroed_page_noprof); 4877 4811 4878 4812 /** 4879 - * __free_pages - Free pages allocated with alloc_pages(). 4813 + * ___free_pages - Free pages allocated with alloc_pages(). 4880 4814 * @page: The page pointer returned from alloc_pages(). 4881 4815 * @order: The order of the allocation. 4816 + * @fpi_flags: Free Page Internal flags. 4882 4817 * 4883 4818 * This function can free multi-page allocations that are not compound 4884 4819 * pages. It does not check that the @order passed in matches that of ··· 4896 4829 * Context: May be called in interrupt context or while holding a normal 4897 4830 * spinlock, but not in NMI context or while holding a raw spinlock. 4898 4831 */ 4899 - void __free_pages(struct page *page, unsigned int order) 4832 + static void ___free_pages(struct page *page, unsigned int order, 4833 + fpi_t fpi_flags) 4900 4834 { 4901 4835 /* get PageHead before we drop reference */ 4902 4836 int head = PageHead(page); 4903 4837 struct alloc_tag *tag = pgalloc_tag_get(page); 4904 4838 4905 4839 if (put_page_testzero(page)) 4906 - free_frozen_pages(page, order); 4840 + __free_frozen_pages(page, order, fpi_flags); 4907 4841 else if (!head) { 4908 4842 pgalloc_tag_sub_pages(tag, (1 << order) - 1); 4909 4843 while (order-- > 0) 4910 - free_frozen_pages(page + (1 << order), order); 4844 + __free_frozen_pages(page + (1 << order), order, 4845 + fpi_flags); 4911 4846 } 4912 4847 } 4848 + void __free_pages(struct page *page, unsigned int order) 4849 + { 4850 + ___free_pages(page, order, FPI_NONE); 4851 + } 4913 4852 EXPORT_SYMBOL(__free_pages); 4853 + 4854 + /* 4855 + * Can be called while holding raw_spin_lock or from IRQ and NMI for any 4856 + * page type (not only those that came from try_alloc_pages) 4857 + */ 4858 + void free_pages_nolock(struct page *page, unsigned int order) 4859 + { 4860 + ___free_pages(page, order, FPI_TRYLOCK); 4861 + } 4914 4862 4915 4863 void free_pages(unsigned long addr, unsigned int order) 4916 4864 { ··· 7163 7081 } 7164 7082 7165 7083 #endif /* CONFIG_UNACCEPTED_MEMORY */ 7084 + 7085 + /** 7086 + * try_alloc_pages - opportunistic reentrant allocation from any context 7087 + * @nid: node to allocate from 7088 + * @order: allocation order size 7089 + * 7090 + * Allocates pages of a given order from the given node. This is safe to 7091 + * call from any context (from atomic, NMI, and also reentrant 7092 + * allocator -> tracepoint -> try_alloc_pages_noprof). 7093 + * Allocation is best effort and to be expected to fail easily so nobody should 7094 + * rely on the success. Failures are not reported via warn_alloc(). 7095 + * See always fail conditions below. 7096 + * 7097 + * Return: allocated page or NULL on failure. 7098 + */ 7099 + struct page *try_alloc_pages_noprof(int nid, unsigned int order) 7100 + { 7101 + /* 7102 + * Do not specify __GFP_DIRECT_RECLAIM, since direct claim is not allowed. 7103 + * Do not specify __GFP_KSWAPD_RECLAIM either, since wake up of kswapd 7104 + * is not safe in arbitrary context. 7105 + * 7106 + * These two are the conditions for gfpflags_allow_spinning() being true. 7107 + * 7108 + * Specify __GFP_NOWARN since failing try_alloc_pages() is not a reason 7109 + * to warn. Also warn would trigger printk() which is unsafe from 7110 + * various contexts. We cannot use printk_deferred_enter() to mitigate, 7111 + * since the running context is unknown. 7112 + * 7113 + * Specify __GFP_ZERO to make sure that call to kmsan_alloc_page() below 7114 + * is safe in any context. Also zeroing the page is mandatory for 7115 + * BPF use cases. 7116 + * 7117 + * Though __GFP_NOMEMALLOC is not checked in the code path below, 7118 + * specify it here to highlight that try_alloc_pages() 7119 + * doesn't want to deplete reserves. 7120 + */ 7121 + gfp_t alloc_gfp = __GFP_NOWARN | __GFP_ZERO | __GFP_NOMEMALLOC 7122 + | __GFP_ACCOUNT; 7123 + unsigned int alloc_flags = ALLOC_TRYLOCK; 7124 + struct alloc_context ac = { }; 7125 + struct page *page; 7126 + 7127 + /* 7128 + * In PREEMPT_RT spin_trylock() will call raw_spin_lock() which is 7129 + * unsafe in NMI. If spin_trylock() is called from hard IRQ the current 7130 + * task may be waiting for one rt_spin_lock, but rt_spin_trylock() will 7131 + * mark the task as the owner of another rt_spin_lock which will 7132 + * confuse PI logic, so return immediately if called form hard IRQ or 7133 + * NMI. 7134 + * 7135 + * Note, irqs_disabled() case is ok. This function can be called 7136 + * from raw_spin_lock_irqsave region. 7137 + */ 7138 + if (IS_ENABLED(CONFIG_PREEMPT_RT) && (in_nmi() || in_hardirq())) 7139 + return NULL; 7140 + if (!pcp_allowed_order(order)) 7141 + return NULL; 7142 + 7143 + #ifdef CONFIG_UNACCEPTED_MEMORY 7144 + /* Bailout, since try_to_accept_memory_one() needs to take a lock */ 7145 + if (has_unaccepted_memory()) 7146 + return NULL; 7147 + #endif 7148 + /* Bailout, since _deferred_grow_zone() needs to take a lock */ 7149 + if (deferred_pages_enabled()) 7150 + return NULL; 7151 + 7152 + if (nid == NUMA_NO_NODE) 7153 + nid = numa_node_id(); 7154 + 7155 + prepare_alloc_pages(alloc_gfp, order, nid, NULL, &ac, 7156 + &alloc_gfp, &alloc_flags); 7157 + 7158 + /* 7159 + * Best effort allocation from percpu free list. 7160 + * If it's empty attempt to spin_trylock zone->lock. 7161 + */ 7162 + page = get_page_from_freelist(alloc_gfp, order, alloc_flags, &ac); 7163 + 7164 + /* Unlike regular alloc_pages() there is no __alloc_pages_slowpath(). */ 7165 + 7166 + if (memcg_kmem_online() && page && 7167 + unlikely(__memcg_kmem_charge_page(page, alloc_gfp, order) != 0)) { 7168 + free_pages_nolock(page, order); 7169 + page = NULL; 7170 + } 7171 + trace_mm_page_alloc(page, order, alloc_gfp, ac.migratetype); 7172 + kmsan_alloc_page(page, order, alloc_gfp); 7173 + return page; 7174 + }

+7 -1

mm/page_owner.c

··· 294 294 page_owner = get_page_owner(page_ext); 295 295 alloc_handle = page_owner->handle; 296 296 297 - handle = save_stack(GFP_NOWAIT | __GFP_NOWARN); 297 + /* 298 + * Do not specify GFP_NOWAIT to make gfpflags_allow_spinning() == false 299 + * to prevent issues in stack_depot_save(). 300 + * This is similar to try_alloc_pages() gfp flags, but only used 301 + * to signal stack_depot to avoid spin_locks. 302 + */ 303 + handle = save_stack(__GFP_NOWARN); 298 304 __update_page_owner_free_handle(page_ext, handle, order, current->pid, 299 305 current->tgid, free_ts_nsec); 300 306 page_ext_put(page_ext);