commit fd6db58867924d2bfbc4ece4b0092f697f9fc31c · tjh.dev/kernel

tjh.dev / kernel

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

slab: fix barn NULL pointer dereference on memoryless nodes

Phil reported a boot failure once sheaves become used in commits
59faa4da7cd4 ("maple_tree: use percpu sheaves for maple_node_cache") and
3accabda4da1 ("mm, vma: use percpu sheaves for vm_area_struct cache"):

BUG: kernel NULL pointer dereference, address: 0000000000000040
#PF: supervisor read access in kernel mode
#PF: error_code(0x0000) - not-present page
PGD 0 P4D 0
Oops: Oops: 0000 [#1] SMP NOPTI
CPU: 21 UID: 0 PID: 818 Comm: kworker/u398:0 Not tainted 6.17.0-rc3.slab+ #5 PREEMPT(voluntary)
Hardware name: Dell Inc. PowerEdge R7425/02MJ3T, BIOS 1.26.0 07/30/2025
RIP: 0010:__pcs_replace_empty_main+0x44/0x1d0
Code: ec 08 48 8b 46 10 48 8b 76 08 48 85 c0 74 0b 8b 48 18 85 c9 0f 85 e5 00 00 00 65 48 63 05 e4 ee 50 02 49 8b 84 c6 e0 00 00 00 <4c> 8b 68 40 4c 89 ef e8 b0 81 ff ff 48 89 c5 48 85 c0 74 1d 48 89
RSP: 0018:ffffd2d10950bdb0 EFLAGS: 00010246
RAX: 0000000000000000 RBX: ffff8a775dab74b0 RCX: 00000000ffffffff
RDX: 0000000000000cc0 RSI: ffff8a6800804000 RDI: ffff8a680004e300
RBP: ffffd2d10950be40 R08: 0000000000000060 R09: ffffffffb9367388
R10: 00000000000149e8 R11: ffff8a6f87a38000 R12: 0000000000000cc0
R13: 0000000000000cc0 R14: ffff8a680004e300 R15: 00000000000000c0
FS: 0000000000000000(0000) GS:ffff8a77a3541000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000000000040 CR3: 0000000e1aa24000 CR4: 00000000003506f0
Call Trace:
<TASK>
? srso_return_thunk+0x5/0x5f
? vm_area_alloc+0x1e/0x60
kmem_cache_alloc_noprof+0x4ec/0x5b0
vm_area_alloc+0x1e/0x60
create_init_stack_vma+0x26/0x210
alloc_bprm+0x139/0x200
kernel_execve+0x4a/0x140
call_usermodehelper_exec_async+0xd0/0x190
? __pfx_call_usermodehelper_exec_async+0x10/0x10
ret_from_fork+0xf0/0x110
? __pfx_call_usermodehelper_exec_async+0x10/0x10
ret_from_fork_asm+0x1a/0x30
</TASK>
Modules linked in:
CR2: 0000000000000040
---[ end trace 0000000000000000 ]---
RIP: 0010:__pcs_replace_empty_main+0x44/0x1d0
Code: ec 08 48 8b 46 10 48 8b 76 08 48 85 c0 74 0b 8b 48 18 85 c9 0f 85 e5 00 00 00 65 48 63 05 e4 ee 50 02 49 8b 84 c6 e0 00 00 00 <4c> 8b 68 40 4c 89 ef e8 b0 81 ff ff 48 89 c5 48 85 c0 74 1d 48 89
RSP: 0018:ffffd2d10950bdb0 EFLAGS: 00010246
RAX: 0000000000000000 RBX: ffff8a775dab74b0 RCX: 00000000ffffffff
RDX: 0000000000000cc0 RSI: ffff8a6800804000 RDI: ffff8a680004e300
RBP: ffffd2d10950be40 R08: 0000000000000060 R09: ffffffffb9367388
R10: 00000000000149e8 R11: ffff8a6f87a38000 R12: 0000000000000cc0
R13: 0000000000000cc0 R14: ffff8a680004e300 R15: 00000000000000c0
FS: 0000000000000000(0000) GS:ffff8a77a3541000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000000000040 CR3: 0000000e1aa24000 CR4: 00000000003506f0
Kernel panic - not syncing: Fatal exception
Kernel Offset: 0x36a00000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff)
---[ end Kernel panic - not syncing: Fatal exception ]---

And noted "this is an AMD EPYC 7401 with 8 NUMA nodes configured such
that memory is only on 2 of them."

# numactl --hardware
available: 8 nodes (0-7)
node 0 cpus: 0 8 16 24 32 40 48 56 64 72 80 88
node 0 size: 0 MB
node 0 free: 0 MB
node 1 cpus: 2 10 18 26 34 42 50 58 66 74 82 90
node 1 size: 31584 MB
node 1 free: 30397 MB
node 2 cpus: 4 12 20 28 36 44 52 60 68 76 84 92
node 2 size: 0 MB
node 2 free: 0 MB
node 3 cpus: 6 14 22 30 38 46 54 62 70 78 86 94
node 3 size: 0 MB
node 3 free: 0 MB
node 4 cpus: 1 9 17 25 33 41 49 57 65 73 81 89
node 4 size: 0 MB
node 4 free: 0 MB
node 5 cpus: 3 11 19 27 35 43 51 59 67 75 83 91
node 5 size: 32214 MB
node 5 free: 31625 MB
node 6 cpus: 5 13 21 29 37 45 53 61 69 77 85 93
node 6 size: 0 MB
node 6 free: 0 MB
node 7 cpus: 7 15 23 31 39 47 55 63 71 79 87 95
node 7 size: 0 MB
node 7 free: 0 MB

Linus decoded the stacktrace to get_barn() and get_node() and determined
that kmem_cache->node[numa_mem_id()] is NULL.

The problem is due to a wrong assumption that memoryless nodes only
exist on systems with CONFIG_HAVE_MEMORYLESS_NODES, where numa_mem_id()
points to the nearest node that has memory. SLUB has been allocating its
kmem_cache_node structures only on nodes with memory and so it does with
struct node_barn.

For kmem_cache_node, get_partial_node() checks if get_node() result is
not NULL, which I assumed was for protection from a bogus node id passed
to kmalloc_node() but apparently it's also for systems where
numa_mem_id() (used when no specific node is given) might return a
memoryless node.

Fix the sheaves code the same way by checking the result of get_node()
and bailing out if it's NULL. Note that cpus on such memoryless nodes
will have degraded sheaves performance, which can be improved later,
preferably by making numa_mem_id() work properly on such systems.

Fixes: 2d517aa09bbc ("slab: add opt-in caching layer of percpu sheaves")
Reported-and-tested-by: Phil Auld <pauld@redhat.com>
Closes: https://lore.kernel.org/all/20251010151116.GA436967@pauld.westford.csb/
Analyzed-by: Linus Torvalds <torvalds@linux-foundation.org>
Link: https://lore.kernel.org/all/CAHk-%3Dwg1xK%2BBr%3DFJ5QipVhzCvq7uQVPt5Prze6HDhQQ%3DQD_BcQ@mail.gmail.com/
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>

Vlastimil Babka 3 months ago fd6db588 f76b1683

+51 -14

1 changed file

expand all

unified split

slub.c

+51 -14

mm/slub.c

··· 504 504 return s->node[node]; 505 505 } 506 506 507 - /* Get the barn of the current cpu's memory node */ 507 + /* 508 + * Get the barn of the current cpu's closest memory node. It may not exist on 509 + * systems with memoryless nodes but without CONFIG_HAVE_MEMORYLESS_NODES 510 + */ 508 511 static inline struct node_barn *get_barn(struct kmem_cache *s) 509 512 { 510 - return get_node(s, numa_mem_id())->barn; 513 + struct kmem_cache_node *n = get_node(s, numa_mem_id()); 514 + 515 + if (!n) 516 + return NULL; 517 + 518 + return n->barn; 511 519 } 512 520 513 521 /* ··· 4990 4982 } 4991 4983 4992 4984 barn = get_barn(s); 4985 + if (!barn) { 4986 + local_unlock(&s->cpu_sheaves->lock); 4987 + return NULL; 4988 + } 4993 4989 4994 4990 full = barn_replace_empty_sheaf(barn, pcs->main); 4995 4991 ··· 5165 5153 if (unlikely(pcs->main->size == 0)) { 5166 5154 5167 5155 struct slab_sheaf *full; 5156 + struct node_barn *barn; 5168 5157 5169 5158 if (pcs->spare && pcs->spare->size > 0) { 5170 5159 swap(pcs->main, pcs->spare); 5171 5160 goto do_alloc; 5172 5161 } 5173 5162 5174 - full = barn_replace_empty_sheaf(get_barn(s), pcs->main); 5163 + barn = get_barn(s); 5164 + if (!barn) { 5165 + local_unlock(&s->cpu_sheaves->lock); 5166 + return allocated; 5167 + } 5168 + 5169 + full = barn_replace_empty_sheaf(barn, pcs->main); 5175 5170 5176 5171 if (full) { 5177 5172 stat(s, BARN_GET); ··· 5333 5314 { 5334 5315 struct slub_percpu_sheaves *pcs; 5335 5316 struct slab_sheaf *sheaf = NULL; 5317 + struct node_barn *barn; 5336 5318 5337 5319 if (unlikely(size > s->sheaf_capacity)) { 5338 5320 ··· 5375 5355 pcs->spare = NULL; 5376 5356 stat(s, SHEAF_PREFILL_FAST); 5377 5357 } else { 5358 + barn = get_barn(s); 5359 + 5378 5360 stat(s, SHEAF_PREFILL_SLOW); 5379 - sheaf = barn_get_full_or_empty_sheaf(get_barn(s)); 5361 + if (barn) 5362 + sheaf = barn_get_full_or_empty_sheaf(barn); 5380 5363 if (sheaf && sheaf->size) 5381 5364 stat(s, BARN_GET); 5382 5365 else ··· 5449 5426 * If the barn has too many full sheaves or we fail to refill the sheaf, 5450 5427 * simply flush and free it. 5451 5428 */ 5452 - if (data_race(barn->nr_full) >= MAX_FULL_SHEAVES || 5429 + if (!barn || data_race(barn->nr_full) >= MAX_FULL_SHEAVES || 5453 5430 refill_sheaf(s, sheaf, gfp)) { 5454 5431 sheaf_flush_unused(s, sheaf); 5455 5432 free_empty_sheaf(s, sheaf); ··· 5966 5943 * put the full sheaf there. 5967 5944 */ 5968 5945 static void __pcs_install_empty_sheaf(struct kmem_cache *s, 5969 - struct slub_percpu_sheaves *pcs, struct slab_sheaf *empty) 5946 + struct slub_percpu_sheaves *pcs, struct slab_sheaf *empty, 5947 + struct node_barn *barn) 5970 5948 { 5971 - struct node_barn *barn; 5972 - 5973 5949 lockdep_assert_held(this_cpu_ptr(&s->cpu_sheaves->lock)); 5974 5950 5975 5951 /* This is what we expect to find if nobody interrupted us. */ ··· 5977 5955 pcs->main = empty; 5978 5956 return; 5979 5957 } 5980 - 5981 - barn = get_barn(s); 5982 5958 5983 5959 /* 5984 5960 * Unlikely because if the main sheaf had space, we would have just ··· 6022 6002 lockdep_assert_held(this_cpu_ptr(&s->cpu_sheaves->lock)); 6023 6003 6024 6004 barn = get_barn(s); 6005 + if (!barn) { 6006 + local_unlock(&s->cpu_sheaves->lock); 6007 + return NULL; 6008 + } 6009 + 6025 6010 put_fail = false; 6026 6011 6027 6012 if (!pcs->spare) { ··· 6109 6084 } 6110 6085 6111 6086 pcs = this_cpu_ptr(s->cpu_sheaves); 6112 - __pcs_install_empty_sheaf(s, pcs, empty); 6087 + __pcs_install_empty_sheaf(s, pcs, empty, barn); 6113 6088 6114 6089 return pcs; 6115 6090 } ··· 6146 6121 6147 6122 static void rcu_free_sheaf(struct rcu_head *head) 6148 6123 { 6124 + struct kmem_cache_node *n; 6149 6125 struct slab_sheaf *sheaf; 6150 - struct node_barn *barn; 6126 + struct node_barn *barn = NULL; 6151 6127 struct kmem_cache *s; 6152 6128 6153 6129 sheaf = container_of(head, struct slab_sheaf, rcu_head); ··· 6165 6139 */ 6166 6140 __rcu_free_sheaf_prepare(s, sheaf); 6167 6141 6168 - barn = get_node(s, sheaf->node)->barn; 6142 + n = get_node(s, sheaf->node); 6143 + if (!n) 6144 + goto flush; 6145 + 6146 + barn = n->barn; 6169 6147 6170 6148 /* due to slab_free_hook() */ 6171 6149 if (unlikely(sheaf->size == 0)) ··· 6187 6157 return; 6188 6158 } 6189 6159 6160 + flush: 6190 6161 stat(s, BARN_PUT_FAIL); 6191 6162 sheaf_flush_unused(s, sheaf); 6192 6163 6193 6164 empty: 6194 - if (data_race(barn->nr_empty) < MAX_EMPTY_SHEAVES) { 6165 + if (barn && data_race(barn->nr_empty) < MAX_EMPTY_SHEAVES) { 6195 6166 barn_put_empty_sheaf(barn, sheaf); 6196 6167 return; 6197 6168 } ··· 6222 6191 } 6223 6192 6224 6193 barn = get_barn(s); 6194 + if (!barn) { 6195 + local_unlock(&s->cpu_sheaves->lock); 6196 + goto fail; 6197 + } 6225 6198 6226 6199 empty = barn_get_empty_sheaf(barn); 6227 6200 ··· 6339 6304 goto do_free; 6340 6305 6341 6306 barn = get_barn(s); 6307 + if (!barn) 6308 + goto no_empty; 6342 6309 6343 6310 if (!pcs->spare) { 6344 6311 empty = barn_get_empty_sheaf(barn);