Merge tag 'slab-for-6.19' of git://git.kernel.org/pub/scm/linux/kernel/git/vbabka/slab

+4 -4

include/linux/fault-inject.h

··· 8 8 struct dentry; 9 9 struct kmem_cache; 10 10 11 + enum fault_flags { 12 + FAULT_NOWARN = 1 << 0, 13 + }; 14 + 11 15 #ifdef CONFIG_FAULT_INJECTION 12 16 13 17 #include <linux/atomic.h> ··· 38 34 unsigned long count; 39 35 struct ratelimit_state ratelimit_state; 40 36 struct dentry *dname; 41 - }; 42 - 43 - enum fault_flags { 44 - FAULT_NOWARN = 1 << 0, 45 37 }; 46 38 47 39 #define FAULT_ATTR_INITIALIZER { \

-6

include/linux/gfp_types.h

··· 55 55 #ifdef CONFIG_LOCKDEP 56 56 ___GFP_NOLOCKDEP_BIT, 57 57 #endif 58 - #ifdef CONFIG_SLAB_OBJ_EXT 59 58 ___GFP_NO_OBJ_EXT_BIT, 60 - #endif 61 59 ___GFP_LAST_BIT 62 60 }; 63 61 ··· 96 98 #else 97 99 #define ___GFP_NOLOCKDEP 0 98 100 #endif 99 - #ifdef CONFIG_SLAB_OBJ_EXT 100 101 #define ___GFP_NO_OBJ_EXT BIT(___GFP_NO_OBJ_EXT_BIT) 101 - #else 102 - #define ___GFP_NO_OBJ_EXT 0 103 - #endif 104 102 105 103 /* 106 104 * Physical address zone modifiers (see linux/mmzone.h - low four bits)

+25 -33

include/linux/mempool.h

··· 27 27 wait_queue_head_t wait; 28 28 } mempool_t; 29 29 30 - static inline bool mempool_initialized(mempool_t *pool) 30 + static inline bool mempool_initialized(struct mempool *pool) 31 31 { 32 32 return pool->elements != NULL; 33 33 } 34 34 35 - static inline bool mempool_is_saturated(mempool_t *pool) 35 + static inline bool mempool_is_saturated(struct mempool *pool) 36 36 { 37 37 return READ_ONCE(pool->curr_nr) >= pool->min_nr; 38 38 } 39 39 40 - void mempool_exit(mempool_t *pool); 41 - int mempool_init_node(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn, 42 - mempool_free_t *free_fn, void *pool_data, 43 - gfp_t gfp_mask, int node_id); 44 - 45 - int mempool_init_noprof(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn, 46 - mempool_free_t *free_fn, void *pool_data); 40 + void mempool_exit(struct mempool *pool); 41 + int mempool_init_node(struct mempool *pool, int min_nr, 42 + mempool_alloc_t *alloc_fn, mempool_free_t *free_fn, 43 + void *pool_data, gfp_t gfp_mask, int node_id); 44 + int mempool_init_noprof(struct mempool *pool, int min_nr, 45 + mempool_alloc_t *alloc_fn, mempool_free_t *free_fn, 46 + void *pool_data); 47 47 #define mempool_init(...) \ 48 48 alloc_hooks(mempool_init_noprof(__VA_ARGS__)) 49 49 50 - extern mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn, 51 - mempool_free_t *free_fn, void *pool_data); 52 - 53 - extern mempool_t *mempool_create_node_noprof(int min_nr, mempool_alloc_t *alloc_fn, 54 - mempool_free_t *free_fn, void *pool_data, 55 - gfp_t gfp_mask, int nid); 50 + struct mempool *mempool_create(int min_nr, mempool_alloc_t *alloc_fn, 51 + mempool_free_t *free_fn, void *pool_data); 52 + struct mempool *mempool_create_node_noprof(int min_nr, 53 + mempool_alloc_t *alloc_fn, mempool_free_t *free_fn, 54 + void *pool_data, gfp_t gfp_mask, int nid); 56 55 #define mempool_create_node(...) \ 57 56 alloc_hooks(mempool_create_node_noprof(__VA_ARGS__)) 58 57 ··· 59 60 mempool_create_node(_min_nr, _alloc_fn, _free_fn, _pool_data, \ 60 61 GFP_KERNEL, NUMA_NO_NODE) 61 62 62 - extern int mempool_resize(mempool_t *pool, int new_min_nr); 63 - extern void mempool_destroy(mempool_t *pool); 63 + int mempool_resize(struct mempool *pool, int new_min_nr); 64 + void mempool_destroy(struct mempool *pool); 64 65 65 - extern void *mempool_alloc_noprof(mempool_t *pool, gfp_t gfp_mask) __malloc; 66 + void *mempool_alloc_noprof(struct mempool *pool, gfp_t gfp_mask) __malloc; 66 67 #define mempool_alloc(...) \ 67 68 alloc_hooks(mempool_alloc_noprof(__VA_ARGS__)) 69 + int mempool_alloc_bulk_noprof(struct mempool *pool, void **elem, 70 + unsigned int count, unsigned int allocated); 71 + #define mempool_alloc_bulk(...) \ 72 + alloc_hooks(mempool_alloc_bulk_noprof(__VA_ARGS__)) 68 73 69 - extern void *mempool_alloc_preallocated(mempool_t *pool) __malloc; 70 - extern void mempool_free(void *element, mempool_t *pool); 74 + void *mempool_alloc_preallocated(struct mempool *pool) __malloc; 75 + void mempool_free(void *element, struct mempool *pool); 76 + unsigned int mempool_free_bulk(struct mempool *pool, void **elem, 77 + unsigned int count); 71 78 72 79 /* 73 80 * A mempool_alloc_t and mempool_free_t that get the memory from ··· 101 96 #define mempool_create_kmalloc_pool(_min_nr, _size) \ 102 97 mempool_create((_min_nr), mempool_kmalloc, mempool_kfree, \ 103 98 (void *)(unsigned long)(_size)) 104 - 105 - void *mempool_kvmalloc(gfp_t gfp_mask, void *pool_data); 106 - void mempool_kvfree(void *element, void *pool_data); 107 - 108 - static inline int mempool_init_kvmalloc_pool(mempool_t *pool, int min_nr, size_t size) 109 - { 110 - return mempool_init(pool, min_nr, mempool_kvmalloc, mempool_kvfree, (void *) size); 111 - } 112 - 113 - static inline mempool_t *mempool_create_kvmalloc_pool(int min_nr, size_t size) 114 - { 115 - return mempool_create(min_nr, mempool_kvmalloc, mempool_kvfree, (void *) size); 116 - } 117 99 118 100 /* 119 101 * A mempool_alloc_t and mempool_free_t for a simple page allocator that

+2 -14

include/linux/page-flags.h

··· 1048 1048 */ 1049 1049 PAGE_TYPE_OPS(Guard, guard, guard) 1050 1050 1051 - FOLIO_TYPE_OPS(slab, slab) 1052 - 1053 - /** 1054 - * PageSlab - Determine if the page belongs to the slab allocator 1055 - * @page: The page to test. 1056 - * 1057 - * Context: Any context. 1058 - * Return: True for slab pages, false for any other kind of page. 1059 - */ 1060 - static inline bool PageSlab(const struct page *page) 1061 - { 1062 - return folio_test_slab(page_folio(page)); 1063 - } 1051 + PAGE_TYPE_OPS(Slab, slab, slab) 1064 1052 1065 1053 #ifdef CONFIG_HUGETLB_PAGE 1066 1054 FOLIO_TYPE_OPS(hugetlb, hugetlb) ··· 1064 1076 * Serialized with zone lock. 1065 1077 */ 1066 1078 PAGE_TYPE_OPS(Unaccepted, unaccepted, unaccepted) 1067 - FOLIO_TYPE_OPS(large_kmalloc, large_kmalloc) 1079 + PAGE_TYPE_OPS(LargeKmalloc, large_kmalloc, large_kmalloc) 1068 1080 1069 1081 /** 1070 1082 * PageHuge - Determine if the page belongs to hugetlbfs

+4 -8

mm/kasan/common.c

··· 520 520 521 521 bool __kasan_mempool_poison_object(void *ptr, unsigned long ip) 522 522 { 523 - struct folio *folio = virt_to_folio(ptr); 523 + struct page *page = virt_to_page(ptr); 524 524 struct slab *slab; 525 525 526 - /* 527 - * This function can be called for large kmalloc allocation that get 528 - * their memory from page_alloc. Thus, the folio might not be a slab. 529 - */ 530 - if (unlikely(!folio_test_slab(folio))) { 526 + if (unlikely(PageLargeKmalloc(page))) { 531 527 if (check_page_allocation(ptr, ip)) 532 528 return false; 533 - kasan_poison(ptr, folio_size(folio), KASAN_PAGE_FREE, false); 529 + kasan_poison(ptr, page_size(page), KASAN_PAGE_FREE, false); 534 530 return true; 535 531 } 536 532 537 533 if (is_kfence_address(ptr)) 538 534 return true; 539 535 540 - slab = folio_slab(folio); 536 + slab = page_slab(page); 541 537 542 538 if (check_slab_allocation(slab->slab_cache, ptr, ip)) 543 539 return false;

+8 -6

mm/kfence/core.c

··· 612 612 * enters __slab_free() slow-path. 613 613 */ 614 614 for (i = 0; i < KFENCE_POOL_SIZE / PAGE_SIZE; i++) { 615 - struct slab *slab; 615 + struct page *page; 616 616 617 617 if (!i || (i % 2)) 618 618 continue; 619 619 620 - slab = page_slab(pfn_to_page(start_pfn + i)); 621 - __folio_set_slab(slab_folio(slab)); 620 + page = pfn_to_page(start_pfn + i); 621 + __SetPageSlab(page); 622 622 #ifdef CONFIG_MEMCG 623 + struct slab *slab = page_slab(page); 623 624 slab->obj_exts = (unsigned long)&kfence_metadata_init[i / 2 - 1].obj_exts | 624 625 MEMCG_DATA_OBJEXTS; 625 626 #endif ··· 666 665 667 666 reset_slab: 668 667 for (i = 0; i < KFENCE_POOL_SIZE / PAGE_SIZE; i++) { 669 - struct slab *slab; 668 + struct page *page; 670 669 671 670 if (!i || (i % 2)) 672 671 continue; 673 672 674 - slab = page_slab(pfn_to_page(start_pfn + i)); 673 + page = pfn_to_page(start_pfn + i); 675 674 #ifdef CONFIG_MEMCG 675 + struct slab *slab = page_slab(page); 676 676 slab->obj_exts = 0; 677 677 #endif 678 - __folio_clear_slab(slab_folio(slab)); 678 + __ClearPageSlab(page); 679 679 } 680 680 681 681 return addr;

+16 -24

mm/memcontrol.c

··· 2557 2557 } 2558 2558 2559 2559 static __always_inline 2560 - struct mem_cgroup *mem_cgroup_from_obj_folio(struct folio *folio, void *p) 2560 + struct mem_cgroup *mem_cgroup_from_obj_slab(struct slab *slab, void *p) 2561 2561 { 2562 2562 /* 2563 2563 * Slab objects are accounted individually, not per-page. 2564 2564 * Memcg membership data for each individual object is saved in 2565 2565 * slab->obj_exts. 2566 2566 */ 2567 - if (folio_test_slab(folio)) { 2568 - struct slabobj_ext *obj_exts; 2569 - struct slab *slab; 2570 - unsigned int off; 2567 + struct slabobj_ext *obj_exts; 2568 + unsigned int off; 2571 2569 2572 - slab = folio_slab(folio); 2573 - obj_exts = slab_obj_exts(slab); 2574 - if (!obj_exts) 2575 - return NULL; 2576 - 2577 - off = obj_to_index(slab->slab_cache, slab, p); 2578 - if (obj_exts[off].objcg) 2579 - return obj_cgroup_memcg(obj_exts[off].objcg); 2580 - 2570 + obj_exts = slab_obj_exts(slab); 2571 + if (!obj_exts) 2581 2572 return NULL; 2582 - } 2583 2573 2584 - /* 2585 - * folio_memcg_check() is used here, because in theory we can encounter 2586 - * a folio where the slab flag has been cleared already, but 2587 - * slab->obj_exts has not been freed yet 2588 - * folio_memcg_check() will guarantee that a proper memory 2589 - * cgroup pointer or NULL will be returned. 2590 - */ 2591 - return folio_memcg_check(folio); 2574 + off = obj_to_index(slab->slab_cache, slab, p); 2575 + if (obj_exts[off].objcg) 2576 + return obj_cgroup_memcg(obj_exts[off].objcg); 2577 + 2578 + return NULL; 2592 2579 } 2593 2580 2594 2581 /* ··· 2589 2602 */ 2590 2603 struct mem_cgroup *mem_cgroup_from_slab_obj(void *p) 2591 2604 { 2605 + struct slab *slab; 2606 + 2592 2607 if (mem_cgroup_disabled()) 2593 2608 return NULL; 2594 2609 2595 - return mem_cgroup_from_obj_folio(virt_to_folio(p), p); 2610 + slab = virt_to_slab(p); 2611 + if (slab) 2612 + return mem_cgroup_from_obj_slab(slab, p); 2613 + return folio_memcg_check(virt_to_folio(p)); 2596 2614 } 2597 2615 2598 2616 static struct obj_cgroup *__get_obj_cgroup_from_memcg(struct mem_cgroup *memcg)

+265 -162

mm/mempool.c

··· 1 1 // SPDX-License-Identifier: GPL-2.0 2 2 /* 3 - * linux/mm/mempool.c 4 - * 5 3 * memory buffer pool support. Such pools are mostly used 6 4 * for guaranteed, deadlock-free memory allocations during 7 5 * extreme VM load. ··· 7 9 * started by Ingo Molnar, Copyright (C) 2001 8 10 * debugging by David Rientjes, Copyright (C) 2015 9 11 */ 10 - 12 + #include <linux/fault-inject.h> 11 13 #include <linux/mm.h> 12 14 #include <linux/slab.h> 13 15 #include <linux/highmem.h> ··· 18 20 #include <linux/writeback.h> 19 21 #include "slab.h" 20 22 23 + static DECLARE_FAULT_ATTR(fail_mempool_alloc); 24 + static DECLARE_FAULT_ATTR(fail_mempool_alloc_bulk); 25 + 26 + static int __init mempool_faul_inject_init(void) 27 + { 28 + int error; 29 + 30 + error = PTR_ERR_OR_ZERO(fault_create_debugfs_attr("fail_mempool_alloc", 31 + NULL, &fail_mempool_alloc)); 32 + if (error) 33 + return error; 34 + 35 + /* booting will fail on error return here, don't bother to cleanup */ 36 + return PTR_ERR_OR_ZERO( 37 + fault_create_debugfs_attr("fail_mempool_alloc_bulk", NULL, 38 + &fail_mempool_alloc_bulk)); 39 + } 40 + late_initcall(mempool_faul_inject_init); 41 + 21 42 #ifdef CONFIG_SLUB_DEBUG_ON 22 - static void poison_error(mempool_t *pool, void *element, size_t size, 43 + static void poison_error(struct mempool *pool, void *element, size_t size, 23 44 size_t byte) 24 45 { 25 46 const int nr = pool->curr_nr; ··· 55 38 dump_stack(); 56 39 } 57 40 58 - static void __check_element(mempool_t *pool, void *element, size_t size) 41 + static void __check_element(struct mempool *pool, void *element, size_t size) 59 42 { 60 43 u8 *obj = element; 61 44 size_t i; ··· 71 54 memset(obj, POISON_INUSE, size); 72 55 } 73 56 74 - static void check_element(mempool_t *pool, void *element) 57 + static void check_element(struct mempool *pool, void *element) 75 58 { 76 59 /* Skip checking: KASAN might save its metadata in the element. */ 77 60 if (kasan_enabled()) ··· 110 93 obj[size - 1] = POISON_END; 111 94 } 112 95 113 - static void poison_element(mempool_t *pool, void *element) 96 + static void poison_element(struct mempool *pool, void *element) 114 97 { 115 98 /* Skip poisoning: KASAN might save its metadata in the element. */ 116 99 if (kasan_enabled()) ··· 141 124 } 142 125 } 143 126 #else /* CONFIG_SLUB_DEBUG_ON */ 144 - static inline void check_element(mempool_t *pool, void *element) 127 + static inline void check_element(struct mempool *pool, void *element) 145 128 { 146 129 } 147 - static inline void poison_element(mempool_t *pool, void *element) 130 + static inline void poison_element(struct mempool *pool, void *element) 148 131 { 149 132 } 150 133 #endif /* CONFIG_SLUB_DEBUG_ON */ 151 134 152 - static __always_inline bool kasan_poison_element(mempool_t *pool, void *element) 135 + static __always_inline bool kasan_poison_element(struct mempool *pool, 136 + void *element) 153 137 { 154 138 if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc) 155 139 return kasan_mempool_poison_object(element); ··· 160 142 return true; 161 143 } 162 144 163 - static void kasan_unpoison_element(mempool_t *pool, void *element) 145 + static void kasan_unpoison_element(struct mempool *pool, void *element) 164 146 { 165 147 if (pool->alloc == mempool_kmalloc) 166 148 kasan_mempool_unpoison_object(element, (size_t)pool->pool_data); ··· 172 154 (unsigned long)pool->pool_data); 173 155 } 174 156 175 - static __always_inline void add_element(mempool_t *pool, void *element) 157 + static __always_inline void add_element(struct mempool *pool, void *element) 176 158 { 177 159 BUG_ON(pool->min_nr != 0 && pool->curr_nr >= pool->min_nr); 178 160 poison_element(pool, element); ··· 180 162 pool->elements[pool->curr_nr++] = element; 181 163 } 182 164 183 - static void *remove_element(mempool_t *pool) 165 + static void *remove_element(struct mempool *pool) 184 166 { 185 167 void *element = pool->elements[--pool->curr_nr]; 186 168 ··· 201 183 * May be called on a zeroed but uninitialized mempool (i.e. allocated with 202 184 * kzalloc()). 203 185 */ 204 - void mempool_exit(mempool_t *pool) 186 + void mempool_exit(struct mempool *pool) 205 187 { 206 188 while (pool->curr_nr) { 207 189 void *element = remove_element(pool); ··· 220 202 * Free all reserved elements in @pool and @pool itself. This function 221 203 * only sleeps if the free_fn() function sleeps. 222 204 */ 223 - void mempool_destroy(mempool_t *pool) 205 + void mempool_destroy(struct mempool *pool) 224 206 { 225 207 if (unlikely(!pool)) 226 208 return; ··· 230 212 } 231 213 EXPORT_SYMBOL(mempool_destroy); 232 214 233 - int mempool_init_node(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn, 234 - mempool_free_t *free_fn, void *pool_data, 235 - gfp_t gfp_mask, int node_id) 215 + int mempool_init_node(struct mempool *pool, int min_nr, 216 + mempool_alloc_t *alloc_fn, mempool_free_t *free_fn, 217 + void *pool_data, gfp_t gfp_mask, int node_id) 236 218 { 237 219 spin_lock_init(&pool->lock); 238 220 pool->min_nr = min_nr; ··· 282 264 * 283 265 * Return: %0 on success, negative error code otherwise. 284 266 */ 285 - int mempool_init_noprof(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn, 286 - mempool_free_t *free_fn, void *pool_data) 267 + int mempool_init_noprof(struct mempool *pool, int min_nr, 268 + mempool_alloc_t *alloc_fn, mempool_free_t *free_fn, 269 + void *pool_data) 287 270 { 288 271 return mempool_init_node(pool, min_nr, alloc_fn, free_fn, 289 272 pool_data, GFP_KERNEL, NUMA_NO_NODE); ··· 310 291 * 311 292 * Return: pointer to the created memory pool object or %NULL on error. 312 293 */ 313 - mempool_t *mempool_create_node_noprof(int min_nr, mempool_alloc_t *alloc_fn, 314 - mempool_free_t *free_fn, void *pool_data, 315 - gfp_t gfp_mask, int node_id) 294 + struct mempool *mempool_create_node_noprof(int min_nr, 295 + mempool_alloc_t *alloc_fn, mempool_free_t *free_fn, 296 + void *pool_data, gfp_t gfp_mask, int node_id) 316 297 { 317 - mempool_t *pool; 298 + struct mempool *pool; 318 299 319 300 pool = kmalloc_node_noprof(sizeof(*pool), gfp_mask | __GFP_ZERO, node_id); 320 301 if (!pool) ··· 348 329 * 349 330 * Return: %0 on success, negative error code otherwise. 350 331 */ 351 - int mempool_resize(mempool_t *pool, int new_min_nr) 332 + int mempool_resize(struct mempool *pool, int new_min_nr) 352 333 { 353 334 void *element; 354 335 void **new_elements; ··· 410 391 } 411 392 EXPORT_SYMBOL(mempool_resize); 412 393 413 - /** 414 - * mempool_alloc - allocate an element from a specific memory pool 415 - * @pool: pointer to the memory pool which was allocated via 416 - * mempool_create(). 417 - * @gfp_mask: the usual allocation bitmask. 418 - * 419 - * this function only sleeps if the alloc_fn() function sleeps or 420 - * returns NULL. Note that due to preallocation, this function 421 - * *never* fails when called from process contexts. (it might 422 - * fail if called from an IRQ context.) 423 - * Note: using __GFP_ZERO is not supported. 424 - * 425 - * Return: pointer to the allocated element or %NULL on error. 426 - */ 427 - void *mempool_alloc_noprof(mempool_t *pool, gfp_t gfp_mask) 394 + static unsigned int mempool_alloc_from_pool(struct mempool *pool, void **elems, 395 + unsigned int count, unsigned int allocated, 396 + gfp_t gfp_mask) 428 397 { 429 - void *element; 430 398 unsigned long flags; 431 - wait_queue_entry_t wait; 432 - gfp_t gfp_temp; 399 + unsigned int i; 400 + 401 + spin_lock_irqsave(&pool->lock, flags); 402 + if (unlikely(pool->curr_nr < count - allocated)) 403 + goto fail; 404 + for (i = 0; i < count; i++) { 405 + if (!elems[i]) { 406 + elems[i] = remove_element(pool); 407 + allocated++; 408 + } 409 + } 410 + spin_unlock_irqrestore(&pool->lock, flags); 411 + 412 + /* Paired with rmb in mempool_free(), read comment there. */ 413 + smp_wmb(); 414 + 415 + /* 416 + * Update the allocation stack trace as this is more useful for 417 + * debugging. 418 + */ 419 + for (i = 0; i < count; i++) 420 + kmemleak_update_trace(elems[i]); 421 + return allocated; 422 + 423 + fail: 424 + if (gfp_mask & __GFP_DIRECT_RECLAIM) { 425 + DEFINE_WAIT(wait); 426 + 427 + prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE); 428 + spin_unlock_irqrestore(&pool->lock, flags); 429 + 430 + /* 431 + * Wait for someone else to return an element to @pool, but wake 432 + * up occasionally as memory pressure might have reduced even 433 + * and the normal allocation in alloc_fn could succeed even if 434 + * no element was returned. 435 + */ 436 + io_schedule_timeout(5 * HZ); 437 + finish_wait(&pool->wait, &wait); 438 + } else { 439 + /* We must not sleep if __GFP_DIRECT_RECLAIM is not set. */ 440 + spin_unlock_irqrestore(&pool->lock, flags); 441 + } 442 + 443 + return allocated; 444 + } 445 + 446 + /* 447 + * Adjust the gfp flags for mempool allocations, as we never want to dip into 448 + * the global emergency reserves or retry in the page allocator. 449 + * 450 + * The first pass also doesn't want to go reclaim, but the next passes do, so 451 + * return a separate subset for that first iteration. 452 + */ 453 + static inline gfp_t mempool_adjust_gfp(gfp_t *gfp_mask) 454 + { 455 + *gfp_mask |= __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN; 456 + return *gfp_mask & ~(__GFP_DIRECT_RECLAIM | __GFP_IO); 457 + } 458 + 459 + /** 460 + * mempool_alloc_bulk - allocate multiple elements from a memory pool 461 + * @pool: pointer to the memory pool 462 + * @elems: partially or fully populated elements array 463 + * @count: number of entries in @elem that need to be allocated 464 + * @allocated: number of entries in @elem already allocated 465 + * 466 + * Allocate elements for each slot in @elem that is non-%NULL. This is done by 467 + * first calling into the alloc_fn supplied at pool initialization time, and 468 + * dipping into the reserved pool when alloc_fn fails to allocate an element. 469 + * 470 + * On return all @count elements in @elems will be populated. 471 + * 472 + * Return: Always 0. If it wasn't for %$#^$ alloc tags, it would return void. 473 + */ 474 + int mempool_alloc_bulk_noprof(struct mempool *pool, void **elems, 475 + unsigned int count, unsigned int allocated) 476 + { 477 + gfp_t gfp_mask = GFP_KERNEL; 478 + gfp_t gfp_temp = mempool_adjust_gfp(&gfp_mask); 479 + unsigned int i = 0; 480 + 481 + VM_WARN_ON_ONCE(count > pool->min_nr); 482 + might_alloc(gfp_mask); 483 + 484 + /* 485 + * If an error is injected, fail all elements in a bulk allocation so 486 + * that we stress the multiple elements missing path. 487 + */ 488 + if (should_fail_ex(&fail_mempool_alloc_bulk, 1, FAULT_NOWARN)) { 489 + pr_info("forcing mempool usage for %pS\n", 490 + (void *)_RET_IP_); 491 + goto use_pool; 492 + } 493 + 494 + repeat_alloc: 495 + /* 496 + * Try to allocate the elements using the allocation callback first as 497 + * that might succeed even when the caller's bulk allocation did not. 498 + */ 499 + for (i = 0; i < count; i++) { 500 + if (elems[i]) 501 + continue; 502 + elems[i] = pool->alloc(gfp_temp, pool->pool_data); 503 + if (unlikely(!elems[i])) 504 + goto use_pool; 505 + allocated++; 506 + } 507 + 508 + return 0; 509 + 510 + use_pool: 511 + allocated = mempool_alloc_from_pool(pool, elems, count, allocated, 512 + gfp_temp); 513 + gfp_temp = gfp_mask; 514 + goto repeat_alloc; 515 + } 516 + EXPORT_SYMBOL_GPL(mempool_alloc_bulk_noprof); 517 + 518 + /** 519 + * mempool_alloc - allocate an element from a memory pool 520 + * @pool: pointer to the memory pool 521 + * @gfp_mask: GFP_* flags. %__GFP_ZERO is not supported. 522 + * 523 + * Allocate an element from @pool. This is done by first calling into the 524 + * alloc_fn supplied at pool initialization time, and dipping into the reserved 525 + * pool when alloc_fn fails to allocate an element. 526 + * 527 + * This function only sleeps if the alloc_fn callback sleeps, or when waiting 528 + * for elements to become available in the pool. 529 + * 530 + * Return: pointer to the allocated element or %NULL when failing to allocate 531 + * an element. Allocation failure can only happen when @gfp_mask does not 532 + * include %__GFP_DIRECT_RECLAIM. 533 + */ 534 + void *mempool_alloc_noprof(struct mempool *pool, gfp_t gfp_mask) 535 + { 536 + gfp_t gfp_temp = mempool_adjust_gfp(&gfp_mask); 537 + void *element; 433 538 434 539 VM_WARN_ON_ONCE(gfp_mask & __GFP_ZERO); 435 540 might_alloc(gfp_mask); 436 541 437 - gfp_mask |= __GFP_NOMEMALLOC; /* don't allocate emergency reserves */ 438 - gfp_mask |= __GFP_NORETRY; /* don't loop in __alloc_pages */ 439 - gfp_mask |= __GFP_NOWARN; /* failures are OK */ 440 - 441 - gfp_temp = gfp_mask & ~(__GFP_DIRECT_RECLAIM|__GFP_IO); 442 - 443 542 repeat_alloc: 543 + if (should_fail_ex(&fail_mempool_alloc, 1, FAULT_NOWARN)) { 544 + pr_info("forcing mempool usage for %pS\n", 545 + (void *)_RET_IP_); 546 + element = NULL; 547 + } else { 548 + element = pool->alloc(gfp_temp, pool->pool_data); 549 + } 444 550 445 - element = pool->alloc(gfp_temp, pool->pool_data); 446 - if (likely(element != NULL)) 447 - return element; 448 - 449 - spin_lock_irqsave(&pool->lock, flags); 450 - if (likely(pool->curr_nr)) { 451 - element = remove_element(pool); 452 - spin_unlock_irqrestore(&pool->lock, flags); 453 - /* paired with rmb in mempool_free(), read comment there */ 454 - smp_wmb(); 551 + if (unlikely(!element)) { 455 552 /* 456 - * Update the allocation stack trace as this is more useful 457 - * for debugging. 553 + * Try to allocate an element from the pool. 554 + * 555 + * The first pass won't have __GFP_DIRECT_RECLAIM and won't 556 + * sleep in mempool_alloc_from_pool. Retry the allocation 557 + * with all flags set in that case. 458 558 */ 459 - kmemleak_update_trace(element); 460 - return element; 559 + if (!mempool_alloc_from_pool(pool, &element, 1, 0, gfp_temp)) { 560 + if (gfp_temp != gfp_mask) { 561 + gfp_temp = gfp_mask; 562 + goto repeat_alloc; 563 + } 564 + if (gfp_mask & __GFP_DIRECT_RECLAIM) { 565 + goto repeat_alloc; 566 + } 567 + } 461 568 } 462 569 463 - /* 464 - * We use gfp mask w/o direct reclaim or IO for the first round. If 465 - * alloc failed with that and @pool was empty, retry immediately. 466 - */ 467 - if (gfp_temp != gfp_mask) { 468 - spin_unlock_irqrestore(&pool->lock, flags); 469 - gfp_temp = gfp_mask; 470 - goto repeat_alloc; 471 - } 472 - 473 - /* We must not sleep if !__GFP_DIRECT_RECLAIM */ 474 - if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) { 475 - spin_unlock_irqrestore(&pool->lock, flags); 476 - return NULL; 477 - } 478 - 479 - /* Let's wait for someone else to return an element to @pool */ 480 - init_wait(&wait); 481 - prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE); 482 - 483 - spin_unlock_irqrestore(&pool->lock, flags); 484 - 485 - /* 486 - * FIXME: this should be io_schedule(). The timeout is there as a 487 - * workaround for some DM problems in 2.6.18. 488 - */ 489 - io_schedule_timeout(5*HZ); 490 - 491 - finish_wait(&pool->wait, &wait); 492 - goto repeat_alloc; 570 + return element; 493 571 } 494 572 EXPORT_SYMBOL(mempool_alloc_noprof); 495 573 496 574 /** 497 575 * mempool_alloc_preallocated - allocate an element from preallocated elements 498 - * belonging to a specific memory pool 499 - * @pool: pointer to the memory pool which was allocated via 500 - * mempool_create(). 576 + * belonging to a memory pool 577 + * @pool: pointer to the memory pool 501 578 * 502 - * This function is similar to mempool_alloc, but it only attempts allocating 503 - * an element from the preallocated elements. It does not sleep and immediately 504 - * returns if no preallocated elements are available. 579 + * This function is similar to mempool_alloc(), but it only attempts allocating 580 + * an element from the preallocated elements. It only takes a single spinlock_t 581 + * and immediately returns if no preallocated elements are available. 505 582 * 506 583 * Return: pointer to the allocated element or %NULL if no elements are 507 584 * available. 508 585 */ 509 - void *mempool_alloc_preallocated(mempool_t *pool) 586 + void *mempool_alloc_preallocated(struct mempool *pool) 510 587 { 511 - void *element; 512 - unsigned long flags; 588 + void *element = NULL; 513 589 514 - spin_lock_irqsave(&pool->lock, flags); 515 - if (likely(pool->curr_nr)) { 516 - element = remove_element(pool); 517 - spin_unlock_irqrestore(&pool->lock, flags); 518 - /* paired with rmb in mempool_free(), read comment there */ 519 - smp_wmb(); 520 - /* 521 - * Update the allocation stack trace as this is more useful 522 - * for debugging. 523 - */ 524 - kmemleak_update_trace(element); 525 - return element; 526 - } 527 - spin_unlock_irqrestore(&pool->lock, flags); 528 - 529 - return NULL; 590 + mempool_alloc_from_pool(pool, &element, 1, 0, GFP_NOWAIT); 591 + return element; 530 592 } 531 593 EXPORT_SYMBOL(mempool_alloc_preallocated); 532 594 533 595 /** 534 - * mempool_free - return an element to the pool. 535 - * @element: pool element pointer. 536 - * @pool: pointer to the memory pool which was allocated via 537 - * mempool_create(). 596 + * mempool_free_bulk - return elements to a mempool 597 + * @pool: pointer to the memory pool 598 + * @elems: elements to return 599 + * @count: number of elements to return 538 600 * 539 - * this function only sleeps if the free_fn() function sleeps. 601 + * Returns a number of elements from the start of @elem to @pool if @pool needs 602 + * replenishing and sets their slots in @elem to NULL. Other elements are left 603 + * in @elem. 604 + * 605 + * Return: number of elements transferred to @pool. Elements are always 606 + * transferred from the beginning of @elem, so the return value can be used as 607 + * an offset into @elem for the freeing the remaining elements in the caller. 540 608 */ 541 - void mempool_free(void *element, mempool_t *pool) 609 + unsigned int mempool_free_bulk(struct mempool *pool, void **elems, 610 + unsigned int count) 542 611 { 543 612 unsigned long flags; 544 - 545 - if (unlikely(element == NULL)) 546 - return; 613 + unsigned int freed = 0; 614 + bool added = false; 547 615 548 616 /* 549 617 * Paired with the wmb in mempool_alloc(). The preceding read is ··· 664 558 * Waiters happen iff curr_nr is 0 and the above guarantee also 665 559 * ensures that there will be frees which return elements to the 666 560 * pool waking up the waiters. 667 - */ 668 - if (unlikely(READ_ONCE(pool->curr_nr) < pool->min_nr)) { 669 - spin_lock_irqsave(&pool->lock, flags); 670 - if (likely(pool->curr_nr < pool->min_nr)) { 671 - add_element(pool, element); 672 - spin_unlock_irqrestore(&pool->lock, flags); 673 - if (wq_has_sleeper(&pool->wait)) 674 - wake_up(&pool->wait); 675 - return; 676 - } 677 - spin_unlock_irqrestore(&pool->lock, flags); 678 - } 679 - 680 - /* 681 - * Handle the min_nr = 0 edge case: 682 561 * 683 562 * For zero-minimum pools, curr_nr < min_nr (0 < 0) never succeeds, 684 563 * so waiters sleeping on pool->wait would never be woken by the ··· 671 580 * allocation of element when both min_nr and curr_nr are 0, and 672 581 * any active waiters are properly awakened. 673 582 */ 674 - if (unlikely(pool->min_nr == 0 && 583 + if (unlikely(READ_ONCE(pool->curr_nr) < pool->min_nr)) { 584 + spin_lock_irqsave(&pool->lock, flags); 585 + while (pool->curr_nr < pool->min_nr && freed < count) { 586 + add_element(pool, elems[freed++]); 587 + added = true; 588 + } 589 + spin_unlock_irqrestore(&pool->lock, flags); 590 + } else if (unlikely(pool->min_nr == 0 && 675 591 READ_ONCE(pool->curr_nr) == 0)) { 592 + /* Handle the min_nr = 0 edge case: */ 676 593 spin_lock_irqsave(&pool->lock, flags); 677 594 if (likely(pool->curr_nr == 0)) { 678 - add_element(pool, element); 679 - spin_unlock_irqrestore(&pool->lock, flags); 680 - if (wq_has_sleeper(&pool->wait)) 681 - wake_up(&pool->wait); 682 - return; 595 + add_element(pool, elems[freed++]); 596 + added = true; 683 597 } 684 598 spin_unlock_irqrestore(&pool->lock, flags); 685 599 } 686 600 687 - pool->free(element, pool->pool_data); 601 + if (unlikely(added) && wq_has_sleeper(&pool->wait)) 602 + wake_up(&pool->wait); 603 + 604 + return freed; 605 + } 606 + EXPORT_SYMBOL_GPL(mempool_free_bulk); 607 + 608 + /** 609 + * mempool_free - return an element to the pool. 610 + * @element: element to return 611 + * @pool: pointer to the memory pool 612 + * 613 + * Returns @element to @pool if it needs replenishing, else frees it using 614 + * the free_fn callback in @pool. 615 + * 616 + * This function only sleeps if the free_fn callback sleeps. 617 + */ 618 + void mempool_free(void *element, struct mempool *pool) 619 + { 620 + if (likely(element) && !mempool_free_bulk(pool, &element, 1)) 621 + pool->free(element, pool->pool_data); 688 622 } 689 623 EXPORT_SYMBOL(mempool_free); 690 624 ··· 747 631 kfree(element); 748 632 } 749 633 EXPORT_SYMBOL(mempool_kfree); 750 - 751 - void *mempool_kvmalloc(gfp_t gfp_mask, void *pool_data) 752 - { 753 - size_t size = (size_t)pool_data; 754 - return kvmalloc(size, gfp_mask); 755 - } 756 - EXPORT_SYMBOL(mempool_kvmalloc); 757 - 758 - void mempool_kvfree(void *element, void *pool_data) 759 - { 760 - kvfree(element); 761 - } 762 - EXPORT_SYMBOL(mempool_kvfree); 763 634 764 635 /* 765 636 * A simple mempool-backed page allocator that allocates pages

+10 -5

mm/page_alloc.c

··· 4977 4977 * @nr_pages: The number of pages desired in the array 4978 4978 * @page_array: Array to store the pages 4979 4979 * 4980 - * This is a batched version of the page allocator that attempts to 4981 - * allocate nr_pages quickly. Pages are added to the page_array. 4980 + * This is a batched version of the page allocator that attempts to allocate 4981 + * @nr_pages quickly. Pages are added to @page_array. 4982 4982 * 4983 - * Note that only NULL elements are populated with pages and nr_pages 4984 - * is the maximum number of pages that will be stored in the array. 4983 + * Note that only the elements in @page_array that were cleared to %NULL on 4984 + * entry are populated with newly allocated pages. @nr_pages is the maximum 4985 + * number of pages that will be stored in the array. 4985 4986 * 4986 - * Returns the number of pages in the array. 4987 + * Returns the number of pages in @page_array, including ones already 4988 + * allocated on entry. This can be less than the number requested in @nr_pages, 4989 + * but all empty slots are filled from the beginning. I.e., if all slots in 4990 + * @page_array were set to %NULL on entry, the slots from 0 to the return value 4991 + * - 1 will be filled. 4987 4992 */ 4988 4993 unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid, 4989 4994 nodemask_t *nodemask, int nr_pages,

+51 -61

mm/slab.h

··· 40 40 * Freelist pointer and counter to cmpxchg together, avoids the typical ABA 41 41 * problems with cmpxchg of just a pointer. 42 42 */ 43 - typedef union { 44 - struct { 45 - void *freelist; 46 - unsigned long counter; 43 + struct freelist_counters { 44 + union { 45 + struct { 46 + void *freelist; 47 + union { 48 + unsigned long counters; 49 + struct { 50 + unsigned inuse:16; 51 + unsigned objects:15; 52 + /* 53 + * If slab debugging is enabled then the 54 + * frozen bit can be reused to indicate 55 + * that the slab was corrupted 56 + */ 57 + unsigned frozen:1; 58 + }; 59 + }; 60 + }; 61 + #ifdef system_has_freelist_aba 62 + freelist_full_t freelist_counters; 63 + #endif 47 64 }; 48 - freelist_full_t full; 49 - } freelist_aba_t; 65 + }; 50 66 51 67 /* Reuses the bits in struct page */ 52 68 struct slab { ··· 85 69 #endif 86 70 }; 87 71 /* Double-word boundary */ 88 - union { 89 - struct { 90 - void *freelist; /* first free object */ 91 - union { 92 - unsigned long counters; 93 - struct { 94 - unsigned inuse:16; 95 - unsigned objects:15; 96 - /* 97 - * If slab debugging is enabled then the 98 - * frozen bit can be reused to indicate 99 - * that the slab was corrupted 100 - */ 101 - unsigned frozen:1; 102 - }; 103 - }; 104 - }; 105 - #ifdef system_has_freelist_aba 106 - freelist_aba_t freelist_counter; 107 - #endif 108 - }; 72 + struct freelist_counters; 109 73 }; 110 74 struct rcu_head rcu_head; 111 75 }; ··· 110 114 #undef SLAB_MATCH 111 115 static_assert(sizeof(struct slab) <= sizeof(struct page)); 112 116 #if defined(system_has_freelist_aba) 113 - static_assert(IS_ALIGNED(offsetof(struct slab, freelist), sizeof(freelist_aba_t))); 117 + static_assert(IS_ALIGNED(offsetof(struct slab, freelist), sizeof(struct freelist_counters))); 114 118 #endif 115 - 116 - /** 117 - * folio_slab - Converts from folio to slab. 118 - * @folio: The folio. 119 - * 120 - * Currently struct slab is a different representation of a folio where 121 - * folio_test_slab() is true. 122 - * 123 - * Return: The slab which contains this folio. 124 - */ 125 - #define folio_slab(folio) (_Generic((folio), \ 126 - const struct folio *: (const struct slab *)(folio), \ 127 - struct folio *: (struct slab *)(folio))) 128 119 129 120 /** 130 121 * slab_folio - The folio allocated for a slab ··· 129 146 struct slab *: (struct folio *)s)) 130 147 131 148 /** 132 - * page_slab - Converts from first struct page to slab. 133 - * @p: The first (either head of compound or single) page of slab. 149 + * page_slab - Converts from struct page to its slab. 150 + * @page: A page which may or may not belong to a slab. 134 151 * 135 - * A temporary wrapper to convert struct page to struct slab in situations where 136 - * we know the page is the compound head, or single order-0 page. 137 - * 138 - * Long-term ideally everything would work with struct slab directly or go 139 - * through folio to struct slab. 140 - * 141 - * Return: The slab which contains this page 152 + * Return: The slab which contains this page or NULL if the page does 153 + * not belong to a slab. This includes pages returned from large kmalloc. 142 154 */ 143 - #define page_slab(p) (_Generic((p), \ 144 - const struct page *: (const struct slab *)(p), \ 145 - struct page *: (struct slab *)(p))) 155 + static inline struct slab *page_slab(const struct page *page) 156 + { 157 + unsigned long head; 158 + 159 + head = READ_ONCE(page->compound_head); 160 + if (head & 1) 161 + page = (struct page *)(head - 1); 162 + if (data_race(page->page_type >> 24) != PGTY_slab) 163 + page = NULL; 164 + 165 + return (struct slab *)page; 166 + } 146 167 147 168 /** 148 169 * slab_page - The first struct page allocated for a slab ··· 175 188 176 189 static inline struct slab *virt_to_slab(const void *addr) 177 190 { 178 - struct folio *folio = virt_to_folio(addr); 179 - 180 - if (!folio_test_slab(folio)) 181 - return NULL; 182 - 183 - return folio_slab(folio); 191 + return page_slab(virt_to_page(addr)); 184 192 } 185 193 186 194 static inline int slab_order(const struct slab *slab) ··· 218 236 * Slab cache management. 219 237 */ 220 238 struct kmem_cache { 221 - #ifndef CONFIG_SLUB_TINY 222 239 struct kmem_cache_cpu __percpu *cpu_slab; 223 240 struct lock_class_key lock_key; 224 - #endif 225 241 struct slub_percpu_sheaves __percpu *cpu_sheaves; 226 242 /* Used for retrieving partial slabs, etc. */ 227 243 slab_flags_t flags; ··· 579 599 * Else we can use all the padding etc for the allocation 580 600 */ 581 601 return s->size; 602 + } 603 + 604 + static inline unsigned int large_kmalloc_order(const struct page *page) 605 + { 606 + return page[1].flags.f & 0xff; 607 + } 608 + 609 + static inline size_t large_kmalloc_size(const struct page *page) 610 + { 611 + return PAGE_SIZE << large_kmalloc_order(page); 582 612 } 583 613 584 614 #ifdef CONFIG_SLUB_DEBUG

+14 -15

mm/slab_common.c

··· 997 997 */ 998 998 size_t __ksize(const void *object) 999 999 { 1000 - struct folio *folio; 1000 + const struct page *page; 1001 + const struct slab *slab; 1001 1002 1002 1003 if (unlikely(object == ZERO_SIZE_PTR)) 1003 1004 return 0; 1004 1005 1005 - folio = virt_to_folio(object); 1006 + page = virt_to_page(object); 1006 1007 1007 - if (unlikely(!folio_test_slab(folio))) { 1008 - if (WARN_ON(folio_size(folio) <= KMALLOC_MAX_CACHE_SIZE)) 1009 - return 0; 1010 - if (WARN_ON(object != folio_address(folio))) 1011 - return 0; 1012 - return folio_size(folio); 1013 - } 1008 + if (unlikely(PageLargeKmalloc(page))) 1009 + return large_kmalloc_size(page); 1010 + 1011 + slab = page_slab(page); 1012 + /* Delete this after we're sure there are no users */ 1013 + if (WARN_ON(!slab)) 1014 + return page_size(page); 1014 1015 1015 1016 #ifdef CONFIG_SLUB_DEBUG 1016 - skip_orig_size_check(folio_slab(folio)->slab_cache, object); 1017 + skip_orig_size_check(slab->slab_cache, object); 1017 1018 #endif 1018 1019 1019 - return slab_ksize(folio_slab(folio)->slab_cache); 1020 + return slab_ksize(slab->slab_cache); 1020 1021 } 1021 1022 1022 1023 gfp_t kmalloc_fix_flags(gfp_t flags) ··· 1615 1614 static bool kfree_rcu_sheaf(void *obj) 1616 1615 { 1617 1616 struct kmem_cache *s; 1618 - struct folio *folio; 1619 1617 struct slab *slab; 1620 1618 1621 1619 if (is_vmalloc_addr(obj)) 1622 1620 return false; 1623 1621 1624 - folio = virt_to_folio(obj); 1625 - if (unlikely(!folio_test_slab(folio))) 1622 + slab = virt_to_slab(obj); 1623 + if (unlikely(!slab)) 1626 1624 return false; 1627 1625 1628 - slab = folio_slab(folio); 1629 1626 s = slab->slab_cache; 1630 1627 if (s->cpu_sheaves) { 1631 1628 if (likely(!IS_ENABLED(CONFIG_NUMA) ||

+352 -342

mm/slub.c

··· 410 410 NR_SLUB_STAT_ITEMS 411 411 }; 412 412 413 - #ifndef CONFIG_SLUB_TINY 413 + struct freelist_tid { 414 + union { 415 + struct { 416 + void *freelist; /* Pointer to next available object */ 417 + unsigned long tid; /* Globally unique transaction id */ 418 + }; 419 + freelist_full_t freelist_tid; 420 + }; 421 + }; 422 + 414 423 /* 415 424 * When changing the layout, make sure freelist and tid are still compatible 416 425 * with this_cpu_cmpxchg_double() alignment requirements. 417 426 */ 418 427 struct kmem_cache_cpu { 419 - union { 420 - struct { 421 - void **freelist; /* Pointer to next available object */ 422 - unsigned long tid; /* Globally unique transaction id */ 423 - }; 424 - freelist_aba_t freelist_tid; 425 - }; 428 + struct freelist_tid; 426 429 struct slab *slab; /* The slab from which we are allocating */ 427 430 #ifdef CONFIG_SLUB_CPU_PARTIAL 428 431 struct slab *partial; /* Partially allocated slabs */ ··· 435 432 unsigned int stat[NR_SLUB_STAT_ITEMS]; 436 433 #endif 437 434 }; 438 - #endif /* CONFIG_SLUB_TINY */ 439 435 440 436 static inline void stat(const struct kmem_cache *s, enum stat_item si) 441 437 { ··· 471 469 struct rcu_head rcu_head; 472 470 struct list_head barn_list; 473 471 /* only used for prefilled sheafs */ 474 - unsigned int capacity; 472 + struct { 473 + unsigned int capacity; 474 + bool pfmemalloc; 475 + }; 475 476 }; 476 477 struct kmem_cache *cache; 477 478 unsigned int size; ··· 599 594 return freelist_ptr_decode(s, p, ptr_addr); 600 595 } 601 596 602 - #ifndef CONFIG_SLUB_TINY 603 597 static void prefetch_freepointer(const struct kmem_cache *s, void *object) 604 598 { 605 599 prefetchw(object + s->offset); 606 600 } 607 - #endif 608 601 609 602 /* 610 603 * When running under KMSAN, get_freepointer_safe() may return an uninitialized ··· 714 711 return s->cpu_partial_slabs; 715 712 } 716 713 #else 714 + #ifdef SLAB_SUPPORTS_SYSFS 717 715 static inline void 718 716 slub_set_cpu_partial(struct kmem_cache *s, unsigned int nr_objects) 719 717 { 720 718 } 719 + #endif 721 720 722 721 static inline unsigned int slub_get_cpu_partial(struct kmem_cache *s) 723 722 { ··· 760 755 } 761 756 762 757 static inline bool 763 - __update_freelist_fast(struct slab *slab, 764 - void *freelist_old, unsigned long counters_old, 765 - void *freelist_new, unsigned long counters_new) 758 + __update_freelist_fast(struct slab *slab, struct freelist_counters *old, 759 + struct freelist_counters *new) 766 760 { 767 761 #ifdef system_has_freelist_aba 768 - freelist_aba_t old = { .freelist = freelist_old, .counter = counters_old }; 769 - freelist_aba_t new = { .freelist = freelist_new, .counter = counters_new }; 770 - 771 - return try_cmpxchg_freelist(&slab->freelist_counter.full, &old.full, new.full); 762 + return try_cmpxchg_freelist(&slab->freelist_counters, 763 + &old->freelist_counters, 764 + new->freelist_counters); 772 765 #else 773 766 return false; 774 767 #endif 775 768 } 776 769 777 770 static inline bool 778 - __update_freelist_slow(struct slab *slab, 779 - void *freelist_old, unsigned long counters_old, 780 - void *freelist_new, unsigned long counters_new) 771 + __update_freelist_slow(struct slab *slab, struct freelist_counters *old, 772 + struct freelist_counters *new) 781 773 { 782 774 bool ret = false; 783 775 784 776 slab_lock(slab); 785 - if (slab->freelist == freelist_old && 786 - slab->counters == counters_old) { 787 - slab->freelist = freelist_new; 788 - slab->counters = counters_new; 777 + if (slab->freelist == old->freelist && 778 + slab->counters == old->counters) { 779 + slab->freelist = new->freelist; 780 + slab->counters = new->counters; 789 781 ret = true; 790 782 } 791 783 slab_unlock(slab); ··· 798 796 * interrupt the operation. 799 797 */ 800 798 static inline bool __slab_update_freelist(struct kmem_cache *s, struct slab *slab, 801 - void *freelist_old, unsigned long counters_old, 802 - void *freelist_new, unsigned long counters_new, 803 - const char *n) 799 + struct freelist_counters *old, struct freelist_counters *new, const char *n) 804 800 { 805 801 bool ret; 806 802 807 803 if (USE_LOCKLESS_FAST_PATH()) 808 804 lockdep_assert_irqs_disabled(); 809 805 810 - if (s->flags & __CMPXCHG_DOUBLE) { 811 - ret = __update_freelist_fast(slab, freelist_old, counters_old, 812 - freelist_new, counters_new); 813 - } else { 814 - ret = __update_freelist_slow(slab, freelist_old, counters_old, 815 - freelist_new, counters_new); 816 - } 806 + if (s->flags & __CMPXCHG_DOUBLE) 807 + ret = __update_freelist_fast(slab, old, new); 808 + else 809 + ret = __update_freelist_slow(slab, old, new); 810 + 817 811 if (likely(ret)) 818 812 return true; 819 813 ··· 824 826 } 825 827 826 828 static inline bool slab_update_freelist(struct kmem_cache *s, struct slab *slab, 827 - void *freelist_old, unsigned long counters_old, 828 - void *freelist_new, unsigned long counters_new, 829 - const char *n) 829 + struct freelist_counters *old, struct freelist_counters *new, const char *n) 830 830 { 831 831 bool ret; 832 832 833 833 if (s->flags & __CMPXCHG_DOUBLE) { 834 - ret = __update_freelist_fast(slab, freelist_old, counters_old, 835 - freelist_new, counters_new); 834 + ret = __update_freelist_fast(slab, old, new); 836 835 } else { 837 836 unsigned long flags; 838 837 839 838 local_irq_save(flags); 840 - ret = __update_freelist_slow(slab, freelist_old, counters_old, 841 - freelist_new, counters_new); 839 + ret = __update_freelist_slow(slab, old, new); 842 840 local_irq_restore(flags); 843 841 } 844 842 if (likely(ret)) ··· 972 978 static slab_flags_t slub_debug; 973 979 #endif 974 980 975 - static char *slub_debug_string; 981 + static const char *slub_debug_string __ro_after_init; 976 982 static int disable_higher_order_debug; 977 983 978 984 /* ··· 1779 1785 * 1780 1786 * returns the start of next block if there's any, or NULL 1781 1787 */ 1782 - static char * 1783 - parse_slub_debug_flags(char *str, slab_flags_t *flags, char **slabs, bool init) 1788 + static const char * 1789 + parse_slub_debug_flags(const char *str, slab_flags_t *flags, const char **slabs, bool init) 1784 1790 { 1785 1791 bool higher_order_disable = false; 1786 1792 ··· 1857 1863 return NULL; 1858 1864 } 1859 1865 1860 - static int __init setup_slub_debug(char *str) 1866 + static int __init setup_slub_debug(const char *str, const struct kernel_param *kp) 1861 1867 { 1862 1868 slab_flags_t flags; 1863 1869 slab_flags_t global_flags; 1864 - char *saved_str; 1865 - char *slab_list; 1870 + const char *saved_str; 1871 + const char *slab_list; 1866 1872 bool global_slub_debug_changed = false; 1867 1873 bool slab_list_specified = false; 1868 1874 1869 1875 global_flags = DEBUG_DEFAULT_FLAGS; 1870 - if (*str++ != '=' || !*str) 1876 + if (!str || !*str) 1871 1877 /* 1872 1878 * No options specified. Switch on full debugging. 1873 1879 */ ··· 1911 1917 static_branch_unlikely(&init_on_free)) && 1912 1918 (slub_debug & SLAB_POISON)) 1913 1919 pr_info("mem auto-init: SLAB_POISON will take precedence over init_on_alloc/init_on_free\n"); 1914 - return 1; 1920 + return 0; 1915 1921 } 1916 1922 1917 - __setup("slab_debug", setup_slub_debug); 1918 - __setup_param("slub_debug", slub_debug, setup_slub_debug, 0); 1923 + static const struct kernel_param_ops param_ops_slab_debug __initconst = { 1924 + .flags = KERNEL_PARAM_OPS_FL_NOARG, 1925 + .set = setup_slub_debug, 1926 + }; 1927 + __core_param_cb(slab_debug, &param_ops_slab_debug, NULL, 0); 1928 + __core_param_cb(slub_debug, &param_ops_slab_debug, NULL, 0); 1919 1929 1920 1930 /* 1921 1931 * kmem_cache_flags - apply debugging options to the cache ··· 1933 1935 */ 1934 1936 slab_flags_t kmem_cache_flags(slab_flags_t flags, const char *name) 1935 1937 { 1936 - char *iter; 1938 + const char *iter; 1937 1939 size_t len; 1938 - char *next_block; 1940 + const char *next_block; 1939 1941 slab_flags_t block_flags; 1940 1942 slab_flags_t slub_debug_local = slub_debug; 1941 1943 ··· 1959 1961 continue; 1960 1962 /* Found a block that has a slab list, search it */ 1961 1963 while (*iter) { 1962 - char *end, *glob; 1964 + const char *end, *glob; 1963 1965 size_t cmplen; 1964 1966 1965 1967 end = strchrnul(iter, ','); ··· 2021 2023 int objects) {} 2022 2024 static inline void dec_slabs_node(struct kmem_cache *s, int node, 2023 2025 int objects) {} 2024 - #ifndef CONFIG_SLUB_TINY 2025 2026 static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab, 2026 2027 void **freelist, void *nextfree) 2027 2028 { 2028 2029 return false; 2029 2030 } 2030 - #endif 2031 2031 #endif /* CONFIG_SLUB_DEBUG */ 2032 + 2033 + /* 2034 + * The allocated objcg pointers array is not accounted directly. 2035 + * Moreover, it should not come from DMA buffer and is not readily 2036 + * reclaimable. So those GFP bits should be masked off. 2037 + */ 2038 + #define OBJCGS_CLEAR_MASK (__GFP_DMA | __GFP_RECLAIMABLE | \ 2039 + __GFP_ACCOUNT | __GFP_NOFAIL) 2032 2040 2033 2041 #ifdef CONFIG_SLAB_OBJ_EXT 2034 2042 ··· 2089 2085 struct slabobj_ext *vec, unsigned int objects) {} 2090 2086 2091 2087 #endif /* CONFIG_MEM_ALLOC_PROFILING_DEBUG */ 2092 - 2093 - /* 2094 - * The allocated objcg pointers array is not accounted directly. 2095 - * Moreover, it should not come from DMA buffer and is not readily 2096 - * reclaimable. So those GFP bits should be masked off. 2097 - */ 2098 - #define OBJCGS_CLEAR_MASK (__GFP_DMA | __GFP_RECLAIMABLE | \ 2099 - __GFP_ACCOUNT | __GFP_NOFAIL) 2100 2088 2101 2089 static inline void init_slab_obj_exts(struct slab *slab) 2102 2090 { ··· 2369 2373 { 2370 2374 struct slabobj_ext *slab_exts; 2371 2375 struct kmem_cache *s; 2372 - struct folio *folio; 2376 + struct page *page; 2373 2377 struct slab *slab; 2374 2378 unsigned long off; 2375 2379 2376 - folio = virt_to_folio(p); 2377 - if (!folio_test_slab(folio)) { 2380 + page = virt_to_page(p); 2381 + if (PageLargeKmalloc(page)) { 2382 + unsigned int order; 2378 2383 int size; 2379 2384 2380 - if (folio_memcg_kmem(folio)) 2385 + if (PageMemcgKmem(page)) 2381 2386 return true; 2382 2387 2383 - if (__memcg_kmem_charge_page(folio_page(folio, 0), flags, 2384 - folio_order(folio))) 2388 + order = large_kmalloc_order(page); 2389 + if (__memcg_kmem_charge_page(page, flags, order)) 2385 2390 return false; 2386 2391 2387 2392 /* 2388 - * This folio has already been accounted in the global stats but 2393 + * This page has already been accounted in the global stats but 2389 2394 * not in the memcg stats. So, subtract from the global and use 2390 2395 * the interface which adds to both global and memcg stats. 2391 2396 */ 2392 - size = folio_size(folio); 2393 - node_stat_mod_folio(folio, NR_SLAB_UNRECLAIMABLE_B, -size); 2394 - lruvec_stat_mod_folio(folio, NR_SLAB_UNRECLAIMABLE_B, size); 2397 + size = PAGE_SIZE << order; 2398 + mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE_B, -size); 2399 + mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B, size); 2395 2400 return true; 2396 2401 } 2397 2402 2398 - slab = folio_slab(folio); 2403 + slab = page_slab(page); 2399 2404 s = slab->slab_cache; 2400 2405 2401 2406 /* ··· 2598 2601 2599 2602 static struct slab_sheaf *alloc_empty_sheaf(struct kmem_cache *s, gfp_t gfp) 2600 2603 { 2601 - struct slab_sheaf *sheaf = kzalloc(struct_size(sheaf, objects, 2602 - s->sheaf_capacity), gfp); 2604 + struct slab_sheaf *sheaf; 2605 + size_t sheaf_size; 2606 + 2607 + if (gfp & __GFP_NO_OBJ_EXT) 2608 + return NULL; 2609 + 2610 + gfp &= ~OBJCGS_CLEAR_MASK; 2611 + 2612 + /* 2613 + * Prevent recursion to the same cache, or a deep stack of kmallocs of 2614 + * varying sizes (sheaf capacity might differ for each kmalloc size 2615 + * bucket) 2616 + */ 2617 + if (s->flags & SLAB_KMALLOC) 2618 + gfp |= __GFP_NO_OBJ_EXT; 2619 + 2620 + sheaf_size = struct_size(sheaf, objects, s->sheaf_capacity); 2621 + sheaf = kzalloc(sheaf_size, gfp); 2603 2622 2604 2623 if (unlikely(!sheaf)) 2605 2624 return NULL; ··· 2668 2655 if (!sheaf) 2669 2656 return NULL; 2670 2657 2671 - if (refill_sheaf(s, sheaf, gfp)) { 2658 + if (refill_sheaf(s, sheaf, gfp | __GFP_NOMEMALLOC)) { 2672 2659 free_empty_sheaf(s, sheaf); 2673 2660 return NULL; 2674 2661 } ··· 2746 2733 sheaf->size = 0; 2747 2734 } 2748 2735 2749 - static void __rcu_free_sheaf_prepare(struct kmem_cache *s, 2736 + static bool __rcu_free_sheaf_prepare(struct kmem_cache *s, 2750 2737 struct slab_sheaf *sheaf) 2751 2738 { 2752 2739 bool init = slab_want_init_on_free(s); 2753 2740 void **p = &sheaf->objects[0]; 2754 2741 unsigned int i = 0; 2742 + bool pfmemalloc = false; 2755 2743 2756 2744 while (i < sheaf->size) { 2757 2745 struct slab *slab = virt_to_slab(p[i]); ··· 2765 2751 continue; 2766 2752 } 2767 2753 2754 + if (slab_test_pfmemalloc(slab)) 2755 + pfmemalloc = true; 2756 + 2768 2757 i++; 2769 2758 } 2759 + 2760 + return pfmemalloc; 2770 2761 } 2771 2762 2772 2763 static void rcu_free_sheaf_nobarn(struct rcu_head *head) ··· 3034 3015 3035 3016 static void barn_shrink(struct kmem_cache *s, struct node_barn *barn) 3036 3017 { 3037 - struct list_head empty_list; 3038 - struct list_head full_list; 3018 + LIST_HEAD(empty_list); 3019 + LIST_HEAD(full_list); 3039 3020 struct slab_sheaf *sheaf, *sheaf2; 3040 3021 unsigned long flags; 3041 - 3042 - INIT_LIST_HEAD(&empty_list); 3043 - INIT_LIST_HEAD(&full_list); 3044 3022 3045 3023 spin_lock_irqsave(&barn->lock, flags); 3046 3024 ··· 3064 3048 struct kmem_cache_order_objects oo, 3065 3049 bool allow_spin) 3066 3050 { 3067 - struct folio *folio; 3051 + struct page *page; 3068 3052 struct slab *slab; 3069 3053 unsigned int order = oo_order(oo); 3070 3054 3071 3055 if (unlikely(!allow_spin)) 3072 - folio = (struct folio *)alloc_frozen_pages_nolock(0/* __GFP_COMP is implied */, 3056 + page = alloc_frozen_pages_nolock(0/* __GFP_COMP is implied */, 3073 3057 node, order); 3074 3058 else if (node == NUMA_NO_NODE) 3075 - folio = (struct folio *)alloc_frozen_pages(flags, order); 3059 + page = alloc_frozen_pages(flags, order); 3076 3060 else 3077 - folio = (struct folio *)__alloc_frozen_pages(flags, order, node, NULL); 3061 + page = __alloc_frozen_pages(flags, order, node, NULL); 3078 3062 3079 - if (!folio) 3063 + if (!page) 3080 3064 return NULL; 3081 3065 3082 - slab = folio_slab(folio); 3083 - __folio_set_slab(folio); 3084 - if (folio_is_pfmemalloc(folio)) 3066 + __SetPageSlab(page); 3067 + slab = page_slab(page); 3068 + if (page_is_pfmemalloc(page)) 3085 3069 slab_set_pfmemalloc(slab); 3086 3070 3087 3071 return slab; ··· 3305 3289 3306 3290 static void __free_slab(struct kmem_cache *s, struct slab *slab) 3307 3291 { 3308 - struct folio *folio = slab_folio(slab); 3309 - int order = folio_order(folio); 3292 + struct page *page = slab_page(slab); 3293 + int order = compound_order(page); 3310 3294 int pages = 1 << order; 3311 3295 3312 3296 __slab_clear_pfmemalloc(slab); 3313 - folio->mapping = NULL; 3314 - __folio_clear_slab(folio); 3297 + page->mapping = NULL; 3298 + __ClearPageSlab(page); 3315 3299 mm_account_reclaimed_pages(pages); 3316 3300 unaccount_slab(slab, order, s); 3317 - free_frozen_pages(&folio->page, order); 3301 + free_frozen_pages(page, order); 3318 3302 } 3319 3303 3320 3304 static void rcu_free_slab(struct rcu_head *h) ··· 3634 3618 return get_any_partial(s, pc); 3635 3619 } 3636 3620 3637 - #ifndef CONFIG_SLUB_TINY 3638 - 3639 3621 #ifdef CONFIG_PREEMPTION 3640 3622 /* 3641 3623 * Calculate the next globally unique transaction for disambiguation ··· 3737 3723 void *nextfree, *freelist_iter, *freelist_tail; 3738 3724 int tail = DEACTIVATE_TO_HEAD; 3739 3725 unsigned long flags = 0; 3740 - struct slab new; 3741 - struct slab old; 3726 + struct freelist_counters old, new; 3742 3727 3743 3728 if (READ_ONCE(slab->freelist)) { 3744 3729 stat(s, DEACTIVATE_REMOTE_FREES); ··· 3786 3773 } else { 3787 3774 new.freelist = old.freelist; 3788 3775 } 3789 - } while (!slab_update_freelist(s, slab, 3790 - old.freelist, old.counters, 3791 - new.freelist, new.counters, 3792 - "unfreezing slab")); 3776 + } while (!slab_update_freelist(s, slab, &old, &new, "unfreezing slab")); 3793 3777 3794 3778 /* 3795 3779 * Stage three: Manipulate the slab list based on the updated state. ··· 4028 4018 4029 4019 return c->slab || slub_percpu_partial(c); 4030 4020 } 4031 - 4032 - #else /* CONFIG_SLUB_TINY */ 4033 - static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) { } 4034 - static inline bool has_cpu_slab(int cpu, struct kmem_cache *s) { return false; } 4035 - static inline void flush_this_cpu_slab(struct kmem_cache *s) { } 4036 - #endif /* CONFIG_SLUB_TINY */ 4037 4021 4038 4022 static bool has_pcs_used(int cpu, struct kmem_cache *s) 4039 4023 { ··· 4369 4365 return true; 4370 4366 } 4371 4367 4372 - #ifndef CONFIG_SLUB_TINY 4373 4368 static inline bool 4374 4369 __update_cpu_freelist_fast(struct kmem_cache *s, 4375 4370 void *freelist_old, void *freelist_new, 4376 4371 unsigned long tid) 4377 4372 { 4378 - freelist_aba_t old = { .freelist = freelist_old, .counter = tid }; 4379 - freelist_aba_t new = { .freelist = freelist_new, .counter = next_tid(tid) }; 4373 + struct freelist_tid old = { .freelist = freelist_old, .tid = tid }; 4374 + struct freelist_tid new = { .freelist = freelist_new, .tid = next_tid(tid) }; 4380 4375 4381 - return this_cpu_try_cmpxchg_freelist(s->cpu_slab->freelist_tid.full, 4382 - &old.full, new.full); 4376 + return this_cpu_try_cmpxchg_freelist(s->cpu_slab->freelist_tid, 4377 + &old.freelist_tid, new.freelist_tid); 4383 4378 } 4384 4379 4385 4380 /* ··· 4391 4388 */ 4392 4389 static inline void *get_freelist(struct kmem_cache *s, struct slab *slab) 4393 4390 { 4394 - struct slab new; 4395 - unsigned long counters; 4396 - void *freelist; 4391 + struct freelist_counters old, new; 4397 4392 4398 4393 lockdep_assert_held(this_cpu_ptr(&s->cpu_slab->lock)); 4399 4394 4400 4395 do { 4401 - freelist = slab->freelist; 4402 - counters = slab->counters; 4396 + old.freelist = slab->freelist; 4397 + old.counters = slab->counters; 4403 4398 4404 - new.counters = counters; 4399 + new.freelist = NULL; 4400 + new.counters = old.counters; 4405 4401 4406 - new.inuse = slab->objects; 4407 - new.frozen = freelist != NULL; 4402 + new.inuse = old.objects; 4403 + new.frozen = old.freelist != NULL; 4408 4404 4409 - } while (!__slab_update_freelist(s, slab, 4410 - freelist, counters, 4411 - NULL, new.counters, 4412 - "get_freelist")); 4413 4405 4414 - return freelist; 4406 + } while (!__slab_update_freelist(s, slab, &old, &new, "get_freelist")); 4407 + 4408 + return old.freelist; 4415 4409 } 4416 4410 4417 4411 /* ··· 4416 4416 */ 4417 4417 static inline void *freeze_slab(struct kmem_cache *s, struct slab *slab) 4418 4418 { 4419 - struct slab new; 4420 - unsigned long counters; 4421 - void *freelist; 4419 + struct freelist_counters old, new; 4422 4420 4423 4421 do { 4424 - freelist = slab->freelist; 4425 - counters = slab->counters; 4422 + old.freelist = slab->freelist; 4423 + old.counters = slab->counters; 4426 4424 4427 - new.counters = counters; 4425 + new.freelist = NULL; 4426 + new.counters = old.counters; 4428 4427 VM_BUG_ON(new.frozen); 4429 4428 4430 - new.inuse = slab->objects; 4429 + new.inuse = old.objects; 4431 4430 new.frozen = 1; 4432 4431 4433 - } while (!slab_update_freelist(s, slab, 4434 - freelist, counters, 4435 - NULL, new.counters, 4436 - "freeze_slab")); 4432 + } while (!slab_update_freelist(s, slab, &old, &new, "freeze_slab")); 4437 4433 4438 - return freelist; 4434 + return old.freelist; 4439 4435 } 4440 4436 4441 4437 /* ··· 4625 4629 pc.orig_size = orig_size; 4626 4630 slab = get_partial(s, node, &pc); 4627 4631 if (slab) { 4628 - if (kmem_cache_debug(s)) { 4632 + if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) { 4629 4633 freelist = pc.object; 4630 4634 /* 4631 4635 * For debug caches here we had to go through ··· 4663 4667 4664 4668 stat(s, ALLOC_SLAB); 4665 4669 4666 - if (kmem_cache_debug(s)) { 4670 + if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) { 4667 4671 freelist = alloc_single_from_new_slab(s, slab, orig_size, gfpflags); 4668 4672 4669 4673 if (unlikely(!freelist)) { ··· 4875 4879 4876 4880 return object; 4877 4881 } 4878 - #else /* CONFIG_SLUB_TINY */ 4879 - static void *__slab_alloc_node(struct kmem_cache *s, 4880 - gfp_t gfpflags, int node, unsigned long addr, size_t orig_size) 4881 - { 4882 - struct partial_context pc; 4883 - struct slab *slab; 4884 - void *object; 4885 - 4886 - pc.flags = gfpflags; 4887 - pc.orig_size = orig_size; 4888 - slab = get_partial(s, node, &pc); 4889 - 4890 - if (slab) 4891 - return pc.object; 4892 - 4893 - slab = new_slab(s, gfpflags, node); 4894 - if (unlikely(!slab)) { 4895 - slab_out_of_memory(s, gfpflags, node); 4896 - return NULL; 4897 - } 4898 - 4899 - object = alloc_single_from_new_slab(s, slab, orig_size, gfpflags); 4900 - 4901 - return object; 4902 - } 4903 - #endif /* CONFIG_SLUB_TINY */ 4904 4882 4905 4883 /* 4906 4884 * If the object has been wiped upon free, make sure it's fully initialized by ··· 5015 5045 return NULL; 5016 5046 5017 5047 if (empty) { 5018 - if (!refill_sheaf(s, empty, gfp)) { 5048 + if (!refill_sheaf(s, empty, gfp | __GFP_NOMEMALLOC)) { 5019 5049 full = empty; 5020 5050 } else { 5021 5051 /* ··· 5126 5156 * be false because of cpu migration during an unlocked part of 5127 5157 * the current allocation or previous freeing process. 5128 5158 */ 5129 - if (folio_nid(virt_to_folio(object)) != node) { 5159 + if (page_to_nid(virt_to_page(object)) != node) { 5130 5160 local_unlock(&s->cpu_sheaves->lock); 5131 5161 return NULL; 5132 5162 } ··· 5315 5345 } 5316 5346 EXPORT_SYMBOL(kmem_cache_alloc_node_noprof); 5317 5347 5348 + static int __prefill_sheaf_pfmemalloc(struct kmem_cache *s, 5349 + struct slab_sheaf *sheaf, gfp_t gfp) 5350 + { 5351 + int ret = 0; 5352 + 5353 + ret = refill_sheaf(s, sheaf, gfp | __GFP_NOMEMALLOC); 5354 + 5355 + if (likely(!ret || !gfp_pfmemalloc_allowed(gfp))) 5356 + return ret; 5357 + 5358 + /* 5359 + * if we are allowed to, refill sheaf with pfmemalloc but then remember 5360 + * it for when it's returned 5361 + */ 5362 + ret = refill_sheaf(s, sheaf, gfp); 5363 + sheaf->pfmemalloc = true; 5364 + 5365 + return ret; 5366 + } 5367 + 5318 5368 /* 5319 5369 * returns a sheaf that has at least the requested size 5320 5370 * when prefilling is needed, do so with given gfp flags ··· 5369 5379 sheaf->cache = s; 5370 5380 sheaf->capacity = size; 5371 5381 5382 + /* 5383 + * we do not need to care about pfmemalloc here because oversize 5384 + * sheaves area always flushed and freed when returned 5385 + */ 5372 5386 if (!__kmem_cache_alloc_bulk(s, gfp, size, 5373 5387 &sheaf->objects[0])) { 5374 5388 kfree(sheaf); ··· 5409 5415 if (!sheaf) 5410 5416 sheaf = alloc_empty_sheaf(s, gfp); 5411 5417 5412 - if (sheaf && sheaf->size < size) { 5413 - if (refill_sheaf(s, sheaf, gfp)) { 5418 + if (sheaf) { 5419 + sheaf->capacity = s->sheaf_capacity; 5420 + sheaf->pfmemalloc = false; 5421 + 5422 + if (sheaf->size < size && 5423 + __prefill_sheaf_pfmemalloc(s, sheaf, gfp)) { 5414 5424 sheaf_flush_unused(s, sheaf); 5415 5425 free_empty_sheaf(s, sheaf); 5416 5426 sheaf = NULL; 5417 5427 } 5418 5428 } 5419 - 5420 - if (sheaf) 5421 - sheaf->capacity = s->sheaf_capacity; 5422 5429 5423 5430 return sheaf; 5424 5431 } ··· 5440 5445 struct slub_percpu_sheaves *pcs; 5441 5446 struct node_barn *barn; 5442 5447 5443 - if (unlikely(sheaf->capacity != s->sheaf_capacity)) { 5448 + if (unlikely((sheaf->capacity != s->sheaf_capacity) 5449 + || sheaf->pfmemalloc)) { 5444 5450 sheaf_flush_unused(s, sheaf); 5445 5451 kfree(sheaf); 5446 5452 return; ··· 5507 5511 5508 5512 if (likely(sheaf->capacity >= size)) { 5509 5513 if (likely(sheaf->capacity == s->sheaf_capacity)) 5510 - return refill_sheaf(s, sheaf, gfp); 5514 + return __prefill_sheaf_pfmemalloc(s, sheaf, gfp); 5511 5515 5512 5516 if (!__kmem_cache_alloc_bulk(s, gfp, sheaf->capacity - sheaf->size, 5513 5517 &sheaf->objects[sheaf->size])) { ··· 5540 5544 * 5541 5545 * The gfp parameter is meant only to specify __GFP_ZERO or __GFP_ACCOUNT 5542 5546 * memcg charging is forced over limit if necessary, to avoid failure. 5547 + * 5548 + * It is possible that the allocation comes from kfence and then the sheaf 5549 + * size is not decreased. 5543 5550 */ 5544 5551 void * 5545 5552 kmem_cache_alloc_from_sheaf_noprof(struct kmem_cache *s, gfp_t gfp, ··· 5554 5555 if (sheaf->size == 0) 5555 5556 goto out; 5556 5557 5557 - ret = sheaf->objects[--sheaf->size]; 5558 + ret = kfence_alloc(s, s->object_size, gfp); 5559 + 5560 + if (likely(!ret)) 5561 + ret = sheaf->objects[--sheaf->size]; 5558 5562 5559 5563 init = slab_want_init_on_alloc(gfp, s); 5560 5564 ··· 5580 5578 */ 5581 5579 static void *___kmalloc_large_node(size_t size, gfp_t flags, int node) 5582 5580 { 5583 - struct folio *folio; 5581 + struct page *page; 5584 5582 void *ptr = NULL; 5585 5583 unsigned int order = get_order(size); 5586 5584 ··· 5590 5588 flags |= __GFP_COMP; 5591 5589 5592 5590 if (node == NUMA_NO_NODE) 5593 - folio = (struct folio *)alloc_frozen_pages_noprof(flags, order); 5591 + page = alloc_frozen_pages_noprof(flags, order); 5594 5592 else 5595 - folio = (struct folio *)__alloc_frozen_pages_noprof(flags, order, node, NULL); 5593 + page = __alloc_frozen_pages_noprof(flags, order, node, NULL); 5596 5594 5597 - if (folio) { 5598 - ptr = folio_address(folio); 5599 - lruvec_stat_mod_folio(folio, NR_SLAB_UNRECLAIMABLE_B, 5595 + if (page) { 5596 + ptr = page_address(page); 5597 + mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B, 5600 5598 PAGE_SIZE << order); 5601 - __folio_set_large_kmalloc(folio); 5599 + __SetPageLargeKmalloc(page); 5602 5600 } 5603 5601 5604 5602 ptr = kasan_kmalloc_large(ptr, size, flags); ··· 5725 5723 * it did local_lock_irqsave(&s->cpu_slab->lock, flags). 5726 5724 * In this case fast path with __update_cpu_freelist_fast() is not safe. 5727 5725 */ 5728 - #ifndef CONFIG_SLUB_TINY 5729 5726 if (!in_nmi() || !local_lock_is_locked(&s->cpu_slab->lock)) 5730 - #endif 5731 5727 ret = __slab_alloc_node(s, alloc_gfp, node, _RET_IP_, size); 5732 5728 5733 5729 if (PTR_ERR(ret) == -EBUSY) { ··· 5863 5863 unsigned long addr) 5864 5864 5865 5865 { 5866 - void *prior; 5867 - int was_frozen; 5868 - struct slab new; 5869 - unsigned long counters; 5866 + bool was_frozen, was_full; 5867 + struct freelist_counters old, new; 5870 5868 struct kmem_cache_node *n = NULL; 5871 5869 unsigned long flags; 5872 5870 bool on_node_partial; ··· 5876 5878 return; 5877 5879 } 5878 5880 5881 + /* 5882 + * It is enough to test IS_ENABLED(CONFIG_SLUB_CPU_PARTIAL) below 5883 + * instead of kmem_cache_has_cpu_partial(s), because kmem_cache_debug(s) 5884 + * is the only other reason it can be false, and it is already handled 5885 + * above. 5886 + */ 5887 + 5879 5888 do { 5880 5889 if (unlikely(n)) { 5881 5890 spin_unlock_irqrestore(&n->list_lock, flags); 5882 5891 n = NULL; 5883 5892 } 5884 - prior = slab->freelist; 5885 - counters = slab->counters; 5886 - set_freepointer(s, tail, prior); 5887 - new.counters = counters; 5888 - was_frozen = new.frozen; 5893 + 5894 + old.freelist = slab->freelist; 5895 + old.counters = slab->counters; 5896 + 5897 + was_full = (old.freelist == NULL); 5898 + was_frozen = old.frozen; 5899 + 5900 + set_freepointer(s, tail, old.freelist); 5901 + 5902 + new.freelist = head; 5903 + new.counters = old.counters; 5889 5904 new.inuse -= cnt; 5890 - if ((!new.inuse || !prior) && !was_frozen) { 5891 - /* Needs to be taken off a list */ 5892 - if (!kmem_cache_has_cpu_partial(s) || prior) { 5905 + 5906 + /* 5907 + * Might need to be taken off (due to becoming empty) or added 5908 + * to (due to not being full anymore) the partial list. 5909 + * Unless it's frozen. 5910 + */ 5911 + if ((!new.inuse || was_full) && !was_frozen) { 5912 + /* 5913 + * If slab becomes non-full and we have cpu partial 5914 + * lists, we put it there unconditionally to avoid 5915 + * taking the list_lock. Otherwise we need it. 5916 + */ 5917 + if (!(IS_ENABLED(CONFIG_SLUB_CPU_PARTIAL) && was_full)) { 5893 5918 5894 5919 n = get_node(s, slab_nid(slab)); 5895 5920 /* ··· 5929 5908 } 5930 5909 } 5931 5910 5932 - } while (!slab_update_freelist(s, slab, 5933 - prior, counters, 5934 - head, new.counters, 5935 - "__slab_free")); 5911 + } while (!slab_update_freelist(s, slab, &old, &new, "__slab_free")); 5936 5912 5937 5913 if (likely(!n)) { 5938 5914 ··· 5939 5921 * activity can be necessary. 5940 5922 */ 5941 5923 stat(s, FREE_FROZEN); 5942 - } else if (kmem_cache_has_cpu_partial(s) && !prior) { 5924 + } else if (IS_ENABLED(CONFIG_SLUB_CPU_PARTIAL) && was_full) { 5943 5925 /* 5944 5926 * If we started with a full slab then put it onto the 5945 5927 * per cpu partial list. ··· 5948 5930 stat(s, CPU_PARTIAL_FREE); 5949 5931 } 5950 5932 5933 + /* 5934 + * In other cases we didn't take the list_lock because the slab 5935 + * was already on the partial list and will remain there. 5936 + */ 5937 + 5951 5938 return; 5952 5939 } 5953 5940 ··· 5960 5937 * This slab was partially empty but not on the per-node partial list, 5961 5938 * in which case we shouldn't manipulate its list, just return. 5962 5939 */ 5963 - if (prior && !on_node_partial) { 5940 + if (!was_full && !on_node_partial) { 5964 5941 spin_unlock_irqrestore(&n->list_lock, flags); 5965 5942 return; 5966 5943 } 5967 5944 5945 + /* 5946 + * If slab became empty, should we add/keep it on the partial list or we 5947 + * have enough? 5948 + */ 5968 5949 if (unlikely(!new.inuse && n->nr_partial >= s->min_partial)) 5969 5950 goto slab_empty; 5970 5951 5971 5952 /* 5972 5953 * Objects left in the slab. If it was not on the partial list before 5973 - * then add it. 5954 + * then add it. This can only happen when cache has no per cpu partial 5955 + * list otherwise we would have put it there. 5974 5956 */ 5975 - if (!kmem_cache_has_cpu_partial(s) && unlikely(!prior)) { 5957 + if (!IS_ENABLED(CONFIG_SLUB_CPU_PARTIAL) && unlikely(was_full)) { 5976 5958 add_partial(n, slab, DEACTIVATE_TO_TAIL); 5977 5959 stat(s, FREE_ADD_PARTIAL); 5978 5960 } ··· 5985 5957 return; 5986 5958 5987 5959 slab_empty: 5988 - if (prior) { 5989 - /* 5990 - * Slab on the partial list. 5991 - */ 5960 + /* 5961 + * The slab could have a single object and thus go from full to empty in 5962 + * a single free, but more likely it was on the partial list. Remove it. 5963 + */ 5964 + if (likely(!was_full)) { 5992 5965 remove_partial(n, slab); 5993 5966 stat(s, FREE_REMOVE_PARTIAL); 5994 5967 } ··· 6214 6185 * handles it fine. The only downside is that sheaf will serve fewer 6215 6186 * allocations when reused. It only happens due to debugging, which is a 6216 6187 * performance hit anyway. 6188 + * 6189 + * If it returns true, there was at least one object from pfmemalloc 6190 + * slab so simply flush everything. 6217 6191 */ 6218 - __rcu_free_sheaf_prepare(s, sheaf); 6192 + if (__rcu_free_sheaf_prepare(s, sheaf)) 6193 + goto flush; 6219 6194 6220 6195 n = get_node(s, sheaf->node); 6221 6196 if (!n) ··· 6372 6339 continue; 6373 6340 } 6374 6341 6375 - if (unlikely(IS_ENABLED(CONFIG_NUMA) && slab_nid(slab) != node)) { 6342 + if (unlikely((IS_ENABLED(CONFIG_NUMA) && slab_nid(slab) != node) 6343 + || slab_test_pfmemalloc(slab))) { 6376 6344 remote_objects[remote_nr] = p[i]; 6377 6345 p[i] = p[--size]; 6378 6346 if (++remote_nr >= PCS_BATCH_MAX) ··· 6521 6487 llist_for_each_safe(pos, t, llnode) { 6522 6488 struct slab *slab = container_of(pos, struct slab, llnode); 6523 6489 6524 - #ifdef CONFIG_SLUB_TINY 6525 - free_slab(slab->slab_cache, slab); 6526 - #else 6527 6490 if (slab->frozen) 6528 6491 deactivate_slab(slab->slab_cache, slab, slab->flush_freelist); 6529 6492 else 6530 6493 free_slab(slab->slab_cache, slab); 6531 - #endif 6532 6494 } 6533 6495 } 6534 6496 ··· 6560 6530 irq_work_sync(&per_cpu_ptr(&defer_free_objects, cpu)->work); 6561 6531 } 6562 6532 6563 - #ifndef CONFIG_SLUB_TINY 6564 6533 /* 6565 6534 * Fastpath with forced inlining to produce a kfree and kmem_cache_free that 6566 6535 * can perform fastpath freeing without additional function calls. ··· 6652 6623 } 6653 6624 stat_add(s, FREE_FASTPATH, cnt); 6654 6625 } 6655 - #else /* CONFIG_SLUB_TINY */ 6656 - static void do_slab_free(struct kmem_cache *s, 6657 - struct slab *slab, void *head, void *tail, 6658 - int cnt, unsigned long addr) 6659 - { 6660 - __slab_free(s, slab, head, tail, cnt, addr); 6661 - } 6662 - #endif /* CONFIG_SLUB_TINY */ 6663 6626 6664 6627 static __fastpath_inline 6665 6628 void slab_free(struct kmem_cache *s, struct slab *slab, void *object, ··· 6664 6643 return; 6665 6644 6666 6645 if (s->cpu_sheaves && likely(!IS_ENABLED(CONFIG_NUMA) || 6667 - slab_nid(slab) == numa_mem_id())) { 6646 + slab_nid(slab) == numa_mem_id()) 6647 + && likely(!slab_test_pfmemalloc(slab))) { 6668 6648 if (likely(free_to_pcs(s, object))) 6669 6649 return; 6670 6650 } ··· 6775 6753 } 6776 6754 EXPORT_SYMBOL(kmem_cache_free); 6777 6755 6778 - static void free_large_kmalloc(struct folio *folio, void *object) 6756 + static void free_large_kmalloc(struct page *page, void *object) 6779 6757 { 6780 - unsigned int order = folio_order(folio); 6758 + unsigned int order = compound_order(page); 6781 6759 6782 - if (WARN_ON_ONCE(!folio_test_large_kmalloc(folio))) { 6783 - dump_page(&folio->page, "Not a kmalloc allocation"); 6760 + if (WARN_ON_ONCE(!PageLargeKmalloc(page))) { 6761 + dump_page(page, "Not a kmalloc allocation"); 6784 6762 return; 6785 6763 } 6786 6764 ··· 6791 6769 kasan_kfree_large(object); 6792 6770 kmsan_kfree_large(object); 6793 6771 6794 - lruvec_stat_mod_folio(folio, NR_SLAB_UNRECLAIMABLE_B, 6772 + mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B, 6795 6773 -(PAGE_SIZE << order)); 6796 - __folio_clear_large_kmalloc(folio); 6797 - free_frozen_pages(&folio->page, order); 6774 + __ClearPageLargeKmalloc(page); 6775 + free_frozen_pages(page, order); 6798 6776 } 6799 6777 6800 6778 /* ··· 6804 6782 void kvfree_rcu_cb(struct rcu_head *head) 6805 6783 { 6806 6784 void *obj = head; 6807 - struct folio *folio; 6785 + struct page *page; 6808 6786 struct slab *slab; 6809 6787 struct kmem_cache *s; 6810 6788 void *slab_addr; ··· 6815 6793 return; 6816 6794 } 6817 6795 6818 - folio = virt_to_folio(obj); 6819 - if (!folio_test_slab(folio)) { 6796 + page = virt_to_page(obj); 6797 + slab = page_slab(page); 6798 + if (!slab) { 6820 6799 /* 6821 6800 * rcu_head offset can be only less than page size so no need to 6822 - * consider folio order 6801 + * consider allocation order 6823 6802 */ 6824 6803 obj = (void *) PAGE_ALIGN_DOWN((unsigned long)obj); 6825 - free_large_kmalloc(folio, obj); 6804 + free_large_kmalloc(page, obj); 6826 6805 return; 6827 6806 } 6828 6807 6829 - slab = folio_slab(folio); 6830 6808 s = slab->slab_cache; 6831 - slab_addr = folio_address(folio); 6809 + slab_addr = slab_address(slab); 6832 6810 6833 6811 if (is_kfence_address(obj)) { 6834 6812 obj = kfence_object_start(obj); ··· 6850 6828 */ 6851 6829 void kfree(const void *object) 6852 6830 { 6853 - struct folio *folio; 6831 + struct page *page; 6854 6832 struct slab *slab; 6855 6833 struct kmem_cache *s; 6856 6834 void *x = (void *)object; ··· 6860 6838 if (unlikely(ZERO_OR_NULL_PTR(object))) 6861 6839 return; 6862 6840 6863 - folio = virt_to_folio(object); 6864 - if (unlikely(!folio_test_slab(folio))) { 6865 - free_large_kmalloc(folio, (void *)object); 6841 + page = virt_to_page(object); 6842 + slab = page_slab(page); 6843 + if (!slab) { 6844 + free_large_kmalloc(page, (void *)object); 6866 6845 return; 6867 6846 } 6868 6847 6869 - slab = folio_slab(folio); 6870 6848 s = slab->slab_cache; 6871 6849 slab_free(s, slab, x, _RET_IP_); 6872 6850 } ··· 6883 6861 */ 6884 6862 void kfree_nolock(const void *object) 6885 6863 { 6886 - struct folio *folio; 6887 6864 struct slab *slab; 6888 6865 struct kmem_cache *s; 6889 6866 void *x = (void *)object; ··· 6890 6869 if (unlikely(ZERO_OR_NULL_PTR(object))) 6891 6870 return; 6892 6871 6893 - folio = virt_to_folio(object); 6894 - if (unlikely(!folio_test_slab(folio))) { 6872 + slab = virt_to_slab(object); 6873 + if (unlikely(!slab)) { 6895 6874 WARN_ONCE(1, "large_kmalloc is not supported by kfree_nolock()"); 6896 6875 return; 6897 6876 } 6898 6877 6899 - slab = folio_slab(folio); 6900 6878 s = slab->slab_cache; 6901 6879 6902 6880 memcg_slab_free_hook(s, slab, &x, 1); ··· 6927 6907 * since kasan quarantine takes locks and not supported from NMI. 6928 6908 */ 6929 6909 kasan_slab_free(s, x, false, false, /* skip quarantine */true); 6930 - #ifndef CONFIG_SLUB_TINY 6931 6910 do_slab_free(s, slab, x, x, 0, _RET_IP_); 6932 - #else 6933 - defer_free(s, x); 6934 - #endif 6935 6911 } 6936 6912 EXPORT_SYMBOL_GPL(kfree_nolock); 6937 6913 ··· 6959 6943 if (is_kfence_address(p)) { 6960 6944 ks = orig_size = kfence_ksize(p); 6961 6945 } else { 6962 - struct folio *folio; 6946 + struct page *page = virt_to_page(p); 6947 + struct slab *slab = page_slab(page); 6963 6948 6964 - folio = virt_to_folio(p); 6965 - if (unlikely(!folio_test_slab(folio))) { 6949 + if (!slab) { 6966 6950 /* Big kmalloc object */ 6967 - WARN_ON(folio_size(folio) <= KMALLOC_MAX_CACHE_SIZE); 6968 - WARN_ON(p != folio_address(folio)); 6969 - ks = folio_size(folio); 6951 + ks = page_size(page); 6952 + WARN_ON(ks <= KMALLOC_MAX_CACHE_SIZE); 6953 + WARN_ON(p != page_address(page)); 6970 6954 } else { 6971 - s = folio_slab(folio)->slab_cache; 6955 + s = slab->slab_cache; 6972 6956 orig_size = get_orig_size(s, (void *)p); 6973 6957 ks = s->object_size; 6974 6958 } ··· 7272 7256 { 7273 7257 int lookahead = 3; 7274 7258 void *object; 7275 - struct folio *folio; 7259 + struct page *page; 7260 + struct slab *slab; 7276 7261 size_t same; 7277 7262 7278 7263 object = p[--size]; 7279 - folio = virt_to_folio(object); 7264 + page = virt_to_page(object); 7265 + slab = page_slab(page); 7280 7266 if (!s) { 7281 7267 /* Handle kalloc'ed objects */ 7282 - if (unlikely(!folio_test_slab(folio))) { 7283 - free_large_kmalloc(folio, object); 7268 + if (!slab) { 7269 + free_large_kmalloc(page, object); 7284 7270 df->slab = NULL; 7285 7271 return size; 7286 7272 } 7287 7273 /* Derive kmem_cache from object */ 7288 - df->slab = folio_slab(folio); 7289 - df->s = df->slab->slab_cache; 7274 + df->slab = slab; 7275 + df->s = slab->slab_cache; 7290 7276 } else { 7291 - df->slab = folio_slab(folio); 7277 + df->slab = slab; 7292 7278 df->s = cache_from_obj(s, object); /* Support for memcg */ 7293 7279 } 7294 7280 ··· 7379 7361 } 7380 7362 EXPORT_SYMBOL(kmem_cache_free_bulk); 7381 7363 7382 - #ifndef CONFIG_SLUB_TINY 7383 7364 static inline 7384 7365 int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, 7385 7366 void **p) ··· 7396 7379 local_lock_irqsave(&s->cpu_slab->lock, irqflags); 7397 7380 7398 7381 for (i = 0; i < size; i++) { 7399 - void *object = kfence_alloc(s, s->object_size, flags); 7382 + void *object = c->freelist; 7400 7383 7401 - if (unlikely(object)) { 7402 - p[i] = object; 7403 - continue; 7404 - } 7405 - 7406 - object = c->freelist; 7407 7384 if (unlikely(!object)) { 7408 7385 /* 7409 7386 * We may have removed an object from c->freelist using ··· 7443 7432 return 0; 7444 7433 7445 7434 } 7446 - #else /* CONFIG_SLUB_TINY */ 7447 - static int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, 7448 - size_t size, void **p) 7449 - { 7450 - int i; 7451 - 7452 - for (i = 0; i < size; i++) { 7453 - void *object = kfence_alloc(s, s->object_size, flags); 7454 - 7455 - if (unlikely(object)) { 7456 - p[i] = object; 7457 - continue; 7458 - } 7459 - 7460 - p[i] = __slab_alloc_node(s, flags, NUMA_NO_NODE, 7461 - _RET_IP_, s->object_size); 7462 - if (unlikely(!p[i])) 7463 - goto error; 7464 - 7465 - maybe_wipe_obj_freeptr(s, p[i]); 7466 - } 7467 - 7468 - return i; 7469 - 7470 - error: 7471 - __kmem_cache_free_bulk(s, i, p); 7472 - return 0; 7473 - } 7474 - #endif /* CONFIG_SLUB_TINY */ 7475 7435 7476 7436 /* Note that interrupts must be enabled when calling this function. */ 7477 7437 int kmem_cache_alloc_bulk_noprof(struct kmem_cache *s, gfp_t flags, size_t size, 7478 7438 void **p) 7479 7439 { 7480 7440 unsigned int i = 0; 7441 + void *kfence_obj; 7481 7442 7482 7443 if (!size) 7483 7444 return 0; ··· 7457 7474 s = slab_pre_alloc_hook(s, flags); 7458 7475 if (unlikely(!s)) 7459 7476 return 0; 7477 + 7478 + /* 7479 + * to make things simpler, only assume at most once kfence allocated 7480 + * object per bulk allocation and choose its index randomly 7481 + */ 7482 + kfence_obj = kfence_alloc(s, s->object_size, flags); 7483 + 7484 + if (unlikely(kfence_obj)) { 7485 + if (unlikely(size == 1)) { 7486 + p[0] = kfence_obj; 7487 + goto out; 7488 + } 7489 + size--; 7490 + } 7460 7491 7461 7492 if (s->cpu_sheaves) 7462 7493 i = alloc_from_pcs_bulk(s, size, p); ··· 7483 7486 if (unlikely(__kmem_cache_alloc_bulk(s, flags, size - i, p + i) == 0)) { 7484 7487 if (i > 0) 7485 7488 __kmem_cache_free_bulk(s, i, p); 7489 + if (kfence_obj) 7490 + __kfence_free(kfence_obj); 7486 7491 return 0; 7487 7492 } 7488 7493 } 7489 7494 7495 + if (unlikely(kfence_obj)) { 7496 + int idx = get_random_u32_below(size + 1); 7497 + 7498 + if (idx != size) 7499 + p[size] = p[idx]; 7500 + p[idx] = kfence_obj; 7501 + 7502 + size++; 7503 + } 7504 + 7505 + out: 7490 7506 /* 7491 7507 * memcg and kmem_cache debug support and memory initialization. 7492 7508 * Done outside of the IRQ disabled fastpath loop. ··· 7661 7651 barn_init(barn); 7662 7652 } 7663 7653 7664 - #ifndef CONFIG_SLUB_TINY 7665 7654 static inline int alloc_kmem_cache_cpus(struct kmem_cache *s) 7666 7655 { 7667 7656 BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE < ··· 7681 7672 7682 7673 return 1; 7683 7674 } 7684 - #else 7685 - static inline int alloc_kmem_cache_cpus(struct kmem_cache *s) 7686 - { 7687 - return 1; 7688 - } 7689 - #endif /* CONFIG_SLUB_TINY */ 7690 7675 7691 7676 static int init_percpu_sheaves(struct kmem_cache *s) 7692 7677 { ··· 7770 7767 cache_random_seq_destroy(s); 7771 7768 if (s->cpu_sheaves) 7772 7769 pcs_destroy(s); 7773 - #ifndef CONFIG_SLUB_TINY 7774 7770 #ifdef CONFIG_PREEMPT_RT 7775 7771 if (s->cpu_slab) 7776 7772 lockdep_unregister_key(&s->lock_key); 7777 7773 #endif 7778 7774 free_percpu(s->cpu_slab); 7779 - #endif 7780 7775 free_kmem_cache_nodes(s); 7781 7776 } 7782 7777 ··· 8140 8139 * Kmalloc subsystem 8141 8140 *******************************************************************/ 8142 8141 8143 - static int __init setup_slub_min_order(char *str) 8142 + static int __init setup_slub_min_order(const char *str, const struct kernel_param *kp) 8144 8143 { 8145 - get_option(&str, (int *)&slub_min_order); 8144 + int ret; 8145 + 8146 + ret = kstrtouint(str, 0, &slub_min_order); 8147 + if (ret) 8148 + return ret; 8146 8149 8147 8150 if (slub_min_order > slub_max_order) 8148 8151 slub_max_order = slub_min_order; 8149 8152 8150 - return 1; 8153 + return 0; 8151 8154 } 8152 8155 8153 - __setup("slab_min_order=", setup_slub_min_order); 8154 - __setup_param("slub_min_order=", slub_min_order, setup_slub_min_order, 0); 8156 + static const struct kernel_param_ops param_ops_slab_min_order __initconst = { 8157 + .set = setup_slub_min_order, 8158 + }; 8159 + __core_param_cb(slab_min_order, &param_ops_slab_min_order, &slub_min_order, 0); 8160 + __core_param_cb(slub_min_order, &param_ops_slab_min_order, &slub_min_order, 0); 8155 8161 8156 - 8157 - static int __init setup_slub_max_order(char *str) 8162 + static int __init setup_slub_max_order(const char *str, const struct kernel_param *kp) 8158 8163 { 8159 - get_option(&str, (int *)&slub_max_order); 8164 + int ret; 8165 + 8166 + ret = kstrtouint(str, 0, &slub_max_order); 8167 + if (ret) 8168 + return ret; 8169 + 8160 8170 slub_max_order = min_t(unsigned int, slub_max_order, MAX_PAGE_ORDER); 8161 8171 8162 8172 if (slub_min_order > slub_max_order) 8163 8173 slub_min_order = slub_max_order; 8164 8174 8165 - return 1; 8175 + return 0; 8166 8176 } 8167 8177 8168 - __setup("slab_max_order=", setup_slub_max_order); 8169 - __setup_param("slub_max_order=", slub_max_order, setup_slub_max_order, 0); 8178 + static const struct kernel_param_ops param_ops_slab_max_order __initconst = { 8179 + .set = setup_slub_max_order, 8180 + }; 8181 + __core_param_cb(slab_max_order, &param_ops_slab_max_order, &slub_max_order, 0); 8182 + __core_param_cb(slub_max_order, &param_ops_slab_max_order, &slub_max_order, 0); 8170 8183 8171 - static int __init setup_slub_min_objects(char *str) 8172 - { 8173 - get_option(&str, (int *)&slub_min_objects); 8174 - 8175 - return 1; 8176 - } 8177 - 8178 - __setup("slab_min_objects=", setup_slub_min_objects); 8179 - __setup_param("slub_min_objects=", slub_min_objects, setup_slub_min_objects, 0); 8184 + core_param(slab_min_objects, slub_min_objects, uint, 0); 8185 + core_param(slub_min_objects, slub_min_objects, uint, 0); 8180 8186 8181 8187 #ifdef CONFIG_NUMA 8182 - static int __init setup_slab_strict_numa(char *str) 8188 + static int __init setup_slab_strict_numa(const char *str, const struct kernel_param *kp) 8183 8189 { 8184 8190 if (nr_node_ids > 1) { 8185 8191 static_branch_enable(&strict_numa); ··· 8195 8187 pr_warn("slab_strict_numa parameter set on non NUMA system.\n"); 8196 8188 } 8197 8189 8198 - return 1; 8190 + return 0; 8199 8191 } 8200 8192 8201 - __setup("slab_strict_numa", setup_slab_strict_numa); 8193 + static const struct kernel_param_ops param_ops_slab_strict_numa __initconst = { 8194 + .flags = KERNEL_PARAM_OPS_FL_NOARG, 8195 + .set = setup_slab_strict_numa, 8196 + }; 8197 + __core_param_cb(slab_strict_numa, &param_ops_slab_strict_numa, NULL, 0); 8202 8198 #endif 8203 8199 8204 8200 ··· 8528 8516 8529 8517 void __init kmem_cache_init_late(void) 8530 8518 { 8531 - #ifndef CONFIG_SLUB_TINY 8532 8519 flushwq = alloc_workqueue("slub_flushwq", WQ_MEM_RECLAIM, 0); 8533 8520 WARN_ON(!flushwq); 8534 - #endif 8535 8521 } 8536 8522 8537 8523 struct kmem_cache *

+16 -8

mm/usercopy.c

··· 164 164 { 165 165 unsigned long addr = (unsigned long)ptr; 166 166 unsigned long offset; 167 - struct folio *folio; 167 + struct page *page; 168 + struct slab *slab; 168 169 169 170 if (is_kmap_addr(ptr)) { 170 171 offset = offset_in_page(ptr); ··· 190 189 if (!virt_addr_valid(ptr)) 191 190 return; 192 191 193 - folio = virt_to_folio(ptr); 194 - 195 - if (folio_test_slab(folio)) { 192 + page = virt_to_page(ptr); 193 + slab = page_slab(page); 194 + if (slab) { 196 195 /* Check slab allocator for flags and size. */ 197 - __check_heap_object(ptr, n, folio_slab(folio), to_user); 198 - } else if (folio_test_large(folio)) { 199 - offset = ptr - folio_address(folio); 200 - if (n > folio_size(folio) - offset) 196 + __check_heap_object(ptr, n, slab, to_user); 197 + } else if (PageCompound(page)) { 198 + page = compound_head(page); 199 + offset = ptr - page_address(page); 200 + if (n > page_size(page) - offset) 201 201 usercopy_abort("page alloc", NULL, to_user, offset, n); 202 202 } 203 + 204 + /* 205 + * We cannot check non-compound pages. They might be part of 206 + * a large allocation, in which case crossing a page boundary 207 + * is fine. 208 + */ 203 209 } 204 210 205 211 DEFINE_STATIC_KEY_MAYBE_RO(CONFIG_HARDENED_USERCOPY_DEFAULT_ON,