Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge tag 'slab-for-6.19' of git://git.kernel.org/pub/scm/linux/kernel/git/vbabka/slab

Pull slab updates from Vlastimil Babka:

- mempool_alloc_bulk() support for upcoming users in the block layer
that need to allocate multiple objects at once with the mempool's
guaranteed progress semantics, which is not achievable with an
allocation single objects in a loop. Along with refactoring and
various improvements (Christoph Hellwig)

- Preparations for the upcoming separation of struct slab from struct
page, mostly by removing the struct folio layer, as the purpose of
struct folio has shifted since it became used in slab code (Matthew
Wilcox)

- Modernisation of slab's boot param API usage, which removes some
unexpected parsing corner cases (Petr Tesarik)

- Refactoring of freelist_aba_t (now struct freelist_counters) and
associated functions for double cmpxchg, enabled by -fms-extensions
(Vlastimil Babka)

- Cleanups and improvements related to sheaves caching layer, that were
part of the full conversion to sheaves, which is planned for the next
release (Vlastimil Babka)

* tag 'slab-for-6.19' of git://git.kernel.org/pub/scm/linux/kernel/git/vbabka/slab: (42 commits)
slab: Remove unnecessary call to compound_head() in alloc_from_pcs()
mempool: clarify behavior of mempool_alloc_preallocated()
mempool: drop the file name in the top of file comment
mempool: de-typedef
mempool: remove mempool_{init,create}_kvmalloc_pool
mempool: legitimize the io_schedule_timeout in mempool_alloc_from_pool
mempool: add mempool_{alloc,free}_bulk
mempool: factor out a mempool_alloc_from_pool helper
slab: Remove references to folios from virt_to_slab()
kasan: Remove references to folio in __kasan_mempool_poison_object()
memcg: Convert mem_cgroup_from_obj_folio() to mem_cgroup_from_obj_slab()
mempool: factor out a mempool_adjust_gfp helper
mempool: add error injection support
mempool: improve kerneldoc comments
mm: improve kerneldoc comments for __alloc_pages_bulk
fault-inject: make enum fault_flags available unconditionally
usercopy: Remove folio references from check_heap_object()
slab: Remove folio references from kfree_nolock()
slab: Remove folio references from kfree_rcu_sheaf()
slab: Remove folio references from build_detached_freelist()
...

+767 -688
+4 -4
include/linux/fault-inject.h
··· 8 8 struct dentry; 9 9 struct kmem_cache; 10 10 11 + enum fault_flags { 12 + FAULT_NOWARN = 1 << 0, 13 + }; 14 + 11 15 #ifdef CONFIG_FAULT_INJECTION 12 16 13 17 #include <linux/atomic.h> ··· 38 34 unsigned long count; 39 35 struct ratelimit_state ratelimit_state; 40 36 struct dentry *dname; 41 - }; 42 - 43 - enum fault_flags { 44 - FAULT_NOWARN = 1 << 0, 45 37 }; 46 38 47 39 #define FAULT_ATTR_INITIALIZER { \
-6
include/linux/gfp_types.h
··· 55 55 #ifdef CONFIG_LOCKDEP 56 56 ___GFP_NOLOCKDEP_BIT, 57 57 #endif 58 - #ifdef CONFIG_SLAB_OBJ_EXT 59 58 ___GFP_NO_OBJ_EXT_BIT, 60 - #endif 61 59 ___GFP_LAST_BIT 62 60 }; 63 61 ··· 96 98 #else 97 99 #define ___GFP_NOLOCKDEP 0 98 100 #endif 99 - #ifdef CONFIG_SLAB_OBJ_EXT 100 101 #define ___GFP_NO_OBJ_EXT BIT(___GFP_NO_OBJ_EXT_BIT) 101 - #else 102 - #define ___GFP_NO_OBJ_EXT 0 103 - #endif 104 102 105 103 /* 106 104 * Physical address zone modifiers (see linux/mmzone.h - low four bits)
+25 -33
include/linux/mempool.h
··· 27 27 wait_queue_head_t wait; 28 28 } mempool_t; 29 29 30 - static inline bool mempool_initialized(mempool_t *pool) 30 + static inline bool mempool_initialized(struct mempool *pool) 31 31 { 32 32 return pool->elements != NULL; 33 33 } 34 34 35 - static inline bool mempool_is_saturated(mempool_t *pool) 35 + static inline bool mempool_is_saturated(struct mempool *pool) 36 36 { 37 37 return READ_ONCE(pool->curr_nr) >= pool->min_nr; 38 38 } 39 39 40 - void mempool_exit(mempool_t *pool); 41 - int mempool_init_node(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn, 42 - mempool_free_t *free_fn, void *pool_data, 43 - gfp_t gfp_mask, int node_id); 44 - 45 - int mempool_init_noprof(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn, 46 - mempool_free_t *free_fn, void *pool_data); 40 + void mempool_exit(struct mempool *pool); 41 + int mempool_init_node(struct mempool *pool, int min_nr, 42 + mempool_alloc_t *alloc_fn, mempool_free_t *free_fn, 43 + void *pool_data, gfp_t gfp_mask, int node_id); 44 + int mempool_init_noprof(struct mempool *pool, int min_nr, 45 + mempool_alloc_t *alloc_fn, mempool_free_t *free_fn, 46 + void *pool_data); 47 47 #define mempool_init(...) \ 48 48 alloc_hooks(mempool_init_noprof(__VA_ARGS__)) 49 49 50 - extern mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn, 51 - mempool_free_t *free_fn, void *pool_data); 52 - 53 - extern mempool_t *mempool_create_node_noprof(int min_nr, mempool_alloc_t *alloc_fn, 54 - mempool_free_t *free_fn, void *pool_data, 55 - gfp_t gfp_mask, int nid); 50 + struct mempool *mempool_create(int min_nr, mempool_alloc_t *alloc_fn, 51 + mempool_free_t *free_fn, void *pool_data); 52 + struct mempool *mempool_create_node_noprof(int min_nr, 53 + mempool_alloc_t *alloc_fn, mempool_free_t *free_fn, 54 + void *pool_data, gfp_t gfp_mask, int nid); 56 55 #define mempool_create_node(...) \ 57 56 alloc_hooks(mempool_create_node_noprof(__VA_ARGS__)) 58 57 ··· 59 60 mempool_create_node(_min_nr, _alloc_fn, _free_fn, _pool_data, \ 60 61 GFP_KERNEL, NUMA_NO_NODE) 61 62 62 - extern int mempool_resize(mempool_t *pool, int new_min_nr); 63 - extern void mempool_destroy(mempool_t *pool); 63 + int mempool_resize(struct mempool *pool, int new_min_nr); 64 + void mempool_destroy(struct mempool *pool); 64 65 65 - extern void *mempool_alloc_noprof(mempool_t *pool, gfp_t gfp_mask) __malloc; 66 + void *mempool_alloc_noprof(struct mempool *pool, gfp_t gfp_mask) __malloc; 66 67 #define mempool_alloc(...) \ 67 68 alloc_hooks(mempool_alloc_noprof(__VA_ARGS__)) 69 + int mempool_alloc_bulk_noprof(struct mempool *pool, void **elem, 70 + unsigned int count, unsigned int allocated); 71 + #define mempool_alloc_bulk(...) \ 72 + alloc_hooks(mempool_alloc_bulk_noprof(__VA_ARGS__)) 68 73 69 - extern void *mempool_alloc_preallocated(mempool_t *pool) __malloc; 70 - extern void mempool_free(void *element, mempool_t *pool); 74 + void *mempool_alloc_preallocated(struct mempool *pool) __malloc; 75 + void mempool_free(void *element, struct mempool *pool); 76 + unsigned int mempool_free_bulk(struct mempool *pool, void **elem, 77 + unsigned int count); 71 78 72 79 /* 73 80 * A mempool_alloc_t and mempool_free_t that get the memory from ··· 101 96 #define mempool_create_kmalloc_pool(_min_nr, _size) \ 102 97 mempool_create((_min_nr), mempool_kmalloc, mempool_kfree, \ 103 98 (void *)(unsigned long)(_size)) 104 - 105 - void *mempool_kvmalloc(gfp_t gfp_mask, void *pool_data); 106 - void mempool_kvfree(void *element, void *pool_data); 107 - 108 - static inline int mempool_init_kvmalloc_pool(mempool_t *pool, int min_nr, size_t size) 109 - { 110 - return mempool_init(pool, min_nr, mempool_kvmalloc, mempool_kvfree, (void *) size); 111 - } 112 - 113 - static inline mempool_t *mempool_create_kvmalloc_pool(int min_nr, size_t size) 114 - { 115 - return mempool_create(min_nr, mempool_kvmalloc, mempool_kvfree, (void *) size); 116 - } 117 99 118 100 /* 119 101 * A mempool_alloc_t and mempool_free_t for a simple page allocator that
+2 -14
include/linux/page-flags.h
··· 1048 1048 */ 1049 1049 PAGE_TYPE_OPS(Guard, guard, guard) 1050 1050 1051 - FOLIO_TYPE_OPS(slab, slab) 1052 - 1053 - /** 1054 - * PageSlab - Determine if the page belongs to the slab allocator 1055 - * @page: The page to test. 1056 - * 1057 - * Context: Any context. 1058 - * Return: True for slab pages, false for any other kind of page. 1059 - */ 1060 - static inline bool PageSlab(const struct page *page) 1061 - { 1062 - return folio_test_slab(page_folio(page)); 1063 - } 1051 + PAGE_TYPE_OPS(Slab, slab, slab) 1064 1052 1065 1053 #ifdef CONFIG_HUGETLB_PAGE 1066 1054 FOLIO_TYPE_OPS(hugetlb, hugetlb) ··· 1064 1076 * Serialized with zone lock. 1065 1077 */ 1066 1078 PAGE_TYPE_OPS(Unaccepted, unaccepted, unaccepted) 1067 - FOLIO_TYPE_OPS(large_kmalloc, large_kmalloc) 1079 + PAGE_TYPE_OPS(LargeKmalloc, large_kmalloc, large_kmalloc) 1068 1080 1069 1081 /** 1070 1082 * PageHuge - Determine if the page belongs to hugetlbfs
+4 -8
mm/kasan/common.c
··· 520 520 521 521 bool __kasan_mempool_poison_object(void *ptr, unsigned long ip) 522 522 { 523 - struct folio *folio = virt_to_folio(ptr); 523 + struct page *page = virt_to_page(ptr); 524 524 struct slab *slab; 525 525 526 - /* 527 - * This function can be called for large kmalloc allocation that get 528 - * their memory from page_alloc. Thus, the folio might not be a slab. 529 - */ 530 - if (unlikely(!folio_test_slab(folio))) { 526 + if (unlikely(PageLargeKmalloc(page))) { 531 527 if (check_page_allocation(ptr, ip)) 532 528 return false; 533 - kasan_poison(ptr, folio_size(folio), KASAN_PAGE_FREE, false); 529 + kasan_poison(ptr, page_size(page), KASAN_PAGE_FREE, false); 534 530 return true; 535 531 } 536 532 537 533 if (is_kfence_address(ptr)) 538 534 return true; 539 535 540 - slab = folio_slab(folio); 536 + slab = page_slab(page); 541 537 542 538 if (check_slab_allocation(slab->slab_cache, ptr, ip)) 543 539 return false;
+8 -6
mm/kfence/core.c
··· 612 612 * enters __slab_free() slow-path. 613 613 */ 614 614 for (i = 0; i < KFENCE_POOL_SIZE / PAGE_SIZE; i++) { 615 - struct slab *slab; 615 + struct page *page; 616 616 617 617 if (!i || (i % 2)) 618 618 continue; 619 619 620 - slab = page_slab(pfn_to_page(start_pfn + i)); 621 - __folio_set_slab(slab_folio(slab)); 620 + page = pfn_to_page(start_pfn + i); 621 + __SetPageSlab(page); 622 622 #ifdef CONFIG_MEMCG 623 + struct slab *slab = page_slab(page); 623 624 slab->obj_exts = (unsigned long)&kfence_metadata_init[i / 2 - 1].obj_exts | 624 625 MEMCG_DATA_OBJEXTS; 625 626 #endif ··· 666 665 667 666 reset_slab: 668 667 for (i = 0; i < KFENCE_POOL_SIZE / PAGE_SIZE; i++) { 669 - struct slab *slab; 668 + struct page *page; 670 669 671 670 if (!i || (i % 2)) 672 671 continue; 673 672 674 - slab = page_slab(pfn_to_page(start_pfn + i)); 673 + page = pfn_to_page(start_pfn + i); 675 674 #ifdef CONFIG_MEMCG 675 + struct slab *slab = page_slab(page); 676 676 slab->obj_exts = 0; 677 677 #endif 678 - __folio_clear_slab(slab_folio(slab)); 678 + __ClearPageSlab(page); 679 679 } 680 680 681 681 return addr;
+16 -24
mm/memcontrol.c
··· 2557 2557 } 2558 2558 2559 2559 static __always_inline 2560 - struct mem_cgroup *mem_cgroup_from_obj_folio(struct folio *folio, void *p) 2560 + struct mem_cgroup *mem_cgroup_from_obj_slab(struct slab *slab, void *p) 2561 2561 { 2562 2562 /* 2563 2563 * Slab objects are accounted individually, not per-page. 2564 2564 * Memcg membership data for each individual object is saved in 2565 2565 * slab->obj_exts. 2566 2566 */ 2567 - if (folio_test_slab(folio)) { 2568 - struct slabobj_ext *obj_exts; 2569 - struct slab *slab; 2570 - unsigned int off; 2567 + struct slabobj_ext *obj_exts; 2568 + unsigned int off; 2571 2569 2572 - slab = folio_slab(folio); 2573 - obj_exts = slab_obj_exts(slab); 2574 - if (!obj_exts) 2575 - return NULL; 2576 - 2577 - off = obj_to_index(slab->slab_cache, slab, p); 2578 - if (obj_exts[off].objcg) 2579 - return obj_cgroup_memcg(obj_exts[off].objcg); 2580 - 2570 + obj_exts = slab_obj_exts(slab); 2571 + if (!obj_exts) 2581 2572 return NULL; 2582 - } 2583 2573 2584 - /* 2585 - * folio_memcg_check() is used here, because in theory we can encounter 2586 - * a folio where the slab flag has been cleared already, but 2587 - * slab->obj_exts has not been freed yet 2588 - * folio_memcg_check() will guarantee that a proper memory 2589 - * cgroup pointer or NULL will be returned. 2590 - */ 2591 - return folio_memcg_check(folio); 2574 + off = obj_to_index(slab->slab_cache, slab, p); 2575 + if (obj_exts[off].objcg) 2576 + return obj_cgroup_memcg(obj_exts[off].objcg); 2577 + 2578 + return NULL; 2592 2579 } 2593 2580 2594 2581 /* ··· 2589 2602 */ 2590 2603 struct mem_cgroup *mem_cgroup_from_slab_obj(void *p) 2591 2604 { 2605 + struct slab *slab; 2606 + 2592 2607 if (mem_cgroup_disabled()) 2593 2608 return NULL; 2594 2609 2595 - return mem_cgroup_from_obj_folio(virt_to_folio(p), p); 2610 + slab = virt_to_slab(p); 2611 + if (slab) 2612 + return mem_cgroup_from_obj_slab(slab, p); 2613 + return folio_memcg_check(virt_to_folio(p)); 2596 2614 } 2597 2615 2598 2616 static struct obj_cgroup *__get_obj_cgroup_from_memcg(struct mem_cgroup *memcg)
+265 -162
mm/mempool.c
··· 1 1 // SPDX-License-Identifier: GPL-2.0 2 2 /* 3 - * linux/mm/mempool.c 4 - * 5 3 * memory buffer pool support. Such pools are mostly used 6 4 * for guaranteed, deadlock-free memory allocations during 7 5 * extreme VM load. ··· 7 9 * started by Ingo Molnar, Copyright (C) 2001 8 10 * debugging by David Rientjes, Copyright (C) 2015 9 11 */ 10 - 12 + #include <linux/fault-inject.h> 11 13 #include <linux/mm.h> 12 14 #include <linux/slab.h> 13 15 #include <linux/highmem.h> ··· 18 20 #include <linux/writeback.h> 19 21 #include "slab.h" 20 22 23 + static DECLARE_FAULT_ATTR(fail_mempool_alloc); 24 + static DECLARE_FAULT_ATTR(fail_mempool_alloc_bulk); 25 + 26 + static int __init mempool_faul_inject_init(void) 27 + { 28 + int error; 29 + 30 + error = PTR_ERR_OR_ZERO(fault_create_debugfs_attr("fail_mempool_alloc", 31 + NULL, &fail_mempool_alloc)); 32 + if (error) 33 + return error; 34 + 35 + /* booting will fail on error return here, don't bother to cleanup */ 36 + return PTR_ERR_OR_ZERO( 37 + fault_create_debugfs_attr("fail_mempool_alloc_bulk", NULL, 38 + &fail_mempool_alloc_bulk)); 39 + } 40 + late_initcall(mempool_faul_inject_init); 41 + 21 42 #ifdef CONFIG_SLUB_DEBUG_ON 22 - static void poison_error(mempool_t *pool, void *element, size_t size, 43 + static void poison_error(struct mempool *pool, void *element, size_t size, 23 44 size_t byte) 24 45 { 25 46 const int nr = pool->curr_nr; ··· 55 38 dump_stack(); 56 39 } 57 40 58 - static void __check_element(mempool_t *pool, void *element, size_t size) 41 + static void __check_element(struct mempool *pool, void *element, size_t size) 59 42 { 60 43 u8 *obj = element; 61 44 size_t i; ··· 71 54 memset(obj, POISON_INUSE, size); 72 55 } 73 56 74 - static void check_element(mempool_t *pool, void *element) 57 + static void check_element(struct mempool *pool, void *element) 75 58 { 76 59 /* Skip checking: KASAN might save its metadata in the element. */ 77 60 if (kasan_enabled()) ··· 110 93 obj[size - 1] = POISON_END; 111 94 } 112 95 113 - static void poison_element(mempool_t *pool, void *element) 96 + static void poison_element(struct mempool *pool, void *element) 114 97 { 115 98 /* Skip poisoning: KASAN might save its metadata in the element. */ 116 99 if (kasan_enabled()) ··· 141 124 } 142 125 } 143 126 #else /* CONFIG_SLUB_DEBUG_ON */ 144 - static inline void check_element(mempool_t *pool, void *element) 127 + static inline void check_element(struct mempool *pool, void *element) 145 128 { 146 129 } 147 - static inline void poison_element(mempool_t *pool, void *element) 130 + static inline void poison_element(struct mempool *pool, void *element) 148 131 { 149 132 } 150 133 #endif /* CONFIG_SLUB_DEBUG_ON */ 151 134 152 - static __always_inline bool kasan_poison_element(mempool_t *pool, void *element) 135 + static __always_inline bool kasan_poison_element(struct mempool *pool, 136 + void *element) 153 137 { 154 138 if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc) 155 139 return kasan_mempool_poison_object(element); ··· 160 142 return true; 161 143 } 162 144 163 - static void kasan_unpoison_element(mempool_t *pool, void *element) 145 + static void kasan_unpoison_element(struct mempool *pool, void *element) 164 146 { 165 147 if (pool->alloc == mempool_kmalloc) 166 148 kasan_mempool_unpoison_object(element, (size_t)pool->pool_data); ··· 172 154 (unsigned long)pool->pool_data); 173 155 } 174 156 175 - static __always_inline void add_element(mempool_t *pool, void *element) 157 + static __always_inline void add_element(struct mempool *pool, void *element) 176 158 { 177 159 BUG_ON(pool->min_nr != 0 && pool->curr_nr >= pool->min_nr); 178 160 poison_element(pool, element); ··· 180 162 pool->elements[pool->curr_nr++] = element; 181 163 } 182 164 183 - static void *remove_element(mempool_t *pool) 165 + static void *remove_element(struct mempool *pool) 184 166 { 185 167 void *element = pool->elements[--pool->curr_nr]; 186 168 ··· 201 183 * May be called on a zeroed but uninitialized mempool (i.e. allocated with 202 184 * kzalloc()). 203 185 */ 204 - void mempool_exit(mempool_t *pool) 186 + void mempool_exit(struct mempool *pool) 205 187 { 206 188 while (pool->curr_nr) { 207 189 void *element = remove_element(pool); ··· 220 202 * Free all reserved elements in @pool and @pool itself. This function 221 203 * only sleeps if the free_fn() function sleeps. 222 204 */ 223 - void mempool_destroy(mempool_t *pool) 205 + void mempool_destroy(struct mempool *pool) 224 206 { 225 207 if (unlikely(!pool)) 226 208 return; ··· 230 212 } 231 213 EXPORT_SYMBOL(mempool_destroy); 232 214 233 - int mempool_init_node(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn, 234 - mempool_free_t *free_fn, void *pool_data, 235 - gfp_t gfp_mask, int node_id) 215 + int mempool_init_node(struct mempool *pool, int min_nr, 216 + mempool_alloc_t *alloc_fn, mempool_free_t *free_fn, 217 + void *pool_data, gfp_t gfp_mask, int node_id) 236 218 { 237 219 spin_lock_init(&pool->lock); 238 220 pool->min_nr = min_nr; ··· 282 264 * 283 265 * Return: %0 on success, negative error code otherwise. 284 266 */ 285 - int mempool_init_noprof(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn, 286 - mempool_free_t *free_fn, void *pool_data) 267 + int mempool_init_noprof(struct mempool *pool, int min_nr, 268 + mempool_alloc_t *alloc_fn, mempool_free_t *free_fn, 269 + void *pool_data) 287 270 { 288 271 return mempool_init_node(pool, min_nr, alloc_fn, free_fn, 289 272 pool_data, GFP_KERNEL, NUMA_NO_NODE); ··· 310 291 * 311 292 * Return: pointer to the created memory pool object or %NULL on error. 312 293 */ 313 - mempool_t *mempool_create_node_noprof(int min_nr, mempool_alloc_t *alloc_fn, 314 - mempool_free_t *free_fn, void *pool_data, 315 - gfp_t gfp_mask, int node_id) 294 + struct mempool *mempool_create_node_noprof(int min_nr, 295 + mempool_alloc_t *alloc_fn, mempool_free_t *free_fn, 296 + void *pool_data, gfp_t gfp_mask, int node_id) 316 297 { 317 - mempool_t *pool; 298 + struct mempool *pool; 318 299 319 300 pool = kmalloc_node_noprof(sizeof(*pool), gfp_mask | __GFP_ZERO, node_id); 320 301 if (!pool) ··· 348 329 * 349 330 * Return: %0 on success, negative error code otherwise. 350 331 */ 351 - int mempool_resize(mempool_t *pool, int new_min_nr) 332 + int mempool_resize(struct mempool *pool, int new_min_nr) 352 333 { 353 334 void *element; 354 335 void **new_elements; ··· 410 391 } 411 392 EXPORT_SYMBOL(mempool_resize); 412 393 413 - /** 414 - * mempool_alloc - allocate an element from a specific memory pool 415 - * @pool: pointer to the memory pool which was allocated via 416 - * mempool_create(). 417 - * @gfp_mask: the usual allocation bitmask. 418 - * 419 - * this function only sleeps if the alloc_fn() function sleeps or 420 - * returns NULL. Note that due to preallocation, this function 421 - * *never* fails when called from process contexts. (it might 422 - * fail if called from an IRQ context.) 423 - * Note: using __GFP_ZERO is not supported. 424 - * 425 - * Return: pointer to the allocated element or %NULL on error. 426 - */ 427 - void *mempool_alloc_noprof(mempool_t *pool, gfp_t gfp_mask) 394 + static unsigned int mempool_alloc_from_pool(struct mempool *pool, void **elems, 395 + unsigned int count, unsigned int allocated, 396 + gfp_t gfp_mask) 428 397 { 429 - void *element; 430 398 unsigned long flags; 431 - wait_queue_entry_t wait; 432 - gfp_t gfp_temp; 399 + unsigned int i; 400 + 401 + spin_lock_irqsave(&pool->lock, flags); 402 + if (unlikely(pool->curr_nr < count - allocated)) 403 + goto fail; 404 + for (i = 0; i < count; i++) { 405 + if (!elems[i]) { 406 + elems[i] = remove_element(pool); 407 + allocated++; 408 + } 409 + } 410 + spin_unlock_irqrestore(&pool->lock, flags); 411 + 412 + /* Paired with rmb in mempool_free(), read comment there. */ 413 + smp_wmb(); 414 + 415 + /* 416 + * Update the allocation stack trace as this is more useful for 417 + * debugging. 418 + */ 419 + for (i = 0; i < count; i++) 420 + kmemleak_update_trace(elems[i]); 421 + return allocated; 422 + 423 + fail: 424 + if (gfp_mask & __GFP_DIRECT_RECLAIM) { 425 + DEFINE_WAIT(wait); 426 + 427 + prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE); 428 + spin_unlock_irqrestore(&pool->lock, flags); 429 + 430 + /* 431 + * Wait for someone else to return an element to @pool, but wake 432 + * up occasionally as memory pressure might have reduced even 433 + * and the normal allocation in alloc_fn could succeed even if 434 + * no element was returned. 435 + */ 436 + io_schedule_timeout(5 * HZ); 437 + finish_wait(&pool->wait, &wait); 438 + } else { 439 + /* We must not sleep if __GFP_DIRECT_RECLAIM is not set. */ 440 + spin_unlock_irqrestore(&pool->lock, flags); 441 + } 442 + 443 + return allocated; 444 + } 445 + 446 + /* 447 + * Adjust the gfp flags for mempool allocations, as we never want to dip into 448 + * the global emergency reserves or retry in the page allocator. 449 + * 450 + * The first pass also doesn't want to go reclaim, but the next passes do, so 451 + * return a separate subset for that first iteration. 452 + */ 453 + static inline gfp_t mempool_adjust_gfp(gfp_t *gfp_mask) 454 + { 455 + *gfp_mask |= __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN; 456 + return *gfp_mask & ~(__GFP_DIRECT_RECLAIM | __GFP_IO); 457 + } 458 + 459 + /** 460 + * mempool_alloc_bulk - allocate multiple elements from a memory pool 461 + * @pool: pointer to the memory pool 462 + * @elems: partially or fully populated elements array 463 + * @count: number of entries in @elem that need to be allocated 464 + * @allocated: number of entries in @elem already allocated 465 + * 466 + * Allocate elements for each slot in @elem that is non-%NULL. This is done by 467 + * first calling into the alloc_fn supplied at pool initialization time, and 468 + * dipping into the reserved pool when alloc_fn fails to allocate an element. 469 + * 470 + * On return all @count elements in @elems will be populated. 471 + * 472 + * Return: Always 0. If it wasn't for %$#^$ alloc tags, it would return void. 473 + */ 474 + int mempool_alloc_bulk_noprof(struct mempool *pool, void **elems, 475 + unsigned int count, unsigned int allocated) 476 + { 477 + gfp_t gfp_mask = GFP_KERNEL; 478 + gfp_t gfp_temp = mempool_adjust_gfp(&gfp_mask); 479 + unsigned int i = 0; 480 + 481 + VM_WARN_ON_ONCE(count > pool->min_nr); 482 + might_alloc(gfp_mask); 483 + 484 + /* 485 + * If an error is injected, fail all elements in a bulk allocation so 486 + * that we stress the multiple elements missing path. 487 + */ 488 + if (should_fail_ex(&fail_mempool_alloc_bulk, 1, FAULT_NOWARN)) { 489 + pr_info("forcing mempool usage for %pS\n", 490 + (void *)_RET_IP_); 491 + goto use_pool; 492 + } 493 + 494 + repeat_alloc: 495 + /* 496 + * Try to allocate the elements using the allocation callback first as 497 + * that might succeed even when the caller's bulk allocation did not. 498 + */ 499 + for (i = 0; i < count; i++) { 500 + if (elems[i]) 501 + continue; 502 + elems[i] = pool->alloc(gfp_temp, pool->pool_data); 503 + if (unlikely(!elems[i])) 504 + goto use_pool; 505 + allocated++; 506 + } 507 + 508 + return 0; 509 + 510 + use_pool: 511 + allocated = mempool_alloc_from_pool(pool, elems, count, allocated, 512 + gfp_temp); 513 + gfp_temp = gfp_mask; 514 + goto repeat_alloc; 515 + } 516 + EXPORT_SYMBOL_GPL(mempool_alloc_bulk_noprof); 517 + 518 + /** 519 + * mempool_alloc - allocate an element from a memory pool 520 + * @pool: pointer to the memory pool 521 + * @gfp_mask: GFP_* flags. %__GFP_ZERO is not supported. 522 + * 523 + * Allocate an element from @pool. This is done by first calling into the 524 + * alloc_fn supplied at pool initialization time, and dipping into the reserved 525 + * pool when alloc_fn fails to allocate an element. 526 + * 527 + * This function only sleeps if the alloc_fn callback sleeps, or when waiting 528 + * for elements to become available in the pool. 529 + * 530 + * Return: pointer to the allocated element or %NULL when failing to allocate 531 + * an element. Allocation failure can only happen when @gfp_mask does not 532 + * include %__GFP_DIRECT_RECLAIM. 533 + */ 534 + void *mempool_alloc_noprof(struct mempool *pool, gfp_t gfp_mask) 535 + { 536 + gfp_t gfp_temp = mempool_adjust_gfp(&gfp_mask); 537 + void *element; 433 538 434 539 VM_WARN_ON_ONCE(gfp_mask & __GFP_ZERO); 435 540 might_alloc(gfp_mask); 436 541 437 - gfp_mask |= __GFP_NOMEMALLOC; /* don't allocate emergency reserves */ 438 - gfp_mask |= __GFP_NORETRY; /* don't loop in __alloc_pages */ 439 - gfp_mask |= __GFP_NOWARN; /* failures are OK */ 440 - 441 - gfp_temp = gfp_mask & ~(__GFP_DIRECT_RECLAIM|__GFP_IO); 442 - 443 542 repeat_alloc: 543 + if (should_fail_ex(&fail_mempool_alloc, 1, FAULT_NOWARN)) { 544 + pr_info("forcing mempool usage for %pS\n", 545 + (void *)_RET_IP_); 546 + element = NULL; 547 + } else { 548 + element = pool->alloc(gfp_temp, pool->pool_data); 549 + } 444 550 445 - element = pool->alloc(gfp_temp, pool->pool_data); 446 - if (likely(element != NULL)) 447 - return element; 448 - 449 - spin_lock_irqsave(&pool->lock, flags); 450 - if (likely(pool->curr_nr)) { 451 - element = remove_element(pool); 452 - spin_unlock_irqrestore(&pool->lock, flags); 453 - /* paired with rmb in mempool_free(), read comment there */ 454 - smp_wmb(); 551 + if (unlikely(!element)) { 455 552 /* 456 - * Update the allocation stack trace as this is more useful 457 - * for debugging. 553 + * Try to allocate an element from the pool. 554 + * 555 + * The first pass won't have __GFP_DIRECT_RECLAIM and won't 556 + * sleep in mempool_alloc_from_pool. Retry the allocation 557 + * with all flags set in that case. 458 558 */ 459 - kmemleak_update_trace(element); 460 - return element; 559 + if (!mempool_alloc_from_pool(pool, &element, 1, 0, gfp_temp)) { 560 + if (gfp_temp != gfp_mask) { 561 + gfp_temp = gfp_mask; 562 + goto repeat_alloc; 563 + } 564 + if (gfp_mask & __GFP_DIRECT_RECLAIM) { 565 + goto repeat_alloc; 566 + } 567 + } 461 568 } 462 569 463 - /* 464 - * We use gfp mask w/o direct reclaim or IO for the first round. If 465 - * alloc failed with that and @pool was empty, retry immediately. 466 - */ 467 - if (gfp_temp != gfp_mask) { 468 - spin_unlock_irqrestore(&pool->lock, flags); 469 - gfp_temp = gfp_mask; 470 - goto repeat_alloc; 471 - } 472 - 473 - /* We must not sleep if !__GFP_DIRECT_RECLAIM */ 474 - if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) { 475 - spin_unlock_irqrestore(&pool->lock, flags); 476 - return NULL; 477 - } 478 - 479 - /* Let's wait for someone else to return an element to @pool */ 480 - init_wait(&wait); 481 - prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE); 482 - 483 - spin_unlock_irqrestore(&pool->lock, flags); 484 - 485 - /* 486 - * FIXME: this should be io_schedule(). The timeout is there as a 487 - * workaround for some DM problems in 2.6.18. 488 - */ 489 - io_schedule_timeout(5*HZ); 490 - 491 - finish_wait(&pool->wait, &wait); 492 - goto repeat_alloc; 570 + return element; 493 571 } 494 572 EXPORT_SYMBOL(mempool_alloc_noprof); 495 573 496 574 /** 497 575 * mempool_alloc_preallocated - allocate an element from preallocated elements 498 - * belonging to a specific memory pool 499 - * @pool: pointer to the memory pool which was allocated via 500 - * mempool_create(). 576 + * belonging to a memory pool 577 + * @pool: pointer to the memory pool 501 578 * 502 - * This function is similar to mempool_alloc, but it only attempts allocating 503 - * an element from the preallocated elements. It does not sleep and immediately 504 - * returns if no preallocated elements are available. 579 + * This function is similar to mempool_alloc(), but it only attempts allocating 580 + * an element from the preallocated elements. It only takes a single spinlock_t 581 + * and immediately returns if no preallocated elements are available. 505 582 * 506 583 * Return: pointer to the allocated element or %NULL if no elements are 507 584 * available. 508 585 */ 509 - void *mempool_alloc_preallocated(mempool_t *pool) 586 + void *mempool_alloc_preallocated(struct mempool *pool) 510 587 { 511 - void *element; 512 - unsigned long flags; 588 + void *element = NULL; 513 589 514 - spin_lock_irqsave(&pool->lock, flags); 515 - if (likely(pool->curr_nr)) { 516 - element = remove_element(pool); 517 - spin_unlock_irqrestore(&pool->lock, flags); 518 - /* paired with rmb in mempool_free(), read comment there */ 519 - smp_wmb(); 520 - /* 521 - * Update the allocation stack trace as this is more useful 522 - * for debugging. 523 - */ 524 - kmemleak_update_trace(element); 525 - return element; 526 - } 527 - spin_unlock_irqrestore(&pool->lock, flags); 528 - 529 - return NULL; 590 + mempool_alloc_from_pool(pool, &element, 1, 0, GFP_NOWAIT); 591 + return element; 530 592 } 531 593 EXPORT_SYMBOL(mempool_alloc_preallocated); 532 594 533 595 /** 534 - * mempool_free - return an element to the pool. 535 - * @element: pool element pointer. 536 - * @pool: pointer to the memory pool which was allocated via 537 - * mempool_create(). 596 + * mempool_free_bulk - return elements to a mempool 597 + * @pool: pointer to the memory pool 598 + * @elems: elements to return 599 + * @count: number of elements to return 538 600 * 539 - * this function only sleeps if the free_fn() function sleeps. 601 + * Returns a number of elements from the start of @elem to @pool if @pool needs 602 + * replenishing and sets their slots in @elem to NULL. Other elements are left 603 + * in @elem. 604 + * 605 + * Return: number of elements transferred to @pool. Elements are always 606 + * transferred from the beginning of @elem, so the return value can be used as 607 + * an offset into @elem for the freeing the remaining elements in the caller. 540 608 */ 541 - void mempool_free(void *element, mempool_t *pool) 609 + unsigned int mempool_free_bulk(struct mempool *pool, void **elems, 610 + unsigned int count) 542 611 { 543 612 unsigned long flags; 544 - 545 - if (unlikely(element == NULL)) 546 - return; 613 + unsigned int freed = 0; 614 + bool added = false; 547 615 548 616 /* 549 617 * Paired with the wmb in mempool_alloc(). The preceding read is ··· 664 558 * Waiters happen iff curr_nr is 0 and the above guarantee also 665 559 * ensures that there will be frees which return elements to the 666 560 * pool waking up the waiters. 667 - */ 668 - if (unlikely(READ_ONCE(pool->curr_nr) < pool->min_nr)) { 669 - spin_lock_irqsave(&pool->lock, flags); 670 - if (likely(pool->curr_nr < pool->min_nr)) { 671 - add_element(pool, element); 672 - spin_unlock_irqrestore(&pool->lock, flags); 673 - if (wq_has_sleeper(&pool->wait)) 674 - wake_up(&pool->wait); 675 - return; 676 - } 677 - spin_unlock_irqrestore(&pool->lock, flags); 678 - } 679 - 680 - /* 681 - * Handle the min_nr = 0 edge case: 682 561 * 683 562 * For zero-minimum pools, curr_nr < min_nr (0 < 0) never succeeds, 684 563 * so waiters sleeping on pool->wait would never be woken by the ··· 671 580 * allocation of element when both min_nr and curr_nr are 0, and 672 581 * any active waiters are properly awakened. 673 582 */ 674 - if (unlikely(pool->min_nr == 0 && 583 + if (unlikely(READ_ONCE(pool->curr_nr) < pool->min_nr)) { 584 + spin_lock_irqsave(&pool->lock, flags); 585 + while (pool->curr_nr < pool->min_nr && freed < count) { 586 + add_element(pool, elems[freed++]); 587 + added = true; 588 + } 589 + spin_unlock_irqrestore(&pool->lock, flags); 590 + } else if (unlikely(pool->min_nr == 0 && 675 591 READ_ONCE(pool->curr_nr) == 0)) { 592 + /* Handle the min_nr = 0 edge case: */ 676 593 spin_lock_irqsave(&pool->lock, flags); 677 594 if (likely(pool->curr_nr == 0)) { 678 - add_element(pool, element); 679 - spin_unlock_irqrestore(&pool->lock, flags); 680 - if (wq_has_sleeper(&pool->wait)) 681 - wake_up(&pool->wait); 682 - return; 595 + add_element(pool, elems[freed++]); 596 + added = true; 683 597 } 684 598 spin_unlock_irqrestore(&pool->lock, flags); 685 599 } 686 600 687 - pool->free(element, pool->pool_data); 601 + if (unlikely(added) && wq_has_sleeper(&pool->wait)) 602 + wake_up(&pool->wait); 603 + 604 + return freed; 605 + } 606 + EXPORT_SYMBOL_GPL(mempool_free_bulk); 607 + 608 + /** 609 + * mempool_free - return an element to the pool. 610 + * @element: element to return 611 + * @pool: pointer to the memory pool 612 + * 613 + * Returns @element to @pool if it needs replenishing, else frees it using 614 + * the free_fn callback in @pool. 615 + * 616 + * This function only sleeps if the free_fn callback sleeps. 617 + */ 618 + void mempool_free(void *element, struct mempool *pool) 619 + { 620 + if (likely(element) && !mempool_free_bulk(pool, &element, 1)) 621 + pool->free(element, pool->pool_data); 688 622 } 689 623 EXPORT_SYMBOL(mempool_free); 690 624 ··· 747 631 kfree(element); 748 632 } 749 633 EXPORT_SYMBOL(mempool_kfree); 750 - 751 - void *mempool_kvmalloc(gfp_t gfp_mask, void *pool_data) 752 - { 753 - size_t size = (size_t)pool_data; 754 - return kvmalloc(size, gfp_mask); 755 - } 756 - EXPORT_SYMBOL(mempool_kvmalloc); 757 - 758 - void mempool_kvfree(void *element, void *pool_data) 759 - { 760 - kvfree(element); 761 - } 762 - EXPORT_SYMBOL(mempool_kvfree); 763 634 764 635 /* 765 636 * A simple mempool-backed page allocator that allocates pages
+10 -5
mm/page_alloc.c
··· 4977 4977 * @nr_pages: The number of pages desired in the array 4978 4978 * @page_array: Array to store the pages 4979 4979 * 4980 - * This is a batched version of the page allocator that attempts to 4981 - * allocate nr_pages quickly. Pages are added to the page_array. 4980 + * This is a batched version of the page allocator that attempts to allocate 4981 + * @nr_pages quickly. Pages are added to @page_array. 4982 4982 * 4983 - * Note that only NULL elements are populated with pages and nr_pages 4984 - * is the maximum number of pages that will be stored in the array. 4983 + * Note that only the elements in @page_array that were cleared to %NULL on 4984 + * entry are populated with newly allocated pages. @nr_pages is the maximum 4985 + * number of pages that will be stored in the array. 4985 4986 * 4986 - * Returns the number of pages in the array. 4987 + * Returns the number of pages in @page_array, including ones already 4988 + * allocated on entry. This can be less than the number requested in @nr_pages, 4989 + * but all empty slots are filled from the beginning. I.e., if all slots in 4990 + * @page_array were set to %NULL on entry, the slots from 0 to the return value 4991 + * - 1 will be filled. 4987 4992 */ 4988 4993 unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid, 4989 4994 nodemask_t *nodemask, int nr_pages,
+51 -61
mm/slab.h
··· 40 40 * Freelist pointer and counter to cmpxchg together, avoids the typical ABA 41 41 * problems with cmpxchg of just a pointer. 42 42 */ 43 - typedef union { 44 - struct { 45 - void *freelist; 46 - unsigned long counter; 43 + struct freelist_counters { 44 + union { 45 + struct { 46 + void *freelist; 47 + union { 48 + unsigned long counters; 49 + struct { 50 + unsigned inuse:16; 51 + unsigned objects:15; 52 + /* 53 + * If slab debugging is enabled then the 54 + * frozen bit can be reused to indicate 55 + * that the slab was corrupted 56 + */ 57 + unsigned frozen:1; 58 + }; 59 + }; 60 + }; 61 + #ifdef system_has_freelist_aba 62 + freelist_full_t freelist_counters; 63 + #endif 47 64 }; 48 - freelist_full_t full; 49 - } freelist_aba_t; 65 + }; 50 66 51 67 /* Reuses the bits in struct page */ 52 68 struct slab { ··· 85 69 #endif 86 70 }; 87 71 /* Double-word boundary */ 88 - union { 89 - struct { 90 - void *freelist; /* first free object */ 91 - union { 92 - unsigned long counters; 93 - struct { 94 - unsigned inuse:16; 95 - unsigned objects:15; 96 - /* 97 - * If slab debugging is enabled then the 98 - * frozen bit can be reused to indicate 99 - * that the slab was corrupted 100 - */ 101 - unsigned frozen:1; 102 - }; 103 - }; 104 - }; 105 - #ifdef system_has_freelist_aba 106 - freelist_aba_t freelist_counter; 107 - #endif 108 - }; 72 + struct freelist_counters; 109 73 }; 110 74 struct rcu_head rcu_head; 111 75 }; ··· 110 114 #undef SLAB_MATCH 111 115 static_assert(sizeof(struct slab) <= sizeof(struct page)); 112 116 #if defined(system_has_freelist_aba) 113 - static_assert(IS_ALIGNED(offsetof(struct slab, freelist), sizeof(freelist_aba_t))); 117 + static_assert(IS_ALIGNED(offsetof(struct slab, freelist), sizeof(struct freelist_counters))); 114 118 #endif 115 - 116 - /** 117 - * folio_slab - Converts from folio to slab. 118 - * @folio: The folio. 119 - * 120 - * Currently struct slab is a different representation of a folio where 121 - * folio_test_slab() is true. 122 - * 123 - * Return: The slab which contains this folio. 124 - */ 125 - #define folio_slab(folio) (_Generic((folio), \ 126 - const struct folio *: (const struct slab *)(folio), \ 127 - struct folio *: (struct slab *)(folio))) 128 119 129 120 /** 130 121 * slab_folio - The folio allocated for a slab ··· 129 146 struct slab *: (struct folio *)s)) 130 147 131 148 /** 132 - * page_slab - Converts from first struct page to slab. 133 - * @p: The first (either head of compound or single) page of slab. 149 + * page_slab - Converts from struct page to its slab. 150 + * @page: A page which may or may not belong to a slab. 134 151 * 135 - * A temporary wrapper to convert struct page to struct slab in situations where 136 - * we know the page is the compound head, or single order-0 page. 137 - * 138 - * Long-term ideally everything would work with struct slab directly or go 139 - * through folio to struct slab. 140 - * 141 - * Return: The slab which contains this page 152 + * Return: The slab which contains this page or NULL if the page does 153 + * not belong to a slab. This includes pages returned from large kmalloc. 142 154 */ 143 - #define page_slab(p) (_Generic((p), \ 144 - const struct page *: (const struct slab *)(p), \ 145 - struct page *: (struct slab *)(p))) 155 + static inline struct slab *page_slab(const struct page *page) 156 + { 157 + unsigned long head; 158 + 159 + head = READ_ONCE(page->compound_head); 160 + if (head & 1) 161 + page = (struct page *)(head - 1); 162 + if (data_race(page->page_type >> 24) != PGTY_slab) 163 + page = NULL; 164 + 165 + return (struct slab *)page; 166 + } 146 167 147 168 /** 148 169 * slab_page - The first struct page allocated for a slab ··· 175 188 176 189 static inline struct slab *virt_to_slab(const void *addr) 177 190 { 178 - struct folio *folio = virt_to_folio(addr); 179 - 180 - if (!folio_test_slab(folio)) 181 - return NULL; 182 - 183 - return folio_slab(folio); 191 + return page_slab(virt_to_page(addr)); 184 192 } 185 193 186 194 static inline int slab_order(const struct slab *slab) ··· 218 236 * Slab cache management. 219 237 */ 220 238 struct kmem_cache { 221 - #ifndef CONFIG_SLUB_TINY 222 239 struct kmem_cache_cpu __percpu *cpu_slab; 223 240 struct lock_class_key lock_key; 224 - #endif 225 241 struct slub_percpu_sheaves __percpu *cpu_sheaves; 226 242 /* Used for retrieving partial slabs, etc. */ 227 243 slab_flags_t flags; ··· 579 599 * Else we can use all the padding etc for the allocation 580 600 */ 581 601 return s->size; 602 + } 603 + 604 + static inline unsigned int large_kmalloc_order(const struct page *page) 605 + { 606 + return page[1].flags.f & 0xff; 607 + } 608 + 609 + static inline size_t large_kmalloc_size(const struct page *page) 610 + { 611 + return PAGE_SIZE << large_kmalloc_order(page); 582 612 } 583 613 584 614 #ifdef CONFIG_SLUB_DEBUG
+14 -15
mm/slab_common.c
··· 997 997 */ 998 998 size_t __ksize(const void *object) 999 999 { 1000 - struct folio *folio; 1000 + const struct page *page; 1001 + const struct slab *slab; 1001 1002 1002 1003 if (unlikely(object == ZERO_SIZE_PTR)) 1003 1004 return 0; 1004 1005 1005 - folio = virt_to_folio(object); 1006 + page = virt_to_page(object); 1006 1007 1007 - if (unlikely(!folio_test_slab(folio))) { 1008 - if (WARN_ON(folio_size(folio) <= KMALLOC_MAX_CACHE_SIZE)) 1009 - return 0; 1010 - if (WARN_ON(object != folio_address(folio))) 1011 - return 0; 1012 - return folio_size(folio); 1013 - } 1008 + if (unlikely(PageLargeKmalloc(page))) 1009 + return large_kmalloc_size(page); 1010 + 1011 + slab = page_slab(page); 1012 + /* Delete this after we're sure there are no users */ 1013 + if (WARN_ON(!slab)) 1014 + return page_size(page); 1014 1015 1015 1016 #ifdef CONFIG_SLUB_DEBUG 1016 - skip_orig_size_check(folio_slab(folio)->slab_cache, object); 1017 + skip_orig_size_check(slab->slab_cache, object); 1017 1018 #endif 1018 1019 1019 - return slab_ksize(folio_slab(folio)->slab_cache); 1020 + return slab_ksize(slab->slab_cache); 1020 1021 } 1021 1022 1022 1023 gfp_t kmalloc_fix_flags(gfp_t flags) ··· 1615 1614 static bool kfree_rcu_sheaf(void *obj) 1616 1615 { 1617 1616 struct kmem_cache *s; 1618 - struct folio *folio; 1619 1617 struct slab *slab; 1620 1618 1621 1619 if (is_vmalloc_addr(obj)) 1622 1620 return false; 1623 1621 1624 - folio = virt_to_folio(obj); 1625 - if (unlikely(!folio_test_slab(folio))) 1622 + slab = virt_to_slab(obj); 1623 + if (unlikely(!slab)) 1626 1624 return false; 1627 1625 1628 - slab = folio_slab(folio); 1629 1626 s = slab->slab_cache; 1630 1627 if (s->cpu_sheaves) { 1631 1628 if (likely(!IS_ENABLED(CONFIG_NUMA) ||
+352 -342
mm/slub.c
··· 410 410 NR_SLUB_STAT_ITEMS 411 411 }; 412 412 413 - #ifndef CONFIG_SLUB_TINY 413 + struct freelist_tid { 414 + union { 415 + struct { 416 + void *freelist; /* Pointer to next available object */ 417 + unsigned long tid; /* Globally unique transaction id */ 418 + }; 419 + freelist_full_t freelist_tid; 420 + }; 421 + }; 422 + 414 423 /* 415 424 * When changing the layout, make sure freelist and tid are still compatible 416 425 * with this_cpu_cmpxchg_double() alignment requirements. 417 426 */ 418 427 struct kmem_cache_cpu { 419 - union { 420 - struct { 421 - void **freelist; /* Pointer to next available object */ 422 - unsigned long tid; /* Globally unique transaction id */ 423 - }; 424 - freelist_aba_t freelist_tid; 425 - }; 428 + struct freelist_tid; 426 429 struct slab *slab; /* The slab from which we are allocating */ 427 430 #ifdef CONFIG_SLUB_CPU_PARTIAL 428 431 struct slab *partial; /* Partially allocated slabs */ ··· 435 432 unsigned int stat[NR_SLUB_STAT_ITEMS]; 436 433 #endif 437 434 }; 438 - #endif /* CONFIG_SLUB_TINY */ 439 435 440 436 static inline void stat(const struct kmem_cache *s, enum stat_item si) 441 437 { ··· 471 469 struct rcu_head rcu_head; 472 470 struct list_head barn_list; 473 471 /* only used for prefilled sheafs */ 474 - unsigned int capacity; 472 + struct { 473 + unsigned int capacity; 474 + bool pfmemalloc; 475 + }; 475 476 }; 476 477 struct kmem_cache *cache; 477 478 unsigned int size; ··· 599 594 return freelist_ptr_decode(s, p, ptr_addr); 600 595 } 601 596 602 - #ifndef CONFIG_SLUB_TINY 603 597 static void prefetch_freepointer(const struct kmem_cache *s, void *object) 604 598 { 605 599 prefetchw(object + s->offset); 606 600 } 607 - #endif 608 601 609 602 /* 610 603 * When running under KMSAN, get_freepointer_safe() may return an uninitialized ··· 714 711 return s->cpu_partial_slabs; 715 712 } 716 713 #else 714 + #ifdef SLAB_SUPPORTS_SYSFS 717 715 static inline void 718 716 slub_set_cpu_partial(struct kmem_cache *s, unsigned int nr_objects) 719 717 { 720 718 } 719 + #endif 721 720 722 721 static inline unsigned int slub_get_cpu_partial(struct kmem_cache *s) 723 722 { ··· 760 755 } 761 756 762 757 static inline bool 763 - __update_freelist_fast(struct slab *slab, 764 - void *freelist_old, unsigned long counters_old, 765 - void *freelist_new, unsigned long counters_new) 758 + __update_freelist_fast(struct slab *slab, struct freelist_counters *old, 759 + struct freelist_counters *new) 766 760 { 767 761 #ifdef system_has_freelist_aba 768 - freelist_aba_t old = { .freelist = freelist_old, .counter = counters_old }; 769 - freelist_aba_t new = { .freelist = freelist_new, .counter = counters_new }; 770 - 771 - return try_cmpxchg_freelist(&slab->freelist_counter.full, &old.full, new.full); 762 + return try_cmpxchg_freelist(&slab->freelist_counters, 763 + &old->freelist_counters, 764 + new->freelist_counters); 772 765 #else 773 766 return false; 774 767 #endif 775 768 } 776 769 777 770 static inline bool 778 - __update_freelist_slow(struct slab *slab, 779 - void *freelist_old, unsigned long counters_old, 780 - void *freelist_new, unsigned long counters_new) 771 + __update_freelist_slow(struct slab *slab, struct freelist_counters *old, 772 + struct freelist_counters *new) 781 773 { 782 774 bool ret = false; 783 775 784 776 slab_lock(slab); 785 - if (slab->freelist == freelist_old && 786 - slab->counters == counters_old) { 787 - slab->freelist = freelist_new; 788 - slab->counters = counters_new; 777 + if (slab->freelist == old->freelist && 778 + slab->counters == old->counters) { 779 + slab->freelist = new->freelist; 780 + slab->counters = new->counters; 789 781 ret = true; 790 782 } 791 783 slab_unlock(slab); ··· 798 796 * interrupt the operation. 799 797 */ 800 798 static inline bool __slab_update_freelist(struct kmem_cache *s, struct slab *slab, 801 - void *freelist_old, unsigned long counters_old, 802 - void *freelist_new, unsigned long counters_new, 803 - const char *n) 799 + struct freelist_counters *old, struct freelist_counters *new, const char *n) 804 800 { 805 801 bool ret; 806 802 807 803 if (USE_LOCKLESS_FAST_PATH()) 808 804 lockdep_assert_irqs_disabled(); 809 805 810 - if (s->flags & __CMPXCHG_DOUBLE) { 811 - ret = __update_freelist_fast(slab, freelist_old, counters_old, 812 - freelist_new, counters_new); 813 - } else { 814 - ret = __update_freelist_slow(slab, freelist_old, counters_old, 815 - freelist_new, counters_new); 816 - } 806 + if (s->flags & __CMPXCHG_DOUBLE) 807 + ret = __update_freelist_fast(slab, old, new); 808 + else 809 + ret = __update_freelist_slow(slab, old, new); 810 + 817 811 if (likely(ret)) 818 812 return true; 819 813 ··· 824 826 } 825 827 826 828 static inline bool slab_update_freelist(struct kmem_cache *s, struct slab *slab, 827 - void *freelist_old, unsigned long counters_old, 828 - void *freelist_new, unsigned long counters_new, 829 - const char *n) 829 + struct freelist_counters *old, struct freelist_counters *new, const char *n) 830 830 { 831 831 bool ret; 832 832 833 833 if (s->flags & __CMPXCHG_DOUBLE) { 834 - ret = __update_freelist_fast(slab, freelist_old, counters_old, 835 - freelist_new, counters_new); 834 + ret = __update_freelist_fast(slab, old, new); 836 835 } else { 837 836 unsigned long flags; 838 837 839 838 local_irq_save(flags); 840 - ret = __update_freelist_slow(slab, freelist_old, counters_old, 841 - freelist_new, counters_new); 839 + ret = __update_freelist_slow(slab, old, new); 842 840 local_irq_restore(flags); 843 841 } 844 842 if (likely(ret)) ··· 972 978 static slab_flags_t slub_debug; 973 979 #endif 974 980 975 - static char *slub_debug_string; 981 + static const char *slub_debug_string __ro_after_init; 976 982 static int disable_higher_order_debug; 977 983 978 984 /* ··· 1779 1785 * 1780 1786 * returns the start of next block if there's any, or NULL 1781 1787 */ 1782 - static char * 1783 - parse_slub_debug_flags(char *str, slab_flags_t *flags, char **slabs, bool init) 1788 + static const char * 1789 + parse_slub_debug_flags(const char *str, slab_flags_t *flags, const char **slabs, bool init) 1784 1790 { 1785 1791 bool higher_order_disable = false; 1786 1792 ··· 1857 1863 return NULL; 1858 1864 } 1859 1865 1860 - static int __init setup_slub_debug(char *str) 1866 + static int __init setup_slub_debug(const char *str, const struct kernel_param *kp) 1861 1867 { 1862 1868 slab_flags_t flags; 1863 1869 slab_flags_t global_flags; 1864 - char *saved_str; 1865 - char *slab_list; 1870 + const char *saved_str; 1871 + const char *slab_list; 1866 1872 bool global_slub_debug_changed = false; 1867 1873 bool slab_list_specified = false; 1868 1874 1869 1875 global_flags = DEBUG_DEFAULT_FLAGS; 1870 - if (*str++ != '=' || !*str) 1876 + if (!str || !*str) 1871 1877 /* 1872 1878 * No options specified. Switch on full debugging. 1873 1879 */ ··· 1911 1917 static_branch_unlikely(&init_on_free)) && 1912 1918 (slub_debug & SLAB_POISON)) 1913 1919 pr_info("mem auto-init: SLAB_POISON will take precedence over init_on_alloc/init_on_free\n"); 1914 - return 1; 1920 + return 0; 1915 1921 } 1916 1922 1917 - __setup("slab_debug", setup_slub_debug); 1918 - __setup_param("slub_debug", slub_debug, setup_slub_debug, 0); 1923 + static const struct kernel_param_ops param_ops_slab_debug __initconst = { 1924 + .flags = KERNEL_PARAM_OPS_FL_NOARG, 1925 + .set = setup_slub_debug, 1926 + }; 1927 + __core_param_cb(slab_debug, &param_ops_slab_debug, NULL, 0); 1928 + __core_param_cb(slub_debug, &param_ops_slab_debug, NULL, 0); 1919 1929 1920 1930 /* 1921 1931 * kmem_cache_flags - apply debugging options to the cache ··· 1933 1935 */ 1934 1936 slab_flags_t kmem_cache_flags(slab_flags_t flags, const char *name) 1935 1937 { 1936 - char *iter; 1938 + const char *iter; 1937 1939 size_t len; 1938 - char *next_block; 1940 + const char *next_block; 1939 1941 slab_flags_t block_flags; 1940 1942 slab_flags_t slub_debug_local = slub_debug; 1941 1943 ··· 1959 1961 continue; 1960 1962 /* Found a block that has a slab list, search it */ 1961 1963 while (*iter) { 1962 - char *end, *glob; 1964 + const char *end, *glob; 1963 1965 size_t cmplen; 1964 1966 1965 1967 end = strchrnul(iter, ','); ··· 2021 2023 int objects) {} 2022 2024 static inline void dec_slabs_node(struct kmem_cache *s, int node, 2023 2025 int objects) {} 2024 - #ifndef CONFIG_SLUB_TINY 2025 2026 static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab, 2026 2027 void **freelist, void *nextfree) 2027 2028 { 2028 2029 return false; 2029 2030 } 2030 - #endif 2031 2031 #endif /* CONFIG_SLUB_DEBUG */ 2032 + 2033 + /* 2034 + * The allocated objcg pointers array is not accounted directly. 2035 + * Moreover, it should not come from DMA buffer and is not readily 2036 + * reclaimable. So those GFP bits should be masked off. 2037 + */ 2038 + #define OBJCGS_CLEAR_MASK (__GFP_DMA | __GFP_RECLAIMABLE | \ 2039 + __GFP_ACCOUNT | __GFP_NOFAIL) 2032 2040 2033 2041 #ifdef CONFIG_SLAB_OBJ_EXT 2034 2042 ··· 2089 2085 struct slabobj_ext *vec, unsigned int objects) {} 2090 2086 2091 2087 #endif /* CONFIG_MEM_ALLOC_PROFILING_DEBUG */ 2092 - 2093 - /* 2094 - * The allocated objcg pointers array is not accounted directly. 2095 - * Moreover, it should not come from DMA buffer and is not readily 2096 - * reclaimable. So those GFP bits should be masked off. 2097 - */ 2098 - #define OBJCGS_CLEAR_MASK (__GFP_DMA | __GFP_RECLAIMABLE | \ 2099 - __GFP_ACCOUNT | __GFP_NOFAIL) 2100 2088 2101 2089 static inline void init_slab_obj_exts(struct slab *slab) 2102 2090 { ··· 2369 2373 { 2370 2374 struct slabobj_ext *slab_exts; 2371 2375 struct kmem_cache *s; 2372 - struct folio *folio; 2376 + struct page *page; 2373 2377 struct slab *slab; 2374 2378 unsigned long off; 2375 2379 2376 - folio = virt_to_folio(p); 2377 - if (!folio_test_slab(folio)) { 2380 + page = virt_to_page(p); 2381 + if (PageLargeKmalloc(page)) { 2382 + unsigned int order; 2378 2383 int size; 2379 2384 2380 - if (folio_memcg_kmem(folio)) 2385 + if (PageMemcgKmem(page)) 2381 2386 return true; 2382 2387 2383 - if (__memcg_kmem_charge_page(folio_page(folio, 0), flags, 2384 - folio_order(folio))) 2388 + order = large_kmalloc_order(page); 2389 + if (__memcg_kmem_charge_page(page, flags, order)) 2385 2390 return false; 2386 2391 2387 2392 /* 2388 - * This folio has already been accounted in the global stats but 2393 + * This page has already been accounted in the global stats but 2389 2394 * not in the memcg stats. So, subtract from the global and use 2390 2395 * the interface which adds to both global and memcg stats. 2391 2396 */ 2392 - size = folio_size(folio); 2393 - node_stat_mod_folio(folio, NR_SLAB_UNRECLAIMABLE_B, -size); 2394 - lruvec_stat_mod_folio(folio, NR_SLAB_UNRECLAIMABLE_B, size); 2397 + size = PAGE_SIZE << order; 2398 + mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE_B, -size); 2399 + mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B, size); 2395 2400 return true; 2396 2401 } 2397 2402 2398 - slab = folio_slab(folio); 2403 + slab = page_slab(page); 2399 2404 s = slab->slab_cache; 2400 2405 2401 2406 /* ··· 2598 2601 2599 2602 static struct slab_sheaf *alloc_empty_sheaf(struct kmem_cache *s, gfp_t gfp) 2600 2603 { 2601 - struct slab_sheaf *sheaf = kzalloc(struct_size(sheaf, objects, 2602 - s->sheaf_capacity), gfp); 2604 + struct slab_sheaf *sheaf; 2605 + size_t sheaf_size; 2606 + 2607 + if (gfp & __GFP_NO_OBJ_EXT) 2608 + return NULL; 2609 + 2610 + gfp &= ~OBJCGS_CLEAR_MASK; 2611 + 2612 + /* 2613 + * Prevent recursion to the same cache, or a deep stack of kmallocs of 2614 + * varying sizes (sheaf capacity might differ for each kmalloc size 2615 + * bucket) 2616 + */ 2617 + if (s->flags & SLAB_KMALLOC) 2618 + gfp |= __GFP_NO_OBJ_EXT; 2619 + 2620 + sheaf_size = struct_size(sheaf, objects, s->sheaf_capacity); 2621 + sheaf = kzalloc(sheaf_size, gfp); 2603 2622 2604 2623 if (unlikely(!sheaf)) 2605 2624 return NULL; ··· 2668 2655 if (!sheaf) 2669 2656 return NULL; 2670 2657 2671 - if (refill_sheaf(s, sheaf, gfp)) { 2658 + if (refill_sheaf(s, sheaf, gfp | __GFP_NOMEMALLOC)) { 2672 2659 free_empty_sheaf(s, sheaf); 2673 2660 return NULL; 2674 2661 } ··· 2746 2733 sheaf->size = 0; 2747 2734 } 2748 2735 2749 - static void __rcu_free_sheaf_prepare(struct kmem_cache *s, 2736 + static bool __rcu_free_sheaf_prepare(struct kmem_cache *s, 2750 2737 struct slab_sheaf *sheaf) 2751 2738 { 2752 2739 bool init = slab_want_init_on_free(s); 2753 2740 void **p = &sheaf->objects[0]; 2754 2741 unsigned int i = 0; 2742 + bool pfmemalloc = false; 2755 2743 2756 2744 while (i < sheaf->size) { 2757 2745 struct slab *slab = virt_to_slab(p[i]); ··· 2765 2751 continue; 2766 2752 } 2767 2753 2754 + if (slab_test_pfmemalloc(slab)) 2755 + pfmemalloc = true; 2756 + 2768 2757 i++; 2769 2758 } 2759 + 2760 + return pfmemalloc; 2770 2761 } 2771 2762 2772 2763 static void rcu_free_sheaf_nobarn(struct rcu_head *head) ··· 3034 3015 3035 3016 static void barn_shrink(struct kmem_cache *s, struct node_barn *barn) 3036 3017 { 3037 - struct list_head empty_list; 3038 - struct list_head full_list; 3018 + LIST_HEAD(empty_list); 3019 + LIST_HEAD(full_list); 3039 3020 struct slab_sheaf *sheaf, *sheaf2; 3040 3021 unsigned long flags; 3041 - 3042 - INIT_LIST_HEAD(&empty_list); 3043 - INIT_LIST_HEAD(&full_list); 3044 3022 3045 3023 spin_lock_irqsave(&barn->lock, flags); 3046 3024 ··· 3064 3048 struct kmem_cache_order_objects oo, 3065 3049 bool allow_spin) 3066 3050 { 3067 - struct folio *folio; 3051 + struct page *page; 3068 3052 struct slab *slab; 3069 3053 unsigned int order = oo_order(oo); 3070 3054 3071 3055 if (unlikely(!allow_spin)) 3072 - folio = (struct folio *)alloc_frozen_pages_nolock(0/* __GFP_COMP is implied */, 3056 + page = alloc_frozen_pages_nolock(0/* __GFP_COMP is implied */, 3073 3057 node, order); 3074 3058 else if (node == NUMA_NO_NODE) 3075 - folio = (struct folio *)alloc_frozen_pages(flags, order); 3059 + page = alloc_frozen_pages(flags, order); 3076 3060 else 3077 - folio = (struct folio *)__alloc_frozen_pages(flags, order, node, NULL); 3061 + page = __alloc_frozen_pages(flags, order, node, NULL); 3078 3062 3079 - if (!folio) 3063 + if (!page) 3080 3064 return NULL; 3081 3065 3082 - slab = folio_slab(folio); 3083 - __folio_set_slab(folio); 3084 - if (folio_is_pfmemalloc(folio)) 3066 + __SetPageSlab(page); 3067 + slab = page_slab(page); 3068 + if (page_is_pfmemalloc(page)) 3085 3069 slab_set_pfmemalloc(slab); 3086 3070 3087 3071 return slab; ··· 3305 3289 3306 3290 static void __free_slab(struct kmem_cache *s, struct slab *slab) 3307 3291 { 3308 - struct folio *folio = slab_folio(slab); 3309 - int order = folio_order(folio); 3292 + struct page *page = slab_page(slab); 3293 + int order = compound_order(page); 3310 3294 int pages = 1 << order; 3311 3295 3312 3296 __slab_clear_pfmemalloc(slab); 3313 - folio->mapping = NULL; 3314 - __folio_clear_slab(folio); 3297 + page->mapping = NULL; 3298 + __ClearPageSlab(page); 3315 3299 mm_account_reclaimed_pages(pages); 3316 3300 unaccount_slab(slab, order, s); 3317 - free_frozen_pages(&folio->page, order); 3301 + free_frozen_pages(page, order); 3318 3302 } 3319 3303 3320 3304 static void rcu_free_slab(struct rcu_head *h) ··· 3634 3618 return get_any_partial(s, pc); 3635 3619 } 3636 3620 3637 - #ifndef CONFIG_SLUB_TINY 3638 - 3639 3621 #ifdef CONFIG_PREEMPTION 3640 3622 /* 3641 3623 * Calculate the next globally unique transaction for disambiguation ··· 3737 3723 void *nextfree, *freelist_iter, *freelist_tail; 3738 3724 int tail = DEACTIVATE_TO_HEAD; 3739 3725 unsigned long flags = 0; 3740 - struct slab new; 3741 - struct slab old; 3726 + struct freelist_counters old, new; 3742 3727 3743 3728 if (READ_ONCE(slab->freelist)) { 3744 3729 stat(s, DEACTIVATE_REMOTE_FREES); ··· 3786 3773 } else { 3787 3774 new.freelist = old.freelist; 3788 3775 } 3789 - } while (!slab_update_freelist(s, slab, 3790 - old.freelist, old.counters, 3791 - new.freelist, new.counters, 3792 - "unfreezing slab")); 3776 + } while (!slab_update_freelist(s, slab, &old, &new, "unfreezing slab")); 3793 3777 3794 3778 /* 3795 3779 * Stage three: Manipulate the slab list based on the updated state. ··· 4028 4018 4029 4019 return c->slab || slub_percpu_partial(c); 4030 4020 } 4031 - 4032 - #else /* CONFIG_SLUB_TINY */ 4033 - static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) { } 4034 - static inline bool has_cpu_slab(int cpu, struct kmem_cache *s) { return false; } 4035 - static inline void flush_this_cpu_slab(struct kmem_cache *s) { } 4036 - #endif /* CONFIG_SLUB_TINY */ 4037 4021 4038 4022 static bool has_pcs_used(int cpu, struct kmem_cache *s) 4039 4023 { ··· 4369 4365 return true; 4370 4366 } 4371 4367 4372 - #ifndef CONFIG_SLUB_TINY 4373 4368 static inline bool 4374 4369 __update_cpu_freelist_fast(struct kmem_cache *s, 4375 4370 void *freelist_old, void *freelist_new, 4376 4371 unsigned long tid) 4377 4372 { 4378 - freelist_aba_t old = { .freelist = freelist_old, .counter = tid }; 4379 - freelist_aba_t new = { .freelist = freelist_new, .counter = next_tid(tid) }; 4373 + struct freelist_tid old = { .freelist = freelist_old, .tid = tid }; 4374 + struct freelist_tid new = { .freelist = freelist_new, .tid = next_tid(tid) }; 4380 4375 4381 - return this_cpu_try_cmpxchg_freelist(s->cpu_slab->freelist_tid.full, 4382 - &old.full, new.full); 4376 + return this_cpu_try_cmpxchg_freelist(s->cpu_slab->freelist_tid, 4377 + &old.freelist_tid, new.freelist_tid); 4383 4378 } 4384 4379 4385 4380 /* ··· 4391 4388 */ 4392 4389 static inline void *get_freelist(struct kmem_cache *s, struct slab *slab) 4393 4390 { 4394 - struct slab new; 4395 - unsigned long counters; 4396 - void *freelist; 4391 + struct freelist_counters old, new; 4397 4392 4398 4393 lockdep_assert_held(this_cpu_ptr(&s->cpu_slab->lock)); 4399 4394 4400 4395 do { 4401 - freelist = slab->freelist; 4402 - counters = slab->counters; 4396 + old.freelist = slab->freelist; 4397 + old.counters = slab->counters; 4403 4398 4404 - new.counters = counters; 4399 + new.freelist = NULL; 4400 + new.counters = old.counters; 4405 4401 4406 - new.inuse = slab->objects; 4407 - new.frozen = freelist != NULL; 4402 + new.inuse = old.objects; 4403 + new.frozen = old.freelist != NULL; 4408 4404 4409 - } while (!__slab_update_freelist(s, slab, 4410 - freelist, counters, 4411 - NULL, new.counters, 4412 - "get_freelist")); 4413 4405 4414 - return freelist; 4406 + } while (!__slab_update_freelist(s, slab, &old, &new, "get_freelist")); 4407 + 4408 + return old.freelist; 4415 4409 } 4416 4410 4417 4411 /* ··· 4416 4416 */ 4417 4417 static inline void *freeze_slab(struct kmem_cache *s, struct slab *slab) 4418 4418 { 4419 - struct slab new; 4420 - unsigned long counters; 4421 - void *freelist; 4419 + struct freelist_counters old, new; 4422 4420 4423 4421 do { 4424 - freelist = slab->freelist; 4425 - counters = slab->counters; 4422 + old.freelist = slab->freelist; 4423 + old.counters = slab->counters; 4426 4424 4427 - new.counters = counters; 4425 + new.freelist = NULL; 4426 + new.counters = old.counters; 4428 4427 VM_BUG_ON(new.frozen); 4429 4428 4430 - new.inuse = slab->objects; 4429 + new.inuse = old.objects; 4431 4430 new.frozen = 1; 4432 4431 4433 - } while (!slab_update_freelist(s, slab, 4434 - freelist, counters, 4435 - NULL, new.counters, 4436 - "freeze_slab")); 4432 + } while (!slab_update_freelist(s, slab, &old, &new, "freeze_slab")); 4437 4433 4438 - return freelist; 4434 + return old.freelist; 4439 4435 } 4440 4436 4441 4437 /* ··· 4625 4629 pc.orig_size = orig_size; 4626 4630 slab = get_partial(s, node, &pc); 4627 4631 if (slab) { 4628 - if (kmem_cache_debug(s)) { 4632 + if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) { 4629 4633 freelist = pc.object; 4630 4634 /* 4631 4635 * For debug caches here we had to go through ··· 4663 4667 4664 4668 stat(s, ALLOC_SLAB); 4665 4669 4666 - if (kmem_cache_debug(s)) { 4670 + if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) { 4667 4671 freelist = alloc_single_from_new_slab(s, slab, orig_size, gfpflags); 4668 4672 4669 4673 if (unlikely(!freelist)) { ··· 4875 4879 4876 4880 return object; 4877 4881 } 4878 - #else /* CONFIG_SLUB_TINY */ 4879 - static void *__slab_alloc_node(struct kmem_cache *s, 4880 - gfp_t gfpflags, int node, unsigned long addr, size_t orig_size) 4881 - { 4882 - struct partial_context pc; 4883 - struct slab *slab; 4884 - void *object; 4885 - 4886 - pc.flags = gfpflags; 4887 - pc.orig_size = orig_size; 4888 - slab = get_partial(s, node, &pc); 4889 - 4890 - if (slab) 4891 - return pc.object; 4892 - 4893 - slab = new_slab(s, gfpflags, node); 4894 - if (unlikely(!slab)) { 4895 - slab_out_of_memory(s, gfpflags, node); 4896 - return NULL; 4897 - } 4898 - 4899 - object = alloc_single_from_new_slab(s, slab, orig_size, gfpflags); 4900 - 4901 - return object; 4902 - } 4903 - #endif /* CONFIG_SLUB_TINY */ 4904 4882 4905 4883 /* 4906 4884 * If the object has been wiped upon free, make sure it's fully initialized by ··· 5015 5045 return NULL; 5016 5046 5017 5047 if (empty) { 5018 - if (!refill_sheaf(s, empty, gfp)) { 5048 + if (!refill_sheaf(s, empty, gfp | __GFP_NOMEMALLOC)) { 5019 5049 full = empty; 5020 5050 } else { 5021 5051 /* ··· 5126 5156 * be false because of cpu migration during an unlocked part of 5127 5157 * the current allocation or previous freeing process. 5128 5158 */ 5129 - if (folio_nid(virt_to_folio(object)) != node) { 5159 + if (page_to_nid(virt_to_page(object)) != node) { 5130 5160 local_unlock(&s->cpu_sheaves->lock); 5131 5161 return NULL; 5132 5162 } ··· 5315 5345 } 5316 5346 EXPORT_SYMBOL(kmem_cache_alloc_node_noprof); 5317 5347 5348 + static int __prefill_sheaf_pfmemalloc(struct kmem_cache *s, 5349 + struct slab_sheaf *sheaf, gfp_t gfp) 5350 + { 5351 + int ret = 0; 5352 + 5353 + ret = refill_sheaf(s, sheaf, gfp | __GFP_NOMEMALLOC); 5354 + 5355 + if (likely(!ret || !gfp_pfmemalloc_allowed(gfp))) 5356 + return ret; 5357 + 5358 + /* 5359 + * if we are allowed to, refill sheaf with pfmemalloc but then remember 5360 + * it for when it's returned 5361 + */ 5362 + ret = refill_sheaf(s, sheaf, gfp); 5363 + sheaf->pfmemalloc = true; 5364 + 5365 + return ret; 5366 + } 5367 + 5318 5368 /* 5319 5369 * returns a sheaf that has at least the requested size 5320 5370 * when prefilling is needed, do so with given gfp flags ··· 5369 5379 sheaf->cache = s; 5370 5380 sheaf->capacity = size; 5371 5381 5382 + /* 5383 + * we do not need to care about pfmemalloc here because oversize 5384 + * sheaves area always flushed and freed when returned 5385 + */ 5372 5386 if (!__kmem_cache_alloc_bulk(s, gfp, size, 5373 5387 &sheaf->objects[0])) { 5374 5388 kfree(sheaf); ··· 5409 5415 if (!sheaf) 5410 5416 sheaf = alloc_empty_sheaf(s, gfp); 5411 5417 5412 - if (sheaf && sheaf->size < size) { 5413 - if (refill_sheaf(s, sheaf, gfp)) { 5418 + if (sheaf) { 5419 + sheaf->capacity = s->sheaf_capacity; 5420 + sheaf->pfmemalloc = false; 5421 + 5422 + if (sheaf->size < size && 5423 + __prefill_sheaf_pfmemalloc(s, sheaf, gfp)) { 5414 5424 sheaf_flush_unused(s, sheaf); 5415 5425 free_empty_sheaf(s, sheaf); 5416 5426 sheaf = NULL; 5417 5427 } 5418 5428 } 5419 - 5420 - if (sheaf) 5421 - sheaf->capacity = s->sheaf_capacity; 5422 5429 5423 5430 return sheaf; 5424 5431 } ··· 5440 5445 struct slub_percpu_sheaves *pcs; 5441 5446 struct node_barn *barn; 5442 5447 5443 - if (unlikely(sheaf->capacity != s->sheaf_capacity)) { 5448 + if (unlikely((sheaf->capacity != s->sheaf_capacity) 5449 + || sheaf->pfmemalloc)) { 5444 5450 sheaf_flush_unused(s, sheaf); 5445 5451 kfree(sheaf); 5446 5452 return; ··· 5507 5511 5508 5512 if (likely(sheaf->capacity >= size)) { 5509 5513 if (likely(sheaf->capacity == s->sheaf_capacity)) 5510 - return refill_sheaf(s, sheaf, gfp); 5514 + return __prefill_sheaf_pfmemalloc(s, sheaf, gfp); 5511 5515 5512 5516 if (!__kmem_cache_alloc_bulk(s, gfp, sheaf->capacity - sheaf->size, 5513 5517 &sheaf->objects[sheaf->size])) { ··· 5540 5544 * 5541 5545 * The gfp parameter is meant only to specify __GFP_ZERO or __GFP_ACCOUNT 5542 5546 * memcg charging is forced over limit if necessary, to avoid failure. 5547 + * 5548 + * It is possible that the allocation comes from kfence and then the sheaf 5549 + * size is not decreased. 5543 5550 */ 5544 5551 void * 5545 5552 kmem_cache_alloc_from_sheaf_noprof(struct kmem_cache *s, gfp_t gfp, ··· 5554 5555 if (sheaf->size == 0) 5555 5556 goto out; 5556 5557 5557 - ret = sheaf->objects[--sheaf->size]; 5558 + ret = kfence_alloc(s, s->object_size, gfp); 5559 + 5560 + if (likely(!ret)) 5561 + ret = sheaf->objects[--sheaf->size]; 5558 5562 5559 5563 init = slab_want_init_on_alloc(gfp, s); 5560 5564 ··· 5580 5578 */ 5581 5579 static void *___kmalloc_large_node(size_t size, gfp_t flags, int node) 5582 5580 { 5583 - struct folio *folio; 5581 + struct page *page; 5584 5582 void *ptr = NULL; 5585 5583 unsigned int order = get_order(size); 5586 5584 ··· 5590 5588 flags |= __GFP_COMP; 5591 5589 5592 5590 if (node == NUMA_NO_NODE) 5593 - folio = (struct folio *)alloc_frozen_pages_noprof(flags, order); 5591 + page = alloc_frozen_pages_noprof(flags, order); 5594 5592 else 5595 - folio = (struct folio *)__alloc_frozen_pages_noprof(flags, order, node, NULL); 5593 + page = __alloc_frozen_pages_noprof(flags, order, node, NULL); 5596 5594 5597 - if (folio) { 5598 - ptr = folio_address(folio); 5599 - lruvec_stat_mod_folio(folio, NR_SLAB_UNRECLAIMABLE_B, 5595 + if (page) { 5596 + ptr = page_address(page); 5597 + mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B, 5600 5598 PAGE_SIZE << order); 5601 - __folio_set_large_kmalloc(folio); 5599 + __SetPageLargeKmalloc(page); 5602 5600 } 5603 5601 5604 5602 ptr = kasan_kmalloc_large(ptr, size, flags); ··· 5725 5723 * it did local_lock_irqsave(&s->cpu_slab->lock, flags). 5726 5724 * In this case fast path with __update_cpu_freelist_fast() is not safe. 5727 5725 */ 5728 - #ifndef CONFIG_SLUB_TINY 5729 5726 if (!in_nmi() || !local_lock_is_locked(&s->cpu_slab->lock)) 5730 - #endif 5731 5727 ret = __slab_alloc_node(s, alloc_gfp, node, _RET_IP_, size); 5732 5728 5733 5729 if (PTR_ERR(ret) == -EBUSY) { ··· 5863 5863 unsigned long addr) 5864 5864 5865 5865 { 5866 - void *prior; 5867 - int was_frozen; 5868 - struct slab new; 5869 - unsigned long counters; 5866 + bool was_frozen, was_full; 5867 + struct freelist_counters old, new; 5870 5868 struct kmem_cache_node *n = NULL; 5871 5869 unsigned long flags; 5872 5870 bool on_node_partial; ··· 5876 5878 return; 5877 5879 } 5878 5880 5881 + /* 5882 + * It is enough to test IS_ENABLED(CONFIG_SLUB_CPU_PARTIAL) below 5883 + * instead of kmem_cache_has_cpu_partial(s), because kmem_cache_debug(s) 5884 + * is the only other reason it can be false, and it is already handled 5885 + * above. 5886 + */ 5887 + 5879 5888 do { 5880 5889 if (unlikely(n)) { 5881 5890 spin_unlock_irqrestore(&n->list_lock, flags); 5882 5891 n = NULL; 5883 5892 } 5884 - prior = slab->freelist; 5885 - counters = slab->counters; 5886 - set_freepointer(s, tail, prior); 5887 - new.counters = counters; 5888 - was_frozen = new.frozen; 5893 + 5894 + old.freelist = slab->freelist; 5895 + old.counters = slab->counters; 5896 + 5897 + was_full = (old.freelist == NULL); 5898 + was_frozen = old.frozen; 5899 + 5900 + set_freepointer(s, tail, old.freelist); 5901 + 5902 + new.freelist = head; 5903 + new.counters = old.counters; 5889 5904 new.inuse -= cnt; 5890 - if ((!new.inuse || !prior) && !was_frozen) { 5891 - /* Needs to be taken off a list */ 5892 - if (!kmem_cache_has_cpu_partial(s) || prior) { 5905 + 5906 + /* 5907 + * Might need to be taken off (due to becoming empty) or added 5908 + * to (due to not being full anymore) the partial list. 5909 + * Unless it's frozen. 5910 + */ 5911 + if ((!new.inuse || was_full) && !was_frozen) { 5912 + /* 5913 + * If slab becomes non-full and we have cpu partial 5914 + * lists, we put it there unconditionally to avoid 5915 + * taking the list_lock. Otherwise we need it. 5916 + */ 5917 + if (!(IS_ENABLED(CONFIG_SLUB_CPU_PARTIAL) && was_full)) { 5893 5918 5894 5919 n = get_node(s, slab_nid(slab)); 5895 5920 /* ··· 5929 5908 } 5930 5909 } 5931 5910 5932 - } while (!slab_update_freelist(s, slab, 5933 - prior, counters, 5934 - head, new.counters, 5935 - "__slab_free")); 5911 + } while (!slab_update_freelist(s, slab, &old, &new, "__slab_free")); 5936 5912 5937 5913 if (likely(!n)) { 5938 5914 ··· 5939 5921 * activity can be necessary. 5940 5922 */ 5941 5923 stat(s, FREE_FROZEN); 5942 - } else if (kmem_cache_has_cpu_partial(s) && !prior) { 5924 + } else if (IS_ENABLED(CONFIG_SLUB_CPU_PARTIAL) && was_full) { 5943 5925 /* 5944 5926 * If we started with a full slab then put it onto the 5945 5927 * per cpu partial list. ··· 5948 5930 stat(s, CPU_PARTIAL_FREE); 5949 5931 } 5950 5932 5933 + /* 5934 + * In other cases we didn't take the list_lock because the slab 5935 + * was already on the partial list and will remain there. 5936 + */ 5937 + 5951 5938 return; 5952 5939 } 5953 5940 ··· 5960 5937 * This slab was partially empty but not on the per-node partial list, 5961 5938 * in which case we shouldn't manipulate its list, just return. 5962 5939 */ 5963 - if (prior && !on_node_partial) { 5940 + if (!was_full && !on_node_partial) { 5964 5941 spin_unlock_irqrestore(&n->list_lock, flags); 5965 5942 return; 5966 5943 } 5967 5944 5945 + /* 5946 + * If slab became empty, should we add/keep it on the partial list or we 5947 + * have enough? 5948 + */ 5968 5949 if (unlikely(!new.inuse && n->nr_partial >= s->min_partial)) 5969 5950 goto slab_empty; 5970 5951 5971 5952 /* 5972 5953 * Objects left in the slab. If it was not on the partial list before 5973 - * then add it. 5954 + * then add it. This can only happen when cache has no per cpu partial 5955 + * list otherwise we would have put it there. 5974 5956 */ 5975 - if (!kmem_cache_has_cpu_partial(s) && unlikely(!prior)) { 5957 + if (!IS_ENABLED(CONFIG_SLUB_CPU_PARTIAL) && unlikely(was_full)) { 5976 5958 add_partial(n, slab, DEACTIVATE_TO_TAIL); 5977 5959 stat(s, FREE_ADD_PARTIAL); 5978 5960 } ··· 5985 5957 return; 5986 5958 5987 5959 slab_empty: 5988 - if (prior) { 5989 - /* 5990 - * Slab on the partial list. 5991 - */ 5960 + /* 5961 + * The slab could have a single object and thus go from full to empty in 5962 + * a single free, but more likely it was on the partial list. Remove it. 5963 + */ 5964 + if (likely(!was_full)) { 5992 5965 remove_partial(n, slab); 5993 5966 stat(s, FREE_REMOVE_PARTIAL); 5994 5967 } ··· 6214 6185 * handles it fine. The only downside is that sheaf will serve fewer 6215 6186 * allocations when reused. It only happens due to debugging, which is a 6216 6187 * performance hit anyway. 6188 + * 6189 + * If it returns true, there was at least one object from pfmemalloc 6190 + * slab so simply flush everything. 6217 6191 */ 6218 - __rcu_free_sheaf_prepare(s, sheaf); 6192 + if (__rcu_free_sheaf_prepare(s, sheaf)) 6193 + goto flush; 6219 6194 6220 6195 n = get_node(s, sheaf->node); 6221 6196 if (!n) ··· 6372 6339 continue; 6373 6340 } 6374 6341 6375 - if (unlikely(IS_ENABLED(CONFIG_NUMA) && slab_nid(slab) != node)) { 6342 + if (unlikely((IS_ENABLED(CONFIG_NUMA) && slab_nid(slab) != node) 6343 + || slab_test_pfmemalloc(slab))) { 6376 6344 remote_objects[remote_nr] = p[i]; 6377 6345 p[i] = p[--size]; 6378 6346 if (++remote_nr >= PCS_BATCH_MAX) ··· 6521 6487 llist_for_each_safe(pos, t, llnode) { 6522 6488 struct slab *slab = container_of(pos, struct slab, llnode); 6523 6489 6524 - #ifdef CONFIG_SLUB_TINY 6525 - free_slab(slab->slab_cache, slab); 6526 - #else 6527 6490 if (slab->frozen) 6528 6491 deactivate_slab(slab->slab_cache, slab, slab->flush_freelist); 6529 6492 else 6530 6493 free_slab(slab->slab_cache, slab); 6531 - #endif 6532 6494 } 6533 6495 } 6534 6496 ··· 6560 6530 irq_work_sync(&per_cpu_ptr(&defer_free_objects, cpu)->work); 6561 6531 } 6562 6532 6563 - #ifndef CONFIG_SLUB_TINY 6564 6533 /* 6565 6534 * Fastpath with forced inlining to produce a kfree and kmem_cache_free that 6566 6535 * can perform fastpath freeing without additional function calls. ··· 6652 6623 } 6653 6624 stat_add(s, FREE_FASTPATH, cnt); 6654 6625 } 6655 - #else /* CONFIG_SLUB_TINY */ 6656 - static void do_slab_free(struct kmem_cache *s, 6657 - struct slab *slab, void *head, void *tail, 6658 - int cnt, unsigned long addr) 6659 - { 6660 - __slab_free(s, slab, head, tail, cnt, addr); 6661 - } 6662 - #endif /* CONFIG_SLUB_TINY */ 6663 6626 6664 6627 static __fastpath_inline 6665 6628 void slab_free(struct kmem_cache *s, struct slab *slab, void *object, ··· 6664 6643 return; 6665 6644 6666 6645 if (s->cpu_sheaves && likely(!IS_ENABLED(CONFIG_NUMA) || 6667 - slab_nid(slab) == numa_mem_id())) { 6646 + slab_nid(slab) == numa_mem_id()) 6647 + && likely(!slab_test_pfmemalloc(slab))) { 6668 6648 if (likely(free_to_pcs(s, object))) 6669 6649 return; 6670 6650 } ··· 6775 6753 } 6776 6754 EXPORT_SYMBOL(kmem_cache_free); 6777 6755 6778 - static void free_large_kmalloc(struct folio *folio, void *object) 6756 + static void free_large_kmalloc(struct page *page, void *object) 6779 6757 { 6780 - unsigned int order = folio_order(folio); 6758 + unsigned int order = compound_order(page); 6781 6759 6782 - if (WARN_ON_ONCE(!folio_test_large_kmalloc(folio))) { 6783 - dump_page(&folio->page, "Not a kmalloc allocation"); 6760 + if (WARN_ON_ONCE(!PageLargeKmalloc(page))) { 6761 + dump_page(page, "Not a kmalloc allocation"); 6784 6762 return; 6785 6763 } 6786 6764 ··· 6791 6769 kasan_kfree_large(object); 6792 6770 kmsan_kfree_large(object); 6793 6771 6794 - lruvec_stat_mod_folio(folio, NR_SLAB_UNRECLAIMABLE_B, 6772 + mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B, 6795 6773 -(PAGE_SIZE << order)); 6796 - __folio_clear_large_kmalloc(folio); 6797 - free_frozen_pages(&folio->page, order); 6774 + __ClearPageLargeKmalloc(page); 6775 + free_frozen_pages(page, order); 6798 6776 } 6799 6777 6800 6778 /* ··· 6804 6782 void kvfree_rcu_cb(struct rcu_head *head) 6805 6783 { 6806 6784 void *obj = head; 6807 - struct folio *folio; 6785 + struct page *page; 6808 6786 struct slab *slab; 6809 6787 struct kmem_cache *s; 6810 6788 void *slab_addr; ··· 6815 6793 return; 6816 6794 } 6817 6795 6818 - folio = virt_to_folio(obj); 6819 - if (!folio_test_slab(folio)) { 6796 + page = virt_to_page(obj); 6797 + slab = page_slab(page); 6798 + if (!slab) { 6820 6799 /* 6821 6800 * rcu_head offset can be only less than page size so no need to 6822 - * consider folio order 6801 + * consider allocation order 6823 6802 */ 6824 6803 obj = (void *) PAGE_ALIGN_DOWN((unsigned long)obj); 6825 - free_large_kmalloc(folio, obj); 6804 + free_large_kmalloc(page, obj); 6826 6805 return; 6827 6806 } 6828 6807 6829 - slab = folio_slab(folio); 6830 6808 s = slab->slab_cache; 6831 - slab_addr = folio_address(folio); 6809 + slab_addr = slab_address(slab); 6832 6810 6833 6811 if (is_kfence_address(obj)) { 6834 6812 obj = kfence_object_start(obj); ··· 6850 6828 */ 6851 6829 void kfree(const void *object) 6852 6830 { 6853 - struct folio *folio; 6831 + struct page *page; 6854 6832 struct slab *slab; 6855 6833 struct kmem_cache *s; 6856 6834 void *x = (void *)object; ··· 6860 6838 if (unlikely(ZERO_OR_NULL_PTR(object))) 6861 6839 return; 6862 6840 6863 - folio = virt_to_folio(object); 6864 - if (unlikely(!folio_test_slab(folio))) { 6865 - free_large_kmalloc(folio, (void *)object); 6841 + page = virt_to_page(object); 6842 + slab = page_slab(page); 6843 + if (!slab) { 6844 + free_large_kmalloc(page, (void *)object); 6866 6845 return; 6867 6846 } 6868 6847 6869 - slab = folio_slab(folio); 6870 6848 s = slab->slab_cache; 6871 6849 slab_free(s, slab, x, _RET_IP_); 6872 6850 } ··· 6883 6861 */ 6884 6862 void kfree_nolock(const void *object) 6885 6863 { 6886 - struct folio *folio; 6887 6864 struct slab *slab; 6888 6865 struct kmem_cache *s; 6889 6866 void *x = (void *)object; ··· 6890 6869 if (unlikely(ZERO_OR_NULL_PTR(object))) 6891 6870 return; 6892 6871 6893 - folio = virt_to_folio(object); 6894 - if (unlikely(!folio_test_slab(folio))) { 6872 + slab = virt_to_slab(object); 6873 + if (unlikely(!slab)) { 6895 6874 WARN_ONCE(1, "large_kmalloc is not supported by kfree_nolock()"); 6896 6875 return; 6897 6876 } 6898 6877 6899 - slab = folio_slab(folio); 6900 6878 s = slab->slab_cache; 6901 6879 6902 6880 memcg_slab_free_hook(s, slab, &x, 1); ··· 6927 6907 * since kasan quarantine takes locks and not supported from NMI. 6928 6908 */ 6929 6909 kasan_slab_free(s, x, false, false, /* skip quarantine */true); 6930 - #ifndef CONFIG_SLUB_TINY 6931 6910 do_slab_free(s, slab, x, x, 0, _RET_IP_); 6932 - #else 6933 - defer_free(s, x); 6934 - #endif 6935 6911 } 6936 6912 EXPORT_SYMBOL_GPL(kfree_nolock); 6937 6913 ··· 6959 6943 if (is_kfence_address(p)) { 6960 6944 ks = orig_size = kfence_ksize(p); 6961 6945 } else { 6962 - struct folio *folio; 6946 + struct page *page = virt_to_page(p); 6947 + struct slab *slab = page_slab(page); 6963 6948 6964 - folio = virt_to_folio(p); 6965 - if (unlikely(!folio_test_slab(folio))) { 6949 + if (!slab) { 6966 6950 /* Big kmalloc object */ 6967 - WARN_ON(folio_size(folio) <= KMALLOC_MAX_CACHE_SIZE); 6968 - WARN_ON(p != folio_address(folio)); 6969 - ks = folio_size(folio); 6951 + ks = page_size(page); 6952 + WARN_ON(ks <= KMALLOC_MAX_CACHE_SIZE); 6953 + WARN_ON(p != page_address(page)); 6970 6954 } else { 6971 - s = folio_slab(folio)->slab_cache; 6955 + s = slab->slab_cache; 6972 6956 orig_size = get_orig_size(s, (void *)p); 6973 6957 ks = s->object_size; 6974 6958 } ··· 7272 7256 { 7273 7257 int lookahead = 3; 7274 7258 void *object; 7275 - struct folio *folio; 7259 + struct page *page; 7260 + struct slab *slab; 7276 7261 size_t same; 7277 7262 7278 7263 object = p[--size]; 7279 - folio = virt_to_folio(object); 7264 + page = virt_to_page(object); 7265 + slab = page_slab(page); 7280 7266 if (!s) { 7281 7267 /* Handle kalloc'ed objects */ 7282 - if (unlikely(!folio_test_slab(folio))) { 7283 - free_large_kmalloc(folio, object); 7268 + if (!slab) { 7269 + free_large_kmalloc(page, object); 7284 7270 df->slab = NULL; 7285 7271 return size; 7286 7272 } 7287 7273 /* Derive kmem_cache from object */ 7288 - df->slab = folio_slab(folio); 7289 - df->s = df->slab->slab_cache; 7274 + df->slab = slab; 7275 + df->s = slab->slab_cache; 7290 7276 } else { 7291 - df->slab = folio_slab(folio); 7277 + df->slab = slab; 7292 7278 df->s = cache_from_obj(s, object); /* Support for memcg */ 7293 7279 } 7294 7280 ··· 7379 7361 } 7380 7362 EXPORT_SYMBOL(kmem_cache_free_bulk); 7381 7363 7382 - #ifndef CONFIG_SLUB_TINY 7383 7364 static inline 7384 7365 int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, 7385 7366 void **p) ··· 7396 7379 local_lock_irqsave(&s->cpu_slab->lock, irqflags); 7397 7380 7398 7381 for (i = 0; i < size; i++) { 7399 - void *object = kfence_alloc(s, s->object_size, flags); 7382 + void *object = c->freelist; 7400 7383 7401 - if (unlikely(object)) { 7402 - p[i] = object; 7403 - continue; 7404 - } 7405 - 7406 - object = c->freelist; 7407 7384 if (unlikely(!object)) { 7408 7385 /* 7409 7386 * We may have removed an object from c->freelist using ··· 7443 7432 return 0; 7444 7433 7445 7434 } 7446 - #else /* CONFIG_SLUB_TINY */ 7447 - static int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, 7448 - size_t size, void **p) 7449 - { 7450 - int i; 7451 - 7452 - for (i = 0; i < size; i++) { 7453 - void *object = kfence_alloc(s, s->object_size, flags); 7454 - 7455 - if (unlikely(object)) { 7456 - p[i] = object; 7457 - continue; 7458 - } 7459 - 7460 - p[i] = __slab_alloc_node(s, flags, NUMA_NO_NODE, 7461 - _RET_IP_, s->object_size); 7462 - if (unlikely(!p[i])) 7463 - goto error; 7464 - 7465 - maybe_wipe_obj_freeptr(s, p[i]); 7466 - } 7467 - 7468 - return i; 7469 - 7470 - error: 7471 - __kmem_cache_free_bulk(s, i, p); 7472 - return 0; 7473 - } 7474 - #endif /* CONFIG_SLUB_TINY */ 7475 7435 7476 7436 /* Note that interrupts must be enabled when calling this function. */ 7477 7437 int kmem_cache_alloc_bulk_noprof(struct kmem_cache *s, gfp_t flags, size_t size, 7478 7438 void **p) 7479 7439 { 7480 7440 unsigned int i = 0; 7441 + void *kfence_obj; 7481 7442 7482 7443 if (!size) 7483 7444 return 0; ··· 7457 7474 s = slab_pre_alloc_hook(s, flags); 7458 7475 if (unlikely(!s)) 7459 7476 return 0; 7477 + 7478 + /* 7479 + * to make things simpler, only assume at most once kfence allocated 7480 + * object per bulk allocation and choose its index randomly 7481 + */ 7482 + kfence_obj = kfence_alloc(s, s->object_size, flags); 7483 + 7484 + if (unlikely(kfence_obj)) { 7485 + if (unlikely(size == 1)) { 7486 + p[0] = kfence_obj; 7487 + goto out; 7488 + } 7489 + size--; 7490 + } 7460 7491 7461 7492 if (s->cpu_sheaves) 7462 7493 i = alloc_from_pcs_bulk(s, size, p); ··· 7483 7486 if (unlikely(__kmem_cache_alloc_bulk(s, flags, size - i, p + i) == 0)) { 7484 7487 if (i > 0) 7485 7488 __kmem_cache_free_bulk(s, i, p); 7489 + if (kfence_obj) 7490 + __kfence_free(kfence_obj); 7486 7491 return 0; 7487 7492 } 7488 7493 } 7489 7494 7495 + if (unlikely(kfence_obj)) { 7496 + int idx = get_random_u32_below(size + 1); 7497 + 7498 + if (idx != size) 7499 + p[size] = p[idx]; 7500 + p[idx] = kfence_obj; 7501 + 7502 + size++; 7503 + } 7504 + 7505 + out: 7490 7506 /* 7491 7507 * memcg and kmem_cache debug support and memory initialization. 7492 7508 * Done outside of the IRQ disabled fastpath loop. ··· 7661 7651 barn_init(barn); 7662 7652 } 7663 7653 7664 - #ifndef CONFIG_SLUB_TINY 7665 7654 static inline int alloc_kmem_cache_cpus(struct kmem_cache *s) 7666 7655 { 7667 7656 BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE < ··· 7681 7672 7682 7673 return 1; 7683 7674 } 7684 - #else 7685 - static inline int alloc_kmem_cache_cpus(struct kmem_cache *s) 7686 - { 7687 - return 1; 7688 - } 7689 - #endif /* CONFIG_SLUB_TINY */ 7690 7675 7691 7676 static int init_percpu_sheaves(struct kmem_cache *s) 7692 7677 { ··· 7770 7767 cache_random_seq_destroy(s); 7771 7768 if (s->cpu_sheaves) 7772 7769 pcs_destroy(s); 7773 - #ifndef CONFIG_SLUB_TINY 7774 7770 #ifdef CONFIG_PREEMPT_RT 7775 7771 if (s->cpu_slab) 7776 7772 lockdep_unregister_key(&s->lock_key); 7777 7773 #endif 7778 7774 free_percpu(s->cpu_slab); 7779 - #endif 7780 7775 free_kmem_cache_nodes(s); 7781 7776 } 7782 7777 ··· 8140 8139 * Kmalloc subsystem 8141 8140 *******************************************************************/ 8142 8141 8143 - static int __init setup_slub_min_order(char *str) 8142 + static int __init setup_slub_min_order(const char *str, const struct kernel_param *kp) 8144 8143 { 8145 - get_option(&str, (int *)&slub_min_order); 8144 + int ret; 8145 + 8146 + ret = kstrtouint(str, 0, &slub_min_order); 8147 + if (ret) 8148 + return ret; 8146 8149 8147 8150 if (slub_min_order > slub_max_order) 8148 8151 slub_max_order = slub_min_order; 8149 8152 8150 - return 1; 8153 + return 0; 8151 8154 } 8152 8155 8153 - __setup("slab_min_order=", setup_slub_min_order); 8154 - __setup_param("slub_min_order=", slub_min_order, setup_slub_min_order, 0); 8156 + static const struct kernel_param_ops param_ops_slab_min_order __initconst = { 8157 + .set = setup_slub_min_order, 8158 + }; 8159 + __core_param_cb(slab_min_order, &param_ops_slab_min_order, &slub_min_order, 0); 8160 + __core_param_cb(slub_min_order, &param_ops_slab_min_order, &slub_min_order, 0); 8155 8161 8156 - 8157 - static int __init setup_slub_max_order(char *str) 8162 + static int __init setup_slub_max_order(const char *str, const struct kernel_param *kp) 8158 8163 { 8159 - get_option(&str, (int *)&slub_max_order); 8164 + int ret; 8165 + 8166 + ret = kstrtouint(str, 0, &slub_max_order); 8167 + if (ret) 8168 + return ret; 8169 + 8160 8170 slub_max_order = min_t(unsigned int, slub_max_order, MAX_PAGE_ORDER); 8161 8171 8162 8172 if (slub_min_order > slub_max_order) 8163 8173 slub_min_order = slub_max_order; 8164 8174 8165 - return 1; 8175 + return 0; 8166 8176 } 8167 8177 8168 - __setup("slab_max_order=", setup_slub_max_order); 8169 - __setup_param("slub_max_order=", slub_max_order, setup_slub_max_order, 0); 8178 + static const struct kernel_param_ops param_ops_slab_max_order __initconst = { 8179 + .set = setup_slub_max_order, 8180 + }; 8181 + __core_param_cb(slab_max_order, &param_ops_slab_max_order, &slub_max_order, 0); 8182 + __core_param_cb(slub_max_order, &param_ops_slab_max_order, &slub_max_order, 0); 8170 8183 8171 - static int __init setup_slub_min_objects(char *str) 8172 - { 8173 - get_option(&str, (int *)&slub_min_objects); 8174 - 8175 - return 1; 8176 - } 8177 - 8178 - __setup("slab_min_objects=", setup_slub_min_objects); 8179 - __setup_param("slub_min_objects=", slub_min_objects, setup_slub_min_objects, 0); 8184 + core_param(slab_min_objects, slub_min_objects, uint, 0); 8185 + core_param(slub_min_objects, slub_min_objects, uint, 0); 8180 8186 8181 8187 #ifdef CONFIG_NUMA 8182 - static int __init setup_slab_strict_numa(char *str) 8188 + static int __init setup_slab_strict_numa(const char *str, const struct kernel_param *kp) 8183 8189 { 8184 8190 if (nr_node_ids > 1) { 8185 8191 static_branch_enable(&strict_numa); ··· 8195 8187 pr_warn("slab_strict_numa parameter set on non NUMA system.\n"); 8196 8188 } 8197 8189 8198 - return 1; 8190 + return 0; 8199 8191 } 8200 8192 8201 - __setup("slab_strict_numa", setup_slab_strict_numa); 8193 + static const struct kernel_param_ops param_ops_slab_strict_numa __initconst = { 8194 + .flags = KERNEL_PARAM_OPS_FL_NOARG, 8195 + .set = setup_slab_strict_numa, 8196 + }; 8197 + __core_param_cb(slab_strict_numa, &param_ops_slab_strict_numa, NULL, 0); 8202 8198 #endif 8203 8199 8204 8200 ··· 8528 8516 8529 8517 void __init kmem_cache_init_late(void) 8530 8518 { 8531 - #ifndef CONFIG_SLUB_TINY 8532 8519 flushwq = alloc_workqueue("slub_flushwq", WQ_MEM_RECLAIM, 0); 8533 8520 WARN_ON(!flushwq); 8534 - #endif 8535 8521 } 8536 8522 8537 8523 struct kmem_cache *
+16 -8
mm/usercopy.c
··· 164 164 { 165 165 unsigned long addr = (unsigned long)ptr; 166 166 unsigned long offset; 167 - struct folio *folio; 167 + struct page *page; 168 + struct slab *slab; 168 169 169 170 if (is_kmap_addr(ptr)) { 170 171 offset = offset_in_page(ptr); ··· 190 189 if (!virt_addr_valid(ptr)) 191 190 return; 192 191 193 - folio = virt_to_folio(ptr); 194 - 195 - if (folio_test_slab(folio)) { 192 + page = virt_to_page(ptr); 193 + slab = page_slab(page); 194 + if (slab) { 196 195 /* Check slab allocator for flags and size. */ 197 - __check_heap_object(ptr, n, folio_slab(folio), to_user); 198 - } else if (folio_test_large(folio)) { 199 - offset = ptr - folio_address(folio); 200 - if (n > folio_size(folio) - offset) 196 + __check_heap_object(ptr, n, slab, to_user); 197 + } else if (PageCompound(page)) { 198 + page = compound_head(page); 199 + offset = ptr - page_address(page); 200 + if (n > page_size(page) - offset) 201 201 usercopy_abort("page alloc", NULL, to_user, offset, n); 202 202 } 203 + 204 + /* 205 + * We cannot check non-compound pages. They might be part of 206 + * a large allocation, in which case crossing a page boundary 207 + * is fine. 208 + */ 203 209 } 204 210 205 211 DEFINE_STATIC_KEY_MAYBE_RO(CONFIG_HARDENED_USERCOPY_DEFAULT_ON,