Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge tag 'slab-for-6.8' of git://git.kernel.org/pub/scm/linux/kernel/git/vbabka/slab

Pull slab updates from Vlastimil Babka:

- SLUB: delayed freezing of CPU partial slabs (Chengming Zhou)

Freezing is an operation involving double_cmpxchg() that makes a slab
exclusive for a particular CPU. Chengming noticed that we use it also
in situations where we are not yet installing the slab as the CPU
slab, because freezing also indicates that the slab is not on the
shared list. This results in redundant freeze/unfreeze operation and
can be avoided by marking separately the shared list presence by
reusing the PG_workingset flag.

This approach neatly avoids the issues described in 9b1ea29bc0d7
("Revert "mm, slub: consider rest of partial list if acquire_slab()
fails"") as we can now grab a slab from the shared list in a quick
and guaranteed way without the cmpxchg_double() operation that
amplifies the lock contention and can fail.

As a result, lkp has reported 34.2% improvement of
stress-ng.rawudp.ops_per_sec

- SLAB removal and SLUB cleanups (Vlastimil Babka)

The SLAB allocator has been deprecated since 6.5 and nobody has
objected so far. We agreed at LSF/MM to wait until the next LTS,
which is 6.6, so we should be good to go now.

This doesn't yet erase all traces of SLAB outside of mm/ so some dead
code, comments or documentation remain, and will be cleaned up
gradually (some series are already in the works).

Removing the choice of allocators has already allowed to simplify and
optimize the code wiring up the kmalloc APIs to the SLUB
implementation.

* tag 'slab-for-6.8' of git://git.kernel.org/pub/scm/linux/kernel/git/vbabka/slab: (34 commits)
mm/slub: free KFENCE objects in slab_free_hook()
mm/slub: handle bulk and single object freeing separately
mm/slub: introduce __kmem_cache_free_bulk() without free hooks
mm/slub: fix bulk alloc and free stats
mm/slub: optimize free fast path code layout
mm/slub: optimize alloc fastpath code layout
mm/slub: remove slab_alloc() and __kmem_cache_alloc_lru() wrappers
mm/slab: move kmalloc() functions from slab_common.c to slub.c
mm/slab: move kmalloc_slab() to mm/slab.h
mm/slab: move kfree() from slab_common.c to slub.c
mm/slab: move struct kmem_cache_node from slab.h to slub.c
mm/slab: move memcg related functions from slab.h to slub.c
mm/slab: move pre/post-alloc hooks from slab.h to slub.c
mm/slab: consolidate includes in the internal mm/slab.h
mm/slab: move the rest of slub_def.h to mm/slab.h
mm/slab: move struct kmem_cache_cpu declaration to slub.c
mm/slab: remove mm/slab.c and slab_def.h
mm/mempool/dmapool: remove CONFIG_DEBUG_SLAB ifdefs
mm/slab: remove CONFIG_SLAB code from slab common code
cpu/hotplug: remove CPUHP_SLAB_PREPARE hooks
...

+1096 -5377
+8 -4
CREDITS
··· 9 9 Linus 10 10 ---------- 11 11 12 - N: Matt Mackal 13 - E: mpm@selenic.com 14 - D: SLOB slab allocator 15 - 16 12 N: Matti Aarnio 17 13 E: mea@nic.funet.fi 18 14 D: Alpha systems hacking, IPv6 and other network related stuff ··· 1568 1572 S: 6020 Innsbruck 1569 1573 S: Austria 1570 1574 1575 + N: Mark Hemment 1576 + E: markhe@nextd.demon.co.uk 1577 + D: SLAB allocator implementation 1578 + 1571 1579 N: Richard Henderson 1572 1580 E: rth@twiddle.net 1573 1581 E: rth@cygnus.com ··· 2444 2444 D: work on suspend-to-ram/disk, killing duplicates from ioctl32, 2445 2445 D: Altera SoCFPGA and Nokia N900 support. 2446 2446 S: Czech Republic 2447 + 2448 + N: Olivia Mackall 2449 + E: olivia@selenic.com 2450 + D: SLOB slab allocator 2447 2451 2448 2452 N: Paul Mackerras 2449 2453 E: paulus@samba.org
+1 -1
Documentation/core-api/mm-api.rst
··· 37 37 .. kernel-doc:: include/linux/slab.h 38 38 :internal: 39 39 40 - .. kernel-doc:: mm/slab.c 40 + .. kernel-doc:: mm/slub.c 41 41 :export: 42 42 43 43 .. kernel-doc:: mm/slab_common.c
+1 -1
arch/arm64/Kconfig
··· 154 154 select HAVE_MOVE_PUD 155 155 select HAVE_PCI 156 156 select HAVE_ACPI_APEI if (ACPI && EFI) 157 - select HAVE_ALIGNED_STRUCT_PAGE if SLUB 157 + select HAVE_ALIGNED_STRUCT_PAGE 158 158 select HAVE_ARCH_AUDITSYSCALL 159 159 select HAVE_ARCH_BITREVERSE 160 160 select HAVE_ARCH_COMPILER_H
+1 -1
arch/s390/Kconfig
··· 146 146 select GENERIC_TIME_VSYSCALL 147 147 select GENERIC_VDSO_TIME_NS 148 148 select GENERIC_IOREMAP if PCI 149 - select HAVE_ALIGNED_STRUCT_PAGE if SLUB 149 + select HAVE_ALIGNED_STRUCT_PAGE 150 150 select HAVE_ARCH_AUDITSYSCALL 151 151 select HAVE_ARCH_JUMP_LABEL 152 152 select HAVE_ARCH_JUMP_LABEL_RELATIVE
+1 -1
arch/x86/Kconfig
··· 169 169 select HAS_IOPORT 170 170 select HAVE_ACPI_APEI if ACPI 171 171 select HAVE_ACPI_APEI_NMI if ACPI 172 - select HAVE_ALIGNED_STRUCT_PAGE if SLUB 172 + select HAVE_ALIGNED_STRUCT_PAGE 173 173 select HAVE_ARCH_AUDITSYSCALL 174 174 select HAVE_ARCH_HUGE_VMAP if X86_64 || X86_PAE 175 175 select HAVE_ARCH_HUGE_VMALLOC if X86_64
-1
include/linux/cpuhotplug.h
··· 104 104 CPUHP_X2APIC_PREPARE, 105 105 CPUHP_SMPCFD_PREPARE, 106 106 CPUHP_RELAY_PREPARE, 107 - CPUHP_SLAB_PREPARE, 108 107 CPUHP_MD_RAID5_PREPARE, 109 108 CPUHP_RCUTREE_PREP, 110 109 CPUHP_CPUIDLE_COUPLED_PREPARE,
+2 -20
include/linux/slab.h
··· 24 24 25 25 /* 26 26 * Flags to pass to kmem_cache_create(). 27 - * The ones marked DEBUG are only valid if CONFIG_DEBUG_SLAB is set. 27 + * The ones marked DEBUG need CONFIG_SLUB_DEBUG enabled, otherwise are no-op 28 28 */ 29 29 /* DEBUG: Perform (expensive) checks on alloc/free */ 30 30 #define SLAB_CONSISTENCY_CHECKS ((slab_flags_t __force)0x00000100U) ··· 302 302 * Kmalloc array related definitions 303 303 */ 304 304 305 - #ifdef CONFIG_SLAB 306 305 /* 307 - * SLAB and SLUB directly allocates requests fitting in to an order-1 page 306 + * SLUB directly allocates requests fitting in to an order-1 page 308 307 * (PAGE_SIZE*2). Larger requests are passed to the page allocator. 309 308 */ 310 309 #define KMALLOC_SHIFT_HIGH (PAGE_SHIFT + 1) 311 310 #define KMALLOC_SHIFT_MAX (MAX_ORDER + PAGE_SHIFT) 312 311 #ifndef KMALLOC_SHIFT_LOW 313 - #define KMALLOC_SHIFT_LOW 5 314 - #endif 315 - #endif 316 - 317 - #ifdef CONFIG_SLUB 318 - #define KMALLOC_SHIFT_HIGH (PAGE_SHIFT + 1) 319 - #define KMALLOC_SHIFT_MAX (MAX_ORDER + PAGE_SHIFT) 320 - #ifndef KMALLOC_SHIFT_LOW 321 312 #define KMALLOC_SHIFT_LOW 3 322 - #endif 323 313 #endif 324 314 325 315 /* Maximum allocatable size */ ··· 777 787 size_t kmalloc_size_roundup(size_t size); 778 788 779 789 void __init kmem_cache_init_late(void); 780 - 781 - #if defined(CONFIG_SMP) && defined(CONFIG_SLAB) 782 - int slab_prepare_cpu(unsigned int cpu); 783 - int slab_dead_cpu(unsigned int cpu); 784 - #else 785 - #define slab_prepare_cpu NULL 786 - #define slab_dead_cpu NULL 787 - #endif 788 790 789 791 #endif /* _LINUX_SLAB_H */
-124
include/linux/slab_def.h
··· 1 - /* SPDX-License-Identifier: GPL-2.0 */ 2 - #ifndef _LINUX_SLAB_DEF_H 3 - #define _LINUX_SLAB_DEF_H 4 - 5 - #include <linux/kfence.h> 6 - #include <linux/reciprocal_div.h> 7 - 8 - /* 9 - * Definitions unique to the original Linux SLAB allocator. 10 - */ 11 - 12 - struct kmem_cache { 13 - struct array_cache __percpu *cpu_cache; 14 - 15 - /* 1) Cache tunables. Protected by slab_mutex */ 16 - unsigned int batchcount; 17 - unsigned int limit; 18 - unsigned int shared; 19 - 20 - unsigned int size; 21 - struct reciprocal_value reciprocal_buffer_size; 22 - /* 2) touched by every alloc & free from the backend */ 23 - 24 - slab_flags_t flags; /* constant flags */ 25 - unsigned int num; /* # of objs per slab */ 26 - 27 - /* 3) cache_grow/shrink */ 28 - /* order of pgs per slab (2^n) */ 29 - unsigned int gfporder; 30 - 31 - /* force GFP flags, e.g. GFP_DMA */ 32 - gfp_t allocflags; 33 - 34 - size_t colour; /* cache colouring range */ 35 - unsigned int colour_off; /* colour offset */ 36 - unsigned int freelist_size; 37 - 38 - /* constructor func */ 39 - void (*ctor)(void *obj); 40 - 41 - /* 4) cache creation/removal */ 42 - const char *name; 43 - struct list_head list; 44 - int refcount; 45 - int object_size; 46 - int align; 47 - 48 - /* 5) statistics */ 49 - #ifdef CONFIG_DEBUG_SLAB 50 - unsigned long num_active; 51 - unsigned long num_allocations; 52 - unsigned long high_mark; 53 - unsigned long grown; 54 - unsigned long reaped; 55 - unsigned long errors; 56 - unsigned long max_freeable; 57 - unsigned long node_allocs; 58 - unsigned long node_frees; 59 - unsigned long node_overflow; 60 - atomic_t allochit; 61 - atomic_t allocmiss; 62 - atomic_t freehit; 63 - atomic_t freemiss; 64 - 65 - /* 66 - * If debugging is enabled, then the allocator can add additional 67 - * fields and/or padding to every object. 'size' contains the total 68 - * object size including these internal fields, while 'obj_offset' 69 - * and 'object_size' contain the offset to the user object and its 70 - * size. 71 - */ 72 - int obj_offset; 73 - #endif /* CONFIG_DEBUG_SLAB */ 74 - 75 - #ifdef CONFIG_KASAN_GENERIC 76 - struct kasan_cache kasan_info; 77 - #endif 78 - 79 - #ifdef CONFIG_SLAB_FREELIST_RANDOM 80 - unsigned int *random_seq; 81 - #endif 82 - 83 - #ifdef CONFIG_HARDENED_USERCOPY 84 - unsigned int useroffset; /* Usercopy region offset */ 85 - unsigned int usersize; /* Usercopy region size */ 86 - #endif 87 - 88 - struct kmem_cache_node *node[MAX_NUMNODES]; 89 - }; 90 - 91 - static inline void *nearest_obj(struct kmem_cache *cache, const struct slab *slab, 92 - void *x) 93 - { 94 - void *object = x - (x - slab->s_mem) % cache->size; 95 - void *last_object = slab->s_mem + (cache->num - 1) * cache->size; 96 - 97 - if (unlikely(object > last_object)) 98 - return last_object; 99 - else 100 - return object; 101 - } 102 - 103 - /* 104 - * We want to avoid an expensive divide : (offset / cache->size) 105 - * Using the fact that size is a constant for a particular cache, 106 - * we can replace (offset / cache->size) by 107 - * reciprocal_divide(offset, cache->reciprocal_buffer_size) 108 - */ 109 - static inline unsigned int obj_to_index(const struct kmem_cache *cache, 110 - const struct slab *slab, void *obj) 111 - { 112 - u32 offset = (obj - slab->s_mem); 113 - return reciprocal_divide(offset, cache->reciprocal_buffer_size); 114 - } 115 - 116 - static inline int objs_per_slab(const struct kmem_cache *cache, 117 - const struct slab *slab) 118 - { 119 - if (is_kfence_address(slab_address(slab))) 120 - return 1; 121 - return cache->num; 122 - } 123 - 124 - #endif /* _LINUX_SLAB_DEF_H */
-204
include/linux/slub_def.h
··· 1 - /* SPDX-License-Identifier: GPL-2.0 */ 2 - #ifndef _LINUX_SLUB_DEF_H 3 - #define _LINUX_SLUB_DEF_H 4 - 5 - /* 6 - * SLUB : A Slab allocator without object queues. 7 - * 8 - * (C) 2007 SGI, Christoph Lameter 9 - */ 10 - #include <linux/kfence.h> 11 - #include <linux/kobject.h> 12 - #include <linux/reciprocal_div.h> 13 - #include <linux/local_lock.h> 14 - 15 - enum stat_item { 16 - ALLOC_FASTPATH, /* Allocation from cpu slab */ 17 - ALLOC_SLOWPATH, /* Allocation by getting a new cpu slab */ 18 - FREE_FASTPATH, /* Free to cpu slab */ 19 - FREE_SLOWPATH, /* Freeing not to cpu slab */ 20 - FREE_FROZEN, /* Freeing to frozen slab */ 21 - FREE_ADD_PARTIAL, /* Freeing moves slab to partial list */ 22 - FREE_REMOVE_PARTIAL, /* Freeing removes last object */ 23 - ALLOC_FROM_PARTIAL, /* Cpu slab acquired from node partial list */ 24 - ALLOC_SLAB, /* Cpu slab acquired from page allocator */ 25 - ALLOC_REFILL, /* Refill cpu slab from slab freelist */ 26 - ALLOC_NODE_MISMATCH, /* Switching cpu slab */ 27 - FREE_SLAB, /* Slab freed to the page allocator */ 28 - CPUSLAB_FLUSH, /* Abandoning of the cpu slab */ 29 - DEACTIVATE_FULL, /* Cpu slab was full when deactivated */ 30 - DEACTIVATE_EMPTY, /* Cpu slab was empty when deactivated */ 31 - DEACTIVATE_TO_HEAD, /* Cpu slab was moved to the head of partials */ 32 - DEACTIVATE_TO_TAIL, /* Cpu slab was moved to the tail of partials */ 33 - DEACTIVATE_REMOTE_FREES,/* Slab contained remotely freed objects */ 34 - DEACTIVATE_BYPASS, /* Implicit deactivation */ 35 - ORDER_FALLBACK, /* Number of times fallback was necessary */ 36 - CMPXCHG_DOUBLE_CPU_FAIL,/* Failure of this_cpu_cmpxchg_double */ 37 - CMPXCHG_DOUBLE_FAIL, /* Number of times that cmpxchg double did not match */ 38 - CPU_PARTIAL_ALLOC, /* Used cpu partial on alloc */ 39 - CPU_PARTIAL_FREE, /* Refill cpu partial on free */ 40 - CPU_PARTIAL_NODE, /* Refill cpu partial from node partial */ 41 - CPU_PARTIAL_DRAIN, /* Drain cpu partial to node partial */ 42 - NR_SLUB_STAT_ITEMS 43 - }; 44 - 45 - #ifndef CONFIG_SLUB_TINY 46 - /* 47 - * When changing the layout, make sure freelist and tid are still compatible 48 - * with this_cpu_cmpxchg_double() alignment requirements. 49 - */ 50 - struct kmem_cache_cpu { 51 - union { 52 - struct { 53 - void **freelist; /* Pointer to next available object */ 54 - unsigned long tid; /* Globally unique transaction id */ 55 - }; 56 - freelist_aba_t freelist_tid; 57 - }; 58 - struct slab *slab; /* The slab from which we are allocating */ 59 - #ifdef CONFIG_SLUB_CPU_PARTIAL 60 - struct slab *partial; /* Partially allocated frozen slabs */ 61 - #endif 62 - local_lock_t lock; /* Protects the fields above */ 63 - #ifdef CONFIG_SLUB_STATS 64 - unsigned stat[NR_SLUB_STAT_ITEMS]; 65 - #endif 66 - }; 67 - #endif /* CONFIG_SLUB_TINY */ 68 - 69 - #ifdef CONFIG_SLUB_CPU_PARTIAL 70 - #define slub_percpu_partial(c) ((c)->partial) 71 - 72 - #define slub_set_percpu_partial(c, p) \ 73 - ({ \ 74 - slub_percpu_partial(c) = (p)->next; \ 75 - }) 76 - 77 - #define slub_percpu_partial_read_once(c) READ_ONCE(slub_percpu_partial(c)) 78 - #else 79 - #define slub_percpu_partial(c) NULL 80 - 81 - #define slub_set_percpu_partial(c, p) 82 - 83 - #define slub_percpu_partial_read_once(c) NULL 84 - #endif // CONFIG_SLUB_CPU_PARTIAL 85 - 86 - /* 87 - * Word size structure that can be atomically updated or read and that 88 - * contains both the order and the number of objects that a slab of the 89 - * given order would contain. 90 - */ 91 - struct kmem_cache_order_objects { 92 - unsigned int x; 93 - }; 94 - 95 - /* 96 - * Slab cache management. 97 - */ 98 - struct kmem_cache { 99 - #ifndef CONFIG_SLUB_TINY 100 - struct kmem_cache_cpu __percpu *cpu_slab; 101 - #endif 102 - /* Used for retrieving partial slabs, etc. */ 103 - slab_flags_t flags; 104 - unsigned long min_partial; 105 - unsigned int size; /* The size of an object including metadata */ 106 - unsigned int object_size;/* The size of an object without metadata */ 107 - struct reciprocal_value reciprocal_size; 108 - unsigned int offset; /* Free pointer offset */ 109 - #ifdef CONFIG_SLUB_CPU_PARTIAL 110 - /* Number of per cpu partial objects to keep around */ 111 - unsigned int cpu_partial; 112 - /* Number of per cpu partial slabs to keep around */ 113 - unsigned int cpu_partial_slabs; 114 - #endif 115 - struct kmem_cache_order_objects oo; 116 - 117 - /* Allocation and freeing of slabs */ 118 - struct kmem_cache_order_objects min; 119 - gfp_t allocflags; /* gfp flags to use on each alloc */ 120 - int refcount; /* Refcount for slab cache destroy */ 121 - void (*ctor)(void *); 122 - unsigned int inuse; /* Offset to metadata */ 123 - unsigned int align; /* Alignment */ 124 - unsigned int red_left_pad; /* Left redzone padding size */ 125 - const char *name; /* Name (only for display!) */ 126 - struct list_head list; /* List of slab caches */ 127 - #ifdef CONFIG_SYSFS 128 - struct kobject kobj; /* For sysfs */ 129 - #endif 130 - #ifdef CONFIG_SLAB_FREELIST_HARDENED 131 - unsigned long random; 132 - #endif 133 - 134 - #ifdef CONFIG_NUMA 135 - /* 136 - * Defragmentation by allocating from a remote node. 137 - */ 138 - unsigned int remote_node_defrag_ratio; 139 - #endif 140 - 141 - #ifdef CONFIG_SLAB_FREELIST_RANDOM 142 - unsigned int *random_seq; 143 - #endif 144 - 145 - #ifdef CONFIG_KASAN_GENERIC 146 - struct kasan_cache kasan_info; 147 - #endif 148 - 149 - #ifdef CONFIG_HARDENED_USERCOPY 150 - unsigned int useroffset; /* Usercopy region offset */ 151 - unsigned int usersize; /* Usercopy region size */ 152 - #endif 153 - 154 - struct kmem_cache_node *node[MAX_NUMNODES]; 155 - }; 156 - 157 - #if defined(CONFIG_SYSFS) && !defined(CONFIG_SLUB_TINY) 158 - #define SLAB_SUPPORTS_SYSFS 159 - void sysfs_slab_unlink(struct kmem_cache *); 160 - void sysfs_slab_release(struct kmem_cache *); 161 - #else 162 - static inline void sysfs_slab_unlink(struct kmem_cache *s) 163 - { 164 - } 165 - static inline void sysfs_slab_release(struct kmem_cache *s) 166 - { 167 - } 168 - #endif 169 - 170 - void *fixup_red_left(struct kmem_cache *s, void *p); 171 - 172 - static inline void *nearest_obj(struct kmem_cache *cache, const struct slab *slab, 173 - void *x) { 174 - void *object = x - (x - slab_address(slab)) % cache->size; 175 - void *last_object = slab_address(slab) + 176 - (slab->objects - 1) * cache->size; 177 - void *result = (unlikely(object > last_object)) ? last_object : object; 178 - 179 - result = fixup_red_left(cache, result); 180 - return result; 181 - } 182 - 183 - /* Determine object index from a given position */ 184 - static inline unsigned int __obj_to_index(const struct kmem_cache *cache, 185 - void *addr, void *obj) 186 - { 187 - return reciprocal_divide(kasan_reset_tag(obj) - addr, 188 - cache->reciprocal_size); 189 - } 190 - 191 - static inline unsigned int obj_to_index(const struct kmem_cache *cache, 192 - const struct slab *slab, void *obj) 193 - { 194 - if (is_kfence_address(obj)) 195 - return 0; 196 - return __obj_to_index(cache, slab_address(slab), obj); 197 - } 198 - 199 - static inline int objs_per_slab(const struct kmem_cache *cache, 200 - const struct slab *slab) 201 - { 202 - return slab->objects; 203 - } 204 - #endif /* _LINUX_SLUB_DEF_H */
-5
kernel/cpu.c
··· 2125 2125 .startup.single = relay_prepare_cpu, 2126 2126 .teardown.single = NULL, 2127 2127 }, 2128 - [CPUHP_SLAB_PREPARE] = { 2129 - .name = "slab:prepare", 2130 - .startup.single = slab_prepare_cpu, 2131 - .teardown.single = slab_dead_cpu, 2132 - }, 2133 2128 [CPUHP_RCUTREE_PREP] = { 2134 2129 .name = "RCU/tree:prepare", 2135 2130 .startup.single = rcutree_prepare_cpu,
-1
lib/Kconfig.debug
··· 1970 1970 config FAILSLAB 1971 1971 bool "Fault-injection capability for kmalloc" 1972 1972 depends on FAULT_INJECTION 1973 - depends on SLAB || SLUB 1974 1973 help 1975 1974 Provide fault-injection capability for kmalloc. 1976 1975
+3 -8
lib/Kconfig.kasan
··· 37 37 (HAVE_ARCH_KASAN_SW_TAGS && CC_HAS_KASAN_SW_TAGS)) && \ 38 38 CC_HAS_WORKING_NOSANITIZE_ADDRESS) || \ 39 39 HAVE_ARCH_KASAN_HW_TAGS 40 - depends on (SLUB && SYSFS && !SLUB_TINY) || (SLAB && !DEBUG_SLAB) 40 + depends on SYSFS && !SLUB_TINY 41 41 select STACKDEPOT_ALWAYS_INIT 42 42 help 43 43 Enables KASAN (Kernel Address Sanitizer) - a dynamic memory safety ··· 78 78 bool "Generic KASAN" 79 79 depends on HAVE_ARCH_KASAN && CC_HAS_KASAN_GENERIC 80 80 depends on CC_HAS_WORKING_NOSANITIZE_ADDRESS 81 - select SLUB_DEBUG if SLUB 81 + select SLUB_DEBUG 82 82 select CONSTRUCTORS 83 83 help 84 84 Enables Generic KASAN. ··· 89 89 overhead of ~50% for dynamic allocations. 90 90 The performance slowdown is ~x3. 91 91 92 - (Incompatible with CONFIG_DEBUG_SLAB: the kernel does not boot.) 93 - 94 92 config KASAN_SW_TAGS 95 93 bool "Software Tag-Based KASAN" 96 94 depends on HAVE_ARCH_KASAN_SW_TAGS && CC_HAS_KASAN_SW_TAGS 97 95 depends on CC_HAS_WORKING_NOSANITIZE_ADDRESS 98 - select SLUB_DEBUG if SLUB 96 + select SLUB_DEBUG 99 97 select CONSTRUCTORS 100 98 help 101 99 Enables Software Tag-Based KASAN. ··· 108 110 May potentially introduce problems related to pointer casting and 109 111 comparison, as it embeds a tag into the top byte of each pointer. 110 112 111 - (Incompatible with CONFIG_DEBUG_SLAB: the kernel does not boot.) 112 - 113 113 config KASAN_HW_TAGS 114 114 bool "Hardware Tag-Based KASAN" 115 115 depends on HAVE_ARCH_KASAN_HW_TAGS 116 - depends on SLUB 117 116 help 118 117 Enables Hardware Tag-Based KASAN. 119 118
+1 -1
lib/Kconfig.kfence
··· 5 5 6 6 menuconfig KFENCE 7 7 bool "KFENCE: low-overhead sampling-based memory safety error detector" 8 - depends on HAVE_ARCH_KFENCE && (SLAB || SLUB) 8 + depends on HAVE_ARCH_KFENCE 9 9 select STACKTRACE 10 10 select IRQ_WORK 11 11 help
+1 -1
lib/Kconfig.kmsan
··· 11 11 config KMSAN 12 12 bool "KMSAN: detector of uninitialized values use" 13 13 depends on HAVE_ARCH_KMSAN && HAVE_KMSAN_COMPILER 14 - depends on SLUB && DEBUG_KERNEL && !KASAN && !KCSAN 14 + depends on DEBUG_KERNEL && !KASAN && !KCSAN 15 15 depends on !PREEMPT_RT 16 16 select STACKDEPOT 17 17 select STACKDEPOT_ALWAYS_INIT
+15 -53
mm/Kconfig
··· 226 226 227 227 For more information, see zsmalloc documentation. 228 228 229 - menu "SLAB allocator options" 230 - 231 - choice 232 - prompt "Choose SLAB allocator" 233 - default SLUB 234 - help 235 - This option allows to select a slab allocator. 236 - 237 - config SLAB_DEPRECATED 238 - bool "SLAB (DEPRECATED)" 239 - depends on !PREEMPT_RT 240 - help 241 - Deprecated and scheduled for removal in a few cycles. Replaced by 242 - SLUB. 243 - 244 - If you cannot migrate to SLUB, please contact linux-mm@kvack.org 245 - and the people listed in the SLAB ALLOCATOR section of MAINTAINERS 246 - file, explaining why. 247 - 248 - The regular slab allocator that is established and known to work 249 - well in all environments. It organizes cache hot objects in 250 - per cpu and per node queues. 229 + menu "Slab allocator options" 251 230 252 231 config SLUB 253 - bool "SLUB (Unqueued Allocator)" 254 - help 255 - SLUB is a slab allocator that minimizes cache line usage 256 - instead of managing queues of cached objects (SLAB approach). 257 - Per cpu caching is realized using slabs of objects instead 258 - of queues of objects. SLUB can use memory efficiently 259 - and has enhanced diagnostics. SLUB is the default choice for 260 - a slab allocator. 261 - 262 - endchoice 263 - 264 - config SLAB 265 - bool 266 - default y 267 - depends on SLAB_DEPRECATED 232 + def_bool y 268 233 269 234 config SLUB_TINY 270 - bool "Configure SLUB for minimal memory footprint" 271 - depends on SLUB && EXPERT 235 + bool "Configure for minimal memory footprint" 236 + depends on EXPERT 272 237 select SLAB_MERGE_DEFAULT 273 238 help 274 - Configures the SLUB allocator in a way to achieve minimal memory 239 + Configures the slab allocator in a way to achieve minimal memory 275 240 footprint, sacrificing scalability, debugging and other features. 276 241 This is intended only for the smallest system that had used the 277 242 SLOB allocator and is not recommended for systems with more than ··· 247 282 config SLAB_MERGE_DEFAULT 248 283 bool "Allow slab caches to be merged" 249 284 default y 250 - depends on SLAB || SLUB 251 285 help 252 286 For reduced kernel memory fragmentation, slab caches can be 253 287 merged when they share the same size and other characteristics. ··· 260 296 261 297 config SLAB_FREELIST_RANDOM 262 298 bool "Randomize slab freelist" 263 - depends on SLAB || (SLUB && !SLUB_TINY) 299 + depends on !SLUB_TINY 264 300 help 265 301 Randomizes the freelist order used on creating new pages. This 266 302 security feature reduces the predictability of the kernel slab ··· 268 304 269 305 config SLAB_FREELIST_HARDENED 270 306 bool "Harden slab freelist metadata" 271 - depends on SLAB || (SLUB && !SLUB_TINY) 307 + depends on !SLUB_TINY 272 308 help 273 309 Many kernel heap attacks try to target slab cache metadata and 274 310 other infrastructure. This options makes minor performance 275 311 sacrifices to harden the kernel slab allocator against common 276 - freelist exploit methods. Some slab implementations have more 277 - sanity-checking than others. This option is most effective with 278 - CONFIG_SLUB. 312 + freelist exploit methods. 279 313 280 314 config SLUB_STATS 281 315 default n 282 - bool "Enable SLUB performance statistics" 283 - depends on SLUB && SYSFS && !SLUB_TINY 316 + bool "Enable performance statistics" 317 + depends on SYSFS && !SLUB_TINY 284 318 help 285 - SLUB statistics are useful to debug SLUBs allocation behavior in 319 + The statistics are useful to debug slab allocation behavior in 286 320 order find ways to optimize the allocator. This should never be 287 321 enabled for production use since keeping statistics slows down 288 322 the allocator by a few percentage points. The slabinfo command ··· 290 328 291 329 config SLUB_CPU_PARTIAL 292 330 default y 293 - depends on SLUB && SMP && !SLUB_TINY 294 - bool "SLUB per cpu partial cache" 331 + depends on SMP && !SLUB_TINY 332 + bool "Enable per cpu partial caches" 295 333 help 296 334 Per cpu partial caches accelerate objects allocation and freeing 297 335 that is local to a processor at the price of more indeterminism ··· 301 339 302 340 config RANDOM_KMALLOC_CACHES 303 341 default n 304 - depends on SLUB && !SLUB_TINY 342 + depends on !SLUB_TINY 305 343 bool "Randomize slab caches for normal kmalloc" 306 344 help 307 345 A hardening feature that creates multiple copies of slab caches for ··· 316 354 limited degree of memory and CPU overhead that relates to hardware and 317 355 system workload. 318 356 319 - endmenu # SLAB allocator options 357 + endmenu # Slab allocator options 320 358 321 359 config SHUFFLE_PAGE_ALLOCATOR 322 360 bool "Page allocator randomization"
+4 -12
mm/Kconfig.debug
··· 45 45 Enable debug page memory allocations by default? This value 46 46 can be overridden by debug_pagealloc=off|on. 47 47 48 - config DEBUG_SLAB 49 - bool "Debug slab memory allocations" 50 - depends on DEBUG_KERNEL && SLAB 51 - help 52 - Say Y here to have the kernel do limited verification on memory 53 - allocation as well as poisoning memory on free to catch use of freed 54 - memory. This can make kmalloc/kfree-intensive workloads much slower. 55 - 56 48 config SLUB_DEBUG 57 49 default y 58 50 bool "Enable SLUB debugging support" if EXPERT 59 - depends on SLUB && SYSFS && !SLUB_TINY 51 + depends on SYSFS && !SLUB_TINY 60 52 select STACKDEPOT if STACKTRACE_SUPPORT 61 53 help 62 54 SLUB has extensive debug support features. Disabling these can ··· 58 66 59 67 config SLUB_DEBUG_ON 60 68 bool "SLUB debugging on by default" 61 - depends on SLUB && SLUB_DEBUG 69 + depends on SLUB_DEBUG 62 70 select STACKDEPOT_ALWAYS_INIT if STACKTRACE_SUPPORT 63 71 default n 64 72 help ··· 223 231 allocations. See Documentation/dev-tools/kmemleak.rst for more 224 232 details. 225 233 226 - Enabling DEBUG_SLAB or SLUB_DEBUG may increase the chances 227 - of finding leaks due to the slab objects poisoning. 234 + Enabling SLUB_DEBUG may increase the chances of finding leaks 235 + due to the slab objects poisoning. 228 236 229 237 In order to access the kmemleak file, debugfs needs to be 230 238 mounted (usually at /sys/kernel/debug).
+1 -5
mm/Makefile
··· 4 4 # 5 5 6 6 KASAN_SANITIZE_slab_common.o := n 7 - KASAN_SANITIZE_slab.o := n 8 7 KASAN_SANITIZE_slub.o := n 9 8 KCSAN_SANITIZE_kmemleak.o := n 10 9 ··· 11 12 # the same word but accesses to different bits of that word. Re-enable KCSAN 12 13 # for these when we have more consensus on what to do about them. 13 14 KCSAN_SANITIZE_slab_common.o := n 14 - KCSAN_SANITIZE_slab.o := n 15 15 KCSAN_SANITIZE_slub.o := n 16 16 KCSAN_SANITIZE_page_alloc.o := n 17 17 # But enable explicit instrumentation for memory barriers. ··· 20 22 # flaky coverage that is not a function of syscall inputs. E.g. slab is out of 21 23 # free pages, or a task is migrated between nodes. 22 24 KCOV_INSTRUMENT_slab_common.o := n 23 - KCOV_INSTRUMENT_slab.o := n 24 25 KCOV_INSTRUMENT_slub.o := n 25 26 KCOV_INSTRUMENT_page_alloc.o := n 26 27 KCOV_INSTRUMENT_debug-pagealloc.o := n ··· 63 66 obj-y += init-mm.o 64 67 obj-y += memblock.o 65 68 obj-y += $(memory-hotplug-y) 69 + obj-y += slub.o 66 70 67 71 ifdef CONFIG_MMU 68 72 obj-$(CONFIG_ADVISE_SYSCALLS) += madvise.o ··· 80 82 obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o 81 83 obj-$(CONFIG_KSM) += ksm.o 82 84 obj-$(CONFIG_PAGE_POISONING) += page_poison.o 83 - obj-$(CONFIG_SLAB) += slab.o 84 - obj-$(CONFIG_SLUB) += slub.o 85 85 obj-$(CONFIG_KASAN) += kasan/ 86 86 obj-$(CONFIG_KFENCE) += kfence/ 87 87 obj-$(CONFIG_KMSAN) += kmsan/
+1 -1
mm/dmapool.c
··· 36 36 #include <linux/types.h> 37 37 #include <linux/wait.h> 38 38 39 - #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_SLUB_DEBUG_ON) 39 + #ifdef CONFIG_SLUB_DEBUG_ON 40 40 #define DMAPOOL_DEBUG 1 41 41 #endif 42 42
+2 -11
mm/kasan/common.c
··· 153 153 * 2. A cache might be SLAB_TYPESAFE_BY_RCU, which means objects can be 154 154 * accessed after being freed. We preassign tags for objects in these 155 155 * caches as well. 156 - * 3. For SLAB allocator we can't preassign tags randomly since the freelist 157 - * is stored as an array of indexes instead of a linked list. Assign tags 158 - * based on objects indexes, so that objects that are next to each other 159 - * get different tags. 160 156 */ 161 157 static inline u8 assign_tag(struct kmem_cache *cache, 162 158 const void *object, bool init) ··· 167 171 if (!cache->ctor && !(cache->flags & SLAB_TYPESAFE_BY_RCU)) 168 172 return init ? KASAN_TAG_KERNEL : kasan_random_tag(); 169 173 170 - /* For caches that either have a constructor or SLAB_TYPESAFE_BY_RCU: */ 171 - #ifdef CONFIG_SLAB 172 - /* For SLAB assign tags based on the object index in the freelist. */ 173 - return (u8)obj_to_index(cache, virt_to_slab(object), (void *)object); 174 - #else 175 174 /* 176 - * For SLUB assign a random tag during slab creation, otherwise reuse 175 + * For caches that either have a constructor or SLAB_TYPESAFE_BY_RCU, 176 + * assign a random tag during slab creation, otherwise reuse 177 177 * the already assigned tag. 178 178 */ 179 179 return init ? kasan_random_tag() : get_tag(object); 180 - #endif 181 180 } 182 181 183 182 void * __must_check __kasan_init_slab_obj(struct kmem_cache *cache,
+1 -2
mm/kasan/kasan.h
··· 373 373 void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags); 374 374 void kasan_save_free_info(struct kmem_cache *cache, void *object); 375 375 376 - #if defined(CONFIG_KASAN_GENERIC) && \ 377 - (defined(CONFIG_SLAB) || defined(CONFIG_SLUB)) 376 + #ifdef CONFIG_KASAN_GENERIC 378 377 bool kasan_quarantine_put(struct kmem_cache *cache, void *object); 379 378 void kasan_quarantine_reduce(void); 380 379 void kasan_quarantine_remove_cache(struct kmem_cache *cache);
-7
mm/kasan/quarantine.c
··· 144 144 { 145 145 void *object = qlink_to_object(qlink, cache); 146 146 struct kasan_free_meta *meta = kasan_get_free_meta(cache, object); 147 - unsigned long flags; 148 - 149 - if (IS_ENABLED(CONFIG_SLAB)) 150 - local_irq_save(flags); 151 147 152 148 /* 153 149 * If init_on_free is enabled and KASAN's free metadata is stored in ··· 162 166 *(u8 *)kasan_mem_to_shadow(object) = KASAN_SLAB_FREE; 163 167 164 168 ___cache_free(cache, object, _THIS_IP_); 165 - 166 - if (IS_ENABLED(CONFIG_SLAB)) 167 - local_irq_restore(flags); 168 169 } 169 170 170 171 static void qlist_free_all(struct qlist_head *q, struct kmem_cache *cache)
+1
mm/kasan/report.c
··· 23 23 #include <linux/stacktrace.h> 24 24 #include <linux/string.h> 25 25 #include <linux/types.h> 26 + #include <linux/vmalloc.h> 26 27 #include <linux/kasan.h> 27 28 #include <linux/module.h> 28 29 #include <linux/sched/task_stack.h>
-4
mm/kfence/core.c
··· 463 463 /* Set required slab fields. */ 464 464 slab = virt_to_slab((void *)meta->addr); 465 465 slab->slab_cache = cache; 466 - #if defined(CONFIG_SLUB) 467 466 slab->objects = 1; 468 - #elif defined(CONFIG_SLAB) 469 - slab->s_mem = addr; 470 - #endif 471 467 472 468 /* Memory initialization. */ 473 469 set_canary(meta);
+3 -3
mm/memcontrol.c
··· 64 64 #include <linux/psi.h> 65 65 #include <linux/seq_buf.h> 66 66 #include <linux/sched/isolation.h> 67 + #include <linux/kmemleak.h> 67 68 #include "internal.h" 68 69 #include <net/sock.h> 69 70 #include <net/ip.h> ··· 5151 5150 return ret; 5152 5151 } 5153 5152 5154 - #if defined(CONFIG_MEMCG_KMEM) && (defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG)) 5153 + #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_SLUB_DEBUG) 5155 5154 static int mem_cgroup_slab_show(struct seq_file *m, void *p) 5156 5155 { 5157 5156 /* ··· 5260 5259 .write = mem_cgroup_reset, 5261 5260 .read_u64 = mem_cgroup_read_u64, 5262 5261 }, 5263 - #if defined(CONFIG_MEMCG_KMEM) && \ 5264 - (defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG)) 5262 + #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_SLUB_DEBUG) 5265 5263 { 5266 5264 .name = "kmem.slabinfo", 5267 5265 .seq_show = mem_cgroup_slab_show,
+3 -3
mm/mempool.c
··· 20 20 #include <linux/writeback.h> 21 21 #include "slab.h" 22 22 23 - #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_SLUB_DEBUG_ON) 23 + #ifdef CONFIG_SLUB_DEBUG_ON 24 24 static void poison_error(mempool_t *pool, void *element, size_t size, 25 25 size_t byte) 26 26 { ··· 95 95 kunmap_atomic(addr); 96 96 } 97 97 } 98 - #else /* CONFIG_DEBUG_SLAB || CONFIG_SLUB_DEBUG_ON */ 98 + #else /* CONFIG_SLUB_DEBUG_ON */ 99 99 static inline void check_element(mempool_t *pool, void *element) 100 100 { 101 101 } 102 102 static inline void poison_element(mempool_t *pool, void *element) 103 103 { 104 104 } 105 - #endif /* CONFIG_DEBUG_SLAB || CONFIG_SLUB_DEBUG_ON */ 105 + #endif /* CONFIG_SLUB_DEBUG_ON */ 106 106 107 107 static __always_inline void kasan_poison_element(mempool_t *pool, void *element) 108 108 {
-4026
mm/slab.c
··· 1 - // SPDX-License-Identifier: GPL-2.0 2 - /* 3 - * linux/mm/slab.c 4 - * Written by Mark Hemment, 1996/97. 5 - * (markhe@nextd.demon.co.uk) 6 - * 7 - * kmem_cache_destroy() + some cleanup - 1999 Andrea Arcangeli 8 - * 9 - * Major cleanup, different bufctl logic, per-cpu arrays 10 - * (c) 2000 Manfred Spraul 11 - * 12 - * Cleanup, make the head arrays unconditional, preparation for NUMA 13 - * (c) 2002 Manfred Spraul 14 - * 15 - * An implementation of the Slab Allocator as described in outline in; 16 - * UNIX Internals: The New Frontiers by Uresh Vahalia 17 - * Pub: Prentice Hall ISBN 0-13-101908-2 18 - * or with a little more detail in; 19 - * The Slab Allocator: An Object-Caching Kernel Memory Allocator 20 - * Jeff Bonwick (Sun Microsystems). 21 - * Presented at: USENIX Summer 1994 Technical Conference 22 - * 23 - * The memory is organized in caches, one cache for each object type. 24 - * (e.g. inode_cache, dentry_cache, buffer_head, vm_area_struct) 25 - * Each cache consists out of many slabs (they are small (usually one 26 - * page long) and always contiguous), and each slab contains multiple 27 - * initialized objects. 28 - * 29 - * This means, that your constructor is used only for newly allocated 30 - * slabs and you must pass objects with the same initializations to 31 - * kmem_cache_free. 32 - * 33 - * Each cache can only support one memory type (GFP_DMA, GFP_HIGHMEM, 34 - * normal). If you need a special memory type, then must create a new 35 - * cache for that memory type. 36 - * 37 - * In order to reduce fragmentation, the slabs are sorted in 3 groups: 38 - * full slabs with 0 free objects 39 - * partial slabs 40 - * empty slabs with no allocated objects 41 - * 42 - * If partial slabs exist, then new allocations come from these slabs, 43 - * otherwise from empty slabs or new slabs are allocated. 44 - * 45 - * kmem_cache_destroy() CAN CRASH if you try to allocate from the cache 46 - * during kmem_cache_destroy(). The caller must prevent concurrent allocs. 47 - * 48 - * Each cache has a short per-cpu head array, most allocs 49 - * and frees go into that array, and if that array overflows, then 1/2 50 - * of the entries in the array are given back into the global cache. 51 - * The head array is strictly LIFO and should improve the cache hit rates. 52 - * On SMP, it additionally reduces the spinlock operations. 53 - * 54 - * The c_cpuarray may not be read with enabled local interrupts - 55 - * it's changed with a smp_call_function(). 56 - * 57 - * SMP synchronization: 58 - * constructors and destructors are called without any locking. 59 - * Several members in struct kmem_cache and struct slab never change, they 60 - * are accessed without any locking. 61 - * The per-cpu arrays are never accessed from the wrong cpu, no locking, 62 - * and local interrupts are disabled so slab code is preempt-safe. 63 - * The non-constant members are protected with a per-cache irq spinlock. 64 - * 65 - * Many thanks to Mark Hemment, who wrote another per-cpu slab patch 66 - * in 2000 - many ideas in the current implementation are derived from 67 - * his patch. 68 - * 69 - * Further notes from the original documentation: 70 - * 71 - * 11 April '97. Started multi-threading - markhe 72 - * The global cache-chain is protected by the mutex 'slab_mutex'. 73 - * The sem is only needed when accessing/extending the cache-chain, which 74 - * can never happen inside an interrupt (kmem_cache_create(), 75 - * kmem_cache_shrink() and kmem_cache_reap()). 76 - * 77 - * At present, each engine can be growing a cache. This should be blocked. 78 - * 79 - * 15 March 2005. NUMA slab allocator. 80 - * Shai Fultheim <shai@scalex86.org>. 81 - * Shobhit Dayal <shobhit@calsoftinc.com> 82 - * Alok N Kataria <alokk@calsoftinc.com> 83 - * Christoph Lameter <christoph@lameter.com> 84 - * 85 - * Modified the slab allocator to be node aware on NUMA systems. 86 - * Each node has its own list of partial, free and full slabs. 87 - * All object allocations for a node occur from node specific slab lists. 88 - */ 89 - 90 - #include <linux/slab.h> 91 - #include <linux/mm.h> 92 - #include <linux/poison.h> 93 - #include <linux/swap.h> 94 - #include <linux/cache.h> 95 - #include <linux/interrupt.h> 96 - #include <linux/init.h> 97 - #include <linux/compiler.h> 98 - #include <linux/cpuset.h> 99 - #include <linux/proc_fs.h> 100 - #include <linux/seq_file.h> 101 - #include <linux/notifier.h> 102 - #include <linux/kallsyms.h> 103 - #include <linux/kfence.h> 104 - #include <linux/cpu.h> 105 - #include <linux/sysctl.h> 106 - #include <linux/module.h> 107 - #include <linux/rcupdate.h> 108 - #include <linux/string.h> 109 - #include <linux/uaccess.h> 110 - #include <linux/nodemask.h> 111 - #include <linux/kmemleak.h> 112 - #include <linux/mempolicy.h> 113 - #include <linux/mutex.h> 114 - #include <linux/fault-inject.h> 115 - #include <linux/rtmutex.h> 116 - #include <linux/reciprocal_div.h> 117 - #include <linux/debugobjects.h> 118 - #include <linux/memory.h> 119 - #include <linux/prefetch.h> 120 - #include <linux/sched/task_stack.h> 121 - 122 - #include <net/sock.h> 123 - 124 - #include <asm/cacheflush.h> 125 - #include <asm/tlbflush.h> 126 - #include <asm/page.h> 127 - 128 - #include <trace/events/kmem.h> 129 - 130 - #include "internal.h" 131 - 132 - #include "slab.h" 133 - 134 - /* 135 - * DEBUG - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON. 136 - * 0 for faster, smaller code (especially in the critical paths). 137 - * 138 - * STATS - 1 to collect stats for /proc/slabinfo. 139 - * 0 for faster, smaller code (especially in the critical paths). 140 - * 141 - * FORCED_DEBUG - 1 enables SLAB_RED_ZONE and SLAB_POISON (if possible) 142 - */ 143 - 144 - #ifdef CONFIG_DEBUG_SLAB 145 - #define DEBUG 1 146 - #define STATS 1 147 - #define FORCED_DEBUG 1 148 - #else 149 - #define DEBUG 0 150 - #define STATS 0 151 - #define FORCED_DEBUG 0 152 - #endif 153 - 154 - /* Shouldn't this be in a header file somewhere? */ 155 - #define BYTES_PER_WORD sizeof(void *) 156 - #define REDZONE_ALIGN max(BYTES_PER_WORD, __alignof__(unsigned long long)) 157 - 158 - #ifndef ARCH_KMALLOC_FLAGS 159 - #define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN 160 - #endif 161 - 162 - #define FREELIST_BYTE_INDEX (((PAGE_SIZE >> BITS_PER_BYTE) \ 163 - <= SLAB_OBJ_MIN_SIZE) ? 1 : 0) 164 - 165 - #if FREELIST_BYTE_INDEX 166 - typedef unsigned char freelist_idx_t; 167 - #else 168 - typedef unsigned short freelist_idx_t; 169 - #endif 170 - 171 - #define SLAB_OBJ_MAX_NUM ((1 << sizeof(freelist_idx_t) * BITS_PER_BYTE) - 1) 172 - 173 - /* 174 - * struct array_cache 175 - * 176 - * Purpose: 177 - * - LIFO ordering, to hand out cache-warm objects from _alloc 178 - * - reduce the number of linked list operations 179 - * - reduce spinlock operations 180 - * 181 - * The limit is stored in the per-cpu structure to reduce the data cache 182 - * footprint. 183 - * 184 - */ 185 - struct array_cache { 186 - unsigned int avail; 187 - unsigned int limit; 188 - unsigned int batchcount; 189 - unsigned int touched; 190 - void *entry[]; /* 191 - * Must have this definition in here for the proper 192 - * alignment of array_cache. Also simplifies accessing 193 - * the entries. 194 - */ 195 - }; 196 - 197 - struct alien_cache { 198 - spinlock_t lock; 199 - struct array_cache ac; 200 - }; 201 - 202 - /* 203 - * Need this for bootstrapping a per node allocator. 204 - */ 205 - #define NUM_INIT_LISTS (2 * MAX_NUMNODES) 206 - static struct kmem_cache_node __initdata init_kmem_cache_node[NUM_INIT_LISTS]; 207 - #define CACHE_CACHE 0 208 - #define SIZE_NODE (MAX_NUMNODES) 209 - 210 - static int drain_freelist(struct kmem_cache *cache, 211 - struct kmem_cache_node *n, int tofree); 212 - static void free_block(struct kmem_cache *cachep, void **objpp, int len, 213 - int node, struct list_head *list); 214 - static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list); 215 - static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp); 216 - static void cache_reap(struct work_struct *unused); 217 - 218 - static inline void fixup_objfreelist_debug(struct kmem_cache *cachep, 219 - void **list); 220 - static inline void fixup_slab_list(struct kmem_cache *cachep, 221 - struct kmem_cache_node *n, struct slab *slab, 222 - void **list); 223 - 224 - #define INDEX_NODE kmalloc_index(sizeof(struct kmem_cache_node)) 225 - 226 - static void kmem_cache_node_init(struct kmem_cache_node *parent) 227 - { 228 - INIT_LIST_HEAD(&parent->slabs_full); 229 - INIT_LIST_HEAD(&parent->slabs_partial); 230 - INIT_LIST_HEAD(&parent->slabs_free); 231 - parent->total_slabs = 0; 232 - parent->free_slabs = 0; 233 - parent->shared = NULL; 234 - parent->alien = NULL; 235 - parent->colour_next = 0; 236 - raw_spin_lock_init(&parent->list_lock); 237 - parent->free_objects = 0; 238 - parent->free_touched = 0; 239 - } 240 - 241 - #define MAKE_LIST(cachep, listp, slab, nodeid) \ 242 - do { \ 243 - INIT_LIST_HEAD(listp); \ 244 - list_splice(&get_node(cachep, nodeid)->slab, listp); \ 245 - } while (0) 246 - 247 - #define MAKE_ALL_LISTS(cachep, ptr, nodeid) \ 248 - do { \ 249 - MAKE_LIST((cachep), (&(ptr)->slabs_full), slabs_full, nodeid); \ 250 - MAKE_LIST((cachep), (&(ptr)->slabs_partial), slabs_partial, nodeid); \ 251 - MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \ 252 - } while (0) 253 - 254 - #define CFLGS_OBJFREELIST_SLAB ((slab_flags_t __force)0x40000000U) 255 - #define CFLGS_OFF_SLAB ((slab_flags_t __force)0x80000000U) 256 - #define OBJFREELIST_SLAB(x) ((x)->flags & CFLGS_OBJFREELIST_SLAB) 257 - #define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB) 258 - 259 - #define BATCHREFILL_LIMIT 16 260 - /* 261 - * Optimization question: fewer reaps means less probability for unnecessary 262 - * cpucache drain/refill cycles. 263 - * 264 - * OTOH the cpuarrays can contain lots of objects, 265 - * which could lock up otherwise freeable slabs. 266 - */ 267 - #define REAPTIMEOUT_AC (2*HZ) 268 - #define REAPTIMEOUT_NODE (4*HZ) 269 - 270 - #if STATS 271 - #define STATS_INC_ACTIVE(x) ((x)->num_active++) 272 - #define STATS_DEC_ACTIVE(x) ((x)->num_active--) 273 - #define STATS_INC_ALLOCED(x) ((x)->num_allocations++) 274 - #define STATS_INC_GROWN(x) ((x)->grown++) 275 - #define STATS_ADD_REAPED(x, y) ((x)->reaped += (y)) 276 - #define STATS_SET_HIGH(x) \ 277 - do { \ 278 - if ((x)->num_active > (x)->high_mark) \ 279 - (x)->high_mark = (x)->num_active; \ 280 - } while (0) 281 - #define STATS_INC_ERR(x) ((x)->errors++) 282 - #define STATS_INC_NODEALLOCS(x) ((x)->node_allocs++) 283 - #define STATS_INC_NODEFREES(x) ((x)->node_frees++) 284 - #define STATS_INC_ACOVERFLOW(x) ((x)->node_overflow++) 285 - #define STATS_SET_FREEABLE(x, i) \ 286 - do { \ 287 - if ((x)->max_freeable < i) \ 288 - (x)->max_freeable = i; \ 289 - } while (0) 290 - #define STATS_INC_ALLOCHIT(x) atomic_inc(&(x)->allochit) 291 - #define STATS_INC_ALLOCMISS(x) atomic_inc(&(x)->allocmiss) 292 - #define STATS_INC_FREEHIT(x) atomic_inc(&(x)->freehit) 293 - #define STATS_INC_FREEMISS(x) atomic_inc(&(x)->freemiss) 294 - #else 295 - #define STATS_INC_ACTIVE(x) do { } while (0) 296 - #define STATS_DEC_ACTIVE(x) do { } while (0) 297 - #define STATS_INC_ALLOCED(x) do { } while (0) 298 - #define STATS_INC_GROWN(x) do { } while (0) 299 - #define STATS_ADD_REAPED(x, y) do { (void)(y); } while (0) 300 - #define STATS_SET_HIGH(x) do { } while (0) 301 - #define STATS_INC_ERR(x) do { } while (0) 302 - #define STATS_INC_NODEALLOCS(x) do { } while (0) 303 - #define STATS_INC_NODEFREES(x) do { } while (0) 304 - #define STATS_INC_ACOVERFLOW(x) do { } while (0) 305 - #define STATS_SET_FREEABLE(x, i) do { } while (0) 306 - #define STATS_INC_ALLOCHIT(x) do { } while (0) 307 - #define STATS_INC_ALLOCMISS(x) do { } while (0) 308 - #define STATS_INC_FREEHIT(x) do { } while (0) 309 - #define STATS_INC_FREEMISS(x) do { } while (0) 310 - #endif 311 - 312 - #if DEBUG 313 - 314 - /* 315 - * memory layout of objects: 316 - * 0 : objp 317 - * 0 .. cachep->obj_offset - BYTES_PER_WORD - 1: padding. This ensures that 318 - * the end of an object is aligned with the end of the real 319 - * allocation. Catches writes behind the end of the allocation. 320 - * cachep->obj_offset - BYTES_PER_WORD .. cachep->obj_offset - 1: 321 - * redzone word. 322 - * cachep->obj_offset: The real object. 323 - * cachep->size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long] 324 - * cachep->size - 1* BYTES_PER_WORD: last caller address 325 - * [BYTES_PER_WORD long] 326 - */ 327 - static int obj_offset(struct kmem_cache *cachep) 328 - { 329 - return cachep->obj_offset; 330 - } 331 - 332 - static unsigned long long *dbg_redzone1(struct kmem_cache *cachep, void *objp) 333 - { 334 - BUG_ON(!(cachep->flags & SLAB_RED_ZONE)); 335 - return (unsigned long long *) (objp + obj_offset(cachep) - 336 - sizeof(unsigned long long)); 337 - } 338 - 339 - static unsigned long long *dbg_redzone2(struct kmem_cache *cachep, void *objp) 340 - { 341 - BUG_ON(!(cachep->flags & SLAB_RED_ZONE)); 342 - if (cachep->flags & SLAB_STORE_USER) 343 - return (unsigned long long *)(objp + cachep->size - 344 - sizeof(unsigned long long) - 345 - REDZONE_ALIGN); 346 - return (unsigned long long *) (objp + cachep->size - 347 - sizeof(unsigned long long)); 348 - } 349 - 350 - static void **dbg_userword(struct kmem_cache *cachep, void *objp) 351 - { 352 - BUG_ON(!(cachep->flags & SLAB_STORE_USER)); 353 - return (void **)(objp + cachep->size - BYTES_PER_WORD); 354 - } 355 - 356 - #else 357 - 358 - #define obj_offset(x) 0 359 - #define dbg_redzone1(cachep, objp) ({BUG(); (unsigned long long *)NULL;}) 360 - #define dbg_redzone2(cachep, objp) ({BUG(); (unsigned long long *)NULL;}) 361 - #define dbg_userword(cachep, objp) ({BUG(); (void **)NULL;}) 362 - 363 - #endif 364 - 365 - /* 366 - * Do not go above this order unless 0 objects fit into the slab or 367 - * overridden on the command line. 368 - */ 369 - #define SLAB_MAX_ORDER_HI 1 370 - #define SLAB_MAX_ORDER_LO 0 371 - static int slab_max_order = SLAB_MAX_ORDER_LO; 372 - static bool slab_max_order_set __initdata; 373 - 374 - static inline void *index_to_obj(struct kmem_cache *cache, 375 - const struct slab *slab, unsigned int idx) 376 - { 377 - return slab->s_mem + cache->size * idx; 378 - } 379 - 380 - #define BOOT_CPUCACHE_ENTRIES 1 381 - /* internal cache of cache description objs */ 382 - static struct kmem_cache kmem_cache_boot = { 383 - .batchcount = 1, 384 - .limit = BOOT_CPUCACHE_ENTRIES, 385 - .shared = 1, 386 - .size = sizeof(struct kmem_cache), 387 - .name = "kmem_cache", 388 - }; 389 - 390 - static DEFINE_PER_CPU(struct delayed_work, slab_reap_work); 391 - 392 - static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) 393 - { 394 - return this_cpu_ptr(cachep->cpu_cache); 395 - } 396 - 397 - /* 398 - * Calculate the number of objects and left-over bytes for a given buffer size. 399 - */ 400 - static unsigned int cache_estimate(unsigned long gfporder, size_t buffer_size, 401 - slab_flags_t flags, size_t *left_over) 402 - { 403 - unsigned int num; 404 - size_t slab_size = PAGE_SIZE << gfporder; 405 - 406 - /* 407 - * The slab management structure can be either off the slab or 408 - * on it. For the latter case, the memory allocated for a 409 - * slab is used for: 410 - * 411 - * - @buffer_size bytes for each object 412 - * - One freelist_idx_t for each object 413 - * 414 - * We don't need to consider alignment of freelist because 415 - * freelist will be at the end of slab page. The objects will be 416 - * at the correct alignment. 417 - * 418 - * If the slab management structure is off the slab, then the 419 - * alignment will already be calculated into the size. Because 420 - * the slabs are all pages aligned, the objects will be at the 421 - * correct alignment when allocated. 422 - */ 423 - if (flags & (CFLGS_OBJFREELIST_SLAB | CFLGS_OFF_SLAB)) { 424 - num = slab_size / buffer_size; 425 - *left_over = slab_size % buffer_size; 426 - } else { 427 - num = slab_size / (buffer_size + sizeof(freelist_idx_t)); 428 - *left_over = slab_size % 429 - (buffer_size + sizeof(freelist_idx_t)); 430 - } 431 - 432 - return num; 433 - } 434 - 435 - #if DEBUG 436 - #define slab_error(cachep, msg) __slab_error(__func__, cachep, msg) 437 - 438 - static void __slab_error(const char *function, struct kmem_cache *cachep, 439 - char *msg) 440 - { 441 - pr_err("slab error in %s(): cache `%s': %s\n", 442 - function, cachep->name, msg); 443 - dump_stack(); 444 - add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); 445 - } 446 - #endif 447 - 448 - /* 449 - * By default on NUMA we use alien caches to stage the freeing of 450 - * objects allocated from other nodes. This causes massive memory 451 - * inefficiencies when using fake NUMA setup to split memory into a 452 - * large number of small nodes, so it can be disabled on the command 453 - * line 454 - */ 455 - 456 - static int use_alien_caches __read_mostly = 1; 457 - static int __init noaliencache_setup(char *s) 458 - { 459 - use_alien_caches = 0; 460 - return 1; 461 - } 462 - __setup("noaliencache", noaliencache_setup); 463 - 464 - static int __init slab_max_order_setup(char *str) 465 - { 466 - get_option(&str, &slab_max_order); 467 - slab_max_order = slab_max_order < 0 ? 0 : 468 - min(slab_max_order, MAX_ORDER); 469 - slab_max_order_set = true; 470 - 471 - return 1; 472 - } 473 - __setup("slab_max_order=", slab_max_order_setup); 474 - 475 - #ifdef CONFIG_NUMA 476 - /* 477 - * Special reaping functions for NUMA systems called from cache_reap(). 478 - * These take care of doing round robin flushing of alien caches (containing 479 - * objects freed on different nodes from which they were allocated) and the 480 - * flushing of remote pcps by calling drain_node_pages. 481 - */ 482 - static DEFINE_PER_CPU(unsigned long, slab_reap_node); 483 - 484 - static void init_reap_node(int cpu) 485 - { 486 - per_cpu(slab_reap_node, cpu) = next_node_in(cpu_to_mem(cpu), 487 - node_online_map); 488 - } 489 - 490 - static void next_reap_node(void) 491 - { 492 - int node = __this_cpu_read(slab_reap_node); 493 - 494 - node = next_node_in(node, node_online_map); 495 - __this_cpu_write(slab_reap_node, node); 496 - } 497 - 498 - #else 499 - #define init_reap_node(cpu) do { } while (0) 500 - #define next_reap_node(void) do { } while (0) 501 - #endif 502 - 503 - /* 504 - * Initiate the reap timer running on the target CPU. We run at around 1 to 2Hz 505 - * via the workqueue/eventd. 506 - * Add the CPU number into the expiration time to minimize the possibility of 507 - * the CPUs getting into lockstep and contending for the global cache chain 508 - * lock. 509 - */ 510 - static void start_cpu_timer(int cpu) 511 - { 512 - struct delayed_work *reap_work = &per_cpu(slab_reap_work, cpu); 513 - 514 - if (reap_work->work.func == NULL) { 515 - init_reap_node(cpu); 516 - INIT_DEFERRABLE_WORK(reap_work, cache_reap); 517 - schedule_delayed_work_on(cpu, reap_work, 518 - __round_jiffies_relative(HZ, cpu)); 519 - } 520 - } 521 - 522 - static void init_arraycache(struct array_cache *ac, int limit, int batch) 523 - { 524 - if (ac) { 525 - ac->avail = 0; 526 - ac->limit = limit; 527 - ac->batchcount = batch; 528 - ac->touched = 0; 529 - } 530 - } 531 - 532 - static struct array_cache *alloc_arraycache(int node, int entries, 533 - int batchcount, gfp_t gfp) 534 - { 535 - size_t memsize = sizeof(void *) * entries + sizeof(struct array_cache); 536 - struct array_cache *ac = NULL; 537 - 538 - ac = kmalloc_node(memsize, gfp, node); 539 - /* 540 - * The array_cache structures contain pointers to free object. 541 - * However, when such objects are allocated or transferred to another 542 - * cache the pointers are not cleared and they could be counted as 543 - * valid references during a kmemleak scan. Therefore, kmemleak must 544 - * not scan such objects. 545 - */ 546 - kmemleak_no_scan(ac); 547 - init_arraycache(ac, entries, batchcount); 548 - return ac; 549 - } 550 - 551 - static noinline void cache_free_pfmemalloc(struct kmem_cache *cachep, 552 - struct slab *slab, void *objp) 553 - { 554 - struct kmem_cache_node *n; 555 - int slab_node; 556 - LIST_HEAD(list); 557 - 558 - slab_node = slab_nid(slab); 559 - n = get_node(cachep, slab_node); 560 - 561 - raw_spin_lock(&n->list_lock); 562 - free_block(cachep, &objp, 1, slab_node, &list); 563 - raw_spin_unlock(&n->list_lock); 564 - 565 - slabs_destroy(cachep, &list); 566 - } 567 - 568 - /* 569 - * Transfer objects in one arraycache to another. 570 - * Locking must be handled by the caller. 571 - * 572 - * Return the number of entries transferred. 573 - */ 574 - static int transfer_objects(struct array_cache *to, 575 - struct array_cache *from, unsigned int max) 576 - { 577 - /* Figure out how many entries to transfer */ 578 - int nr = min3(from->avail, max, to->limit - to->avail); 579 - 580 - if (!nr) 581 - return 0; 582 - 583 - memcpy(to->entry + to->avail, from->entry + from->avail - nr, 584 - sizeof(void *) *nr); 585 - 586 - from->avail -= nr; 587 - to->avail += nr; 588 - return nr; 589 - } 590 - 591 - /* &alien->lock must be held by alien callers. */ 592 - static __always_inline void __free_one(struct array_cache *ac, void *objp) 593 - { 594 - /* Avoid trivial double-free. */ 595 - if (IS_ENABLED(CONFIG_SLAB_FREELIST_HARDENED) && 596 - WARN_ON_ONCE(ac->avail > 0 && ac->entry[ac->avail - 1] == objp)) 597 - return; 598 - ac->entry[ac->avail++] = objp; 599 - } 600 - 601 - #ifndef CONFIG_NUMA 602 - 603 - #define drain_alien_cache(cachep, alien) do { } while (0) 604 - #define reap_alien(cachep, n) do { } while (0) 605 - 606 - static inline struct alien_cache **alloc_alien_cache(int node, 607 - int limit, gfp_t gfp) 608 - { 609 - return NULL; 610 - } 611 - 612 - static inline void free_alien_cache(struct alien_cache **ac_ptr) 613 - { 614 - } 615 - 616 - static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) 617 - { 618 - return 0; 619 - } 620 - 621 - static inline gfp_t gfp_exact_node(gfp_t flags) 622 - { 623 - return flags & ~__GFP_NOFAIL; 624 - } 625 - 626 - #else /* CONFIG_NUMA */ 627 - 628 - static struct alien_cache *__alloc_alien_cache(int node, int entries, 629 - int batch, gfp_t gfp) 630 - { 631 - size_t memsize = sizeof(void *) * entries + sizeof(struct alien_cache); 632 - struct alien_cache *alc = NULL; 633 - 634 - alc = kmalloc_node(memsize, gfp, node); 635 - if (alc) { 636 - kmemleak_no_scan(alc); 637 - init_arraycache(&alc->ac, entries, batch); 638 - spin_lock_init(&alc->lock); 639 - } 640 - return alc; 641 - } 642 - 643 - static struct alien_cache **alloc_alien_cache(int node, int limit, gfp_t gfp) 644 - { 645 - struct alien_cache **alc_ptr; 646 - int i; 647 - 648 - if (limit > 1) 649 - limit = 12; 650 - alc_ptr = kcalloc_node(nr_node_ids, sizeof(void *), gfp, node); 651 - if (!alc_ptr) 652 - return NULL; 653 - 654 - for_each_node(i) { 655 - if (i == node || !node_online(i)) 656 - continue; 657 - alc_ptr[i] = __alloc_alien_cache(node, limit, 0xbaadf00d, gfp); 658 - if (!alc_ptr[i]) { 659 - for (i--; i >= 0; i--) 660 - kfree(alc_ptr[i]); 661 - kfree(alc_ptr); 662 - return NULL; 663 - } 664 - } 665 - return alc_ptr; 666 - } 667 - 668 - static void free_alien_cache(struct alien_cache **alc_ptr) 669 - { 670 - int i; 671 - 672 - if (!alc_ptr) 673 - return; 674 - for_each_node(i) 675 - kfree(alc_ptr[i]); 676 - kfree(alc_ptr); 677 - } 678 - 679 - static void __drain_alien_cache(struct kmem_cache *cachep, 680 - struct array_cache *ac, int node, 681 - struct list_head *list) 682 - { 683 - struct kmem_cache_node *n = get_node(cachep, node); 684 - 685 - if (ac->avail) { 686 - raw_spin_lock(&n->list_lock); 687 - /* 688 - * Stuff objects into the remote nodes shared array first. 689 - * That way we could avoid the overhead of putting the objects 690 - * into the free lists and getting them back later. 691 - */ 692 - if (n->shared) 693 - transfer_objects(n->shared, ac, ac->limit); 694 - 695 - free_block(cachep, ac->entry, ac->avail, node, list); 696 - ac->avail = 0; 697 - raw_spin_unlock(&n->list_lock); 698 - } 699 - } 700 - 701 - /* 702 - * Called from cache_reap() to regularly drain alien caches round robin. 703 - */ 704 - static void reap_alien(struct kmem_cache *cachep, struct kmem_cache_node *n) 705 - { 706 - int node = __this_cpu_read(slab_reap_node); 707 - 708 - if (n->alien) { 709 - struct alien_cache *alc = n->alien[node]; 710 - struct array_cache *ac; 711 - 712 - if (alc) { 713 - ac = &alc->ac; 714 - if (ac->avail && spin_trylock_irq(&alc->lock)) { 715 - LIST_HEAD(list); 716 - 717 - __drain_alien_cache(cachep, ac, node, &list); 718 - spin_unlock_irq(&alc->lock); 719 - slabs_destroy(cachep, &list); 720 - } 721 - } 722 - } 723 - } 724 - 725 - static void drain_alien_cache(struct kmem_cache *cachep, 726 - struct alien_cache **alien) 727 - { 728 - int i = 0; 729 - struct alien_cache *alc; 730 - struct array_cache *ac; 731 - unsigned long flags; 732 - 733 - for_each_online_node(i) { 734 - alc = alien[i]; 735 - if (alc) { 736 - LIST_HEAD(list); 737 - 738 - ac = &alc->ac; 739 - spin_lock_irqsave(&alc->lock, flags); 740 - __drain_alien_cache(cachep, ac, i, &list); 741 - spin_unlock_irqrestore(&alc->lock, flags); 742 - slabs_destroy(cachep, &list); 743 - } 744 - } 745 - } 746 - 747 - static int __cache_free_alien(struct kmem_cache *cachep, void *objp, 748 - int node, int slab_node) 749 - { 750 - struct kmem_cache_node *n; 751 - struct alien_cache *alien = NULL; 752 - struct array_cache *ac; 753 - LIST_HEAD(list); 754 - 755 - n = get_node(cachep, node); 756 - STATS_INC_NODEFREES(cachep); 757 - if (n->alien && n->alien[slab_node]) { 758 - alien = n->alien[slab_node]; 759 - ac = &alien->ac; 760 - spin_lock(&alien->lock); 761 - if (unlikely(ac->avail == ac->limit)) { 762 - STATS_INC_ACOVERFLOW(cachep); 763 - __drain_alien_cache(cachep, ac, slab_node, &list); 764 - } 765 - __free_one(ac, objp); 766 - spin_unlock(&alien->lock); 767 - slabs_destroy(cachep, &list); 768 - } else { 769 - n = get_node(cachep, slab_node); 770 - raw_spin_lock(&n->list_lock); 771 - free_block(cachep, &objp, 1, slab_node, &list); 772 - raw_spin_unlock(&n->list_lock); 773 - slabs_destroy(cachep, &list); 774 - } 775 - return 1; 776 - } 777 - 778 - static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) 779 - { 780 - int slab_node = slab_nid(virt_to_slab(objp)); 781 - int node = numa_mem_id(); 782 - /* 783 - * Make sure we are not freeing an object from another node to the array 784 - * cache on this cpu. 785 - */ 786 - if (likely(node == slab_node)) 787 - return 0; 788 - 789 - return __cache_free_alien(cachep, objp, node, slab_node); 790 - } 791 - 792 - /* 793 - * Construct gfp mask to allocate from a specific node but do not reclaim or 794 - * warn about failures. 795 - */ 796 - static inline gfp_t gfp_exact_node(gfp_t flags) 797 - { 798 - return (flags | __GFP_THISNODE | __GFP_NOWARN) & ~(__GFP_RECLAIM|__GFP_NOFAIL); 799 - } 800 - #endif 801 - 802 - static int init_cache_node(struct kmem_cache *cachep, int node, gfp_t gfp) 803 - { 804 - struct kmem_cache_node *n; 805 - 806 - /* 807 - * Set up the kmem_cache_node for cpu before we can 808 - * begin anything. Make sure some other cpu on this 809 - * node has not already allocated this 810 - */ 811 - n = get_node(cachep, node); 812 - if (n) { 813 - raw_spin_lock_irq(&n->list_lock); 814 - n->free_limit = (1 + nr_cpus_node(node)) * cachep->batchcount + 815 - cachep->num; 816 - raw_spin_unlock_irq(&n->list_lock); 817 - 818 - return 0; 819 - } 820 - 821 - n = kmalloc_node(sizeof(struct kmem_cache_node), gfp, node); 822 - if (!n) 823 - return -ENOMEM; 824 - 825 - kmem_cache_node_init(n); 826 - n->next_reap = jiffies + REAPTIMEOUT_NODE + 827 - ((unsigned long)cachep) % REAPTIMEOUT_NODE; 828 - 829 - n->free_limit = 830 - (1 + nr_cpus_node(node)) * cachep->batchcount + cachep->num; 831 - 832 - /* 833 - * The kmem_cache_nodes don't come and go as CPUs 834 - * come and go. slab_mutex provides sufficient 835 - * protection here. 836 - */ 837 - cachep->node[node] = n; 838 - 839 - return 0; 840 - } 841 - 842 - #if defined(CONFIG_NUMA) || defined(CONFIG_SMP) 843 - /* 844 - * Allocates and initializes node for a node on each slab cache, used for 845 - * either memory or cpu hotplug. If memory is being hot-added, the kmem_cache_node 846 - * will be allocated off-node since memory is not yet online for the new node. 847 - * When hotplugging memory or a cpu, existing nodes are not replaced if 848 - * already in use. 849 - * 850 - * Must hold slab_mutex. 851 - */ 852 - static int init_cache_node_node(int node) 853 - { 854 - int ret; 855 - struct kmem_cache *cachep; 856 - 857 - list_for_each_entry(cachep, &slab_caches, list) { 858 - ret = init_cache_node(cachep, node, GFP_KERNEL); 859 - if (ret) 860 - return ret; 861 - } 862 - 863 - return 0; 864 - } 865 - #endif 866 - 867 - static int setup_kmem_cache_node(struct kmem_cache *cachep, 868 - int node, gfp_t gfp, bool force_change) 869 - { 870 - int ret = -ENOMEM; 871 - struct kmem_cache_node *n; 872 - struct array_cache *old_shared = NULL; 873 - struct array_cache *new_shared = NULL; 874 - struct alien_cache **new_alien = NULL; 875 - LIST_HEAD(list); 876 - 877 - if (use_alien_caches) { 878 - new_alien = alloc_alien_cache(node, cachep->limit, gfp); 879 - if (!new_alien) 880 - goto fail; 881 - } 882 - 883 - if (cachep->shared) { 884 - new_shared = alloc_arraycache(node, 885 - cachep->shared * cachep->batchcount, 0xbaadf00d, gfp); 886 - if (!new_shared) 887 - goto fail; 888 - } 889 - 890 - ret = init_cache_node(cachep, node, gfp); 891 - if (ret) 892 - goto fail; 893 - 894 - n = get_node(cachep, node); 895 - raw_spin_lock_irq(&n->list_lock); 896 - if (n->shared && force_change) { 897 - free_block(cachep, n->shared->entry, 898 - n->shared->avail, node, &list); 899 - n->shared->avail = 0; 900 - } 901 - 902 - if (!n->shared || force_change) { 903 - old_shared = n->shared; 904 - n->shared = new_shared; 905 - new_shared = NULL; 906 - } 907 - 908 - if (!n->alien) { 909 - n->alien = new_alien; 910 - new_alien = NULL; 911 - } 912 - 913 - raw_spin_unlock_irq(&n->list_lock); 914 - slabs_destroy(cachep, &list); 915 - 916 - /* 917 - * To protect lockless access to n->shared during irq disabled context. 918 - * If n->shared isn't NULL in irq disabled context, accessing to it is 919 - * guaranteed to be valid until irq is re-enabled, because it will be 920 - * freed after synchronize_rcu(). 921 - */ 922 - if (old_shared && force_change) 923 - synchronize_rcu(); 924 - 925 - fail: 926 - kfree(old_shared); 927 - kfree(new_shared); 928 - free_alien_cache(new_alien); 929 - 930 - return ret; 931 - } 932 - 933 - #ifdef CONFIG_SMP 934 - 935 - static void cpuup_canceled(long cpu) 936 - { 937 - struct kmem_cache *cachep; 938 - struct kmem_cache_node *n = NULL; 939 - int node = cpu_to_mem(cpu); 940 - const struct cpumask *mask = cpumask_of_node(node); 941 - 942 - list_for_each_entry(cachep, &slab_caches, list) { 943 - struct array_cache *nc; 944 - struct array_cache *shared; 945 - struct alien_cache **alien; 946 - LIST_HEAD(list); 947 - 948 - n = get_node(cachep, node); 949 - if (!n) 950 - continue; 951 - 952 - raw_spin_lock_irq(&n->list_lock); 953 - 954 - /* Free limit for this kmem_cache_node */ 955 - n->free_limit -= cachep->batchcount; 956 - 957 - /* cpu is dead; no one can alloc from it. */ 958 - nc = per_cpu_ptr(cachep->cpu_cache, cpu); 959 - free_block(cachep, nc->entry, nc->avail, node, &list); 960 - nc->avail = 0; 961 - 962 - if (!cpumask_empty(mask)) { 963 - raw_spin_unlock_irq(&n->list_lock); 964 - goto free_slab; 965 - } 966 - 967 - shared = n->shared; 968 - if (shared) { 969 - free_block(cachep, shared->entry, 970 - shared->avail, node, &list); 971 - n->shared = NULL; 972 - } 973 - 974 - alien = n->alien; 975 - n->alien = NULL; 976 - 977 - raw_spin_unlock_irq(&n->list_lock); 978 - 979 - kfree(shared); 980 - if (alien) { 981 - drain_alien_cache(cachep, alien); 982 - free_alien_cache(alien); 983 - } 984 - 985 - free_slab: 986 - slabs_destroy(cachep, &list); 987 - } 988 - /* 989 - * In the previous loop, all the objects were freed to 990 - * the respective cache's slabs, now we can go ahead and 991 - * shrink each nodelist to its limit. 992 - */ 993 - list_for_each_entry(cachep, &slab_caches, list) { 994 - n = get_node(cachep, node); 995 - if (!n) 996 - continue; 997 - drain_freelist(cachep, n, INT_MAX); 998 - } 999 - } 1000 - 1001 - static int cpuup_prepare(long cpu) 1002 - { 1003 - struct kmem_cache *cachep; 1004 - int node = cpu_to_mem(cpu); 1005 - int err; 1006 - 1007 - /* 1008 - * We need to do this right in the beginning since 1009 - * alloc_arraycache's are going to use this list. 1010 - * kmalloc_node allows us to add the slab to the right 1011 - * kmem_cache_node and not this cpu's kmem_cache_node 1012 - */ 1013 - err = init_cache_node_node(node); 1014 - if (err < 0) 1015 - goto bad; 1016 - 1017 - /* 1018 - * Now we can go ahead with allocating the shared arrays and 1019 - * array caches 1020 - */ 1021 - list_for_each_entry(cachep, &slab_caches, list) { 1022 - err = setup_kmem_cache_node(cachep, node, GFP_KERNEL, false); 1023 - if (err) 1024 - goto bad; 1025 - } 1026 - 1027 - return 0; 1028 - bad: 1029 - cpuup_canceled(cpu); 1030 - return -ENOMEM; 1031 - } 1032 - 1033 - int slab_prepare_cpu(unsigned int cpu) 1034 - { 1035 - int err; 1036 - 1037 - mutex_lock(&slab_mutex); 1038 - err = cpuup_prepare(cpu); 1039 - mutex_unlock(&slab_mutex); 1040 - return err; 1041 - } 1042 - 1043 - /* 1044 - * This is called for a failed online attempt and for a successful 1045 - * offline. 1046 - * 1047 - * Even if all the cpus of a node are down, we don't free the 1048 - * kmem_cache_node of any cache. This is to avoid a race between cpu_down, and 1049 - * a kmalloc allocation from another cpu for memory from the node of 1050 - * the cpu going down. The kmem_cache_node structure is usually allocated from 1051 - * kmem_cache_create() and gets destroyed at kmem_cache_destroy(). 1052 - */ 1053 - int slab_dead_cpu(unsigned int cpu) 1054 - { 1055 - mutex_lock(&slab_mutex); 1056 - cpuup_canceled(cpu); 1057 - mutex_unlock(&slab_mutex); 1058 - return 0; 1059 - } 1060 - #endif 1061 - 1062 - static int slab_online_cpu(unsigned int cpu) 1063 - { 1064 - start_cpu_timer(cpu); 1065 - return 0; 1066 - } 1067 - 1068 - static int slab_offline_cpu(unsigned int cpu) 1069 - { 1070 - /* 1071 - * Shutdown cache reaper. Note that the slab_mutex is held so 1072 - * that if cache_reap() is invoked it cannot do anything 1073 - * expensive but will only modify reap_work and reschedule the 1074 - * timer. 1075 - */ 1076 - cancel_delayed_work_sync(&per_cpu(slab_reap_work, cpu)); 1077 - /* Now the cache_reaper is guaranteed to be not running. */ 1078 - per_cpu(slab_reap_work, cpu).work.func = NULL; 1079 - return 0; 1080 - } 1081 - 1082 - #if defined(CONFIG_NUMA) 1083 - /* 1084 - * Drains freelist for a node on each slab cache, used for memory hot-remove. 1085 - * Returns -EBUSY if all objects cannot be drained so that the node is not 1086 - * removed. 1087 - * 1088 - * Must hold slab_mutex. 1089 - */ 1090 - static int __meminit drain_cache_node_node(int node) 1091 - { 1092 - struct kmem_cache *cachep; 1093 - int ret = 0; 1094 - 1095 - list_for_each_entry(cachep, &slab_caches, list) { 1096 - struct kmem_cache_node *n; 1097 - 1098 - n = get_node(cachep, node); 1099 - if (!n) 1100 - continue; 1101 - 1102 - drain_freelist(cachep, n, INT_MAX); 1103 - 1104 - if (!list_empty(&n->slabs_full) || 1105 - !list_empty(&n->slabs_partial)) { 1106 - ret = -EBUSY; 1107 - break; 1108 - } 1109 - } 1110 - return ret; 1111 - } 1112 - 1113 - static int __meminit slab_memory_callback(struct notifier_block *self, 1114 - unsigned long action, void *arg) 1115 - { 1116 - struct memory_notify *mnb = arg; 1117 - int ret = 0; 1118 - int nid; 1119 - 1120 - nid = mnb->status_change_nid; 1121 - if (nid < 0) 1122 - goto out; 1123 - 1124 - switch (action) { 1125 - case MEM_GOING_ONLINE: 1126 - mutex_lock(&slab_mutex); 1127 - ret = init_cache_node_node(nid); 1128 - mutex_unlock(&slab_mutex); 1129 - break; 1130 - case MEM_GOING_OFFLINE: 1131 - mutex_lock(&slab_mutex); 1132 - ret = drain_cache_node_node(nid); 1133 - mutex_unlock(&slab_mutex); 1134 - break; 1135 - case MEM_ONLINE: 1136 - case MEM_OFFLINE: 1137 - case MEM_CANCEL_ONLINE: 1138 - case MEM_CANCEL_OFFLINE: 1139 - break; 1140 - } 1141 - out: 1142 - return notifier_from_errno(ret); 1143 - } 1144 - #endif /* CONFIG_NUMA */ 1145 - 1146 - /* 1147 - * swap the static kmem_cache_node with kmalloced memory 1148 - */ 1149 - static void __init init_list(struct kmem_cache *cachep, struct kmem_cache_node *list, 1150 - int nodeid) 1151 - { 1152 - struct kmem_cache_node *ptr; 1153 - 1154 - ptr = kmalloc_node(sizeof(struct kmem_cache_node), GFP_NOWAIT, nodeid); 1155 - BUG_ON(!ptr); 1156 - 1157 - memcpy(ptr, list, sizeof(struct kmem_cache_node)); 1158 - /* 1159 - * Do not assume that spinlocks can be initialized via memcpy: 1160 - */ 1161 - raw_spin_lock_init(&ptr->list_lock); 1162 - 1163 - MAKE_ALL_LISTS(cachep, ptr, nodeid); 1164 - cachep->node[nodeid] = ptr; 1165 - } 1166 - 1167 - /* 1168 - * For setting up all the kmem_cache_node for cache whose buffer_size is same as 1169 - * size of kmem_cache_node. 1170 - */ 1171 - static void __init set_up_node(struct kmem_cache *cachep, int index) 1172 - { 1173 - int node; 1174 - 1175 - for_each_online_node(node) { 1176 - cachep->node[node] = &init_kmem_cache_node[index + node]; 1177 - cachep->node[node]->next_reap = jiffies + 1178 - REAPTIMEOUT_NODE + 1179 - ((unsigned long)cachep) % REAPTIMEOUT_NODE; 1180 - } 1181 - } 1182 - 1183 - /* 1184 - * Initialisation. Called after the page allocator have been initialised and 1185 - * before smp_init(). 1186 - */ 1187 - void __init kmem_cache_init(void) 1188 - { 1189 - int i; 1190 - 1191 - kmem_cache = &kmem_cache_boot; 1192 - 1193 - if (!IS_ENABLED(CONFIG_NUMA) || num_possible_nodes() == 1) 1194 - use_alien_caches = 0; 1195 - 1196 - for (i = 0; i < NUM_INIT_LISTS; i++) 1197 - kmem_cache_node_init(&init_kmem_cache_node[i]); 1198 - 1199 - /* 1200 - * Fragmentation resistance on low memory - only use bigger 1201 - * page orders on machines with more than 32MB of memory if 1202 - * not overridden on the command line. 1203 - */ 1204 - if (!slab_max_order_set && totalram_pages() > (32 << 20) >> PAGE_SHIFT) 1205 - slab_max_order = SLAB_MAX_ORDER_HI; 1206 - 1207 - /* Bootstrap is tricky, because several objects are allocated 1208 - * from caches that do not exist yet: 1209 - * 1) initialize the kmem_cache cache: it contains the struct 1210 - * kmem_cache structures of all caches, except kmem_cache itself: 1211 - * kmem_cache is statically allocated. 1212 - * Initially an __init data area is used for the head array and the 1213 - * kmem_cache_node structures, it's replaced with a kmalloc allocated 1214 - * array at the end of the bootstrap. 1215 - * 2) Create the first kmalloc cache. 1216 - * The struct kmem_cache for the new cache is allocated normally. 1217 - * An __init data area is used for the head array. 1218 - * 3) Create the remaining kmalloc caches, with minimally sized 1219 - * head arrays. 1220 - * 4) Replace the __init data head arrays for kmem_cache and the first 1221 - * kmalloc cache with kmalloc allocated arrays. 1222 - * 5) Replace the __init data for kmem_cache_node for kmem_cache and 1223 - * the other cache's with kmalloc allocated memory. 1224 - * 6) Resize the head arrays of the kmalloc caches to their final sizes. 1225 - */ 1226 - 1227 - /* 1) create the kmem_cache */ 1228 - 1229 - /* 1230 - * struct kmem_cache size depends on nr_node_ids & nr_cpu_ids 1231 - */ 1232 - create_boot_cache(kmem_cache, "kmem_cache", 1233 - offsetof(struct kmem_cache, node) + 1234 - nr_node_ids * sizeof(struct kmem_cache_node *), 1235 - SLAB_HWCACHE_ALIGN, 0, 0); 1236 - list_add(&kmem_cache->list, &slab_caches); 1237 - slab_state = PARTIAL; 1238 - 1239 - /* 1240 - * Initialize the caches that provide memory for the kmem_cache_node 1241 - * structures first. Without this, further allocations will bug. 1242 - */ 1243 - new_kmalloc_cache(INDEX_NODE, KMALLOC_NORMAL, ARCH_KMALLOC_FLAGS); 1244 - slab_state = PARTIAL_NODE; 1245 - setup_kmalloc_cache_index_table(); 1246 - 1247 - /* 5) Replace the bootstrap kmem_cache_node */ 1248 - { 1249 - int nid; 1250 - 1251 - for_each_online_node(nid) { 1252 - init_list(kmem_cache, &init_kmem_cache_node[CACHE_CACHE + nid], nid); 1253 - 1254 - init_list(kmalloc_caches[KMALLOC_NORMAL][INDEX_NODE], 1255 - &init_kmem_cache_node[SIZE_NODE + nid], nid); 1256 - } 1257 - } 1258 - 1259 - create_kmalloc_caches(ARCH_KMALLOC_FLAGS); 1260 - } 1261 - 1262 - void __init kmem_cache_init_late(void) 1263 - { 1264 - struct kmem_cache *cachep; 1265 - 1266 - /* 6) resize the head arrays to their final sizes */ 1267 - mutex_lock(&slab_mutex); 1268 - list_for_each_entry(cachep, &slab_caches, list) 1269 - if (enable_cpucache(cachep, GFP_NOWAIT)) 1270 - BUG(); 1271 - mutex_unlock(&slab_mutex); 1272 - 1273 - /* Done! */ 1274 - slab_state = FULL; 1275 - 1276 - #ifdef CONFIG_NUMA 1277 - /* 1278 - * Register a memory hotplug callback that initializes and frees 1279 - * node. 1280 - */ 1281 - hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI); 1282 - #endif 1283 - 1284 - /* 1285 - * The reap timers are started later, with a module init call: That part 1286 - * of the kernel is not yet operational. 1287 - */ 1288 - } 1289 - 1290 - static int __init cpucache_init(void) 1291 - { 1292 - int ret; 1293 - 1294 - /* 1295 - * Register the timers that return unneeded pages to the page allocator 1296 - */ 1297 - ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "SLAB online", 1298 - slab_online_cpu, slab_offline_cpu); 1299 - WARN_ON(ret < 0); 1300 - 1301 - return 0; 1302 - } 1303 - __initcall(cpucache_init); 1304 - 1305 - static noinline void 1306 - slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid) 1307 - { 1308 - #if DEBUG 1309 - struct kmem_cache_node *n; 1310 - unsigned long flags; 1311 - int node; 1312 - static DEFINE_RATELIMIT_STATE(slab_oom_rs, DEFAULT_RATELIMIT_INTERVAL, 1313 - DEFAULT_RATELIMIT_BURST); 1314 - 1315 - if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slab_oom_rs)) 1316 - return; 1317 - 1318 - pr_warn("SLAB: Unable to allocate memory on node %d, gfp=%#x(%pGg)\n", 1319 - nodeid, gfpflags, &gfpflags); 1320 - pr_warn(" cache: %s, object size: %d, order: %d\n", 1321 - cachep->name, cachep->size, cachep->gfporder); 1322 - 1323 - for_each_kmem_cache_node(cachep, node, n) { 1324 - unsigned long total_slabs, free_slabs, free_objs; 1325 - 1326 - raw_spin_lock_irqsave(&n->list_lock, flags); 1327 - total_slabs = n->total_slabs; 1328 - free_slabs = n->free_slabs; 1329 - free_objs = n->free_objects; 1330 - raw_spin_unlock_irqrestore(&n->list_lock, flags); 1331 - 1332 - pr_warn(" node %d: slabs: %ld/%ld, objs: %ld/%ld\n", 1333 - node, total_slabs - free_slabs, total_slabs, 1334 - (total_slabs * cachep->num) - free_objs, 1335 - total_slabs * cachep->num); 1336 - } 1337 - #endif 1338 - } 1339 - 1340 - /* 1341 - * Interface to system's page allocator. No need to hold the 1342 - * kmem_cache_node ->list_lock. 1343 - * 1344 - * If we requested dmaable memory, we will get it. Even if we 1345 - * did not request dmaable memory, we might get it, but that 1346 - * would be relatively rare and ignorable. 1347 - */ 1348 - static struct slab *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, 1349 - int nodeid) 1350 - { 1351 - struct folio *folio; 1352 - struct slab *slab; 1353 - 1354 - flags |= cachep->allocflags; 1355 - 1356 - folio = (struct folio *) __alloc_pages_node(nodeid, flags, cachep->gfporder); 1357 - if (!folio) { 1358 - slab_out_of_memory(cachep, flags, nodeid); 1359 - return NULL; 1360 - } 1361 - 1362 - slab = folio_slab(folio); 1363 - 1364 - account_slab(slab, cachep->gfporder, cachep, flags); 1365 - __folio_set_slab(folio); 1366 - /* Make the flag visible before any changes to folio->mapping */ 1367 - smp_wmb(); 1368 - /* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */ 1369 - if (sk_memalloc_socks() && folio_is_pfmemalloc(folio)) 1370 - slab_set_pfmemalloc(slab); 1371 - 1372 - return slab; 1373 - } 1374 - 1375 - /* 1376 - * Interface to system's page release. 1377 - */ 1378 - static void kmem_freepages(struct kmem_cache *cachep, struct slab *slab) 1379 - { 1380 - int order = cachep->gfporder; 1381 - struct folio *folio = slab_folio(slab); 1382 - 1383 - BUG_ON(!folio_test_slab(folio)); 1384 - __slab_clear_pfmemalloc(slab); 1385 - page_mapcount_reset(&folio->page); 1386 - folio->mapping = NULL; 1387 - /* Make the mapping reset visible before clearing the flag */ 1388 - smp_wmb(); 1389 - __folio_clear_slab(folio); 1390 - 1391 - mm_account_reclaimed_pages(1 << order); 1392 - unaccount_slab(slab, order, cachep); 1393 - __free_pages(&folio->page, order); 1394 - } 1395 - 1396 - static void kmem_rcu_free(struct rcu_head *head) 1397 - { 1398 - struct kmem_cache *cachep; 1399 - struct slab *slab; 1400 - 1401 - slab = container_of(head, struct slab, rcu_head); 1402 - cachep = slab->slab_cache; 1403 - 1404 - kmem_freepages(cachep, slab); 1405 - } 1406 - 1407 - #if DEBUG 1408 - static inline bool is_debug_pagealloc_cache(struct kmem_cache *cachep) 1409 - { 1410 - return debug_pagealloc_enabled_static() && OFF_SLAB(cachep) && 1411 - ((cachep->size % PAGE_SIZE) == 0); 1412 - } 1413 - 1414 - #ifdef CONFIG_DEBUG_PAGEALLOC 1415 - static void slab_kernel_map(struct kmem_cache *cachep, void *objp, int map) 1416 - { 1417 - if (!is_debug_pagealloc_cache(cachep)) 1418 - return; 1419 - 1420 - __kernel_map_pages(virt_to_page(objp), cachep->size / PAGE_SIZE, map); 1421 - } 1422 - 1423 - #else 1424 - static inline void slab_kernel_map(struct kmem_cache *cachep, void *objp, 1425 - int map) {} 1426 - 1427 - #endif 1428 - 1429 - static void poison_obj(struct kmem_cache *cachep, void *addr, unsigned char val) 1430 - { 1431 - int size = cachep->object_size; 1432 - addr = &((char *)addr)[obj_offset(cachep)]; 1433 - 1434 - memset(addr, val, size); 1435 - *(unsigned char *)(addr + size - 1) = POISON_END; 1436 - } 1437 - 1438 - static void dump_line(char *data, int offset, int limit) 1439 - { 1440 - int i; 1441 - unsigned char error = 0; 1442 - int bad_count = 0; 1443 - 1444 - pr_err("%03x: ", offset); 1445 - for (i = 0; i < limit; i++) { 1446 - if (data[offset + i] != POISON_FREE) { 1447 - error = data[offset + i]; 1448 - bad_count++; 1449 - } 1450 - } 1451 - print_hex_dump(KERN_CONT, "", 0, 16, 1, 1452 - &data[offset], limit, 1); 1453 - 1454 - if (bad_count == 1) { 1455 - error ^= POISON_FREE; 1456 - if (!(error & (error - 1))) { 1457 - pr_err("Single bit error detected. Probably bad RAM.\n"); 1458 - #ifdef CONFIG_X86 1459 - pr_err("Run memtest86+ or a similar memory test tool.\n"); 1460 - #else 1461 - pr_err("Run a memory test tool.\n"); 1462 - #endif 1463 - } 1464 - } 1465 - } 1466 - #endif 1467 - 1468 - #if DEBUG 1469 - 1470 - static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines) 1471 - { 1472 - int i, size; 1473 - char *realobj; 1474 - 1475 - if (cachep->flags & SLAB_RED_ZONE) { 1476 - pr_err("Redzone: 0x%llx/0x%llx\n", 1477 - *dbg_redzone1(cachep, objp), 1478 - *dbg_redzone2(cachep, objp)); 1479 - } 1480 - 1481 - if (cachep->flags & SLAB_STORE_USER) 1482 - pr_err("Last user: (%pSR)\n", *dbg_userword(cachep, objp)); 1483 - realobj = (char *)objp + obj_offset(cachep); 1484 - size = cachep->object_size; 1485 - for (i = 0; i < size && lines; i += 16, lines--) { 1486 - int limit; 1487 - limit = 16; 1488 - if (i + limit > size) 1489 - limit = size - i; 1490 - dump_line(realobj, i, limit); 1491 - } 1492 - } 1493 - 1494 - static void check_poison_obj(struct kmem_cache *cachep, void *objp) 1495 - { 1496 - char *realobj; 1497 - int size, i; 1498 - int lines = 0; 1499 - 1500 - if (is_debug_pagealloc_cache(cachep)) 1501 - return; 1502 - 1503 - realobj = (char *)objp + obj_offset(cachep); 1504 - size = cachep->object_size; 1505 - 1506 - for (i = 0; i < size; i++) { 1507 - char exp = POISON_FREE; 1508 - if (i == size - 1) 1509 - exp = POISON_END; 1510 - if (realobj[i] != exp) { 1511 - int limit; 1512 - /* Mismatch ! */ 1513 - /* Print header */ 1514 - if (lines == 0) { 1515 - pr_err("Slab corruption (%s): %s start=%px, len=%d\n", 1516 - print_tainted(), cachep->name, 1517 - realobj, size); 1518 - print_objinfo(cachep, objp, 0); 1519 - } 1520 - /* Hexdump the affected line */ 1521 - i = (i / 16) * 16; 1522 - limit = 16; 1523 - if (i + limit > size) 1524 - limit = size - i; 1525 - dump_line(realobj, i, limit); 1526 - i += 16; 1527 - lines++; 1528 - /* Limit to 5 lines */ 1529 - if (lines > 5) 1530 - break; 1531 - } 1532 - } 1533 - if (lines != 0) { 1534 - /* Print some data about the neighboring objects, if they 1535 - * exist: 1536 - */ 1537 - struct slab *slab = virt_to_slab(objp); 1538 - unsigned int objnr; 1539 - 1540 - objnr = obj_to_index(cachep, slab, objp); 1541 - if (objnr) { 1542 - objp = index_to_obj(cachep, slab, objnr - 1); 1543 - realobj = (char *)objp + obj_offset(cachep); 1544 - pr_err("Prev obj: start=%px, len=%d\n", realobj, size); 1545 - print_objinfo(cachep, objp, 2); 1546 - } 1547 - if (objnr + 1 < cachep->num) { 1548 - objp = index_to_obj(cachep, slab, objnr + 1); 1549 - realobj = (char *)objp + obj_offset(cachep); 1550 - pr_err("Next obj: start=%px, len=%d\n", realobj, size); 1551 - print_objinfo(cachep, objp, 2); 1552 - } 1553 - } 1554 - } 1555 - #endif 1556 - 1557 - #if DEBUG 1558 - static void slab_destroy_debugcheck(struct kmem_cache *cachep, 1559 - struct slab *slab) 1560 - { 1561 - int i; 1562 - 1563 - if (OBJFREELIST_SLAB(cachep) && cachep->flags & SLAB_POISON) { 1564 - poison_obj(cachep, slab->freelist - obj_offset(cachep), 1565 - POISON_FREE); 1566 - } 1567 - 1568 - for (i = 0; i < cachep->num; i++) { 1569 - void *objp = index_to_obj(cachep, slab, i); 1570 - 1571 - if (cachep->flags & SLAB_POISON) { 1572 - check_poison_obj(cachep, objp); 1573 - slab_kernel_map(cachep, objp, 1); 1574 - } 1575 - if (cachep->flags & SLAB_RED_ZONE) { 1576 - if (*dbg_redzone1(cachep, objp) != RED_INACTIVE) 1577 - slab_error(cachep, "start of a freed object was overwritten"); 1578 - if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) 1579 - slab_error(cachep, "end of a freed object was overwritten"); 1580 - } 1581 - } 1582 - } 1583 - #else 1584 - static void slab_destroy_debugcheck(struct kmem_cache *cachep, 1585 - struct slab *slab) 1586 - { 1587 - } 1588 - #endif 1589 - 1590 - /** 1591 - * slab_destroy - destroy and release all objects in a slab 1592 - * @cachep: cache pointer being destroyed 1593 - * @slab: slab being destroyed 1594 - * 1595 - * Destroy all the objs in a slab, and release the mem back to the system. 1596 - * Before calling the slab must have been unlinked from the cache. The 1597 - * kmem_cache_node ->list_lock is not held/needed. 1598 - */ 1599 - static void slab_destroy(struct kmem_cache *cachep, struct slab *slab) 1600 - { 1601 - void *freelist; 1602 - 1603 - freelist = slab->freelist; 1604 - slab_destroy_debugcheck(cachep, slab); 1605 - if (unlikely(cachep->flags & SLAB_TYPESAFE_BY_RCU)) 1606 - call_rcu(&slab->rcu_head, kmem_rcu_free); 1607 - else 1608 - kmem_freepages(cachep, slab); 1609 - 1610 - /* 1611 - * From now on, we don't use freelist 1612 - * although actual page can be freed in rcu context 1613 - */ 1614 - if (OFF_SLAB(cachep)) 1615 - kfree(freelist); 1616 - } 1617 - 1618 - /* 1619 - * Update the size of the caches before calling slabs_destroy as it may 1620 - * recursively call kfree. 1621 - */ 1622 - static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list) 1623 - { 1624 - struct slab *slab, *n; 1625 - 1626 - list_for_each_entry_safe(slab, n, list, slab_list) { 1627 - list_del(&slab->slab_list); 1628 - slab_destroy(cachep, slab); 1629 - } 1630 - } 1631 - 1632 - /** 1633 - * calculate_slab_order - calculate size (page order) of slabs 1634 - * @cachep: pointer to the cache that is being created 1635 - * @size: size of objects to be created in this cache. 1636 - * @flags: slab allocation flags 1637 - * 1638 - * Also calculates the number of objects per slab. 1639 - * 1640 - * This could be made much more intelligent. For now, try to avoid using 1641 - * high order pages for slabs. When the gfp() functions are more friendly 1642 - * towards high-order requests, this should be changed. 1643 - * 1644 - * Return: number of left-over bytes in a slab 1645 - */ 1646 - static size_t calculate_slab_order(struct kmem_cache *cachep, 1647 - size_t size, slab_flags_t flags) 1648 - { 1649 - size_t left_over = 0; 1650 - int gfporder; 1651 - 1652 - for (gfporder = 0; gfporder <= KMALLOC_MAX_ORDER; gfporder++) { 1653 - unsigned int num; 1654 - size_t remainder; 1655 - 1656 - num = cache_estimate(gfporder, size, flags, &remainder); 1657 - if (!num) 1658 - continue; 1659 - 1660 - /* Can't handle number of objects more than SLAB_OBJ_MAX_NUM */ 1661 - if (num > SLAB_OBJ_MAX_NUM) 1662 - break; 1663 - 1664 - if (flags & CFLGS_OFF_SLAB) { 1665 - struct kmem_cache *freelist_cache; 1666 - size_t freelist_size; 1667 - size_t freelist_cache_size; 1668 - 1669 - freelist_size = num * sizeof(freelist_idx_t); 1670 - if (freelist_size > KMALLOC_MAX_CACHE_SIZE) { 1671 - freelist_cache_size = PAGE_SIZE << get_order(freelist_size); 1672 - } else { 1673 - freelist_cache = kmalloc_slab(freelist_size, 0u, _RET_IP_); 1674 - if (!freelist_cache) 1675 - continue; 1676 - freelist_cache_size = freelist_cache->size; 1677 - 1678 - /* 1679 - * Needed to avoid possible looping condition 1680 - * in cache_grow_begin() 1681 - */ 1682 - if (OFF_SLAB(freelist_cache)) 1683 - continue; 1684 - } 1685 - 1686 - /* check if off slab has enough benefit */ 1687 - if (freelist_cache_size > cachep->size / 2) 1688 - continue; 1689 - } 1690 - 1691 - /* Found something acceptable - save it away */ 1692 - cachep->num = num; 1693 - cachep->gfporder = gfporder; 1694 - left_over = remainder; 1695 - 1696 - /* 1697 - * A VFS-reclaimable slab tends to have most allocations 1698 - * as GFP_NOFS and we really don't want to have to be allocating 1699 - * higher-order pages when we are unable to shrink dcache. 1700 - */ 1701 - if (flags & SLAB_RECLAIM_ACCOUNT) 1702 - break; 1703 - 1704 - /* 1705 - * Large number of objects is good, but very large slabs are 1706 - * currently bad for the gfp()s. 1707 - */ 1708 - if (gfporder >= slab_max_order) 1709 - break; 1710 - 1711 - /* 1712 - * Acceptable internal fragmentation? 1713 - */ 1714 - if (left_over * 8 <= (PAGE_SIZE << gfporder)) 1715 - break; 1716 - } 1717 - return left_over; 1718 - } 1719 - 1720 - static struct array_cache __percpu *alloc_kmem_cache_cpus( 1721 - struct kmem_cache *cachep, int entries, int batchcount) 1722 - { 1723 - int cpu; 1724 - size_t size; 1725 - struct array_cache __percpu *cpu_cache; 1726 - 1727 - size = sizeof(void *) * entries + sizeof(struct array_cache); 1728 - cpu_cache = __alloc_percpu(size, sizeof(void *)); 1729 - 1730 - if (!cpu_cache) 1731 - return NULL; 1732 - 1733 - for_each_possible_cpu(cpu) { 1734 - init_arraycache(per_cpu_ptr(cpu_cache, cpu), 1735 - entries, batchcount); 1736 - } 1737 - 1738 - return cpu_cache; 1739 - } 1740 - 1741 - static int __ref setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) 1742 - { 1743 - if (slab_state >= FULL) 1744 - return enable_cpucache(cachep, gfp); 1745 - 1746 - cachep->cpu_cache = alloc_kmem_cache_cpus(cachep, 1, 1); 1747 - if (!cachep->cpu_cache) 1748 - return 1; 1749 - 1750 - if (slab_state == DOWN) { 1751 - /* Creation of first cache (kmem_cache). */ 1752 - set_up_node(kmem_cache, CACHE_CACHE); 1753 - } else if (slab_state == PARTIAL) { 1754 - /* For kmem_cache_node */ 1755 - set_up_node(cachep, SIZE_NODE); 1756 - } else { 1757 - int node; 1758 - 1759 - for_each_online_node(node) { 1760 - cachep->node[node] = kmalloc_node( 1761 - sizeof(struct kmem_cache_node), gfp, node); 1762 - BUG_ON(!cachep->node[node]); 1763 - kmem_cache_node_init(cachep->node[node]); 1764 - } 1765 - } 1766 - 1767 - cachep->node[numa_mem_id()]->next_reap = 1768 - jiffies + REAPTIMEOUT_NODE + 1769 - ((unsigned long)cachep) % REAPTIMEOUT_NODE; 1770 - 1771 - cpu_cache_get(cachep)->avail = 0; 1772 - cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES; 1773 - cpu_cache_get(cachep)->batchcount = 1; 1774 - cpu_cache_get(cachep)->touched = 0; 1775 - cachep->batchcount = 1; 1776 - cachep->limit = BOOT_CPUCACHE_ENTRIES; 1777 - return 0; 1778 - } 1779 - 1780 - slab_flags_t kmem_cache_flags(unsigned int object_size, 1781 - slab_flags_t flags, const char *name) 1782 - { 1783 - return flags; 1784 - } 1785 - 1786 - struct kmem_cache * 1787 - __kmem_cache_alias(const char *name, unsigned int size, unsigned int align, 1788 - slab_flags_t flags, void (*ctor)(void *)) 1789 - { 1790 - struct kmem_cache *cachep; 1791 - 1792 - cachep = find_mergeable(size, align, flags, name, ctor); 1793 - if (cachep) { 1794 - cachep->refcount++; 1795 - 1796 - /* 1797 - * Adjust the object sizes so that we clear 1798 - * the complete object on kzalloc. 1799 - */ 1800 - cachep->object_size = max_t(int, cachep->object_size, size); 1801 - } 1802 - return cachep; 1803 - } 1804 - 1805 - static bool set_objfreelist_slab_cache(struct kmem_cache *cachep, 1806 - size_t size, slab_flags_t flags) 1807 - { 1808 - size_t left; 1809 - 1810 - cachep->num = 0; 1811 - 1812 - /* 1813 - * If slab auto-initialization on free is enabled, store the freelist 1814 - * off-slab, so that its contents don't end up in one of the allocated 1815 - * objects. 1816 - */ 1817 - if (unlikely(slab_want_init_on_free(cachep))) 1818 - return false; 1819 - 1820 - if (cachep->ctor || flags & SLAB_TYPESAFE_BY_RCU) 1821 - return false; 1822 - 1823 - left = calculate_slab_order(cachep, size, 1824 - flags | CFLGS_OBJFREELIST_SLAB); 1825 - if (!cachep->num) 1826 - return false; 1827 - 1828 - if (cachep->num * sizeof(freelist_idx_t) > cachep->object_size) 1829 - return false; 1830 - 1831 - cachep->colour = left / cachep->colour_off; 1832 - 1833 - return true; 1834 - } 1835 - 1836 - static bool set_off_slab_cache(struct kmem_cache *cachep, 1837 - size_t size, slab_flags_t flags) 1838 - { 1839 - size_t left; 1840 - 1841 - cachep->num = 0; 1842 - 1843 - /* 1844 - * Always use on-slab management when SLAB_NOLEAKTRACE 1845 - * to avoid recursive calls into kmemleak. 1846 - */ 1847 - if (flags & SLAB_NOLEAKTRACE) 1848 - return false; 1849 - 1850 - /* 1851 - * Size is large, assume best to place the slab management obj 1852 - * off-slab (should allow better packing of objs). 1853 - */ 1854 - left = calculate_slab_order(cachep, size, flags | CFLGS_OFF_SLAB); 1855 - if (!cachep->num) 1856 - return false; 1857 - 1858 - /* 1859 - * If the slab has been placed off-slab, and we have enough space then 1860 - * move it on-slab. This is at the expense of any extra colouring. 1861 - */ 1862 - if (left >= cachep->num * sizeof(freelist_idx_t)) 1863 - return false; 1864 - 1865 - cachep->colour = left / cachep->colour_off; 1866 - 1867 - return true; 1868 - } 1869 - 1870 - static bool set_on_slab_cache(struct kmem_cache *cachep, 1871 - size_t size, slab_flags_t flags) 1872 - { 1873 - size_t left; 1874 - 1875 - cachep->num = 0; 1876 - 1877 - left = calculate_slab_order(cachep, size, flags); 1878 - if (!cachep->num) 1879 - return false; 1880 - 1881 - cachep->colour = left / cachep->colour_off; 1882 - 1883 - return true; 1884 - } 1885 - 1886 - /* 1887 - * __kmem_cache_create - Create a cache. 1888 - * @cachep: cache management descriptor 1889 - * @flags: SLAB flags 1890 - * 1891 - * Returns zero on success, nonzero on failure. 1892 - * 1893 - * The flags are 1894 - * 1895 - * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5) 1896 - * to catch references to uninitialised memory. 1897 - * 1898 - * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check 1899 - * for buffer overruns. 1900 - * 1901 - * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware 1902 - * cacheline. This can be beneficial if you're counting cycles as closely 1903 - * as davem. 1904 - */ 1905 - int __kmem_cache_create(struct kmem_cache *cachep, slab_flags_t flags) 1906 - { 1907 - size_t ralign = BYTES_PER_WORD; 1908 - gfp_t gfp; 1909 - int err; 1910 - unsigned int size = cachep->size; 1911 - 1912 - #if DEBUG 1913 - #if FORCED_DEBUG 1914 - /* 1915 - * Enable redzoning and last user accounting, except for caches with 1916 - * large objects, if the increased size would increase the object size 1917 - * above the next power of two: caches with object sizes just above a 1918 - * power of two have a significant amount of internal fragmentation. 1919 - */ 1920 - if (size < 4096 || fls(size - 1) == fls(size-1 + REDZONE_ALIGN + 1921 - 2 * sizeof(unsigned long long))) 1922 - flags |= SLAB_RED_ZONE | SLAB_STORE_USER; 1923 - if (!(flags & SLAB_TYPESAFE_BY_RCU)) 1924 - flags |= SLAB_POISON; 1925 - #endif 1926 - #endif 1927 - 1928 - /* 1929 - * Check that size is in terms of words. This is needed to avoid 1930 - * unaligned accesses for some archs when redzoning is used, and makes 1931 - * sure any on-slab bufctl's are also correctly aligned. 1932 - */ 1933 - size = ALIGN(size, BYTES_PER_WORD); 1934 - 1935 - if (flags & SLAB_RED_ZONE) { 1936 - ralign = REDZONE_ALIGN; 1937 - /* If redzoning, ensure that the second redzone is suitably 1938 - * aligned, by adjusting the object size accordingly. */ 1939 - size = ALIGN(size, REDZONE_ALIGN); 1940 - } 1941 - 1942 - /* 3) caller mandated alignment */ 1943 - if (ralign < cachep->align) { 1944 - ralign = cachep->align; 1945 - } 1946 - /* disable debug if necessary */ 1947 - if (ralign > __alignof__(unsigned long long)) 1948 - flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); 1949 - /* 1950 - * 4) Store it. 1951 - */ 1952 - cachep->align = ralign; 1953 - cachep->colour_off = cache_line_size(); 1954 - /* Offset must be a multiple of the alignment. */ 1955 - if (cachep->colour_off < cachep->align) 1956 - cachep->colour_off = cachep->align; 1957 - 1958 - if (slab_is_available()) 1959 - gfp = GFP_KERNEL; 1960 - else 1961 - gfp = GFP_NOWAIT; 1962 - 1963 - #if DEBUG 1964 - 1965 - /* 1966 - * Both debugging options require word-alignment which is calculated 1967 - * into align above. 1968 - */ 1969 - if (flags & SLAB_RED_ZONE) { 1970 - /* add space for red zone words */ 1971 - cachep->obj_offset += sizeof(unsigned long long); 1972 - size += 2 * sizeof(unsigned long long); 1973 - } 1974 - if (flags & SLAB_STORE_USER) { 1975 - /* user store requires one word storage behind the end of 1976 - * the real object. But if the second red zone needs to be 1977 - * aligned to 64 bits, we must allow that much space. 1978 - */ 1979 - if (flags & SLAB_RED_ZONE) 1980 - size += REDZONE_ALIGN; 1981 - else 1982 - size += BYTES_PER_WORD; 1983 - } 1984 - #endif 1985 - 1986 - kasan_cache_create(cachep, &size, &flags); 1987 - 1988 - size = ALIGN(size, cachep->align); 1989 - /* 1990 - * We should restrict the number of objects in a slab to implement 1991 - * byte sized index. Refer comment on SLAB_OBJ_MIN_SIZE definition. 1992 - */ 1993 - if (FREELIST_BYTE_INDEX && size < SLAB_OBJ_MIN_SIZE) 1994 - size = ALIGN(SLAB_OBJ_MIN_SIZE, cachep->align); 1995 - 1996 - #if DEBUG 1997 - /* 1998 - * To activate debug pagealloc, off-slab management is necessary 1999 - * requirement. In early phase of initialization, small sized slab 2000 - * doesn't get initialized so it would not be possible. So, we need 2001 - * to check size >= 256. It guarantees that all necessary small 2002 - * sized slab is initialized in current slab initialization sequence. 2003 - */ 2004 - if (debug_pagealloc_enabled_static() && (flags & SLAB_POISON) && 2005 - size >= 256 && cachep->object_size > cache_line_size()) { 2006 - if (size < PAGE_SIZE || size % PAGE_SIZE == 0) { 2007 - size_t tmp_size = ALIGN(size, PAGE_SIZE); 2008 - 2009 - if (set_off_slab_cache(cachep, tmp_size, flags)) { 2010 - flags |= CFLGS_OFF_SLAB; 2011 - cachep->obj_offset += tmp_size - size; 2012 - size = tmp_size; 2013 - goto done; 2014 - } 2015 - } 2016 - } 2017 - #endif 2018 - 2019 - if (set_objfreelist_slab_cache(cachep, size, flags)) { 2020 - flags |= CFLGS_OBJFREELIST_SLAB; 2021 - goto done; 2022 - } 2023 - 2024 - if (set_off_slab_cache(cachep, size, flags)) { 2025 - flags |= CFLGS_OFF_SLAB; 2026 - goto done; 2027 - } 2028 - 2029 - if (set_on_slab_cache(cachep, size, flags)) 2030 - goto done; 2031 - 2032 - return -E2BIG; 2033 - 2034 - done: 2035 - cachep->freelist_size = cachep->num * sizeof(freelist_idx_t); 2036 - cachep->flags = flags; 2037 - cachep->allocflags = __GFP_COMP; 2038 - if (flags & SLAB_CACHE_DMA) 2039 - cachep->allocflags |= GFP_DMA; 2040 - if (flags & SLAB_CACHE_DMA32) 2041 - cachep->allocflags |= GFP_DMA32; 2042 - if (flags & SLAB_RECLAIM_ACCOUNT) 2043 - cachep->allocflags |= __GFP_RECLAIMABLE; 2044 - cachep->size = size; 2045 - cachep->reciprocal_buffer_size = reciprocal_value(size); 2046 - 2047 - #if DEBUG 2048 - /* 2049 - * If we're going to use the generic kernel_map_pages() 2050 - * poisoning, then it's going to smash the contents of 2051 - * the redzone and userword anyhow, so switch them off. 2052 - */ 2053 - if (IS_ENABLED(CONFIG_PAGE_POISONING) && 2054 - (cachep->flags & SLAB_POISON) && 2055 - is_debug_pagealloc_cache(cachep)) 2056 - cachep->flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); 2057 - #endif 2058 - 2059 - err = setup_cpu_cache(cachep, gfp); 2060 - if (err) { 2061 - __kmem_cache_release(cachep); 2062 - return err; 2063 - } 2064 - 2065 - return 0; 2066 - } 2067 - 2068 - #if DEBUG 2069 - static void check_irq_off(void) 2070 - { 2071 - BUG_ON(!irqs_disabled()); 2072 - } 2073 - 2074 - static void check_irq_on(void) 2075 - { 2076 - BUG_ON(irqs_disabled()); 2077 - } 2078 - 2079 - static void check_mutex_acquired(void) 2080 - { 2081 - BUG_ON(!mutex_is_locked(&slab_mutex)); 2082 - } 2083 - 2084 - static void check_spinlock_acquired(struct kmem_cache *cachep) 2085 - { 2086 - #ifdef CONFIG_SMP 2087 - check_irq_off(); 2088 - assert_raw_spin_locked(&get_node(cachep, numa_mem_id())->list_lock); 2089 - #endif 2090 - } 2091 - 2092 - static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node) 2093 - { 2094 - #ifdef CONFIG_SMP 2095 - check_irq_off(); 2096 - assert_raw_spin_locked(&get_node(cachep, node)->list_lock); 2097 - #endif 2098 - } 2099 - 2100 - #else 2101 - #define check_irq_off() do { } while(0) 2102 - #define check_irq_on() do { } while(0) 2103 - #define check_mutex_acquired() do { } while(0) 2104 - #define check_spinlock_acquired(x) do { } while(0) 2105 - #define check_spinlock_acquired_node(x, y) do { } while(0) 2106 - #endif 2107 - 2108 - static void drain_array_locked(struct kmem_cache *cachep, struct array_cache *ac, 2109 - int node, bool free_all, struct list_head *list) 2110 - { 2111 - int tofree; 2112 - 2113 - if (!ac || !ac->avail) 2114 - return; 2115 - 2116 - tofree = free_all ? ac->avail : (ac->limit + 4) / 5; 2117 - if (tofree > ac->avail) 2118 - tofree = (ac->avail + 1) / 2; 2119 - 2120 - free_block(cachep, ac->entry, tofree, node, list); 2121 - ac->avail -= tofree; 2122 - memmove(ac->entry, &(ac->entry[tofree]), sizeof(void *) * ac->avail); 2123 - } 2124 - 2125 - static void do_drain(void *arg) 2126 - { 2127 - struct kmem_cache *cachep = arg; 2128 - struct array_cache *ac; 2129 - int node = numa_mem_id(); 2130 - struct kmem_cache_node *n; 2131 - LIST_HEAD(list); 2132 - 2133 - check_irq_off(); 2134 - ac = cpu_cache_get(cachep); 2135 - n = get_node(cachep, node); 2136 - raw_spin_lock(&n->list_lock); 2137 - free_block(cachep, ac->entry, ac->avail, node, &list); 2138 - raw_spin_unlock(&n->list_lock); 2139 - ac->avail = 0; 2140 - slabs_destroy(cachep, &list); 2141 - } 2142 - 2143 - static void drain_cpu_caches(struct kmem_cache *cachep) 2144 - { 2145 - struct kmem_cache_node *n; 2146 - int node; 2147 - LIST_HEAD(list); 2148 - 2149 - on_each_cpu(do_drain, cachep, 1); 2150 - check_irq_on(); 2151 - for_each_kmem_cache_node(cachep, node, n) 2152 - if (n->alien) 2153 - drain_alien_cache(cachep, n->alien); 2154 - 2155 - for_each_kmem_cache_node(cachep, node, n) { 2156 - raw_spin_lock_irq(&n->list_lock); 2157 - drain_array_locked(cachep, n->shared, node, true, &list); 2158 - raw_spin_unlock_irq(&n->list_lock); 2159 - 2160 - slabs_destroy(cachep, &list); 2161 - } 2162 - } 2163 - 2164 - /* 2165 - * Remove slabs from the list of free slabs. 2166 - * Specify the number of slabs to drain in tofree. 2167 - * 2168 - * Returns the actual number of slabs released. 2169 - */ 2170 - static int drain_freelist(struct kmem_cache *cache, 2171 - struct kmem_cache_node *n, int tofree) 2172 - { 2173 - struct list_head *p; 2174 - int nr_freed; 2175 - struct slab *slab; 2176 - 2177 - nr_freed = 0; 2178 - while (nr_freed < tofree && !list_empty(&n->slabs_free)) { 2179 - 2180 - raw_spin_lock_irq(&n->list_lock); 2181 - p = n->slabs_free.prev; 2182 - if (p == &n->slabs_free) { 2183 - raw_spin_unlock_irq(&n->list_lock); 2184 - goto out; 2185 - } 2186 - 2187 - slab = list_entry(p, struct slab, slab_list); 2188 - list_del(&slab->slab_list); 2189 - n->free_slabs--; 2190 - n->total_slabs--; 2191 - /* 2192 - * Safe to drop the lock. The slab is no longer linked 2193 - * to the cache. 2194 - */ 2195 - n->free_objects -= cache->num; 2196 - raw_spin_unlock_irq(&n->list_lock); 2197 - slab_destroy(cache, slab); 2198 - nr_freed++; 2199 - 2200 - cond_resched(); 2201 - } 2202 - out: 2203 - return nr_freed; 2204 - } 2205 - 2206 - bool __kmem_cache_empty(struct kmem_cache *s) 2207 - { 2208 - int node; 2209 - struct kmem_cache_node *n; 2210 - 2211 - for_each_kmem_cache_node(s, node, n) 2212 - if (!list_empty(&n->slabs_full) || 2213 - !list_empty(&n->slabs_partial)) 2214 - return false; 2215 - return true; 2216 - } 2217 - 2218 - int __kmem_cache_shrink(struct kmem_cache *cachep) 2219 - { 2220 - int ret = 0; 2221 - int node; 2222 - struct kmem_cache_node *n; 2223 - 2224 - drain_cpu_caches(cachep); 2225 - 2226 - check_irq_on(); 2227 - for_each_kmem_cache_node(cachep, node, n) { 2228 - drain_freelist(cachep, n, INT_MAX); 2229 - 2230 - ret += !list_empty(&n->slabs_full) || 2231 - !list_empty(&n->slabs_partial); 2232 - } 2233 - return (ret ? 1 : 0); 2234 - } 2235 - 2236 - int __kmem_cache_shutdown(struct kmem_cache *cachep) 2237 - { 2238 - return __kmem_cache_shrink(cachep); 2239 - } 2240 - 2241 - void __kmem_cache_release(struct kmem_cache *cachep) 2242 - { 2243 - int i; 2244 - struct kmem_cache_node *n; 2245 - 2246 - cache_random_seq_destroy(cachep); 2247 - 2248 - free_percpu(cachep->cpu_cache); 2249 - 2250 - /* NUMA: free the node structures */ 2251 - for_each_kmem_cache_node(cachep, i, n) { 2252 - kfree(n->shared); 2253 - free_alien_cache(n->alien); 2254 - kfree(n); 2255 - cachep->node[i] = NULL; 2256 - } 2257 - } 2258 - 2259 - /* 2260 - * Get the memory for a slab management obj. 2261 - * 2262 - * For a slab cache when the slab descriptor is off-slab, the 2263 - * slab descriptor can't come from the same cache which is being created, 2264 - * Because if it is the case, that means we defer the creation of 2265 - * the kmalloc_{dma,}_cache of size sizeof(slab descriptor) to this point. 2266 - * And we eventually call down to __kmem_cache_create(), which 2267 - * in turn looks up in the kmalloc_{dma,}_caches for the desired-size one. 2268 - * This is a "chicken-and-egg" problem. 2269 - * 2270 - * So the off-slab slab descriptor shall come from the kmalloc_{dma,}_caches, 2271 - * which are all initialized during kmem_cache_init(). 2272 - */ 2273 - static void *alloc_slabmgmt(struct kmem_cache *cachep, 2274 - struct slab *slab, int colour_off, 2275 - gfp_t local_flags, int nodeid) 2276 - { 2277 - void *freelist; 2278 - void *addr = slab_address(slab); 2279 - 2280 - slab->s_mem = addr + colour_off; 2281 - slab->active = 0; 2282 - 2283 - if (OBJFREELIST_SLAB(cachep)) 2284 - freelist = NULL; 2285 - else if (OFF_SLAB(cachep)) { 2286 - /* Slab management obj is off-slab. */ 2287 - freelist = kmalloc_node(cachep->freelist_size, 2288 - local_flags, nodeid); 2289 - } else { 2290 - /* We will use last bytes at the slab for freelist */ 2291 - freelist = addr + (PAGE_SIZE << cachep->gfporder) - 2292 - cachep->freelist_size; 2293 - } 2294 - 2295 - return freelist; 2296 - } 2297 - 2298 - static inline freelist_idx_t get_free_obj(struct slab *slab, unsigned int idx) 2299 - { 2300 - return ((freelist_idx_t *) slab->freelist)[idx]; 2301 - } 2302 - 2303 - static inline void set_free_obj(struct slab *slab, 2304 - unsigned int idx, freelist_idx_t val) 2305 - { 2306 - ((freelist_idx_t *)(slab->freelist))[idx] = val; 2307 - } 2308 - 2309 - static void cache_init_objs_debug(struct kmem_cache *cachep, struct slab *slab) 2310 - { 2311 - #if DEBUG 2312 - int i; 2313 - 2314 - for (i = 0; i < cachep->num; i++) { 2315 - void *objp = index_to_obj(cachep, slab, i); 2316 - 2317 - if (cachep->flags & SLAB_STORE_USER) 2318 - *dbg_userword(cachep, objp) = NULL; 2319 - 2320 - if (cachep->flags & SLAB_RED_ZONE) { 2321 - *dbg_redzone1(cachep, objp) = RED_INACTIVE; 2322 - *dbg_redzone2(cachep, objp) = RED_INACTIVE; 2323 - } 2324 - /* 2325 - * Constructors are not allowed to allocate memory from the same 2326 - * cache which they are a constructor for. Otherwise, deadlock. 2327 - * They must also be threaded. 2328 - */ 2329 - if (cachep->ctor && !(cachep->flags & SLAB_POISON)) { 2330 - kasan_unpoison_object_data(cachep, 2331 - objp + obj_offset(cachep)); 2332 - cachep->ctor(objp + obj_offset(cachep)); 2333 - kasan_poison_object_data( 2334 - cachep, objp + obj_offset(cachep)); 2335 - } 2336 - 2337 - if (cachep->flags & SLAB_RED_ZONE) { 2338 - if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) 2339 - slab_error(cachep, "constructor overwrote the end of an object"); 2340 - if (*dbg_redzone1(cachep, objp) != RED_INACTIVE) 2341 - slab_error(cachep, "constructor overwrote the start of an object"); 2342 - } 2343 - /* need to poison the objs? */ 2344 - if (cachep->flags & SLAB_POISON) { 2345 - poison_obj(cachep, objp, POISON_FREE); 2346 - slab_kernel_map(cachep, objp, 0); 2347 - } 2348 - } 2349 - #endif 2350 - } 2351 - 2352 - #ifdef CONFIG_SLAB_FREELIST_RANDOM 2353 - /* Hold information during a freelist initialization */ 2354 - struct freelist_init_state { 2355 - unsigned int pos; 2356 - unsigned int *list; 2357 - unsigned int count; 2358 - }; 2359 - 2360 - /* 2361 - * Initialize the state based on the randomization method available. 2362 - * return true if the pre-computed list is available, false otherwise. 2363 - */ 2364 - static bool freelist_state_initialize(struct freelist_init_state *state, 2365 - struct kmem_cache *cachep, 2366 - unsigned int count) 2367 - { 2368 - bool ret; 2369 - if (!cachep->random_seq) { 2370 - ret = false; 2371 - } else { 2372 - state->list = cachep->random_seq; 2373 - state->count = count; 2374 - state->pos = get_random_u32_below(count); 2375 - ret = true; 2376 - } 2377 - return ret; 2378 - } 2379 - 2380 - /* Get the next entry on the list and randomize it using a random shift */ 2381 - static freelist_idx_t next_random_slot(struct freelist_init_state *state) 2382 - { 2383 - if (state->pos >= state->count) 2384 - state->pos = 0; 2385 - return state->list[state->pos++]; 2386 - } 2387 - 2388 - /* Swap two freelist entries */ 2389 - static void swap_free_obj(struct slab *slab, unsigned int a, unsigned int b) 2390 - { 2391 - swap(((freelist_idx_t *) slab->freelist)[a], 2392 - ((freelist_idx_t *) slab->freelist)[b]); 2393 - } 2394 - 2395 - /* 2396 - * Shuffle the freelist initialization state based on pre-computed lists. 2397 - * return true if the list was successfully shuffled, false otherwise. 2398 - */ 2399 - static bool shuffle_freelist(struct kmem_cache *cachep, struct slab *slab) 2400 - { 2401 - unsigned int objfreelist = 0, i, rand, count = cachep->num; 2402 - struct freelist_init_state state; 2403 - bool precomputed; 2404 - 2405 - if (count < 2) 2406 - return false; 2407 - 2408 - precomputed = freelist_state_initialize(&state, cachep, count); 2409 - 2410 - /* Take a random entry as the objfreelist */ 2411 - if (OBJFREELIST_SLAB(cachep)) { 2412 - if (!precomputed) 2413 - objfreelist = count - 1; 2414 - else 2415 - objfreelist = next_random_slot(&state); 2416 - slab->freelist = index_to_obj(cachep, slab, objfreelist) + 2417 - obj_offset(cachep); 2418 - count--; 2419 - } 2420 - 2421 - /* 2422 - * On early boot, generate the list dynamically. 2423 - * Later use a pre-computed list for speed. 2424 - */ 2425 - if (!precomputed) { 2426 - for (i = 0; i < count; i++) 2427 - set_free_obj(slab, i, i); 2428 - 2429 - /* Fisher-Yates shuffle */ 2430 - for (i = count - 1; i > 0; i--) { 2431 - rand = get_random_u32_below(i + 1); 2432 - swap_free_obj(slab, i, rand); 2433 - } 2434 - } else { 2435 - for (i = 0; i < count; i++) 2436 - set_free_obj(slab, i, next_random_slot(&state)); 2437 - } 2438 - 2439 - if (OBJFREELIST_SLAB(cachep)) 2440 - set_free_obj(slab, cachep->num - 1, objfreelist); 2441 - 2442 - return true; 2443 - } 2444 - #else 2445 - static inline bool shuffle_freelist(struct kmem_cache *cachep, 2446 - struct slab *slab) 2447 - { 2448 - return false; 2449 - } 2450 - #endif /* CONFIG_SLAB_FREELIST_RANDOM */ 2451 - 2452 - static void cache_init_objs(struct kmem_cache *cachep, 2453 - struct slab *slab) 2454 - { 2455 - int i; 2456 - void *objp; 2457 - bool shuffled; 2458 - 2459 - cache_init_objs_debug(cachep, slab); 2460 - 2461 - /* Try to randomize the freelist if enabled */ 2462 - shuffled = shuffle_freelist(cachep, slab); 2463 - 2464 - if (!shuffled && OBJFREELIST_SLAB(cachep)) { 2465 - slab->freelist = index_to_obj(cachep, slab, cachep->num - 1) + 2466 - obj_offset(cachep); 2467 - } 2468 - 2469 - for (i = 0; i < cachep->num; i++) { 2470 - objp = index_to_obj(cachep, slab, i); 2471 - objp = kasan_init_slab_obj(cachep, objp); 2472 - 2473 - /* constructor could break poison info */ 2474 - if (DEBUG == 0 && cachep->ctor) { 2475 - kasan_unpoison_object_data(cachep, objp); 2476 - cachep->ctor(objp); 2477 - kasan_poison_object_data(cachep, objp); 2478 - } 2479 - 2480 - if (!shuffled) 2481 - set_free_obj(slab, i, i); 2482 - } 2483 - } 2484 - 2485 - static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slab) 2486 - { 2487 - void *objp; 2488 - 2489 - objp = index_to_obj(cachep, slab, get_free_obj(slab, slab->active)); 2490 - slab->active++; 2491 - 2492 - return objp; 2493 - } 2494 - 2495 - static void slab_put_obj(struct kmem_cache *cachep, 2496 - struct slab *slab, void *objp) 2497 - { 2498 - unsigned int objnr = obj_to_index(cachep, slab, objp); 2499 - #if DEBUG 2500 - unsigned int i; 2501 - 2502 - /* Verify double free bug */ 2503 - for (i = slab->active; i < cachep->num; i++) { 2504 - if (get_free_obj(slab, i) == objnr) { 2505 - pr_err("slab: double free detected in cache '%s', objp %px\n", 2506 - cachep->name, objp); 2507 - BUG(); 2508 - } 2509 - } 2510 - #endif 2511 - slab->active--; 2512 - if (!slab->freelist) 2513 - slab->freelist = objp + obj_offset(cachep); 2514 - 2515 - set_free_obj(slab, slab->active, objnr); 2516 - } 2517 - 2518 - /* 2519 - * Grow (by 1) the number of slabs within a cache. This is called by 2520 - * kmem_cache_alloc() when there are no active objs left in a cache. 2521 - */ 2522 - static struct slab *cache_grow_begin(struct kmem_cache *cachep, 2523 - gfp_t flags, int nodeid) 2524 - { 2525 - void *freelist; 2526 - size_t offset; 2527 - gfp_t local_flags; 2528 - int slab_node; 2529 - struct kmem_cache_node *n; 2530 - struct slab *slab; 2531 - 2532 - /* 2533 - * Be lazy and only check for valid flags here, keeping it out of the 2534 - * critical path in kmem_cache_alloc(). 2535 - */ 2536 - if (unlikely(flags & GFP_SLAB_BUG_MASK)) 2537 - flags = kmalloc_fix_flags(flags); 2538 - 2539 - WARN_ON_ONCE(cachep->ctor && (flags & __GFP_ZERO)); 2540 - local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); 2541 - 2542 - check_irq_off(); 2543 - if (gfpflags_allow_blocking(local_flags)) 2544 - local_irq_enable(); 2545 - 2546 - /* 2547 - * Get mem for the objs. Attempt to allocate a physical page from 2548 - * 'nodeid'. 2549 - */ 2550 - slab = kmem_getpages(cachep, local_flags, nodeid); 2551 - if (!slab) 2552 - goto failed; 2553 - 2554 - slab_node = slab_nid(slab); 2555 - n = get_node(cachep, slab_node); 2556 - 2557 - /* Get colour for the slab, and cal the next value. */ 2558 - n->colour_next++; 2559 - if (n->colour_next >= cachep->colour) 2560 - n->colour_next = 0; 2561 - 2562 - offset = n->colour_next; 2563 - if (offset >= cachep->colour) 2564 - offset = 0; 2565 - 2566 - offset *= cachep->colour_off; 2567 - 2568 - /* 2569 - * Call kasan_poison_slab() before calling alloc_slabmgmt(), so 2570 - * page_address() in the latter returns a non-tagged pointer, 2571 - * as it should be for slab pages. 2572 - */ 2573 - kasan_poison_slab(slab); 2574 - 2575 - /* Get slab management. */ 2576 - freelist = alloc_slabmgmt(cachep, slab, offset, 2577 - local_flags & ~GFP_CONSTRAINT_MASK, slab_node); 2578 - if (OFF_SLAB(cachep) && !freelist) 2579 - goto opps1; 2580 - 2581 - slab->slab_cache = cachep; 2582 - slab->freelist = freelist; 2583 - 2584 - cache_init_objs(cachep, slab); 2585 - 2586 - if (gfpflags_allow_blocking(local_flags)) 2587 - local_irq_disable(); 2588 - 2589 - return slab; 2590 - 2591 - opps1: 2592 - kmem_freepages(cachep, slab); 2593 - failed: 2594 - if (gfpflags_allow_blocking(local_flags)) 2595 - local_irq_disable(); 2596 - return NULL; 2597 - } 2598 - 2599 - static void cache_grow_end(struct kmem_cache *cachep, struct slab *slab) 2600 - { 2601 - struct kmem_cache_node *n; 2602 - void *list = NULL; 2603 - 2604 - check_irq_off(); 2605 - 2606 - if (!slab) 2607 - return; 2608 - 2609 - INIT_LIST_HEAD(&slab->slab_list); 2610 - n = get_node(cachep, slab_nid(slab)); 2611 - 2612 - raw_spin_lock(&n->list_lock); 2613 - n->total_slabs++; 2614 - if (!slab->active) { 2615 - list_add_tail(&slab->slab_list, &n->slabs_free); 2616 - n->free_slabs++; 2617 - } else 2618 - fixup_slab_list(cachep, n, slab, &list); 2619 - 2620 - STATS_INC_GROWN(cachep); 2621 - n->free_objects += cachep->num - slab->active; 2622 - raw_spin_unlock(&n->list_lock); 2623 - 2624 - fixup_objfreelist_debug(cachep, &list); 2625 - } 2626 - 2627 - #if DEBUG 2628 - 2629 - /* 2630 - * Perform extra freeing checks: 2631 - * - detect bad pointers. 2632 - * - POISON/RED_ZONE checking 2633 - */ 2634 - static void kfree_debugcheck(const void *objp) 2635 - { 2636 - if (!virt_addr_valid(objp)) { 2637 - pr_err("kfree_debugcheck: out of range ptr %lxh\n", 2638 - (unsigned long)objp); 2639 - BUG(); 2640 - } 2641 - } 2642 - 2643 - static inline void verify_redzone_free(struct kmem_cache *cache, void *obj) 2644 - { 2645 - unsigned long long redzone1, redzone2; 2646 - 2647 - redzone1 = *dbg_redzone1(cache, obj); 2648 - redzone2 = *dbg_redzone2(cache, obj); 2649 - 2650 - /* 2651 - * Redzone is ok. 2652 - */ 2653 - if (redzone1 == RED_ACTIVE && redzone2 == RED_ACTIVE) 2654 - return; 2655 - 2656 - if (redzone1 == RED_INACTIVE && redzone2 == RED_INACTIVE) 2657 - slab_error(cache, "double free detected"); 2658 - else 2659 - slab_error(cache, "memory outside object was overwritten"); 2660 - 2661 - pr_err("%px: redzone 1:0x%llx, redzone 2:0x%llx\n", 2662 - obj, redzone1, redzone2); 2663 - } 2664 - 2665 - static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, 2666 - unsigned long caller) 2667 - { 2668 - unsigned int objnr; 2669 - struct slab *slab; 2670 - 2671 - BUG_ON(virt_to_cache(objp) != cachep); 2672 - 2673 - objp -= obj_offset(cachep); 2674 - kfree_debugcheck(objp); 2675 - slab = virt_to_slab(objp); 2676 - 2677 - if (cachep->flags & SLAB_RED_ZONE) { 2678 - verify_redzone_free(cachep, objp); 2679 - *dbg_redzone1(cachep, objp) = RED_INACTIVE; 2680 - *dbg_redzone2(cachep, objp) = RED_INACTIVE; 2681 - } 2682 - if (cachep->flags & SLAB_STORE_USER) 2683 - *dbg_userword(cachep, objp) = (void *)caller; 2684 - 2685 - objnr = obj_to_index(cachep, slab, objp); 2686 - 2687 - BUG_ON(objnr >= cachep->num); 2688 - BUG_ON(objp != index_to_obj(cachep, slab, objnr)); 2689 - 2690 - if (cachep->flags & SLAB_POISON) { 2691 - poison_obj(cachep, objp, POISON_FREE); 2692 - slab_kernel_map(cachep, objp, 0); 2693 - } 2694 - return objp; 2695 - } 2696 - 2697 - #else 2698 - #define kfree_debugcheck(x) do { } while(0) 2699 - #define cache_free_debugcheck(x, objp, z) (objp) 2700 - #endif 2701 - 2702 - static inline void fixup_objfreelist_debug(struct kmem_cache *cachep, 2703 - void **list) 2704 - { 2705 - #if DEBUG 2706 - void *next = *list; 2707 - void *objp; 2708 - 2709 - while (next) { 2710 - objp = next - obj_offset(cachep); 2711 - next = *(void **)next; 2712 - poison_obj(cachep, objp, POISON_FREE); 2713 - } 2714 - #endif 2715 - } 2716 - 2717 - static inline void fixup_slab_list(struct kmem_cache *cachep, 2718 - struct kmem_cache_node *n, struct slab *slab, 2719 - void **list) 2720 - { 2721 - /* move slabp to correct slabp list: */ 2722 - list_del(&slab->slab_list); 2723 - if (slab->active == cachep->num) { 2724 - list_add(&slab->slab_list, &n->slabs_full); 2725 - if (OBJFREELIST_SLAB(cachep)) { 2726 - #if DEBUG 2727 - /* Poisoning will be done without holding the lock */ 2728 - if (cachep->flags & SLAB_POISON) { 2729 - void **objp = slab->freelist; 2730 - 2731 - *objp = *list; 2732 - *list = objp; 2733 - } 2734 - #endif 2735 - slab->freelist = NULL; 2736 - } 2737 - } else 2738 - list_add(&slab->slab_list, &n->slabs_partial); 2739 - } 2740 - 2741 - /* Try to find non-pfmemalloc slab if needed */ 2742 - static noinline struct slab *get_valid_first_slab(struct kmem_cache_node *n, 2743 - struct slab *slab, bool pfmemalloc) 2744 - { 2745 - if (!slab) 2746 - return NULL; 2747 - 2748 - if (pfmemalloc) 2749 - return slab; 2750 - 2751 - if (!slab_test_pfmemalloc(slab)) 2752 - return slab; 2753 - 2754 - /* No need to keep pfmemalloc slab if we have enough free objects */ 2755 - if (n->free_objects > n->free_limit) { 2756 - slab_clear_pfmemalloc(slab); 2757 - return slab; 2758 - } 2759 - 2760 - /* Move pfmemalloc slab to the end of list to speed up next search */ 2761 - list_del(&slab->slab_list); 2762 - if (!slab->active) { 2763 - list_add_tail(&slab->slab_list, &n->slabs_free); 2764 - n->free_slabs++; 2765 - } else 2766 - list_add_tail(&slab->slab_list, &n->slabs_partial); 2767 - 2768 - list_for_each_entry(slab, &n->slabs_partial, slab_list) { 2769 - if (!slab_test_pfmemalloc(slab)) 2770 - return slab; 2771 - } 2772 - 2773 - n->free_touched = 1; 2774 - list_for_each_entry(slab, &n->slabs_free, slab_list) { 2775 - if (!slab_test_pfmemalloc(slab)) { 2776 - n->free_slabs--; 2777 - return slab; 2778 - } 2779 - } 2780 - 2781 - return NULL; 2782 - } 2783 - 2784 - static struct slab *get_first_slab(struct kmem_cache_node *n, bool pfmemalloc) 2785 - { 2786 - struct slab *slab; 2787 - 2788 - assert_raw_spin_locked(&n->list_lock); 2789 - slab = list_first_entry_or_null(&n->slabs_partial, struct slab, 2790 - slab_list); 2791 - if (!slab) { 2792 - n->free_touched = 1; 2793 - slab = list_first_entry_or_null(&n->slabs_free, struct slab, 2794 - slab_list); 2795 - if (slab) 2796 - n->free_slabs--; 2797 - } 2798 - 2799 - if (sk_memalloc_socks()) 2800 - slab = get_valid_first_slab(n, slab, pfmemalloc); 2801 - 2802 - return slab; 2803 - } 2804 - 2805 - static noinline void *cache_alloc_pfmemalloc(struct kmem_cache *cachep, 2806 - struct kmem_cache_node *n, gfp_t flags) 2807 - { 2808 - struct slab *slab; 2809 - void *obj; 2810 - void *list = NULL; 2811 - 2812 - if (!gfp_pfmemalloc_allowed(flags)) 2813 - return NULL; 2814 - 2815 - raw_spin_lock(&n->list_lock); 2816 - slab = get_first_slab(n, true); 2817 - if (!slab) { 2818 - raw_spin_unlock(&n->list_lock); 2819 - return NULL; 2820 - } 2821 - 2822 - obj = slab_get_obj(cachep, slab); 2823 - n->free_objects--; 2824 - 2825 - fixup_slab_list(cachep, n, slab, &list); 2826 - 2827 - raw_spin_unlock(&n->list_lock); 2828 - fixup_objfreelist_debug(cachep, &list); 2829 - 2830 - return obj; 2831 - } 2832 - 2833 - /* 2834 - * Slab list should be fixed up by fixup_slab_list() for existing slab 2835 - * or cache_grow_end() for new slab 2836 - */ 2837 - static __always_inline int alloc_block(struct kmem_cache *cachep, 2838 - struct array_cache *ac, struct slab *slab, int batchcount) 2839 - { 2840 - /* 2841 - * There must be at least one object available for 2842 - * allocation. 2843 - */ 2844 - BUG_ON(slab->active >= cachep->num); 2845 - 2846 - while (slab->active < cachep->num && batchcount--) { 2847 - STATS_INC_ALLOCED(cachep); 2848 - STATS_INC_ACTIVE(cachep); 2849 - STATS_SET_HIGH(cachep); 2850 - 2851 - ac->entry[ac->avail++] = slab_get_obj(cachep, slab); 2852 - } 2853 - 2854 - return batchcount; 2855 - } 2856 - 2857 - static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags) 2858 - { 2859 - int batchcount; 2860 - struct kmem_cache_node *n; 2861 - struct array_cache *ac, *shared; 2862 - int node; 2863 - void *list = NULL; 2864 - struct slab *slab; 2865 - 2866 - check_irq_off(); 2867 - node = numa_mem_id(); 2868 - 2869 - ac = cpu_cache_get(cachep); 2870 - batchcount = ac->batchcount; 2871 - if (!ac->touched && batchcount > BATCHREFILL_LIMIT) { 2872 - /* 2873 - * If there was little recent activity on this cache, then 2874 - * perform only a partial refill. Otherwise we could generate 2875 - * refill bouncing. 2876 - */ 2877 - batchcount = BATCHREFILL_LIMIT; 2878 - } 2879 - n = get_node(cachep, node); 2880 - 2881 - BUG_ON(ac->avail > 0 || !n); 2882 - shared = READ_ONCE(n->shared); 2883 - if (!n->free_objects && (!shared || !shared->avail)) 2884 - goto direct_grow; 2885 - 2886 - raw_spin_lock(&n->list_lock); 2887 - shared = READ_ONCE(n->shared); 2888 - 2889 - /* See if we can refill from the shared array */ 2890 - if (shared && transfer_objects(ac, shared, batchcount)) { 2891 - shared->touched = 1; 2892 - goto alloc_done; 2893 - } 2894 - 2895 - while (batchcount > 0) { 2896 - /* Get slab alloc is to come from. */ 2897 - slab = get_first_slab(n, false); 2898 - if (!slab) 2899 - goto must_grow; 2900 - 2901 - check_spinlock_acquired(cachep); 2902 - 2903 - batchcount = alloc_block(cachep, ac, slab, batchcount); 2904 - fixup_slab_list(cachep, n, slab, &list); 2905 - } 2906 - 2907 - must_grow: 2908 - n->free_objects -= ac->avail; 2909 - alloc_done: 2910 - raw_spin_unlock(&n->list_lock); 2911 - fixup_objfreelist_debug(cachep, &list); 2912 - 2913 - direct_grow: 2914 - if (unlikely(!ac->avail)) { 2915 - /* Check if we can use obj in pfmemalloc slab */ 2916 - if (sk_memalloc_socks()) { 2917 - void *obj = cache_alloc_pfmemalloc(cachep, n, flags); 2918 - 2919 - if (obj) 2920 - return obj; 2921 - } 2922 - 2923 - slab = cache_grow_begin(cachep, gfp_exact_node(flags), node); 2924 - 2925 - /* 2926 - * cache_grow_begin() can reenable interrupts, 2927 - * then ac could change. 2928 - */ 2929 - ac = cpu_cache_get(cachep); 2930 - if (!ac->avail && slab) 2931 - alloc_block(cachep, ac, slab, batchcount); 2932 - cache_grow_end(cachep, slab); 2933 - 2934 - if (!ac->avail) 2935 - return NULL; 2936 - } 2937 - ac->touched = 1; 2938 - 2939 - return ac->entry[--ac->avail]; 2940 - } 2941 - 2942 - #if DEBUG 2943 - static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, 2944 - gfp_t flags, void *objp, unsigned long caller) 2945 - { 2946 - WARN_ON_ONCE(cachep->ctor && (flags & __GFP_ZERO)); 2947 - if (!objp || is_kfence_address(objp)) 2948 - return objp; 2949 - if (cachep->flags & SLAB_POISON) { 2950 - check_poison_obj(cachep, objp); 2951 - slab_kernel_map(cachep, objp, 1); 2952 - poison_obj(cachep, objp, POISON_INUSE); 2953 - } 2954 - if (cachep->flags & SLAB_STORE_USER) 2955 - *dbg_userword(cachep, objp) = (void *)caller; 2956 - 2957 - if (cachep->flags & SLAB_RED_ZONE) { 2958 - if (*dbg_redzone1(cachep, objp) != RED_INACTIVE || 2959 - *dbg_redzone2(cachep, objp) != RED_INACTIVE) { 2960 - slab_error(cachep, "double free, or memory outside object was overwritten"); 2961 - pr_err("%px: redzone 1:0x%llx, redzone 2:0x%llx\n", 2962 - objp, *dbg_redzone1(cachep, objp), 2963 - *dbg_redzone2(cachep, objp)); 2964 - } 2965 - *dbg_redzone1(cachep, objp) = RED_ACTIVE; 2966 - *dbg_redzone2(cachep, objp) = RED_ACTIVE; 2967 - } 2968 - 2969 - objp += obj_offset(cachep); 2970 - if (cachep->ctor && cachep->flags & SLAB_POISON) 2971 - cachep->ctor(objp); 2972 - if ((unsigned long)objp & (arch_slab_minalign() - 1)) { 2973 - pr_err("0x%px: not aligned to arch_slab_minalign()=%u\n", objp, 2974 - arch_slab_minalign()); 2975 - } 2976 - return objp; 2977 - } 2978 - #else 2979 - #define cache_alloc_debugcheck_after(a, b, objp, d) (objp) 2980 - #endif 2981 - 2982 - static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) 2983 - { 2984 - void *objp; 2985 - struct array_cache *ac; 2986 - 2987 - check_irq_off(); 2988 - 2989 - ac = cpu_cache_get(cachep); 2990 - if (likely(ac->avail)) { 2991 - ac->touched = 1; 2992 - objp = ac->entry[--ac->avail]; 2993 - 2994 - STATS_INC_ALLOCHIT(cachep); 2995 - goto out; 2996 - } 2997 - 2998 - STATS_INC_ALLOCMISS(cachep); 2999 - objp = cache_alloc_refill(cachep, flags); 3000 - /* 3001 - * the 'ac' may be updated by cache_alloc_refill(), 3002 - * and kmemleak_erase() requires its correct value. 3003 - */ 3004 - ac = cpu_cache_get(cachep); 3005 - 3006 - out: 3007 - /* 3008 - * To avoid a false negative, if an object that is in one of the 3009 - * per-CPU caches is leaked, we need to make sure kmemleak doesn't 3010 - * treat the array pointers as a reference to the object. 3011 - */ 3012 - if (objp) 3013 - kmemleak_erase(&ac->entry[ac->avail]); 3014 - return objp; 3015 - } 3016 - 3017 - #ifdef CONFIG_NUMA 3018 - static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int); 3019 - 3020 - /* 3021 - * Try allocating on another node if PFA_SPREAD_SLAB is a mempolicy is set. 3022 - * 3023 - * If we are in_interrupt, then process context, including cpusets and 3024 - * mempolicy, may not apply and should not be used for allocation policy. 3025 - */ 3026 - static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags) 3027 - { 3028 - int nid_alloc, nid_here; 3029 - 3030 - if (in_interrupt() || (flags & __GFP_THISNODE)) 3031 - return NULL; 3032 - nid_alloc = nid_here = numa_mem_id(); 3033 - if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD)) 3034 - nid_alloc = cpuset_slab_spread_node(); 3035 - else if (current->mempolicy) 3036 - nid_alloc = mempolicy_slab_node(); 3037 - if (nid_alloc != nid_here) 3038 - return ____cache_alloc_node(cachep, flags, nid_alloc); 3039 - return NULL; 3040 - } 3041 - 3042 - /* 3043 - * Fallback function if there was no memory available and no objects on a 3044 - * certain node and fall back is permitted. First we scan all the 3045 - * available node for available objects. If that fails then we 3046 - * perform an allocation without specifying a node. This allows the page 3047 - * allocator to do its reclaim / fallback magic. We then insert the 3048 - * slab into the proper nodelist and then allocate from it. 3049 - */ 3050 - static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) 3051 - { 3052 - struct zonelist *zonelist; 3053 - struct zoneref *z; 3054 - struct zone *zone; 3055 - enum zone_type highest_zoneidx = gfp_zone(flags); 3056 - void *obj = NULL; 3057 - struct slab *slab; 3058 - int nid; 3059 - unsigned int cpuset_mems_cookie; 3060 - 3061 - if (flags & __GFP_THISNODE) 3062 - return NULL; 3063 - 3064 - retry_cpuset: 3065 - cpuset_mems_cookie = read_mems_allowed_begin(); 3066 - zonelist = node_zonelist(mempolicy_slab_node(), flags); 3067 - 3068 - retry: 3069 - /* 3070 - * Look through allowed nodes for objects available 3071 - * from existing per node queues. 3072 - */ 3073 - for_each_zone_zonelist(zone, z, zonelist, highest_zoneidx) { 3074 - nid = zone_to_nid(zone); 3075 - 3076 - if (cpuset_zone_allowed(zone, flags) && 3077 - get_node(cache, nid) && 3078 - get_node(cache, nid)->free_objects) { 3079 - obj = ____cache_alloc_node(cache, 3080 - gfp_exact_node(flags), nid); 3081 - if (obj) 3082 - break; 3083 - } 3084 - } 3085 - 3086 - if (!obj) { 3087 - /* 3088 - * This allocation will be performed within the constraints 3089 - * of the current cpuset / memory policy requirements. 3090 - * We may trigger various forms of reclaim on the allowed 3091 - * set and go into memory reserves if necessary. 3092 - */ 3093 - slab = cache_grow_begin(cache, flags, numa_mem_id()); 3094 - cache_grow_end(cache, slab); 3095 - if (slab) { 3096 - nid = slab_nid(slab); 3097 - obj = ____cache_alloc_node(cache, 3098 - gfp_exact_node(flags), nid); 3099 - 3100 - /* 3101 - * Another processor may allocate the objects in 3102 - * the slab since we are not holding any locks. 3103 - */ 3104 - if (!obj) 3105 - goto retry; 3106 - } 3107 - } 3108 - 3109 - if (unlikely(!obj && read_mems_allowed_retry(cpuset_mems_cookie))) 3110 - goto retry_cpuset; 3111 - return obj; 3112 - } 3113 - 3114 - /* 3115 - * An interface to enable slab creation on nodeid 3116 - */ 3117 - static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, 3118 - int nodeid) 3119 - { 3120 - struct slab *slab; 3121 - struct kmem_cache_node *n; 3122 - void *obj = NULL; 3123 - void *list = NULL; 3124 - 3125 - VM_BUG_ON(nodeid < 0 || nodeid >= MAX_NUMNODES); 3126 - n = get_node(cachep, nodeid); 3127 - BUG_ON(!n); 3128 - 3129 - check_irq_off(); 3130 - raw_spin_lock(&n->list_lock); 3131 - slab = get_first_slab(n, false); 3132 - if (!slab) 3133 - goto must_grow; 3134 - 3135 - check_spinlock_acquired_node(cachep, nodeid); 3136 - 3137 - STATS_INC_NODEALLOCS(cachep); 3138 - STATS_INC_ACTIVE(cachep); 3139 - STATS_SET_HIGH(cachep); 3140 - 3141 - BUG_ON(slab->active == cachep->num); 3142 - 3143 - obj = slab_get_obj(cachep, slab); 3144 - n->free_objects--; 3145 - 3146 - fixup_slab_list(cachep, n, slab, &list); 3147 - 3148 - raw_spin_unlock(&n->list_lock); 3149 - fixup_objfreelist_debug(cachep, &list); 3150 - return obj; 3151 - 3152 - must_grow: 3153 - raw_spin_unlock(&n->list_lock); 3154 - slab = cache_grow_begin(cachep, gfp_exact_node(flags), nodeid); 3155 - if (slab) { 3156 - /* This slab isn't counted yet so don't update free_objects */ 3157 - obj = slab_get_obj(cachep, slab); 3158 - } 3159 - cache_grow_end(cachep, slab); 3160 - 3161 - return obj ? obj : fallback_alloc(cachep, flags); 3162 - } 3163 - 3164 - static __always_inline void * 3165 - __do_cache_alloc(struct kmem_cache *cachep, gfp_t flags, int nodeid) 3166 - { 3167 - void *objp = NULL; 3168 - int slab_node = numa_mem_id(); 3169 - 3170 - if (nodeid == NUMA_NO_NODE) { 3171 - if (current->mempolicy || cpuset_do_slab_mem_spread()) { 3172 - objp = alternate_node_alloc(cachep, flags); 3173 - if (objp) 3174 - goto out; 3175 - } 3176 - /* 3177 - * Use the locally cached objects if possible. 3178 - * However ____cache_alloc does not allow fallback 3179 - * to other nodes. It may fail while we still have 3180 - * objects on other nodes available. 3181 - */ 3182 - objp = ____cache_alloc(cachep, flags); 3183 - nodeid = slab_node; 3184 - } else if (nodeid == slab_node) { 3185 - objp = ____cache_alloc(cachep, flags); 3186 - } else if (!get_node(cachep, nodeid)) { 3187 - /* Node not bootstrapped yet */ 3188 - objp = fallback_alloc(cachep, flags); 3189 - goto out; 3190 - } 3191 - 3192 - /* 3193 - * We may just have run out of memory on the local node. 3194 - * ____cache_alloc_node() knows how to locate memory on other nodes 3195 - */ 3196 - if (!objp) 3197 - objp = ____cache_alloc_node(cachep, flags, nodeid); 3198 - out: 3199 - return objp; 3200 - } 3201 - #else 3202 - 3203 - static __always_inline void * 3204 - __do_cache_alloc(struct kmem_cache *cachep, gfp_t flags, int nodeid __maybe_unused) 3205 - { 3206 - return ____cache_alloc(cachep, flags); 3207 - } 3208 - 3209 - #endif /* CONFIG_NUMA */ 3210 - 3211 - static __always_inline void * 3212 - slab_alloc_node(struct kmem_cache *cachep, struct list_lru *lru, gfp_t flags, 3213 - int nodeid, size_t orig_size, unsigned long caller) 3214 - { 3215 - unsigned long save_flags; 3216 - void *objp; 3217 - struct obj_cgroup *objcg = NULL; 3218 - bool init = false; 3219 - 3220 - flags &= gfp_allowed_mask; 3221 - cachep = slab_pre_alloc_hook(cachep, lru, &objcg, 1, flags); 3222 - if (unlikely(!cachep)) 3223 - return NULL; 3224 - 3225 - objp = kfence_alloc(cachep, orig_size, flags); 3226 - if (unlikely(objp)) 3227 - goto out; 3228 - 3229 - local_irq_save(save_flags); 3230 - objp = __do_cache_alloc(cachep, flags, nodeid); 3231 - local_irq_restore(save_flags); 3232 - objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller); 3233 - prefetchw(objp); 3234 - init = slab_want_init_on_alloc(flags, cachep); 3235 - 3236 - out: 3237 - slab_post_alloc_hook(cachep, objcg, flags, 1, &objp, init, 3238 - cachep->object_size); 3239 - return objp; 3240 - } 3241 - 3242 - static __always_inline void * 3243 - slab_alloc(struct kmem_cache *cachep, struct list_lru *lru, gfp_t flags, 3244 - size_t orig_size, unsigned long caller) 3245 - { 3246 - return slab_alloc_node(cachep, lru, flags, NUMA_NO_NODE, orig_size, 3247 - caller); 3248 - } 3249 - 3250 - /* 3251 - * Caller needs to acquire correct kmem_cache_node's list_lock 3252 - * @list: List of detached free slabs should be freed by caller 3253 - */ 3254 - static void free_block(struct kmem_cache *cachep, void **objpp, 3255 - int nr_objects, int node, struct list_head *list) 3256 - { 3257 - int i; 3258 - struct kmem_cache_node *n = get_node(cachep, node); 3259 - struct slab *slab; 3260 - 3261 - n->free_objects += nr_objects; 3262 - 3263 - for (i = 0; i < nr_objects; i++) { 3264 - void *objp; 3265 - struct slab *slab; 3266 - 3267 - objp = objpp[i]; 3268 - 3269 - slab = virt_to_slab(objp); 3270 - list_del(&slab->slab_list); 3271 - check_spinlock_acquired_node(cachep, node); 3272 - slab_put_obj(cachep, slab, objp); 3273 - STATS_DEC_ACTIVE(cachep); 3274 - 3275 - /* fixup slab chains */ 3276 - if (slab->active == 0) { 3277 - list_add(&slab->slab_list, &n->slabs_free); 3278 - n->free_slabs++; 3279 - } else { 3280 - /* Unconditionally move a slab to the end of the 3281 - * partial list on free - maximum time for the 3282 - * other objects to be freed, too. 3283 - */ 3284 - list_add_tail(&slab->slab_list, &n->slabs_partial); 3285 - } 3286 - } 3287 - 3288 - while (n->free_objects > n->free_limit && !list_empty(&n->slabs_free)) { 3289 - n->free_objects -= cachep->num; 3290 - 3291 - slab = list_last_entry(&n->slabs_free, struct slab, slab_list); 3292 - list_move(&slab->slab_list, list); 3293 - n->free_slabs--; 3294 - n->total_slabs--; 3295 - } 3296 - } 3297 - 3298 - static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac) 3299 - { 3300 - int batchcount; 3301 - struct kmem_cache_node *n; 3302 - int node = numa_mem_id(); 3303 - LIST_HEAD(list); 3304 - 3305 - batchcount = ac->batchcount; 3306 - 3307 - check_irq_off(); 3308 - n = get_node(cachep, node); 3309 - raw_spin_lock(&n->list_lock); 3310 - if (n->shared) { 3311 - struct array_cache *shared_array = n->shared; 3312 - int max = shared_array->limit - shared_array->avail; 3313 - if (max) { 3314 - if (batchcount > max) 3315 - batchcount = max; 3316 - memcpy(&(shared_array->entry[shared_array->avail]), 3317 - ac->entry, sizeof(void *) * batchcount); 3318 - shared_array->avail += batchcount; 3319 - goto free_done; 3320 - } 3321 - } 3322 - 3323 - free_block(cachep, ac->entry, batchcount, node, &list); 3324 - free_done: 3325 - #if STATS 3326 - { 3327 - int i = 0; 3328 - struct slab *slab; 3329 - 3330 - list_for_each_entry(slab, &n->slabs_free, slab_list) { 3331 - BUG_ON(slab->active); 3332 - 3333 - i++; 3334 - } 3335 - STATS_SET_FREEABLE(cachep, i); 3336 - } 3337 - #endif 3338 - raw_spin_unlock(&n->list_lock); 3339 - ac->avail -= batchcount; 3340 - memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void *)*ac->avail); 3341 - slabs_destroy(cachep, &list); 3342 - } 3343 - 3344 - /* 3345 - * Release an obj back to its cache. If the obj has a constructed state, it must 3346 - * be in this state _before_ it is released. Called with disabled ints. 3347 - */ 3348 - static __always_inline void __cache_free(struct kmem_cache *cachep, void *objp, 3349 - unsigned long caller) 3350 - { 3351 - bool init; 3352 - 3353 - memcg_slab_free_hook(cachep, virt_to_slab(objp), &objp, 1); 3354 - 3355 - if (is_kfence_address(objp)) { 3356 - kmemleak_free_recursive(objp, cachep->flags); 3357 - __kfence_free(objp); 3358 - return; 3359 - } 3360 - 3361 - /* 3362 - * As memory initialization might be integrated into KASAN, 3363 - * kasan_slab_free and initialization memset must be 3364 - * kept together to avoid discrepancies in behavior. 3365 - */ 3366 - init = slab_want_init_on_free(cachep); 3367 - if (init && !kasan_has_integrated_init()) 3368 - memset(objp, 0, cachep->object_size); 3369 - /* KASAN might put objp into memory quarantine, delaying its reuse. */ 3370 - if (kasan_slab_free(cachep, objp, init)) 3371 - return; 3372 - 3373 - /* Use KCSAN to help debug racy use-after-free. */ 3374 - if (!(cachep->flags & SLAB_TYPESAFE_BY_RCU)) 3375 - __kcsan_check_access(objp, cachep->object_size, 3376 - KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ASSERT); 3377 - 3378 - ___cache_free(cachep, objp, caller); 3379 - } 3380 - 3381 - void ___cache_free(struct kmem_cache *cachep, void *objp, 3382 - unsigned long caller) 3383 - { 3384 - struct array_cache *ac = cpu_cache_get(cachep); 3385 - 3386 - check_irq_off(); 3387 - kmemleak_free_recursive(objp, cachep->flags); 3388 - objp = cache_free_debugcheck(cachep, objp, caller); 3389 - 3390 - /* 3391 - * Skip calling cache_free_alien() when the platform is not numa. 3392 - * This will avoid cache misses that happen while accessing slabp (which 3393 - * is per page memory reference) to get nodeid. Instead use a global 3394 - * variable to skip the call, which is mostly likely to be present in 3395 - * the cache. 3396 - */ 3397 - if (nr_online_nodes > 1 && cache_free_alien(cachep, objp)) 3398 - return; 3399 - 3400 - if (ac->avail < ac->limit) { 3401 - STATS_INC_FREEHIT(cachep); 3402 - } else { 3403 - STATS_INC_FREEMISS(cachep); 3404 - cache_flusharray(cachep, ac); 3405 - } 3406 - 3407 - if (sk_memalloc_socks()) { 3408 - struct slab *slab = virt_to_slab(objp); 3409 - 3410 - if (unlikely(slab_test_pfmemalloc(slab))) { 3411 - cache_free_pfmemalloc(cachep, slab, objp); 3412 - return; 3413 - } 3414 - } 3415 - 3416 - __free_one(ac, objp); 3417 - } 3418 - 3419 - static __always_inline 3420 - void *__kmem_cache_alloc_lru(struct kmem_cache *cachep, struct list_lru *lru, 3421 - gfp_t flags) 3422 - { 3423 - void *ret = slab_alloc(cachep, lru, flags, cachep->object_size, _RET_IP_); 3424 - 3425 - trace_kmem_cache_alloc(_RET_IP_, ret, cachep, flags, NUMA_NO_NODE); 3426 - 3427 - return ret; 3428 - } 3429 - 3430 - void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) 3431 - { 3432 - return __kmem_cache_alloc_lru(cachep, NULL, flags); 3433 - } 3434 - EXPORT_SYMBOL(kmem_cache_alloc); 3435 - 3436 - void *kmem_cache_alloc_lru(struct kmem_cache *cachep, struct list_lru *lru, 3437 - gfp_t flags) 3438 - { 3439 - return __kmem_cache_alloc_lru(cachep, lru, flags); 3440 - } 3441 - EXPORT_SYMBOL(kmem_cache_alloc_lru); 3442 - 3443 - static __always_inline void 3444 - cache_alloc_debugcheck_after_bulk(struct kmem_cache *s, gfp_t flags, 3445 - size_t size, void **p, unsigned long caller) 3446 - { 3447 - size_t i; 3448 - 3449 - for (i = 0; i < size; i++) 3450 - p[i] = cache_alloc_debugcheck_after(s, flags, p[i], caller); 3451 - } 3452 - 3453 - int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, 3454 - void **p) 3455 - { 3456 - struct obj_cgroup *objcg = NULL; 3457 - unsigned long irqflags; 3458 - size_t i; 3459 - 3460 - s = slab_pre_alloc_hook(s, NULL, &objcg, size, flags); 3461 - if (!s) 3462 - return 0; 3463 - 3464 - local_irq_save(irqflags); 3465 - for (i = 0; i < size; i++) { 3466 - void *objp = kfence_alloc(s, s->object_size, flags) ?: 3467 - __do_cache_alloc(s, flags, NUMA_NO_NODE); 3468 - 3469 - if (unlikely(!objp)) 3470 - goto error; 3471 - p[i] = objp; 3472 - } 3473 - local_irq_restore(irqflags); 3474 - 3475 - cache_alloc_debugcheck_after_bulk(s, flags, size, p, _RET_IP_); 3476 - 3477 - /* 3478 - * memcg and kmem_cache debug support and memory initialization. 3479 - * Done outside of the IRQ disabled section. 3480 - */ 3481 - slab_post_alloc_hook(s, objcg, flags, size, p, 3482 - slab_want_init_on_alloc(flags, s), s->object_size); 3483 - /* FIXME: Trace call missing. Christoph would like a bulk variant */ 3484 - return size; 3485 - error: 3486 - local_irq_restore(irqflags); 3487 - cache_alloc_debugcheck_after_bulk(s, flags, i, p, _RET_IP_); 3488 - slab_post_alloc_hook(s, objcg, flags, i, p, false, s->object_size); 3489 - kmem_cache_free_bulk(s, i, p); 3490 - return 0; 3491 - } 3492 - EXPORT_SYMBOL(kmem_cache_alloc_bulk); 3493 - 3494 - /** 3495 - * kmem_cache_alloc_node - Allocate an object on the specified node 3496 - * @cachep: The cache to allocate from. 3497 - * @flags: See kmalloc(). 3498 - * @nodeid: node number of the target node. 3499 - * 3500 - * Identical to kmem_cache_alloc but it will allocate memory on the given 3501 - * node, which can improve the performance for cpu bound structures. 3502 - * 3503 - * Fallback to other node is possible if __GFP_THISNODE is not set. 3504 - * 3505 - * Return: pointer to the new object or %NULL in case of error 3506 - */ 3507 - void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) 3508 - { 3509 - void *ret = slab_alloc_node(cachep, NULL, flags, nodeid, cachep->object_size, _RET_IP_); 3510 - 3511 - trace_kmem_cache_alloc(_RET_IP_, ret, cachep, flags, nodeid); 3512 - 3513 - return ret; 3514 - } 3515 - EXPORT_SYMBOL(kmem_cache_alloc_node); 3516 - 3517 - void *__kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, 3518 - int nodeid, size_t orig_size, 3519 - unsigned long caller) 3520 - { 3521 - return slab_alloc_node(cachep, NULL, flags, nodeid, 3522 - orig_size, caller); 3523 - } 3524 - 3525 - #ifdef CONFIG_PRINTK 3526 - void __kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab) 3527 - { 3528 - struct kmem_cache *cachep; 3529 - unsigned int objnr; 3530 - void *objp; 3531 - 3532 - kpp->kp_ptr = object; 3533 - kpp->kp_slab = slab; 3534 - cachep = slab->slab_cache; 3535 - kpp->kp_slab_cache = cachep; 3536 - objp = object - obj_offset(cachep); 3537 - kpp->kp_data_offset = obj_offset(cachep); 3538 - slab = virt_to_slab(objp); 3539 - objnr = obj_to_index(cachep, slab, objp); 3540 - objp = index_to_obj(cachep, slab, objnr); 3541 - kpp->kp_objp = objp; 3542 - if (DEBUG && cachep->flags & SLAB_STORE_USER) 3543 - kpp->kp_ret = *dbg_userword(cachep, objp); 3544 - } 3545 - #endif 3546 - 3547 - static __always_inline 3548 - void __do_kmem_cache_free(struct kmem_cache *cachep, void *objp, 3549 - unsigned long caller) 3550 - { 3551 - unsigned long flags; 3552 - 3553 - local_irq_save(flags); 3554 - debug_check_no_locks_freed(objp, cachep->object_size); 3555 - if (!(cachep->flags & SLAB_DEBUG_OBJECTS)) 3556 - debug_check_no_obj_freed(objp, cachep->object_size); 3557 - __cache_free(cachep, objp, caller); 3558 - local_irq_restore(flags); 3559 - } 3560 - 3561 - void __kmem_cache_free(struct kmem_cache *cachep, void *objp, 3562 - unsigned long caller) 3563 - { 3564 - __do_kmem_cache_free(cachep, objp, caller); 3565 - } 3566 - 3567 - /** 3568 - * kmem_cache_free - Deallocate an object 3569 - * @cachep: The cache the allocation was from. 3570 - * @objp: The previously allocated object. 3571 - * 3572 - * Free an object which was previously allocated from this 3573 - * cache. 3574 - */ 3575 - void kmem_cache_free(struct kmem_cache *cachep, void *objp) 3576 - { 3577 - cachep = cache_from_obj(cachep, objp); 3578 - if (!cachep) 3579 - return; 3580 - 3581 - trace_kmem_cache_free(_RET_IP_, objp, cachep); 3582 - __do_kmem_cache_free(cachep, objp, _RET_IP_); 3583 - } 3584 - EXPORT_SYMBOL(kmem_cache_free); 3585 - 3586 - void kmem_cache_free_bulk(struct kmem_cache *orig_s, size_t size, void **p) 3587 - { 3588 - unsigned long flags; 3589 - 3590 - local_irq_save(flags); 3591 - for (int i = 0; i < size; i++) { 3592 - void *objp = p[i]; 3593 - struct kmem_cache *s; 3594 - 3595 - if (!orig_s) { 3596 - struct folio *folio = virt_to_folio(objp); 3597 - 3598 - /* called via kfree_bulk */ 3599 - if (!folio_test_slab(folio)) { 3600 - local_irq_restore(flags); 3601 - free_large_kmalloc(folio, objp); 3602 - local_irq_save(flags); 3603 - continue; 3604 - } 3605 - s = folio_slab(folio)->slab_cache; 3606 - } else { 3607 - s = cache_from_obj(orig_s, objp); 3608 - } 3609 - 3610 - if (!s) 3611 - continue; 3612 - 3613 - debug_check_no_locks_freed(objp, s->object_size); 3614 - if (!(s->flags & SLAB_DEBUG_OBJECTS)) 3615 - debug_check_no_obj_freed(objp, s->object_size); 3616 - 3617 - __cache_free(s, objp, _RET_IP_); 3618 - } 3619 - local_irq_restore(flags); 3620 - 3621 - /* FIXME: add tracing */ 3622 - } 3623 - EXPORT_SYMBOL(kmem_cache_free_bulk); 3624 - 3625 - /* 3626 - * This initializes kmem_cache_node or resizes various caches for all nodes. 3627 - */ 3628 - static int setup_kmem_cache_nodes(struct kmem_cache *cachep, gfp_t gfp) 3629 - { 3630 - int ret; 3631 - int node; 3632 - struct kmem_cache_node *n; 3633 - 3634 - for_each_online_node(node) { 3635 - ret = setup_kmem_cache_node(cachep, node, gfp, true); 3636 - if (ret) 3637 - goto fail; 3638 - 3639 - } 3640 - 3641 - return 0; 3642 - 3643 - fail: 3644 - if (!cachep->list.next) { 3645 - /* Cache is not active yet. Roll back what we did */ 3646 - node--; 3647 - while (node >= 0) { 3648 - n = get_node(cachep, node); 3649 - if (n) { 3650 - kfree(n->shared); 3651 - free_alien_cache(n->alien); 3652 - kfree(n); 3653 - cachep->node[node] = NULL; 3654 - } 3655 - node--; 3656 - } 3657 - } 3658 - return -ENOMEM; 3659 - } 3660 - 3661 - /* Always called with the slab_mutex held */ 3662 - static int do_tune_cpucache(struct kmem_cache *cachep, int limit, 3663 - int batchcount, int shared, gfp_t gfp) 3664 - { 3665 - struct array_cache __percpu *cpu_cache, *prev; 3666 - int cpu; 3667 - 3668 - cpu_cache = alloc_kmem_cache_cpus(cachep, limit, batchcount); 3669 - if (!cpu_cache) 3670 - return -ENOMEM; 3671 - 3672 - prev = cachep->cpu_cache; 3673 - cachep->cpu_cache = cpu_cache; 3674 - /* 3675 - * Without a previous cpu_cache there's no need to synchronize remote 3676 - * cpus, so skip the IPIs. 3677 - */ 3678 - if (prev) 3679 - kick_all_cpus_sync(); 3680 - 3681 - check_irq_on(); 3682 - cachep->batchcount = batchcount; 3683 - cachep->limit = limit; 3684 - cachep->shared = shared; 3685 - 3686 - if (!prev) 3687 - goto setup_node; 3688 - 3689 - for_each_online_cpu(cpu) { 3690 - LIST_HEAD(list); 3691 - int node; 3692 - struct kmem_cache_node *n; 3693 - struct array_cache *ac = per_cpu_ptr(prev, cpu); 3694 - 3695 - node = cpu_to_mem(cpu); 3696 - n = get_node(cachep, node); 3697 - raw_spin_lock_irq(&n->list_lock); 3698 - free_block(cachep, ac->entry, ac->avail, node, &list); 3699 - raw_spin_unlock_irq(&n->list_lock); 3700 - slabs_destroy(cachep, &list); 3701 - } 3702 - free_percpu(prev); 3703 - 3704 - setup_node: 3705 - return setup_kmem_cache_nodes(cachep, gfp); 3706 - } 3707 - 3708 - /* Called with slab_mutex held always */ 3709 - static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp) 3710 - { 3711 - int err; 3712 - int limit = 0; 3713 - int shared = 0; 3714 - int batchcount = 0; 3715 - 3716 - err = cache_random_seq_create(cachep, cachep->num, gfp); 3717 - if (err) 3718 - goto end; 3719 - 3720 - /* 3721 - * The head array serves three purposes: 3722 - * - create a LIFO ordering, i.e. return objects that are cache-warm 3723 - * - reduce the number of spinlock operations. 3724 - * - reduce the number of linked list operations on the slab and 3725 - * bufctl chains: array operations are cheaper. 3726 - * The numbers are guessed, we should auto-tune as described by 3727 - * Bonwick. 3728 - */ 3729 - if (cachep->size > 131072) 3730 - limit = 1; 3731 - else if (cachep->size > PAGE_SIZE) 3732 - limit = 8; 3733 - else if (cachep->size > 1024) 3734 - limit = 24; 3735 - else if (cachep->size > 256) 3736 - limit = 54; 3737 - else 3738 - limit = 120; 3739 - 3740 - /* 3741 - * CPU bound tasks (e.g. network routing) can exhibit cpu bound 3742 - * allocation behaviour: Most allocs on one cpu, most free operations 3743 - * on another cpu. For these cases, an efficient object passing between 3744 - * cpus is necessary. This is provided by a shared array. The array 3745 - * replaces Bonwick's magazine layer. 3746 - * On uniprocessor, it's functionally equivalent (but less efficient) 3747 - * to a larger limit. Thus disabled by default. 3748 - */ 3749 - shared = 0; 3750 - if (cachep->size <= PAGE_SIZE && num_possible_cpus() > 1) 3751 - shared = 8; 3752 - 3753 - #if DEBUG 3754 - /* 3755 - * With debugging enabled, large batchcount lead to excessively long 3756 - * periods with disabled local interrupts. Limit the batchcount 3757 - */ 3758 - if (limit > 32) 3759 - limit = 32; 3760 - #endif 3761 - batchcount = (limit + 1) / 2; 3762 - err = do_tune_cpucache(cachep, limit, batchcount, shared, gfp); 3763 - end: 3764 - if (err) 3765 - pr_err("enable_cpucache failed for %s, error %d\n", 3766 - cachep->name, -err); 3767 - return err; 3768 - } 3769 - 3770 - /* 3771 - * Drain an array if it contains any elements taking the node lock only if 3772 - * necessary. Note that the node listlock also protects the array_cache 3773 - * if drain_array() is used on the shared array. 3774 - */ 3775 - static void drain_array(struct kmem_cache *cachep, struct kmem_cache_node *n, 3776 - struct array_cache *ac, int node) 3777 - { 3778 - LIST_HEAD(list); 3779 - 3780 - /* ac from n->shared can be freed if we don't hold the slab_mutex. */ 3781 - check_mutex_acquired(); 3782 - 3783 - if (!ac || !ac->avail) 3784 - return; 3785 - 3786 - if (ac->touched) { 3787 - ac->touched = 0; 3788 - return; 3789 - } 3790 - 3791 - raw_spin_lock_irq(&n->list_lock); 3792 - drain_array_locked(cachep, ac, node, false, &list); 3793 - raw_spin_unlock_irq(&n->list_lock); 3794 - 3795 - slabs_destroy(cachep, &list); 3796 - } 3797 - 3798 - /** 3799 - * cache_reap - Reclaim memory from caches. 3800 - * @w: work descriptor 3801 - * 3802 - * Called from workqueue/eventd every few seconds. 3803 - * Purpose: 3804 - * - clear the per-cpu caches for this CPU. 3805 - * - return freeable pages to the main free memory pool. 3806 - * 3807 - * If we cannot acquire the cache chain mutex then just give up - we'll try 3808 - * again on the next iteration. 3809 - */ 3810 - static void cache_reap(struct work_struct *w) 3811 - { 3812 - struct kmem_cache *searchp; 3813 - struct kmem_cache_node *n; 3814 - int node = numa_mem_id(); 3815 - struct delayed_work *work = to_delayed_work(w); 3816 - 3817 - if (!mutex_trylock(&slab_mutex)) 3818 - /* Give up. Setup the next iteration. */ 3819 - goto out; 3820 - 3821 - list_for_each_entry(searchp, &slab_caches, list) { 3822 - check_irq_on(); 3823 - 3824 - /* 3825 - * We only take the node lock if absolutely necessary and we 3826 - * have established with reasonable certainty that 3827 - * we can do some work if the lock was obtained. 3828 - */ 3829 - n = get_node(searchp, node); 3830 - 3831 - reap_alien(searchp, n); 3832 - 3833 - drain_array(searchp, n, cpu_cache_get(searchp), node); 3834 - 3835 - /* 3836 - * These are racy checks but it does not matter 3837 - * if we skip one check or scan twice. 3838 - */ 3839 - if (time_after(n->next_reap, jiffies)) 3840 - goto next; 3841 - 3842 - n->next_reap = jiffies + REAPTIMEOUT_NODE; 3843 - 3844 - drain_array(searchp, n, n->shared, node); 3845 - 3846 - if (n->free_touched) 3847 - n->free_touched = 0; 3848 - else { 3849 - int freed; 3850 - 3851 - freed = drain_freelist(searchp, n, (n->free_limit + 3852 - 5 * searchp->num - 1) / (5 * searchp->num)); 3853 - STATS_ADD_REAPED(searchp, freed); 3854 - } 3855 - next: 3856 - cond_resched(); 3857 - } 3858 - check_irq_on(); 3859 - mutex_unlock(&slab_mutex); 3860 - next_reap_node(); 3861 - out: 3862 - /* Set up the next iteration */ 3863 - schedule_delayed_work_on(smp_processor_id(), work, 3864 - round_jiffies_relative(REAPTIMEOUT_AC)); 3865 - } 3866 - 3867 - void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo) 3868 - { 3869 - unsigned long active_objs, num_objs, active_slabs; 3870 - unsigned long total_slabs = 0, free_objs = 0, shared_avail = 0; 3871 - unsigned long free_slabs = 0; 3872 - int node; 3873 - struct kmem_cache_node *n; 3874 - 3875 - for_each_kmem_cache_node(cachep, node, n) { 3876 - check_irq_on(); 3877 - raw_spin_lock_irq(&n->list_lock); 3878 - 3879 - total_slabs += n->total_slabs; 3880 - free_slabs += n->free_slabs; 3881 - free_objs += n->free_objects; 3882 - 3883 - if (n->shared) 3884 - shared_avail += n->shared->avail; 3885 - 3886 - raw_spin_unlock_irq(&n->list_lock); 3887 - } 3888 - num_objs = total_slabs * cachep->num; 3889 - active_slabs = total_slabs - free_slabs; 3890 - active_objs = num_objs - free_objs; 3891 - 3892 - sinfo->active_objs = active_objs; 3893 - sinfo->num_objs = num_objs; 3894 - sinfo->active_slabs = active_slabs; 3895 - sinfo->num_slabs = total_slabs; 3896 - sinfo->shared_avail = shared_avail; 3897 - sinfo->limit = cachep->limit; 3898 - sinfo->batchcount = cachep->batchcount; 3899 - sinfo->shared = cachep->shared; 3900 - sinfo->objects_per_slab = cachep->num; 3901 - sinfo->cache_order = cachep->gfporder; 3902 - } 3903 - 3904 - void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *cachep) 3905 - { 3906 - #if STATS 3907 - { /* node stats */ 3908 - unsigned long high = cachep->high_mark; 3909 - unsigned long allocs = cachep->num_allocations; 3910 - unsigned long grown = cachep->grown; 3911 - unsigned long reaped = cachep->reaped; 3912 - unsigned long errors = cachep->errors; 3913 - unsigned long max_freeable = cachep->max_freeable; 3914 - unsigned long node_allocs = cachep->node_allocs; 3915 - unsigned long node_frees = cachep->node_frees; 3916 - unsigned long overflows = cachep->node_overflow; 3917 - 3918 - seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu %4lu %4lu %4lu %4lu %4lu", 3919 - allocs, high, grown, 3920 - reaped, errors, max_freeable, node_allocs, 3921 - node_frees, overflows); 3922 - } 3923 - /* cpu stats */ 3924 - { 3925 - unsigned long allochit = atomic_read(&cachep->allochit); 3926 - unsigned long allocmiss = atomic_read(&cachep->allocmiss); 3927 - unsigned long freehit = atomic_read(&cachep->freehit); 3928 - unsigned long freemiss = atomic_read(&cachep->freemiss); 3929 - 3930 - seq_printf(m, " : cpustat %6lu %6lu %6lu %6lu", 3931 - allochit, allocmiss, freehit, freemiss); 3932 - } 3933 - #endif 3934 - } 3935 - 3936 - #define MAX_SLABINFO_WRITE 128 3937 - /** 3938 - * slabinfo_write - Tuning for the slab allocator 3939 - * @file: unused 3940 - * @buffer: user buffer 3941 - * @count: data length 3942 - * @ppos: unused 3943 - * 3944 - * Return: %0 on success, negative error code otherwise. 3945 - */ 3946 - ssize_t slabinfo_write(struct file *file, const char __user *buffer, 3947 - size_t count, loff_t *ppos) 3948 - { 3949 - char kbuf[MAX_SLABINFO_WRITE + 1], *tmp; 3950 - int limit, batchcount, shared, res; 3951 - struct kmem_cache *cachep; 3952 - 3953 - if (count > MAX_SLABINFO_WRITE) 3954 - return -EINVAL; 3955 - if (copy_from_user(&kbuf, buffer, count)) 3956 - return -EFAULT; 3957 - kbuf[MAX_SLABINFO_WRITE] = '\0'; 3958 - 3959 - tmp = strchr(kbuf, ' '); 3960 - if (!tmp) 3961 - return -EINVAL; 3962 - *tmp = '\0'; 3963 - tmp++; 3964 - if (sscanf(tmp, " %d %d %d", &limit, &batchcount, &shared) != 3) 3965 - return -EINVAL; 3966 - 3967 - /* Find the cache in the chain of caches. */ 3968 - mutex_lock(&slab_mutex); 3969 - res = -EINVAL; 3970 - list_for_each_entry(cachep, &slab_caches, list) { 3971 - if (!strcmp(cachep->name, kbuf)) { 3972 - if (limit < 1 || batchcount < 1 || 3973 - batchcount > limit || shared < 0) { 3974 - res = 0; 3975 - } else { 3976 - res = do_tune_cpucache(cachep, limit, 3977 - batchcount, shared, 3978 - GFP_KERNEL); 3979 - } 3980 - break; 3981 - } 3982 - } 3983 - mutex_unlock(&slab_mutex); 3984 - if (res >= 0) 3985 - res = count; 3986 - return res; 3987 - } 3988 - 3989 - #ifdef CONFIG_HARDENED_USERCOPY 3990 - /* 3991 - * Rejects incorrectly sized objects and objects that are to be copied 3992 - * to/from userspace but do not fall entirely within the containing slab 3993 - * cache's usercopy region. 3994 - * 3995 - * Returns NULL if check passes, otherwise const char * to name of cache 3996 - * to indicate an error. 3997 - */ 3998 - void __check_heap_object(const void *ptr, unsigned long n, 3999 - const struct slab *slab, bool to_user) 4000 - { 4001 - struct kmem_cache *cachep; 4002 - unsigned int objnr; 4003 - unsigned long offset; 4004 - 4005 - ptr = kasan_reset_tag(ptr); 4006 - 4007 - /* Find and validate object. */ 4008 - cachep = slab->slab_cache; 4009 - objnr = obj_to_index(cachep, slab, (void *)ptr); 4010 - BUG_ON(objnr >= cachep->num); 4011 - 4012 - /* Find offset within object. */ 4013 - if (is_kfence_address(ptr)) 4014 - offset = ptr - kfence_object_start(ptr); 4015 - else 4016 - offset = ptr - index_to_obj(cachep, slab, objnr) - obj_offset(cachep); 4017 - 4018 - /* Allow address range falling entirely within usercopy region. */ 4019 - if (offset >= cachep->useroffset && 4020 - offset - cachep->useroffset <= cachep->usersize && 4021 - n <= cachep->useroffset - offset + cachep->usersize) 4022 - return; 4023 - 4024 - usercopy_abort("SLAB object", cachep->name, to_user, offset, n); 4025 - } 4026 - #endif /* CONFIG_HARDENED_USERCOPY */
+169 -382
mm/slab.h
··· 1 1 /* SPDX-License-Identifier: GPL-2.0 */ 2 2 #ifndef MM_SLAB_H 3 3 #define MM_SLAB_H 4 + 5 + #include <linux/reciprocal_div.h> 6 + #include <linux/list_lru.h> 7 + #include <linux/local_lock.h> 8 + #include <linux/random.h> 9 + #include <linux/kobject.h> 10 + #include <linux/sched/mm.h> 11 + #include <linux/memcontrol.h> 12 + #include <linux/kfence.h> 13 + #include <linux/kasan.h> 14 + 4 15 /* 5 16 * Internal slab definitions 6 17 */ 7 - void __init kmem_cache_init(void); 8 18 9 19 #ifdef CONFIG_64BIT 10 20 # ifdef system_has_cmpxchg128 ··· 52 42 struct slab { 53 43 unsigned long __page_flags; 54 44 55 - #if defined(CONFIG_SLAB) 56 - 57 - struct kmem_cache *slab_cache; 58 - union { 59 - struct { 60 - struct list_head slab_list; 61 - void *freelist; /* array of free object indexes */ 62 - void *s_mem; /* first object */ 63 - }; 64 - struct rcu_head rcu_head; 65 - }; 66 - unsigned int active; 67 - 68 - #elif defined(CONFIG_SLUB) 69 - 70 45 struct kmem_cache *slab_cache; 71 46 union { 72 47 struct { ··· 86 91 }; 87 92 unsigned int __unused; 88 93 89 - #else 90 - #error "Unexpected slab allocator configured" 91 - #endif 92 - 93 94 atomic_t __page_refcount; 94 95 #ifdef CONFIG_MEMCG 95 96 unsigned long memcg_data; ··· 102 111 #endif 103 112 #undef SLAB_MATCH 104 113 static_assert(sizeof(struct slab) <= sizeof(struct page)); 105 - #if defined(system_has_freelist_aba) && defined(CONFIG_SLUB) 114 + #if defined(system_has_freelist_aba) 106 115 static_assert(IS_ALIGNED(offsetof(struct slab, freelist), sizeof(freelist_aba_t))); 107 116 #endif 108 117 ··· 219 228 return PAGE_SIZE << slab_order(slab); 220 229 } 221 230 222 - #ifdef CONFIG_SLAB 223 - #include <linux/slab_def.h> 231 + #ifdef CONFIG_SLUB_CPU_PARTIAL 232 + #define slub_percpu_partial(c) ((c)->partial) 233 + 234 + #define slub_set_percpu_partial(c, p) \ 235 + ({ \ 236 + slub_percpu_partial(c) = (p)->next; \ 237 + }) 238 + 239 + #define slub_percpu_partial_read_once(c) READ_ONCE(slub_percpu_partial(c)) 240 + #else 241 + #define slub_percpu_partial(c) NULL 242 + 243 + #define slub_set_percpu_partial(c, p) 244 + 245 + #define slub_percpu_partial_read_once(c) NULL 246 + #endif // CONFIG_SLUB_CPU_PARTIAL 247 + 248 + /* 249 + * Word size structure that can be atomically updated or read and that 250 + * contains both the order and the number of objects that a slab of the 251 + * given order would contain. 252 + */ 253 + struct kmem_cache_order_objects { 254 + unsigned int x; 255 + }; 256 + 257 + /* 258 + * Slab cache management. 259 + */ 260 + struct kmem_cache { 261 + #ifndef CONFIG_SLUB_TINY 262 + struct kmem_cache_cpu __percpu *cpu_slab; 263 + #endif 264 + /* Used for retrieving partial slabs, etc. */ 265 + slab_flags_t flags; 266 + unsigned long min_partial; 267 + unsigned int size; /* Object size including metadata */ 268 + unsigned int object_size; /* Object size without metadata */ 269 + struct reciprocal_value reciprocal_size; 270 + unsigned int offset; /* Free pointer offset */ 271 + #ifdef CONFIG_SLUB_CPU_PARTIAL 272 + /* Number of per cpu partial objects to keep around */ 273 + unsigned int cpu_partial; 274 + /* Number of per cpu partial slabs to keep around */ 275 + unsigned int cpu_partial_slabs; 276 + #endif 277 + struct kmem_cache_order_objects oo; 278 + 279 + /* Allocation and freeing of slabs */ 280 + struct kmem_cache_order_objects min; 281 + gfp_t allocflags; /* gfp flags to use on each alloc */ 282 + int refcount; /* Refcount for slab cache destroy */ 283 + void (*ctor)(void *object); /* Object constructor */ 284 + unsigned int inuse; /* Offset to metadata */ 285 + unsigned int align; /* Alignment */ 286 + unsigned int red_left_pad; /* Left redzone padding size */ 287 + const char *name; /* Name (only for display!) */ 288 + struct list_head list; /* List of slab caches */ 289 + #ifdef CONFIG_SYSFS 290 + struct kobject kobj; /* For sysfs */ 291 + #endif 292 + #ifdef CONFIG_SLAB_FREELIST_HARDENED 293 + unsigned long random; 224 294 #endif 225 295 226 - #ifdef CONFIG_SLUB 227 - #include <linux/slub_def.h> 296 + #ifdef CONFIG_NUMA 297 + /* 298 + * Defragmentation by allocating from a remote node. 299 + */ 300 + unsigned int remote_node_defrag_ratio; 228 301 #endif 229 302 230 - #include <linux/memcontrol.h> 231 - #include <linux/fault-inject.h> 232 - #include <linux/kasan.h> 233 - #include <linux/kmemleak.h> 234 - #include <linux/random.h> 235 - #include <linux/sched/mm.h> 236 - #include <linux/list_lru.h> 303 + #ifdef CONFIG_SLAB_FREELIST_RANDOM 304 + unsigned int *random_seq; 305 + #endif 306 + 307 + #ifdef CONFIG_KASAN_GENERIC 308 + struct kasan_cache kasan_info; 309 + #endif 310 + 311 + #ifdef CONFIG_HARDENED_USERCOPY 312 + unsigned int useroffset; /* Usercopy region offset */ 313 + unsigned int usersize; /* Usercopy region size */ 314 + #endif 315 + 316 + struct kmem_cache_node *node[MAX_NUMNODES]; 317 + }; 318 + 319 + #if defined(CONFIG_SYSFS) && !defined(CONFIG_SLUB_TINY) 320 + #define SLAB_SUPPORTS_SYSFS 321 + void sysfs_slab_unlink(struct kmem_cache *s); 322 + void sysfs_slab_release(struct kmem_cache *s); 323 + #else 324 + static inline void sysfs_slab_unlink(struct kmem_cache *s) { } 325 + static inline void sysfs_slab_release(struct kmem_cache *s) { } 326 + #endif 327 + 328 + void *fixup_red_left(struct kmem_cache *s, void *p); 329 + 330 + static inline void *nearest_obj(struct kmem_cache *cache, 331 + const struct slab *slab, void *x) 332 + { 333 + void *object = x - (x - slab_address(slab)) % cache->size; 334 + void *last_object = slab_address(slab) + 335 + (slab->objects - 1) * cache->size; 336 + void *result = (unlikely(object > last_object)) ? last_object : object; 337 + 338 + result = fixup_red_left(cache, result); 339 + return result; 340 + } 341 + 342 + /* Determine object index from a given position */ 343 + static inline unsigned int __obj_to_index(const struct kmem_cache *cache, 344 + void *addr, void *obj) 345 + { 346 + return reciprocal_divide(kasan_reset_tag(obj) - addr, 347 + cache->reciprocal_size); 348 + } 349 + 350 + static inline unsigned int obj_to_index(const struct kmem_cache *cache, 351 + const struct slab *slab, void *obj) 352 + { 353 + if (is_kfence_address(obj)) 354 + return 0; 355 + return __obj_to_index(cache, slab_address(slab), obj); 356 + } 357 + 358 + static inline int objs_per_slab(const struct kmem_cache *cache, 359 + const struct slab *slab) 360 + { 361 + return slab->objects; 362 + } 237 363 238 364 /* 239 365 * State of the slab allocator. ··· 389 281 void setup_kmalloc_cache_index_table(void); 390 282 void create_kmalloc_caches(slab_flags_t); 391 283 392 - /* Find the kmalloc slab corresponding for a certain size */ 393 - struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags, unsigned long caller); 284 + extern u8 kmalloc_size_index[24]; 394 285 395 - void *__kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, 396 - int node, size_t orig_size, 397 - unsigned long caller); 398 - void __kmem_cache_free(struct kmem_cache *s, void *x, unsigned long caller); 286 + static inline unsigned int size_index_elem(unsigned int bytes) 287 + { 288 + return (bytes - 1) / 8; 289 + } 290 + 291 + /* 292 + * Find the kmem_cache structure that serves a given size of 293 + * allocation 294 + * 295 + * This assumes size is larger than zero and not larger than 296 + * KMALLOC_MAX_CACHE_SIZE and the caller must check that. 297 + */ 298 + static inline struct kmem_cache * 299 + kmalloc_slab(size_t size, gfp_t flags, unsigned long caller) 300 + { 301 + unsigned int index; 302 + 303 + if (size <= 192) 304 + index = kmalloc_size_index[size_index_elem(size)]; 305 + else 306 + index = fls(size - 1); 307 + 308 + return kmalloc_caches[kmalloc_type(flags, caller)][index]; 309 + } 399 310 400 311 gfp_t kmalloc_fix_flags(gfp_t flags); 401 312 402 313 /* Functions provided by the slab allocators */ 403 314 int __kmem_cache_create(struct kmem_cache *, slab_flags_t flags); 404 315 316 + void __init kmem_cache_init(void); 405 317 void __init new_kmalloc_cache(int idx, enum kmalloc_cache_type type, 406 318 slab_flags_t flags); 407 319 extern void create_boot_cache(struct kmem_cache *, const char *name, ··· 448 320 SLAB_CACHE_DMA32 | SLAB_PANIC | \ 449 321 SLAB_TYPESAFE_BY_RCU | SLAB_DEBUG_OBJECTS ) 450 322 451 - #if defined(CONFIG_DEBUG_SLAB) 452 - #define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER) 453 - #elif defined(CONFIG_SLUB_DEBUG) 323 + #ifdef CONFIG_SLUB_DEBUG 454 324 #define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ 455 325 SLAB_TRACE | SLAB_CONSISTENCY_CHECKS) 456 326 #else 457 327 #define SLAB_DEBUG_FLAGS (0) 458 328 #endif 459 329 460 - #if defined(CONFIG_SLAB) 461 - #define SLAB_CACHE_FLAGS (SLAB_MEM_SPREAD | SLAB_NOLEAKTRACE | \ 462 - SLAB_RECLAIM_ACCOUNT | SLAB_TEMPORARY | \ 463 - SLAB_ACCOUNT | SLAB_NO_MERGE) 464 - #elif defined(CONFIG_SLUB) 465 330 #define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE | SLAB_RECLAIM_ACCOUNT | \ 466 331 SLAB_TEMPORARY | SLAB_ACCOUNT | \ 467 332 SLAB_NO_USER_FLAGS | SLAB_KMALLOC | SLAB_NO_MERGE) 468 - #else 469 - #define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE) 470 - #endif 471 333 472 334 /* Common flags available with current configuration */ 473 335 #define CACHE_CREATE_MASK (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS | SLAB_CACHE_FLAGS) ··· 504 386 void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s); 505 387 ssize_t slabinfo_write(struct file *file, const char __user *buffer, 506 388 size_t count, loff_t *ppos); 507 - 508 - static inline enum node_stat_item cache_vmstat_idx(struct kmem_cache *s) 509 - { 510 - return (s->flags & SLAB_RECLAIM_ACCOUNT) ? 511 - NR_SLAB_RECLAIMABLE_B : NR_SLAB_UNRECLAIMABLE_B; 512 - } 513 389 514 390 #ifdef CONFIG_SLUB_DEBUG 515 391 #ifdef CONFIG_SLUB_DEBUG_ON ··· 564 452 gfp_t gfp, bool new_slab); 565 453 void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, 566 454 enum node_stat_item idx, int nr); 567 - 568 - static inline void memcg_free_slab_cgroups(struct slab *slab) 569 - { 570 - kfree(slab_objcgs(slab)); 571 - slab->memcg_data = 0; 572 - } 573 - 574 - static inline size_t obj_full_size(struct kmem_cache *s) 575 - { 576 - /* 577 - * For each accounted object there is an extra space which is used 578 - * to store obj_cgroup membership. Charge it too. 579 - */ 580 - return s->size + sizeof(struct obj_cgroup *); 581 - } 582 - 583 - /* 584 - * Returns false if the allocation should fail. 585 - */ 586 - static inline bool memcg_slab_pre_alloc_hook(struct kmem_cache *s, 587 - struct list_lru *lru, 588 - struct obj_cgroup **objcgp, 589 - size_t objects, gfp_t flags) 590 - { 591 - struct obj_cgroup *objcg; 592 - 593 - if (!memcg_kmem_online()) 594 - return true; 595 - 596 - if (!(flags & __GFP_ACCOUNT) && !(s->flags & SLAB_ACCOUNT)) 597 - return true; 598 - 599 - /* 600 - * The obtained objcg pointer is safe to use within the current scope, 601 - * defined by current task or set_active_memcg() pair. 602 - * obj_cgroup_get() is used to get a permanent reference. 603 - */ 604 - objcg = current_obj_cgroup(); 605 - if (!objcg) 606 - return true; 607 - 608 - if (lru) { 609 - int ret; 610 - struct mem_cgroup *memcg; 611 - 612 - memcg = get_mem_cgroup_from_objcg(objcg); 613 - ret = memcg_list_lru_alloc(memcg, lru, flags); 614 - css_put(&memcg->css); 615 - 616 - if (ret) 617 - return false; 618 - } 619 - 620 - if (obj_cgroup_charge(objcg, flags, objects * obj_full_size(s))) 621 - return false; 622 - 623 - *objcgp = objcg; 624 - return true; 625 - } 626 - 627 - static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s, 628 - struct obj_cgroup *objcg, 629 - gfp_t flags, size_t size, 630 - void **p) 631 - { 632 - struct slab *slab; 633 - unsigned long off; 634 - size_t i; 635 - 636 - if (!memcg_kmem_online() || !objcg) 637 - return; 638 - 639 - for (i = 0; i < size; i++) { 640 - if (likely(p[i])) { 641 - slab = virt_to_slab(p[i]); 642 - 643 - if (!slab_objcgs(slab) && 644 - memcg_alloc_slab_cgroups(slab, s, flags, 645 - false)) { 646 - obj_cgroup_uncharge(objcg, obj_full_size(s)); 647 - continue; 648 - } 649 - 650 - off = obj_to_index(s, slab, p[i]); 651 - obj_cgroup_get(objcg); 652 - slab_objcgs(slab)[off] = objcg; 653 - mod_objcg_state(objcg, slab_pgdat(slab), 654 - cache_vmstat_idx(s), obj_full_size(s)); 655 - } else { 656 - obj_cgroup_uncharge(objcg, obj_full_size(s)); 657 - } 658 - } 659 - } 660 - 661 - static inline void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, 662 - void **p, int objects) 663 - { 664 - struct obj_cgroup **objcgs; 665 - int i; 666 - 667 - if (!memcg_kmem_online()) 668 - return; 669 - 670 - objcgs = slab_objcgs(slab); 671 - if (!objcgs) 672 - return; 673 - 674 - for (i = 0; i < objects; i++) { 675 - struct obj_cgroup *objcg; 676 - unsigned int off; 677 - 678 - off = obj_to_index(s, slab, p[i]); 679 - objcg = objcgs[off]; 680 - if (!objcg) 681 - continue; 682 - 683 - objcgs[off] = NULL; 684 - obj_cgroup_uncharge(objcg, obj_full_size(s)); 685 - mod_objcg_state(objcg, slab_pgdat(slab), cache_vmstat_idx(s), 686 - -obj_full_size(s)); 687 - obj_cgroup_put(objcg); 688 - } 689 - } 690 - 691 455 #else /* CONFIG_MEMCG_KMEM */ 692 456 static inline struct obj_cgroup **slab_objcgs(struct slab *slab) 693 - { 694 - return NULL; 695 - } 696 - 697 - static inline struct mem_cgroup *memcg_from_slab_obj(void *ptr) 698 457 { 699 458 return NULL; 700 459 } ··· 576 593 { 577 594 return 0; 578 595 } 579 - 580 - static inline void memcg_free_slab_cgroups(struct slab *slab) 581 - { 582 - } 583 - 584 - static inline bool memcg_slab_pre_alloc_hook(struct kmem_cache *s, 585 - struct list_lru *lru, 586 - struct obj_cgroup **objcgp, 587 - size_t objects, gfp_t flags) 588 - { 589 - return true; 590 - } 591 - 592 - static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s, 593 - struct obj_cgroup *objcg, 594 - gfp_t flags, size_t size, 595 - void **p) 596 - { 597 - } 598 - 599 - static inline void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, 600 - void **p, int objects) 601 - { 602 - } 603 596 #endif /* CONFIG_MEMCG_KMEM */ 604 - 605 - static inline struct kmem_cache *virt_to_cache(const void *obj) 606 - { 607 - struct slab *slab; 608 - 609 - slab = virt_to_slab(obj); 610 - if (WARN_ONCE(!slab, "%s: Object is not a Slab page!\n", 611 - __func__)) 612 - return NULL; 613 - return slab->slab_cache; 614 - } 615 - 616 - static __always_inline void account_slab(struct slab *slab, int order, 617 - struct kmem_cache *s, gfp_t gfp) 618 - { 619 - if (memcg_kmem_online() && (s->flags & SLAB_ACCOUNT)) 620 - memcg_alloc_slab_cgroups(slab, s, gfp, true); 621 - 622 - mod_node_page_state(slab_pgdat(slab), cache_vmstat_idx(s), 623 - PAGE_SIZE << order); 624 - } 625 - 626 - static __always_inline void unaccount_slab(struct slab *slab, int order, 627 - struct kmem_cache *s) 628 - { 629 - if (memcg_kmem_online()) 630 - memcg_free_slab_cgroups(slab); 631 - 632 - mod_node_page_state(slab_pgdat(slab), cache_vmstat_idx(s), 633 - -(PAGE_SIZE << order)); 634 - } 635 - 636 - static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) 637 - { 638 - struct kmem_cache *cachep; 639 - 640 - if (!IS_ENABLED(CONFIG_SLAB_FREELIST_HARDENED) && 641 - !kmem_cache_debug_flags(s, SLAB_CONSISTENCY_CHECKS)) 642 - return s; 643 - 644 - cachep = virt_to_cache(x); 645 - if (WARN(cachep && cachep != s, 646 - "%s: Wrong slab cache. %s but object is from %s\n", 647 - __func__, s->name, cachep->name)) 648 - print_tracking(cachep, x); 649 - return cachep; 650 - } 651 - 652 - void free_large_kmalloc(struct folio *folio, void *object); 653 597 654 598 size_t __ksize(const void *objp); 655 599 656 600 static inline size_t slab_ksize(const struct kmem_cache *s) 657 601 { 658 - #ifndef CONFIG_SLUB 659 - return s->object_size; 660 - 661 - #else /* CONFIG_SLUB */ 662 - # ifdef CONFIG_SLUB_DEBUG 602 + #ifdef CONFIG_SLUB_DEBUG 663 603 /* 664 604 * Debugging requires use of the padding between object 665 605 * and whatever may come after it. 666 606 */ 667 607 if (s->flags & (SLAB_RED_ZONE | SLAB_POISON)) 668 608 return s->object_size; 669 - # endif 609 + #endif 670 610 if (s->flags & SLAB_KASAN) 671 611 return s->object_size; 672 612 /* ··· 603 697 * Else we can use all the padding etc for the allocation 604 698 */ 605 699 return s->size; 606 - #endif 607 700 } 608 701 609 - static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s, 610 - struct list_lru *lru, 611 - struct obj_cgroup **objcgp, 612 - size_t size, gfp_t flags) 613 - { 614 - flags &= gfp_allowed_mask; 615 - 616 - might_alloc(flags); 617 - 618 - if (should_failslab(s, flags)) 619 - return NULL; 620 - 621 - if (!memcg_slab_pre_alloc_hook(s, lru, objcgp, size, flags)) 622 - return NULL; 623 - 624 - return s; 625 - } 626 - 627 - static inline void slab_post_alloc_hook(struct kmem_cache *s, 628 - struct obj_cgroup *objcg, gfp_t flags, 629 - size_t size, void **p, bool init, 630 - unsigned int orig_size) 631 - { 632 - unsigned int zero_size = s->object_size; 633 - bool kasan_init = init; 634 - size_t i; 635 - 636 - flags &= gfp_allowed_mask; 637 - 638 - /* 639 - * For kmalloc object, the allocated memory size(object_size) is likely 640 - * larger than the requested size(orig_size). If redzone check is 641 - * enabled for the extra space, don't zero it, as it will be redzoned 642 - * soon. The redzone operation for this extra space could be seen as a 643 - * replacement of current poisoning under certain debug option, and 644 - * won't break other sanity checks. 645 - */ 646 - if (kmem_cache_debug_flags(s, SLAB_STORE_USER | SLAB_RED_ZONE) && 647 - (s->flags & SLAB_KMALLOC)) 648 - zero_size = orig_size; 649 - 650 - /* 651 - * When slub_debug is enabled, avoid memory initialization integrated 652 - * into KASAN and instead zero out the memory via the memset below with 653 - * the proper size. Otherwise, KASAN might overwrite SLUB redzones and 654 - * cause false-positive reports. This does not lead to a performance 655 - * penalty on production builds, as slub_debug is not intended to be 656 - * enabled there. 657 - */ 658 - if (__slub_debug_enabled()) 659 - kasan_init = false; 660 - 661 - /* 662 - * As memory initialization might be integrated into KASAN, 663 - * kasan_slab_alloc and initialization memset must be 664 - * kept together to avoid discrepancies in behavior. 665 - * 666 - * As p[i] might get tagged, memset and kmemleak hook come after KASAN. 667 - */ 668 - for (i = 0; i < size; i++) { 669 - p[i] = kasan_slab_alloc(s, p[i], flags, kasan_init); 670 - if (p[i] && init && (!kasan_init || !kasan_has_integrated_init())) 671 - memset(p[i], 0, zero_size); 672 - kmemleak_alloc_recursive(p[i], s->object_size, 1, 673 - s->flags, flags); 674 - kmsan_slab_alloc(s, p[i], flags); 675 - } 676 - 677 - memcg_slab_post_alloc_hook(s, objcg, flags, size, p); 678 - } 679 - 680 - /* 681 - * The slab lists for all objects. 682 - */ 683 - struct kmem_cache_node { 684 - #ifdef CONFIG_SLAB 685 - raw_spinlock_t list_lock; 686 - struct list_head slabs_partial; /* partial list first, better asm code */ 687 - struct list_head slabs_full; 688 - struct list_head slabs_free; 689 - unsigned long total_slabs; /* length of all slab lists */ 690 - unsigned long free_slabs; /* length of free slab list only */ 691 - unsigned long free_objects; 692 - unsigned int free_limit; 693 - unsigned int colour_next; /* Per-node cache coloring */ 694 - struct array_cache *shared; /* shared per node */ 695 - struct alien_cache **alien; /* on other nodes */ 696 - unsigned long next_reap; /* updated without locking */ 697 - int free_touched; /* updated without locking */ 698 - #endif 699 - 700 - #ifdef CONFIG_SLUB 701 - spinlock_t list_lock; 702 - unsigned long nr_partial; 703 - struct list_head partial; 704 702 #ifdef CONFIG_SLUB_DEBUG 705 - atomic_long_t nr_slabs; 706 - atomic_long_t total_objects; 707 - struct list_head full; 708 - #endif 709 - #endif 710 - 711 - }; 712 - 713 - static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) 714 - { 715 - return s->node[node]; 716 - } 717 - 718 - /* 719 - * Iterator over all nodes. The body will be executed for each node that has 720 - * a kmem_cache_node structure allocated (which is true for all online nodes) 721 - */ 722 - #define for_each_kmem_cache_node(__s, __node, __n) \ 723 - for (__node = 0; __node < nr_node_ids; __node++) \ 724 - if ((__n = get_node(__s, __node))) 725 - 726 - 727 - #if defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG) 728 703 void dump_unreclaimable_slab(void); 729 704 #else 730 705 static inline void dump_unreclaimable_slab(void)
+8 -223
mm/slab_common.c
··· 21 21 #include <linux/swiotlb.h> 22 22 #include <linux/proc_fs.h> 23 23 #include <linux/debugfs.h> 24 + #include <linux/kmemleak.h> 24 25 #include <linux/kasan.h> 25 26 #include <asm/cacheflush.h> 26 27 #include <asm/tlbflush.h> ··· 72 71 return 1; 73 72 } 74 73 75 - #ifdef CONFIG_SLUB 76 74 __setup_param("slub_nomerge", slub_nomerge, setup_slab_nomerge, 0); 77 75 __setup_param("slub_merge", slub_merge, setup_slab_merge, 0); 78 - #endif 79 76 80 77 __setup("slab_nomerge", setup_slab_nomerge); 81 78 __setup("slab_merge", setup_slab_merge); ··· 194 195 continue; 195 196 196 197 if (s->size - size >= sizeof(void *)) 197 - continue; 198 - 199 - if (IS_ENABLED(CONFIG_SLAB) && align && 200 - (align > s->align || s->align % align)) 201 198 continue; 202 199 203 200 return s; ··· 665 670 * of two cache sizes there. The size of larger slabs can be determined using 666 671 * fls. 667 672 */ 668 - static u8 size_index[24] __ro_after_init = { 673 + u8 kmalloc_size_index[24] __ro_after_init = { 669 674 3, /* 8 */ 670 675 4, /* 16 */ 671 676 5, /* 24 */ ··· 691 696 2, /* 184 */ 692 697 2 /* 192 */ 693 698 }; 694 - 695 - static inline unsigned int size_index_elem(unsigned int bytes) 696 - { 697 - return (bytes - 1) / 8; 698 - } 699 - 700 - /* 701 - * Find the kmem_cache structure that serves a given size of 702 - * allocation 703 - */ 704 - struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags, unsigned long caller) 705 - { 706 - unsigned int index; 707 - 708 - if (size <= 192) { 709 - if (!size) 710 - return ZERO_SIZE_PTR; 711 - 712 - index = size_index[size_index_elem(size)]; 713 - } else { 714 - if (WARN_ON_ONCE(size > KMALLOC_MAX_CACHE_SIZE)) 715 - return NULL; 716 - index = fls(size - 1); 717 - } 718 - 719 - return kmalloc_caches[kmalloc_type(flags, caller)][index]; 720 - } 721 699 722 700 size_t kmalloc_size_roundup(size_t size) 723 701 { ··· 816 848 for (i = 8; i < KMALLOC_MIN_SIZE; i += 8) { 817 849 unsigned int elem = size_index_elem(i); 818 850 819 - if (elem >= ARRAY_SIZE(size_index)) 851 + if (elem >= ARRAY_SIZE(kmalloc_size_index)) 820 852 break; 821 - size_index[elem] = KMALLOC_SHIFT_LOW; 853 + kmalloc_size_index[elem] = KMALLOC_SHIFT_LOW; 822 854 } 823 855 824 856 if (KMALLOC_MIN_SIZE >= 64) { ··· 827 859 * is 64 byte. 828 860 */ 829 861 for (i = 64 + 8; i <= 96; i += 8) 830 - size_index[size_index_elem(i)] = 7; 862 + kmalloc_size_index[size_index_elem(i)] = 7; 831 863 832 864 } 833 865 ··· 838 870 * instead. 839 871 */ 840 872 for (i = 128 + 8; i <= 192; i += 8) 841 - size_index[size_index_elem(i)] = 8; 873 + kmalloc_size_index[size_index_elem(i)] = 8; 842 874 } 843 875 } 844 876 ··· 936 968 slab_state = UP; 937 969 } 938 970 939 - void free_large_kmalloc(struct folio *folio, void *object) 940 - { 941 - unsigned int order = folio_order(folio); 942 - 943 - if (WARN_ON_ONCE(order == 0)) 944 - pr_warn_once("object pointer: 0x%p\n", object); 945 - 946 - kmemleak_free(object); 947 - kasan_kfree_large(object); 948 - kmsan_kfree_large(object); 949 - 950 - mod_lruvec_page_state(folio_page(folio, 0), NR_SLAB_UNRECLAIMABLE_B, 951 - -(PAGE_SIZE << order)); 952 - __free_pages(folio_page(folio, 0), order); 953 - } 954 - 955 - static void *__kmalloc_large_node(size_t size, gfp_t flags, int node); 956 - static __always_inline 957 - void *__do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller) 958 - { 959 - struct kmem_cache *s; 960 - void *ret; 961 - 962 - if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) { 963 - ret = __kmalloc_large_node(size, flags, node); 964 - trace_kmalloc(caller, ret, size, 965 - PAGE_SIZE << get_order(size), flags, node); 966 - return ret; 967 - } 968 - 969 - s = kmalloc_slab(size, flags, caller); 970 - 971 - if (unlikely(ZERO_OR_NULL_PTR(s))) 972 - return s; 973 - 974 - ret = __kmem_cache_alloc_node(s, flags, node, size, caller); 975 - ret = kasan_kmalloc(s, ret, size, flags); 976 - trace_kmalloc(caller, ret, size, s->size, flags, node); 977 - return ret; 978 - } 979 - 980 - void *__kmalloc_node(size_t size, gfp_t flags, int node) 981 - { 982 - return __do_kmalloc_node(size, flags, node, _RET_IP_); 983 - } 984 - EXPORT_SYMBOL(__kmalloc_node); 985 - 986 - void *__kmalloc(size_t size, gfp_t flags) 987 - { 988 - return __do_kmalloc_node(size, flags, NUMA_NO_NODE, _RET_IP_); 989 - } 990 - EXPORT_SYMBOL(__kmalloc); 991 - 992 - void *__kmalloc_node_track_caller(size_t size, gfp_t flags, 993 - int node, unsigned long caller) 994 - { 995 - return __do_kmalloc_node(size, flags, node, caller); 996 - } 997 - EXPORT_SYMBOL(__kmalloc_node_track_caller); 998 - 999 - /** 1000 - * kfree - free previously allocated memory 1001 - * @object: pointer returned by kmalloc() or kmem_cache_alloc() 1002 - * 1003 - * If @object is NULL, no operation is performed. 1004 - */ 1005 - void kfree(const void *object) 1006 - { 1007 - struct folio *folio; 1008 - struct slab *slab; 1009 - struct kmem_cache *s; 1010 - 1011 - trace_kfree(_RET_IP_, object); 1012 - 1013 - if (unlikely(ZERO_OR_NULL_PTR(object))) 1014 - return; 1015 - 1016 - folio = virt_to_folio(object); 1017 - if (unlikely(!folio_test_slab(folio))) { 1018 - free_large_kmalloc(folio, (void *)object); 1019 - return; 1020 - } 1021 - 1022 - slab = folio_slab(folio); 1023 - s = slab->slab_cache; 1024 - __kmem_cache_free(s, (void *)object, _RET_IP_); 1025 - } 1026 - EXPORT_SYMBOL(kfree); 1027 - 1028 971 /** 1029 972 * __ksize -- Report full size of underlying allocation 1030 973 * @object: pointer to the object ··· 972 1093 return slab_ksize(folio_slab(folio)->slab_cache); 973 1094 } 974 1095 975 - void *kmalloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size) 976 - { 977 - void *ret = __kmem_cache_alloc_node(s, gfpflags, NUMA_NO_NODE, 978 - size, _RET_IP_); 979 - 980 - trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags, NUMA_NO_NODE); 981 - 982 - ret = kasan_kmalloc(s, ret, size, gfpflags); 983 - return ret; 984 - } 985 - EXPORT_SYMBOL(kmalloc_trace); 986 - 987 - void *kmalloc_node_trace(struct kmem_cache *s, gfp_t gfpflags, 988 - int node, size_t size) 989 - { 990 - void *ret = __kmem_cache_alloc_node(s, gfpflags, node, size, _RET_IP_); 991 - 992 - trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags, node); 993 - 994 - ret = kasan_kmalloc(s, ret, size, gfpflags); 995 - return ret; 996 - } 997 - EXPORT_SYMBOL(kmalloc_node_trace); 998 - 999 1096 gfp_t kmalloc_fix_flags(gfp_t flags) 1000 1097 { 1001 1098 gfp_t invalid_mask = flags & GFP_SLAB_BUG_MASK; ··· 983 1128 984 1129 return flags; 985 1130 } 986 - 987 - /* 988 - * To avoid unnecessary overhead, we pass through large allocation requests 989 - * directly to the page allocator. We use __GFP_COMP, because we will need to 990 - * know the allocation order to free the pages properly in kfree. 991 - */ 992 - 993 - static void *__kmalloc_large_node(size_t size, gfp_t flags, int node) 994 - { 995 - struct page *page; 996 - void *ptr = NULL; 997 - unsigned int order = get_order(size); 998 - 999 - if (unlikely(flags & GFP_SLAB_BUG_MASK)) 1000 - flags = kmalloc_fix_flags(flags); 1001 - 1002 - flags |= __GFP_COMP; 1003 - page = alloc_pages_node(node, flags, order); 1004 - if (page) { 1005 - ptr = page_address(page); 1006 - mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B, 1007 - PAGE_SIZE << order); 1008 - } 1009 - 1010 - ptr = kasan_kmalloc_large(ptr, size, flags); 1011 - /* As ptr might get tagged, call kmemleak hook after KASAN. */ 1012 - kmemleak_alloc(ptr, size, 1, flags); 1013 - kmsan_kmalloc_large(ptr, size, flags); 1014 - 1015 - return ptr; 1016 - } 1017 - 1018 - void *kmalloc_large(size_t size, gfp_t flags) 1019 - { 1020 - void *ret = __kmalloc_large_node(size, flags, NUMA_NO_NODE); 1021 - 1022 - trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << get_order(size), 1023 - flags, NUMA_NO_NODE); 1024 - return ret; 1025 - } 1026 - EXPORT_SYMBOL(kmalloc_large); 1027 - 1028 - void *kmalloc_large_node(size_t size, gfp_t flags, int node) 1029 - { 1030 - void *ret = __kmalloc_large_node(size, flags, node); 1031 - 1032 - trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << get_order(size), 1033 - flags, node); 1034 - return ret; 1035 - } 1036 - EXPORT_SYMBOL(kmalloc_large_node); 1037 1131 1038 1132 #ifdef CONFIG_SLAB_FREELIST_RANDOM 1039 1133 /* Randomize a generic freelist */ ··· 1026 1222 } 1027 1223 #endif /* CONFIG_SLAB_FREELIST_RANDOM */ 1028 1224 1029 - #if defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG) 1030 - #ifdef CONFIG_SLAB 1031 - #define SLABINFO_RIGHTS (0600) 1032 - #else 1225 + #ifdef CONFIG_SLUB_DEBUG 1033 1226 #define SLABINFO_RIGHTS (0400) 1034 - #endif 1035 1227 1036 1228 static void print_slabinfo_header(struct seq_file *m) 1037 1229 { ··· 1035 1235 * Output format version, so at least we can change it 1036 1236 * without _too_ many complaints. 1037 1237 */ 1038 - #ifdef CONFIG_DEBUG_SLAB 1039 - seq_puts(m, "slabinfo - version: 2.1 (statistics)\n"); 1040 - #else 1041 1238 seq_puts(m, "slabinfo - version: 2.1\n"); 1042 - #endif 1043 1239 seq_puts(m, "# name <active_objs> <num_objs> <objsize> <objperslab> <pagesperslab>"); 1044 1240 seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>"); 1045 1241 seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>"); 1046 - #ifdef CONFIG_DEBUG_SLAB 1047 - seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped> <error> <maxfreeable> <nodeallocs> <remotefrees> <alienoverflow>"); 1048 - seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>"); 1049 - #endif 1050 1242 seq_putc(m, '\n'); 1051 1243 } 1052 1244 ··· 1162 1370 } 1163 1371 module_init(slab_proc_init); 1164 1372 1165 - #endif /* CONFIG_SLAB || CONFIG_SLUB_DEBUG */ 1373 + #endif /* CONFIG_SLUB_DEBUG */ 1166 1374 1167 1375 static __always_inline __realloc_size(2) void * 1168 1376 __do_krealloc(const void *p, size_t new_size, gfp_t flags) ··· 1280 1488 EXPORT_TRACEPOINT_SYMBOL(kfree); 1281 1489 EXPORT_TRACEPOINT_SYMBOL(kmem_cache_free); 1282 1490 1283 - int should_failslab(struct kmem_cache *s, gfp_t gfpflags) 1284 - { 1285 - if (__should_failslab(s, gfpflags)) 1286 - return -ENOMEM; 1287 - return 0; 1288 - } 1289 - ALLOW_ERROR_INJECTION(should_failslab, ERRNO);
+869 -272
mm/slub.c
··· 34 34 #include <linux/memory.h> 35 35 #include <linux/math64.h> 36 36 #include <linux/fault-inject.h> 37 + #include <linux/kmemleak.h> 37 38 #include <linux/stacktrace.h> 38 39 #include <linux/prefetch.h> 39 40 #include <linux/memcontrol.h> ··· 77 76 * 78 77 * Frozen slabs 79 78 * 80 - * If a slab is frozen then it is exempt from list management. It is not 81 - * on any list except per cpu partial list. The processor that froze the 79 + * If a slab is frozen then it is exempt from list management. It is 80 + * the cpu slab which is actively allocated from by the processor that 81 + * froze it and it is not on any list. The processor that froze the 82 82 * slab is the one who can perform list operations on the slab. Other 83 83 * processors may put objects onto the freelist but the processor that 84 84 * froze the slab is the only one that can retrieve the objects from the 85 85 * slab's freelist. 86 + * 87 + * CPU partial slabs 88 + * 89 + * The partially empty slabs cached on the CPU partial list are used 90 + * for performance reasons, which speeds up the allocation process. 91 + * These slabs are not frozen, but are also exempt from list management, 92 + * by clearing the PG_workingset flag when moving out of the node 93 + * partial list. Please see __slab_free() for more details. 94 + * 95 + * To sum up, the current scheme is: 96 + * - node partial slab: PG_Workingset && !frozen 97 + * - cpu partial slab: !PG_Workingset && !frozen 98 + * - cpu slab: !PG_Workingset && frozen 99 + * - full slab: !PG_Workingset && !frozen 86 100 * 87 101 * list_lock 88 102 * ··· 220 204 221 205 /* Structure holding parameters for get_partial() call chain */ 222 206 struct partial_context { 223 - struct slab **slab; 224 207 gfp_t flags; 225 208 unsigned int orig_size; 209 + void *object; 226 210 }; 227 211 228 212 static inline bool kmem_cache_debug(struct kmem_cache *s) ··· 346 330 static inline void debugfs_slab_add(struct kmem_cache *s) { } 347 331 #endif 348 332 333 + enum stat_item { 334 + ALLOC_FASTPATH, /* Allocation from cpu slab */ 335 + ALLOC_SLOWPATH, /* Allocation by getting a new cpu slab */ 336 + FREE_FASTPATH, /* Free to cpu slab */ 337 + FREE_SLOWPATH, /* Freeing not to cpu slab */ 338 + FREE_FROZEN, /* Freeing to frozen slab */ 339 + FREE_ADD_PARTIAL, /* Freeing moves slab to partial list */ 340 + FREE_REMOVE_PARTIAL, /* Freeing removes last object */ 341 + ALLOC_FROM_PARTIAL, /* Cpu slab acquired from node partial list */ 342 + ALLOC_SLAB, /* Cpu slab acquired from page allocator */ 343 + ALLOC_REFILL, /* Refill cpu slab from slab freelist */ 344 + ALLOC_NODE_MISMATCH, /* Switching cpu slab */ 345 + FREE_SLAB, /* Slab freed to the page allocator */ 346 + CPUSLAB_FLUSH, /* Abandoning of the cpu slab */ 347 + DEACTIVATE_FULL, /* Cpu slab was full when deactivated */ 348 + DEACTIVATE_EMPTY, /* Cpu slab was empty when deactivated */ 349 + DEACTIVATE_TO_HEAD, /* Cpu slab was moved to the head of partials */ 350 + DEACTIVATE_TO_TAIL, /* Cpu slab was moved to the tail of partials */ 351 + DEACTIVATE_REMOTE_FREES,/* Slab contained remotely freed objects */ 352 + DEACTIVATE_BYPASS, /* Implicit deactivation */ 353 + ORDER_FALLBACK, /* Number of times fallback was necessary */ 354 + CMPXCHG_DOUBLE_CPU_FAIL,/* Failures of this_cpu_cmpxchg_double */ 355 + CMPXCHG_DOUBLE_FAIL, /* Failures of slab freelist update */ 356 + CPU_PARTIAL_ALLOC, /* Used cpu partial on alloc */ 357 + CPU_PARTIAL_FREE, /* Refill cpu partial on free */ 358 + CPU_PARTIAL_NODE, /* Refill cpu partial from node partial */ 359 + CPU_PARTIAL_DRAIN, /* Drain cpu partial to node partial */ 360 + NR_SLUB_STAT_ITEMS 361 + }; 362 + 363 + #ifndef CONFIG_SLUB_TINY 364 + /* 365 + * When changing the layout, make sure freelist and tid are still compatible 366 + * with this_cpu_cmpxchg_double() alignment requirements. 367 + */ 368 + struct kmem_cache_cpu { 369 + union { 370 + struct { 371 + void **freelist; /* Pointer to next available object */ 372 + unsigned long tid; /* Globally unique transaction id */ 373 + }; 374 + freelist_aba_t freelist_tid; 375 + }; 376 + struct slab *slab; /* The slab from which we are allocating */ 377 + #ifdef CONFIG_SLUB_CPU_PARTIAL 378 + struct slab *partial; /* Partially allocated frozen slabs */ 379 + #endif 380 + local_lock_t lock; /* Protects the fields above */ 381 + #ifdef CONFIG_SLUB_STATS 382 + unsigned int stat[NR_SLUB_STAT_ITEMS]; 383 + #endif 384 + }; 385 + #endif /* CONFIG_SLUB_TINY */ 386 + 349 387 static inline void stat(const struct kmem_cache *s, enum stat_item si) 350 388 { 351 389 #ifdef CONFIG_SLUB_STATS ··· 410 340 raw_cpu_inc(s->cpu_slab->stat[si]); 411 341 #endif 412 342 } 343 + 344 + static inline 345 + void stat_add(const struct kmem_cache *s, enum stat_item si, int v) 346 + { 347 + #ifdef CONFIG_SLUB_STATS 348 + raw_cpu_add(s->cpu_slab->stat[si], v); 349 + #endif 350 + } 351 + 352 + /* 353 + * The slab lists for all objects. 354 + */ 355 + struct kmem_cache_node { 356 + spinlock_t list_lock; 357 + unsigned long nr_partial; 358 + struct list_head partial; 359 + #ifdef CONFIG_SLUB_DEBUG 360 + atomic_long_t nr_slabs; 361 + atomic_long_t total_objects; 362 + struct list_head full; 363 + #endif 364 + }; 365 + 366 + static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) 367 + { 368 + return s->node[node]; 369 + } 370 + 371 + /* 372 + * Iterator over all nodes. The body will be executed for each node that has 373 + * a kmem_cache_node structure allocated (which is true for all online nodes) 374 + */ 375 + #define for_each_kmem_cache_node(__s, __node, __n) \ 376 + for (__node = 0; __node < nr_node_ids; __node++) \ 377 + if ((__n = get_node(__s, __node))) 413 378 414 379 /* 415 380 * Tracks for which NUMA nodes we have kmem_cache_nodes allocated. ··· 627 522 struct page *page = slab_page(slab); 628 523 629 524 VM_BUG_ON_PAGE(PageTail(page), page); 630 - __bit_spin_unlock(PG_locked, &page->flags); 525 + bit_spin_unlock(PG_locked, &page->flags); 631 526 } 632 527 633 528 static inline bool ··· 1864 1759 #endif 1865 1760 #endif /* CONFIG_SLUB_DEBUG */ 1866 1761 1762 + static inline enum node_stat_item cache_vmstat_idx(struct kmem_cache *s) 1763 + { 1764 + return (s->flags & SLAB_RECLAIM_ACCOUNT) ? 1765 + NR_SLAB_RECLAIMABLE_B : NR_SLAB_UNRECLAIMABLE_B; 1766 + } 1767 + 1768 + #ifdef CONFIG_MEMCG_KMEM 1769 + static inline void memcg_free_slab_cgroups(struct slab *slab) 1770 + { 1771 + kfree(slab_objcgs(slab)); 1772 + slab->memcg_data = 0; 1773 + } 1774 + 1775 + static inline size_t obj_full_size(struct kmem_cache *s) 1776 + { 1777 + /* 1778 + * For each accounted object there is an extra space which is used 1779 + * to store obj_cgroup membership. Charge it too. 1780 + */ 1781 + return s->size + sizeof(struct obj_cgroup *); 1782 + } 1783 + 1784 + /* 1785 + * Returns false if the allocation should fail. 1786 + */ 1787 + static bool __memcg_slab_pre_alloc_hook(struct kmem_cache *s, 1788 + struct list_lru *lru, 1789 + struct obj_cgroup **objcgp, 1790 + size_t objects, gfp_t flags) 1791 + { 1792 + /* 1793 + * The obtained objcg pointer is safe to use within the current scope, 1794 + * defined by current task or set_active_memcg() pair. 1795 + * obj_cgroup_get() is used to get a permanent reference. 1796 + */ 1797 + struct obj_cgroup *objcg = current_obj_cgroup(); 1798 + if (!objcg) 1799 + return true; 1800 + 1801 + if (lru) { 1802 + int ret; 1803 + struct mem_cgroup *memcg; 1804 + 1805 + memcg = get_mem_cgroup_from_objcg(objcg); 1806 + ret = memcg_list_lru_alloc(memcg, lru, flags); 1807 + css_put(&memcg->css); 1808 + 1809 + if (ret) 1810 + return false; 1811 + } 1812 + 1813 + if (obj_cgroup_charge(objcg, flags, objects * obj_full_size(s))) 1814 + return false; 1815 + 1816 + *objcgp = objcg; 1817 + return true; 1818 + } 1819 + 1820 + /* 1821 + * Returns false if the allocation should fail. 1822 + */ 1823 + static __fastpath_inline 1824 + bool memcg_slab_pre_alloc_hook(struct kmem_cache *s, struct list_lru *lru, 1825 + struct obj_cgroup **objcgp, size_t objects, 1826 + gfp_t flags) 1827 + { 1828 + if (!memcg_kmem_online()) 1829 + return true; 1830 + 1831 + if (likely(!(flags & __GFP_ACCOUNT) && !(s->flags & SLAB_ACCOUNT))) 1832 + return true; 1833 + 1834 + return likely(__memcg_slab_pre_alloc_hook(s, lru, objcgp, objects, 1835 + flags)); 1836 + } 1837 + 1838 + static void __memcg_slab_post_alloc_hook(struct kmem_cache *s, 1839 + struct obj_cgroup *objcg, 1840 + gfp_t flags, size_t size, 1841 + void **p) 1842 + { 1843 + struct slab *slab; 1844 + unsigned long off; 1845 + size_t i; 1846 + 1847 + flags &= gfp_allowed_mask; 1848 + 1849 + for (i = 0; i < size; i++) { 1850 + if (likely(p[i])) { 1851 + slab = virt_to_slab(p[i]); 1852 + 1853 + if (!slab_objcgs(slab) && 1854 + memcg_alloc_slab_cgroups(slab, s, flags, false)) { 1855 + obj_cgroup_uncharge(objcg, obj_full_size(s)); 1856 + continue; 1857 + } 1858 + 1859 + off = obj_to_index(s, slab, p[i]); 1860 + obj_cgroup_get(objcg); 1861 + slab_objcgs(slab)[off] = objcg; 1862 + mod_objcg_state(objcg, slab_pgdat(slab), 1863 + cache_vmstat_idx(s), obj_full_size(s)); 1864 + } else { 1865 + obj_cgroup_uncharge(objcg, obj_full_size(s)); 1866 + } 1867 + } 1868 + } 1869 + 1870 + static __fastpath_inline 1871 + void memcg_slab_post_alloc_hook(struct kmem_cache *s, struct obj_cgroup *objcg, 1872 + gfp_t flags, size_t size, void **p) 1873 + { 1874 + if (likely(!memcg_kmem_online() || !objcg)) 1875 + return; 1876 + 1877 + return __memcg_slab_post_alloc_hook(s, objcg, flags, size, p); 1878 + } 1879 + 1880 + static void __memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, 1881 + void **p, int objects, 1882 + struct obj_cgroup **objcgs) 1883 + { 1884 + for (int i = 0; i < objects; i++) { 1885 + struct obj_cgroup *objcg; 1886 + unsigned int off; 1887 + 1888 + off = obj_to_index(s, slab, p[i]); 1889 + objcg = objcgs[off]; 1890 + if (!objcg) 1891 + continue; 1892 + 1893 + objcgs[off] = NULL; 1894 + obj_cgroup_uncharge(objcg, obj_full_size(s)); 1895 + mod_objcg_state(objcg, slab_pgdat(slab), cache_vmstat_idx(s), 1896 + -obj_full_size(s)); 1897 + obj_cgroup_put(objcg); 1898 + } 1899 + } 1900 + 1901 + static __fastpath_inline 1902 + void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p, 1903 + int objects) 1904 + { 1905 + struct obj_cgroup **objcgs; 1906 + 1907 + if (!memcg_kmem_online()) 1908 + return; 1909 + 1910 + objcgs = slab_objcgs(slab); 1911 + if (likely(!objcgs)) 1912 + return; 1913 + 1914 + __memcg_slab_free_hook(s, slab, p, objects, objcgs); 1915 + } 1916 + 1917 + static inline 1918 + void memcg_slab_alloc_error_hook(struct kmem_cache *s, int objects, 1919 + struct obj_cgroup *objcg) 1920 + { 1921 + if (objcg) 1922 + obj_cgroup_uncharge(objcg, objects * obj_full_size(s)); 1923 + } 1924 + #else /* CONFIG_MEMCG_KMEM */ 1925 + static inline struct mem_cgroup *memcg_from_slab_obj(void *ptr) 1926 + { 1927 + return NULL; 1928 + } 1929 + 1930 + static inline void memcg_free_slab_cgroups(struct slab *slab) 1931 + { 1932 + } 1933 + 1934 + static inline bool memcg_slab_pre_alloc_hook(struct kmem_cache *s, 1935 + struct list_lru *lru, 1936 + struct obj_cgroup **objcgp, 1937 + size_t objects, gfp_t flags) 1938 + { 1939 + return true; 1940 + } 1941 + 1942 + static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s, 1943 + struct obj_cgroup *objcg, 1944 + gfp_t flags, size_t size, 1945 + void **p) 1946 + { 1947 + } 1948 + 1949 + static inline void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, 1950 + void **p, int objects) 1951 + { 1952 + } 1953 + 1954 + static inline 1955 + void memcg_slab_alloc_error_hook(struct kmem_cache *s, int objects, 1956 + struct obj_cgroup *objcg) 1957 + { 1958 + } 1959 + #endif /* CONFIG_MEMCG_KMEM */ 1960 + 1867 1961 /* 1868 1962 * Hooks for other subsystems that check memory allocations. In a typical 1869 1963 * production configuration these hooks all should produce no code at all. 1964 + * 1965 + * Returns true if freeing of the object can proceed, false if its reuse 1966 + * was delayed by KASAN quarantine, or it was returned to KFENCE. 1870 1967 */ 1871 - static __always_inline bool slab_free_hook(struct kmem_cache *s, 1872 - void *x, bool init) 1968 + static __always_inline 1969 + bool slab_free_hook(struct kmem_cache *s, void *x, bool init) 1873 1970 { 1874 1971 kmemleak_free_recursive(x, s->flags); 1875 1972 kmsan_slab_free(s, x); ··· 2086 1779 __kcsan_check_access(x, s->object_size, 2087 1780 KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ASSERT); 2088 1781 1782 + if (kfence_free(x)) 1783 + return false; 1784 + 2089 1785 /* 2090 1786 * As memory initialization might be integrated into KASAN, 2091 1787 * kasan_slab_free and initialization memset's must be ··· 2097 1787 * The initialization memset's clear the object and the metadata, 2098 1788 * but don't touch the SLAB redzone. 2099 1789 */ 2100 - if (init) { 1790 + if (unlikely(init)) { 2101 1791 int rsize; 2102 1792 2103 1793 if (!kasan_has_integrated_init()) ··· 2107 1797 s->size - s->inuse - rsize); 2108 1798 } 2109 1799 /* KASAN might put x into memory quarantine, delaying its reuse. */ 2110 - return kasan_slab_free(s, x, init); 1800 + return !kasan_slab_free(s, x, init); 2111 1801 } 2112 1802 2113 1803 static inline bool slab_free_freelist_hook(struct kmem_cache *s, ··· 2117 1807 2118 1808 void *object; 2119 1809 void *next = *head; 2120 - void *old_tail = *tail ? *tail : *head; 1810 + void *old_tail = *tail; 1811 + bool init; 2121 1812 2122 1813 if (is_kfence_address(next)) { 2123 1814 slab_free_hook(s, next, false); 2124 - return true; 1815 + return false; 2125 1816 } 2126 1817 2127 1818 /* Head and tail of the reconstructed freelist */ 2128 1819 *head = NULL; 2129 1820 *tail = NULL; 2130 1821 1822 + init = slab_want_init_on_free(s); 1823 + 2131 1824 do { 2132 1825 object = next; 2133 1826 next = get_freepointer(s, object); 2134 1827 2135 1828 /* If object's reuse doesn't have to be delayed */ 2136 - if (!slab_free_hook(s, object, slab_want_init_on_free(s))) { 1829 + if (likely(slab_free_hook(s, object, init))) { 2137 1830 /* Move object to the new freelist */ 2138 1831 set_freepointer(s, object, *head); 2139 1832 *head = object; ··· 2150 1837 --(*cnt); 2151 1838 } 2152 1839 } while (object != old_tail); 2153 - 2154 - if (*head == *tail) 2155 - *tail = NULL; 2156 1840 2157 1841 return *head != NULL; 2158 1842 } ··· 2303 1993 } 2304 1994 #endif /* CONFIG_SLAB_FREELIST_RANDOM */ 2305 1995 1996 + static __always_inline void account_slab(struct slab *slab, int order, 1997 + struct kmem_cache *s, gfp_t gfp) 1998 + { 1999 + if (memcg_kmem_online() && (s->flags & SLAB_ACCOUNT)) 2000 + memcg_alloc_slab_cgroups(slab, s, gfp, true); 2001 + 2002 + mod_node_page_state(slab_pgdat(slab), cache_vmstat_idx(s), 2003 + PAGE_SIZE << order); 2004 + } 2005 + 2006 + static __always_inline void unaccount_slab(struct slab *slab, int order, 2007 + struct kmem_cache *s) 2008 + { 2009 + if (memcg_kmem_online()) 2010 + memcg_free_slab_cgroups(slab); 2011 + 2012 + mod_node_page_state(slab_pgdat(slab), cache_vmstat_idx(s), 2013 + -(PAGE_SIZE << order)); 2014 + } 2015 + 2306 2016 static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) 2307 2017 { 2308 2018 struct slab *slab; ··· 2447 2117 } 2448 2118 2449 2119 /* 2120 + * SLUB reuses PG_workingset bit to keep track of whether it's on 2121 + * the per-node partial list. 2122 + */ 2123 + static inline bool slab_test_node_partial(const struct slab *slab) 2124 + { 2125 + return folio_test_workingset((struct folio *)slab_folio(slab)); 2126 + } 2127 + 2128 + static inline void slab_set_node_partial(struct slab *slab) 2129 + { 2130 + set_bit(PG_workingset, folio_flags(slab_folio(slab), 0)); 2131 + } 2132 + 2133 + static inline void slab_clear_node_partial(struct slab *slab) 2134 + { 2135 + clear_bit(PG_workingset, folio_flags(slab_folio(slab), 0)); 2136 + } 2137 + 2138 + /* 2450 2139 * Management of partially allocated slabs. 2451 2140 */ 2452 2141 static inline void ··· 2476 2127 list_add_tail(&slab->slab_list, &n->partial); 2477 2128 else 2478 2129 list_add(&slab->slab_list, &n->partial); 2130 + slab_set_node_partial(slab); 2479 2131 } 2480 2132 2481 2133 static inline void add_partial(struct kmem_cache_node *n, ··· 2491 2141 { 2492 2142 lockdep_assert_held(&n->list_lock); 2493 2143 list_del(&slab->slab_list); 2144 + slab_clear_node_partial(slab); 2494 2145 n->nr_partial--; 2495 2146 } 2496 2147 2497 2148 /* 2498 - * Called only for kmem_cache_debug() caches instead of acquire_slab(), with a 2149 + * Called only for kmem_cache_debug() caches instead of remove_partial(), with a 2499 2150 * slab from the n->partial list. Remove only a single object from the slab, do 2500 2151 * the alloc_debug_processing() checks and leave the slab on the list, or move 2501 2152 * it to full list if it was the last free object. ··· 2564 2213 return object; 2565 2214 } 2566 2215 2567 - /* 2568 - * Remove slab from the partial list, freeze it and 2569 - * return the pointer to the freelist. 2570 - * 2571 - * Returns a list of objects or NULL if it fails. 2572 - */ 2573 - static inline void *acquire_slab(struct kmem_cache *s, 2574 - struct kmem_cache_node *n, struct slab *slab, 2575 - int mode) 2576 - { 2577 - void *freelist; 2578 - unsigned long counters; 2579 - struct slab new; 2580 - 2581 - lockdep_assert_held(&n->list_lock); 2582 - 2583 - /* 2584 - * Zap the freelist and set the frozen bit. 2585 - * The old freelist is the list of objects for the 2586 - * per cpu allocation list. 2587 - */ 2588 - freelist = slab->freelist; 2589 - counters = slab->counters; 2590 - new.counters = counters; 2591 - if (mode) { 2592 - new.inuse = slab->objects; 2593 - new.freelist = NULL; 2594 - } else { 2595 - new.freelist = freelist; 2596 - } 2597 - 2598 - VM_BUG_ON(new.frozen); 2599 - new.frozen = 1; 2600 - 2601 - if (!__slab_update_freelist(s, slab, 2602 - freelist, counters, 2603 - new.freelist, new.counters, 2604 - "acquire_slab")) 2605 - return NULL; 2606 - 2607 - remove_partial(n, slab); 2608 - WARN_ON(!freelist); 2609 - return freelist; 2610 - } 2611 - 2612 2216 #ifdef CONFIG_SLUB_CPU_PARTIAL 2613 2217 static void put_cpu_partial(struct kmem_cache *s, struct slab *slab, int drain); 2614 2218 #else ··· 2575 2269 /* 2576 2270 * Try to allocate a partial slab from a specific node. 2577 2271 */ 2578 - static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, 2579 - struct partial_context *pc) 2272 + static struct slab *get_partial_node(struct kmem_cache *s, 2273 + struct kmem_cache_node *n, 2274 + struct partial_context *pc) 2580 2275 { 2581 - struct slab *slab, *slab2; 2582 - void *object = NULL; 2276 + struct slab *slab, *slab2, *partial = NULL; 2583 2277 unsigned long flags; 2584 2278 unsigned int partial_slabs = 0; 2585 2279 ··· 2594 2288 2595 2289 spin_lock_irqsave(&n->list_lock, flags); 2596 2290 list_for_each_entry_safe(slab, slab2, &n->partial, slab_list) { 2597 - void *t; 2598 - 2599 2291 if (!pfmemalloc_match(slab, pc->flags)) 2600 2292 continue; 2601 2293 2602 2294 if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) { 2603 - object = alloc_single_from_partial(s, n, slab, 2295 + void *object = alloc_single_from_partial(s, n, slab, 2604 2296 pc->orig_size); 2605 - if (object) 2297 + if (object) { 2298 + partial = slab; 2299 + pc->object = object; 2606 2300 break; 2301 + } 2607 2302 continue; 2608 2303 } 2609 2304 2610 - t = acquire_slab(s, n, slab, object == NULL); 2611 - if (!t) 2612 - break; 2305 + remove_partial(n, slab); 2613 2306 2614 - if (!object) { 2615 - *pc->slab = slab; 2307 + if (!partial) { 2308 + partial = slab; 2616 2309 stat(s, ALLOC_FROM_PARTIAL); 2617 - object = t; 2618 2310 } else { 2619 2311 put_cpu_partial(s, slab, 0); 2620 2312 stat(s, CPU_PARTIAL_NODE); ··· 2628 2324 2629 2325 } 2630 2326 spin_unlock_irqrestore(&n->list_lock, flags); 2631 - return object; 2327 + return partial; 2632 2328 } 2633 2329 2634 2330 /* 2635 2331 * Get a slab from somewhere. Search in increasing NUMA distances. 2636 2332 */ 2637 - static void *get_any_partial(struct kmem_cache *s, struct partial_context *pc) 2333 + static struct slab *get_any_partial(struct kmem_cache *s, 2334 + struct partial_context *pc) 2638 2335 { 2639 2336 #ifdef CONFIG_NUMA 2640 2337 struct zonelist *zonelist; 2641 2338 struct zoneref *z; 2642 2339 struct zone *zone; 2643 2340 enum zone_type highest_zoneidx = gfp_zone(pc->flags); 2644 - void *object; 2341 + struct slab *slab; 2645 2342 unsigned int cpuset_mems_cookie; 2646 2343 2647 2344 /* ··· 2677 2372 2678 2373 if (n && cpuset_zone_allowed(zone, pc->flags) && 2679 2374 n->nr_partial > s->min_partial) { 2680 - object = get_partial_node(s, n, pc); 2681 - if (object) { 2375 + slab = get_partial_node(s, n, pc); 2376 + if (slab) { 2682 2377 /* 2683 2378 * Don't check read_mems_allowed_retry() 2684 2379 * here - if mems_allowed was updated in ··· 2686 2381 * between allocation and the cpuset 2687 2382 * update 2688 2383 */ 2689 - return object; 2384 + return slab; 2690 2385 } 2691 2386 } 2692 2387 } ··· 2698 2393 /* 2699 2394 * Get a partial slab, lock it and return it. 2700 2395 */ 2701 - static void *get_partial(struct kmem_cache *s, int node, struct partial_context *pc) 2396 + static struct slab *get_partial(struct kmem_cache *s, int node, 2397 + struct partial_context *pc) 2702 2398 { 2703 - void *object; 2399 + struct slab *slab; 2704 2400 int searchnode = node; 2705 2401 2706 2402 if (node == NUMA_NO_NODE) 2707 2403 searchnode = numa_mem_id(); 2708 2404 2709 - object = get_partial_node(s, get_node(s, searchnode), pc); 2710 - if (object || node != NUMA_NO_NODE) 2711 - return object; 2405 + slab = get_partial_node(s, get_node(s, searchnode), pc); 2406 + if (slab || node != NUMA_NO_NODE) 2407 + return slab; 2712 2408 2713 2409 return get_any_partial(s, pc); 2714 2410 } ··· 2798 2492 static void deactivate_slab(struct kmem_cache *s, struct slab *slab, 2799 2493 void *freelist) 2800 2494 { 2801 - enum slab_modes { M_NONE, M_PARTIAL, M_FREE, M_FULL_NOLIST }; 2802 2495 struct kmem_cache_node *n = get_node(s, slab_nid(slab)); 2803 2496 int free_delta = 0; 2804 - enum slab_modes mode = M_NONE; 2805 2497 void *nextfree, *freelist_iter, *freelist_tail; 2806 2498 int tail = DEACTIVATE_TO_HEAD; 2807 2499 unsigned long flags = 0; ··· 2837 2533 /* 2838 2534 * Stage two: Unfreeze the slab while splicing the per-cpu 2839 2535 * freelist to the head of slab's freelist. 2840 - * 2841 - * Ensure that the slab is unfrozen while the list presence 2842 - * reflects the actual number of objects during unfreeze. 2843 - * 2844 - * We first perform cmpxchg holding lock and insert to list 2845 - * when it succeed. If there is mismatch then the slab is not 2846 - * unfrozen and number of objects in the slab may have changed. 2847 - * Then release lock and retry cmpxchg again. 2848 2536 */ 2849 - redo: 2537 + do { 2538 + old.freelist = READ_ONCE(slab->freelist); 2539 + old.counters = READ_ONCE(slab->counters); 2540 + VM_BUG_ON(!old.frozen); 2850 2541 2851 - old.freelist = READ_ONCE(slab->freelist); 2852 - old.counters = READ_ONCE(slab->counters); 2853 - VM_BUG_ON(!old.frozen); 2542 + /* Determine target state of the slab */ 2543 + new.counters = old.counters; 2544 + new.frozen = 0; 2545 + if (freelist_tail) { 2546 + new.inuse -= free_delta; 2547 + set_freepointer(s, freelist_tail, old.freelist); 2548 + new.freelist = freelist; 2549 + } else { 2550 + new.freelist = old.freelist; 2551 + } 2552 + } while (!slab_update_freelist(s, slab, 2553 + old.freelist, old.counters, 2554 + new.freelist, new.counters, 2555 + "unfreezing slab")); 2854 2556 2855 - /* Determine target state of the slab */ 2856 - new.counters = old.counters; 2857 - if (freelist_tail) { 2858 - new.inuse -= free_delta; 2859 - set_freepointer(s, freelist_tail, old.freelist); 2860 - new.freelist = freelist; 2861 - } else 2862 - new.freelist = old.freelist; 2863 - 2864 - new.frozen = 0; 2865 - 2557 + /* 2558 + * Stage three: Manipulate the slab list based on the updated state. 2559 + */ 2866 2560 if (!new.inuse && n->nr_partial >= s->min_partial) { 2867 - mode = M_FREE; 2868 - } else if (new.freelist) { 2869 - mode = M_PARTIAL; 2870 - /* 2871 - * Taking the spinlock removes the possibility that 2872 - * acquire_slab() will see a slab that is frozen 2873 - */ 2874 - spin_lock_irqsave(&n->list_lock, flags); 2875 - } else { 2876 - mode = M_FULL_NOLIST; 2877 - } 2878 - 2879 - 2880 - if (!slab_update_freelist(s, slab, 2881 - old.freelist, old.counters, 2882 - new.freelist, new.counters, 2883 - "unfreezing slab")) { 2884 - if (mode == M_PARTIAL) 2885 - spin_unlock_irqrestore(&n->list_lock, flags); 2886 - goto redo; 2887 - } 2888 - 2889 - 2890 - if (mode == M_PARTIAL) { 2891 - add_partial(n, slab, tail); 2892 - spin_unlock_irqrestore(&n->list_lock, flags); 2893 - stat(s, tail); 2894 - } else if (mode == M_FREE) { 2895 2561 stat(s, DEACTIVATE_EMPTY); 2896 2562 discard_slab(s, slab); 2897 2563 stat(s, FREE_SLAB); 2898 - } else if (mode == M_FULL_NOLIST) { 2564 + } else if (new.freelist) { 2565 + spin_lock_irqsave(&n->list_lock, flags); 2566 + add_partial(n, slab, tail); 2567 + spin_unlock_irqrestore(&n->list_lock, flags); 2568 + stat(s, tail); 2569 + } else { 2899 2570 stat(s, DEACTIVATE_FULL); 2900 2571 } 2901 2572 } 2902 2573 2903 2574 #ifdef CONFIG_SLUB_CPU_PARTIAL 2904 - static void __unfreeze_partials(struct kmem_cache *s, struct slab *partial_slab) 2575 + static void __put_partials(struct kmem_cache *s, struct slab *partial_slab) 2905 2576 { 2906 2577 struct kmem_cache_node *n = NULL, *n2 = NULL; 2907 2578 struct slab *slab, *slab_to_discard = NULL; 2908 2579 unsigned long flags = 0; 2909 2580 2910 2581 while (partial_slab) { 2911 - struct slab new; 2912 - struct slab old; 2913 - 2914 2582 slab = partial_slab; 2915 2583 partial_slab = slab->next; 2916 2584 ··· 2895 2619 spin_lock_irqsave(&n->list_lock, flags); 2896 2620 } 2897 2621 2898 - do { 2899 - 2900 - old.freelist = slab->freelist; 2901 - old.counters = slab->counters; 2902 - VM_BUG_ON(!old.frozen); 2903 - 2904 - new.counters = old.counters; 2905 - new.freelist = old.freelist; 2906 - 2907 - new.frozen = 0; 2908 - 2909 - } while (!__slab_update_freelist(s, slab, 2910 - old.freelist, old.counters, 2911 - new.freelist, new.counters, 2912 - "unfreezing slab")); 2913 - 2914 - if (unlikely(!new.inuse && n->nr_partial >= s->min_partial)) { 2622 + if (unlikely(!slab->inuse && n->nr_partial >= s->min_partial)) { 2915 2623 slab->next = slab_to_discard; 2916 2624 slab_to_discard = slab; 2917 2625 } else { ··· 2918 2658 } 2919 2659 2920 2660 /* 2921 - * Unfreeze all the cpu partial slabs. 2661 + * Put all the cpu partial slabs to the node partial list. 2922 2662 */ 2923 - static void unfreeze_partials(struct kmem_cache *s) 2663 + static void put_partials(struct kmem_cache *s) 2924 2664 { 2925 2665 struct slab *partial_slab; 2926 2666 unsigned long flags; ··· 2931 2671 local_unlock_irqrestore(&s->cpu_slab->lock, flags); 2932 2672 2933 2673 if (partial_slab) 2934 - __unfreeze_partials(s, partial_slab); 2674 + __put_partials(s, partial_slab); 2935 2675 } 2936 2676 2937 - static void unfreeze_partials_cpu(struct kmem_cache *s, 2938 - struct kmem_cache_cpu *c) 2677 + static void put_partials_cpu(struct kmem_cache *s, 2678 + struct kmem_cache_cpu *c) 2939 2679 { 2940 2680 struct slab *partial_slab; 2941 2681 ··· 2943 2683 c->partial = NULL; 2944 2684 2945 2685 if (partial_slab) 2946 - __unfreeze_partials(s, partial_slab); 2686 + __put_partials(s, partial_slab); 2947 2687 } 2948 2688 2949 2689 /* 2950 - * Put a slab that was just frozen (in __slab_free|get_partial_node) into a 2951 - * partial slab slot if available. 2690 + * Put a slab into a partial slab slot if available. 2952 2691 * 2953 2692 * If we did not find a slot then simply move all the partials to the 2954 2693 * per node partial list. ··· 2955 2696 static void put_cpu_partial(struct kmem_cache *s, struct slab *slab, int drain) 2956 2697 { 2957 2698 struct slab *oldslab; 2958 - struct slab *slab_to_unfreeze = NULL; 2699 + struct slab *slab_to_put = NULL; 2959 2700 unsigned long flags; 2960 2701 int slabs = 0; 2961 2702 ··· 2970 2711 * per node partial list. Postpone the actual unfreezing 2971 2712 * outside of the critical section. 2972 2713 */ 2973 - slab_to_unfreeze = oldslab; 2714 + slab_to_put = oldslab; 2974 2715 oldslab = NULL; 2975 2716 } else { 2976 2717 slabs = oldslab->slabs; ··· 2986 2727 2987 2728 local_unlock_irqrestore(&s->cpu_slab->lock, flags); 2988 2729 2989 - if (slab_to_unfreeze) { 2990 - __unfreeze_partials(s, slab_to_unfreeze); 2730 + if (slab_to_put) { 2731 + __put_partials(s, slab_to_put); 2991 2732 stat(s, CPU_PARTIAL_DRAIN); 2992 2733 } 2993 2734 } 2994 2735 2995 2736 #else /* CONFIG_SLUB_CPU_PARTIAL */ 2996 2737 2997 - static inline void unfreeze_partials(struct kmem_cache *s) { } 2998 - static inline void unfreeze_partials_cpu(struct kmem_cache *s, 2999 - struct kmem_cache_cpu *c) { } 2738 + static inline void put_partials(struct kmem_cache *s) { } 2739 + static inline void put_partials_cpu(struct kmem_cache *s, 2740 + struct kmem_cache_cpu *c) { } 3000 2741 3001 2742 #endif /* CONFIG_SLUB_CPU_PARTIAL */ 3002 2743 ··· 3038 2779 stat(s, CPUSLAB_FLUSH); 3039 2780 } 3040 2781 3041 - unfreeze_partials_cpu(s, c); 2782 + put_partials_cpu(s, c); 3042 2783 } 3043 2784 3044 2785 struct slub_flush_work { ··· 3066 2807 if (c->slab) 3067 2808 flush_slab(s, c); 3068 2809 3069 - unfreeze_partials(s); 2810 + put_partials(s); 3070 2811 } 3071 2812 3072 2813 static bool has_cpu_slab(int cpu, struct kmem_cache *s) ··· 3333 3074 } 3334 3075 3335 3076 /* 3077 + * Freeze the partial slab and return the pointer to the freelist. 3078 + */ 3079 + static inline void *freeze_slab(struct kmem_cache *s, struct slab *slab) 3080 + { 3081 + struct slab new; 3082 + unsigned long counters; 3083 + void *freelist; 3084 + 3085 + do { 3086 + freelist = slab->freelist; 3087 + counters = slab->counters; 3088 + 3089 + new.counters = counters; 3090 + VM_BUG_ON(new.frozen); 3091 + 3092 + new.inuse = slab->objects; 3093 + new.frozen = 1; 3094 + 3095 + } while (!slab_update_freelist(s, slab, 3096 + freelist, counters, 3097 + NULL, new.counters, 3098 + "freeze_slab")); 3099 + 3100 + return freelist; 3101 + } 3102 + 3103 + /* 3336 3104 * Slow path. The lockless freelist is empty or we need to perform 3337 3105 * debugging duties. 3338 3106 * ··· 3401 3115 node = NUMA_NO_NODE; 3402 3116 goto new_slab; 3403 3117 } 3404 - redo: 3405 3118 3406 3119 if (unlikely(!node_match(slab, node))) { 3407 3120 /* ··· 3476 3191 3477 3192 new_slab: 3478 3193 3479 - if (slub_percpu_partial(c)) { 3194 + #ifdef CONFIG_SLUB_CPU_PARTIAL 3195 + while (slub_percpu_partial(c)) { 3480 3196 local_lock_irqsave(&s->cpu_slab->lock, flags); 3481 3197 if (unlikely(c->slab)) { 3482 3198 local_unlock_irqrestore(&s->cpu_slab->lock, flags); ··· 3489 3203 goto new_objects; 3490 3204 } 3491 3205 3492 - slab = c->slab = slub_percpu_partial(c); 3206 + slab = slub_percpu_partial(c); 3493 3207 slub_set_percpu_partial(c, slab); 3494 3208 local_unlock_irqrestore(&s->cpu_slab->lock, flags); 3495 3209 stat(s, CPU_PARTIAL_ALLOC); 3496 - goto redo; 3210 + 3211 + if (unlikely(!node_match(slab, node) || 3212 + !pfmemalloc_match(slab, gfpflags))) { 3213 + slab->next = NULL; 3214 + __put_partials(s, slab); 3215 + continue; 3216 + } 3217 + 3218 + freelist = freeze_slab(s, slab); 3219 + goto retry_load_slab; 3497 3220 } 3221 + #endif 3498 3222 3499 3223 new_objects: 3500 3224 3501 3225 pc.flags = gfpflags; 3502 - pc.slab = &slab; 3503 3226 pc.orig_size = orig_size; 3504 - freelist = get_partial(s, node, &pc); 3505 - if (freelist) 3506 - goto check_new_slab; 3227 + slab = get_partial(s, node, &pc); 3228 + if (slab) { 3229 + if (kmem_cache_debug(s)) { 3230 + freelist = pc.object; 3231 + /* 3232 + * For debug caches here we had to go through 3233 + * alloc_single_from_partial() so just store the 3234 + * tracking info and return the object. 3235 + */ 3236 + if (s->flags & SLAB_STORE_USER) 3237 + set_track(s, freelist, TRACK_ALLOC, addr); 3238 + 3239 + return freelist; 3240 + } 3241 + 3242 + freelist = freeze_slab(s, slab); 3243 + goto retry_load_slab; 3244 + } 3507 3245 3508 3246 slub_put_cpu_ptr(s->cpu_slab); 3509 3247 slab = new_slab(s, gfpflags, node); ··· 3562 3252 slab->frozen = 1; 3563 3253 3564 3254 inc_slabs_node(s, slab_nid(slab), slab->objects); 3565 - 3566 - check_new_slab: 3567 - 3568 - if (kmem_cache_debug(s)) { 3569 - /* 3570 - * For debug caches here we had to go through 3571 - * alloc_single_from_partial() so just store the tracking info 3572 - * and return the object 3573 - */ 3574 - if (s->flags & SLAB_STORE_USER) 3575 - set_track(s, freelist, TRACK_ALLOC, addr); 3576 - 3577 - return freelist; 3578 - } 3579 3255 3580 3256 if (unlikely(!pfmemalloc_match(slab, gfpflags))) { 3581 3257 /* ··· 3705 3409 void *object; 3706 3410 3707 3411 pc.flags = gfpflags; 3708 - pc.slab = &slab; 3709 3412 pc.orig_size = orig_size; 3710 - object = get_partial(s, node, &pc); 3413 + slab = get_partial(s, node, &pc); 3711 3414 3712 - if (object) 3713 - return object; 3415 + if (slab) 3416 + return pc.object; 3714 3417 3715 3418 slab = new_slab(s, gfpflags, node); 3716 3419 if (unlikely(!slab)) { ··· 3735 3440 0, sizeof(void *)); 3736 3441 } 3737 3442 3443 + noinline int should_failslab(struct kmem_cache *s, gfp_t gfpflags) 3444 + { 3445 + if (__should_failslab(s, gfpflags)) 3446 + return -ENOMEM; 3447 + return 0; 3448 + } 3449 + ALLOW_ERROR_INJECTION(should_failslab, ERRNO); 3450 + 3451 + static __fastpath_inline 3452 + struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s, 3453 + struct list_lru *lru, 3454 + struct obj_cgroup **objcgp, 3455 + size_t size, gfp_t flags) 3456 + { 3457 + flags &= gfp_allowed_mask; 3458 + 3459 + might_alloc(flags); 3460 + 3461 + if (unlikely(should_failslab(s, flags))) 3462 + return NULL; 3463 + 3464 + if (unlikely(!memcg_slab_pre_alloc_hook(s, lru, objcgp, size, flags))) 3465 + return NULL; 3466 + 3467 + return s; 3468 + } 3469 + 3470 + static __fastpath_inline 3471 + void slab_post_alloc_hook(struct kmem_cache *s, struct obj_cgroup *objcg, 3472 + gfp_t flags, size_t size, void **p, bool init, 3473 + unsigned int orig_size) 3474 + { 3475 + unsigned int zero_size = s->object_size; 3476 + bool kasan_init = init; 3477 + size_t i; 3478 + gfp_t init_flags = flags & gfp_allowed_mask; 3479 + 3480 + /* 3481 + * For kmalloc object, the allocated memory size(object_size) is likely 3482 + * larger than the requested size(orig_size). If redzone check is 3483 + * enabled for the extra space, don't zero it, as it will be redzoned 3484 + * soon. The redzone operation for this extra space could be seen as a 3485 + * replacement of current poisoning under certain debug option, and 3486 + * won't break other sanity checks. 3487 + */ 3488 + if (kmem_cache_debug_flags(s, SLAB_STORE_USER | SLAB_RED_ZONE) && 3489 + (s->flags & SLAB_KMALLOC)) 3490 + zero_size = orig_size; 3491 + 3492 + /* 3493 + * When slub_debug is enabled, avoid memory initialization integrated 3494 + * into KASAN and instead zero out the memory via the memset below with 3495 + * the proper size. Otherwise, KASAN might overwrite SLUB redzones and 3496 + * cause false-positive reports. This does not lead to a performance 3497 + * penalty on production builds, as slub_debug is not intended to be 3498 + * enabled there. 3499 + */ 3500 + if (__slub_debug_enabled()) 3501 + kasan_init = false; 3502 + 3503 + /* 3504 + * As memory initialization might be integrated into KASAN, 3505 + * kasan_slab_alloc and initialization memset must be 3506 + * kept together to avoid discrepancies in behavior. 3507 + * 3508 + * As p[i] might get tagged, memset and kmemleak hook come after KASAN. 3509 + */ 3510 + for (i = 0; i < size; i++) { 3511 + p[i] = kasan_slab_alloc(s, p[i], init_flags, kasan_init); 3512 + if (p[i] && init && (!kasan_init || 3513 + !kasan_has_integrated_init())) 3514 + memset(p[i], 0, zero_size); 3515 + kmemleak_alloc_recursive(p[i], s->object_size, 1, 3516 + s->flags, init_flags); 3517 + kmsan_slab_alloc(s, p[i], init_flags); 3518 + } 3519 + 3520 + memcg_slab_post_alloc_hook(s, objcg, flags, size, p); 3521 + } 3522 + 3738 3523 /* 3739 3524 * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc) 3740 3525 * have the fastpath folded into their functions. So no function call ··· 3833 3458 bool init = false; 3834 3459 3835 3460 s = slab_pre_alloc_hook(s, lru, &objcg, 1, gfpflags); 3836 - if (!s) 3461 + if (unlikely(!s)) 3837 3462 return NULL; 3838 3463 3839 3464 object = kfence_alloc(s, orig_size, gfpflags); ··· 3855 3480 return object; 3856 3481 } 3857 3482 3858 - static __fastpath_inline void *slab_alloc(struct kmem_cache *s, struct list_lru *lru, 3859 - gfp_t gfpflags, unsigned long addr, size_t orig_size) 3483 + void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags) 3860 3484 { 3861 - return slab_alloc_node(s, lru, gfpflags, NUMA_NO_NODE, addr, orig_size); 3862 - } 3863 - 3864 - static __fastpath_inline 3865 - void *__kmem_cache_alloc_lru(struct kmem_cache *s, struct list_lru *lru, 3866 - gfp_t gfpflags) 3867 - { 3868 - void *ret = slab_alloc(s, lru, gfpflags, _RET_IP_, s->object_size); 3485 + void *ret = slab_alloc_node(s, NULL, gfpflags, NUMA_NO_NODE, _RET_IP_, 3486 + s->object_size); 3869 3487 3870 3488 trace_kmem_cache_alloc(_RET_IP_, ret, s, gfpflags, NUMA_NO_NODE); 3871 3489 3872 3490 return ret; 3873 - } 3874 - 3875 - void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags) 3876 - { 3877 - return __kmem_cache_alloc_lru(s, NULL, gfpflags); 3878 3491 } 3879 3492 EXPORT_SYMBOL(kmem_cache_alloc); 3880 3493 3881 3494 void *kmem_cache_alloc_lru(struct kmem_cache *s, struct list_lru *lru, 3882 3495 gfp_t gfpflags) 3883 3496 { 3884 - return __kmem_cache_alloc_lru(s, lru, gfpflags); 3497 + void *ret = slab_alloc_node(s, lru, gfpflags, NUMA_NO_NODE, _RET_IP_, 3498 + s->object_size); 3499 + 3500 + trace_kmem_cache_alloc(_RET_IP_, ret, s, gfpflags, NUMA_NO_NODE); 3501 + 3502 + return ret; 3885 3503 } 3886 3504 EXPORT_SYMBOL(kmem_cache_alloc_lru); 3887 3505 3888 - void *__kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, 3889 - int node, size_t orig_size, 3890 - unsigned long caller) 3891 - { 3892 - return slab_alloc_node(s, NULL, gfpflags, node, 3893 - caller, orig_size); 3894 - } 3895 - 3506 + /** 3507 + * kmem_cache_alloc_node - Allocate an object on the specified node 3508 + * @s: The cache to allocate from. 3509 + * @gfpflags: See kmalloc(). 3510 + * @node: node number of the target node. 3511 + * 3512 + * Identical to kmem_cache_alloc but it will allocate memory on the given 3513 + * node, which can improve the performance for cpu bound structures. 3514 + * 3515 + * Fallback to other node is possible if __GFP_THISNODE is not set. 3516 + * 3517 + * Return: pointer to the new object or %NULL in case of error 3518 + */ 3896 3519 void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node) 3897 3520 { 3898 3521 void *ret = slab_alloc_node(s, NULL, gfpflags, node, _RET_IP_, s->object_size); ··· 3900 3527 return ret; 3901 3528 } 3902 3529 EXPORT_SYMBOL(kmem_cache_alloc_node); 3530 + 3531 + /* 3532 + * To avoid unnecessary overhead, we pass through large allocation requests 3533 + * directly to the page allocator. We use __GFP_COMP, because we will need to 3534 + * know the allocation order to free the pages properly in kfree. 3535 + */ 3536 + static void *__kmalloc_large_node(size_t size, gfp_t flags, int node) 3537 + { 3538 + struct page *page; 3539 + void *ptr = NULL; 3540 + unsigned int order = get_order(size); 3541 + 3542 + if (unlikely(flags & GFP_SLAB_BUG_MASK)) 3543 + flags = kmalloc_fix_flags(flags); 3544 + 3545 + flags |= __GFP_COMP; 3546 + page = alloc_pages_node(node, flags, order); 3547 + if (page) { 3548 + ptr = page_address(page); 3549 + mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B, 3550 + PAGE_SIZE << order); 3551 + } 3552 + 3553 + ptr = kasan_kmalloc_large(ptr, size, flags); 3554 + /* As ptr might get tagged, call kmemleak hook after KASAN. */ 3555 + kmemleak_alloc(ptr, size, 1, flags); 3556 + kmsan_kmalloc_large(ptr, size, flags); 3557 + 3558 + return ptr; 3559 + } 3560 + 3561 + void *kmalloc_large(size_t size, gfp_t flags) 3562 + { 3563 + void *ret = __kmalloc_large_node(size, flags, NUMA_NO_NODE); 3564 + 3565 + trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << get_order(size), 3566 + flags, NUMA_NO_NODE); 3567 + return ret; 3568 + } 3569 + EXPORT_SYMBOL(kmalloc_large); 3570 + 3571 + void *kmalloc_large_node(size_t size, gfp_t flags, int node) 3572 + { 3573 + void *ret = __kmalloc_large_node(size, flags, node); 3574 + 3575 + trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << get_order(size), 3576 + flags, node); 3577 + return ret; 3578 + } 3579 + EXPORT_SYMBOL(kmalloc_large_node); 3580 + 3581 + static __always_inline 3582 + void *__do_kmalloc_node(size_t size, gfp_t flags, int node, 3583 + unsigned long caller) 3584 + { 3585 + struct kmem_cache *s; 3586 + void *ret; 3587 + 3588 + if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) { 3589 + ret = __kmalloc_large_node(size, flags, node); 3590 + trace_kmalloc(caller, ret, size, 3591 + PAGE_SIZE << get_order(size), flags, node); 3592 + return ret; 3593 + } 3594 + 3595 + if (unlikely(!size)) 3596 + return ZERO_SIZE_PTR; 3597 + 3598 + s = kmalloc_slab(size, flags, caller); 3599 + 3600 + ret = slab_alloc_node(s, NULL, flags, node, caller, size); 3601 + ret = kasan_kmalloc(s, ret, size, flags); 3602 + trace_kmalloc(caller, ret, size, s->size, flags, node); 3603 + return ret; 3604 + } 3605 + 3606 + void *__kmalloc_node(size_t size, gfp_t flags, int node) 3607 + { 3608 + return __do_kmalloc_node(size, flags, node, _RET_IP_); 3609 + } 3610 + EXPORT_SYMBOL(__kmalloc_node); 3611 + 3612 + void *__kmalloc(size_t size, gfp_t flags) 3613 + { 3614 + return __do_kmalloc_node(size, flags, NUMA_NO_NODE, _RET_IP_); 3615 + } 3616 + EXPORT_SYMBOL(__kmalloc); 3617 + 3618 + void *__kmalloc_node_track_caller(size_t size, gfp_t flags, 3619 + int node, unsigned long caller) 3620 + { 3621 + return __do_kmalloc_node(size, flags, node, caller); 3622 + } 3623 + EXPORT_SYMBOL(__kmalloc_node_track_caller); 3624 + 3625 + void *kmalloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size) 3626 + { 3627 + void *ret = slab_alloc_node(s, NULL, gfpflags, NUMA_NO_NODE, 3628 + _RET_IP_, size); 3629 + 3630 + trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags, NUMA_NO_NODE); 3631 + 3632 + ret = kasan_kmalloc(s, ret, size, gfpflags); 3633 + return ret; 3634 + } 3635 + EXPORT_SYMBOL(kmalloc_trace); 3636 + 3637 + void *kmalloc_node_trace(struct kmem_cache *s, gfp_t gfpflags, 3638 + int node, size_t size) 3639 + { 3640 + void *ret = slab_alloc_node(s, NULL, gfpflags, node, _RET_IP_, size); 3641 + 3642 + trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags, node); 3643 + 3644 + ret = kasan_kmalloc(s, ret, size, gfpflags); 3645 + return ret; 3646 + } 3647 + EXPORT_SYMBOL(kmalloc_node_trace); 3903 3648 3904 3649 static noinline void free_to_partial_list( 3905 3650 struct kmem_cache *s, struct slab *slab, ··· 4099 3608 unsigned long counters; 4100 3609 struct kmem_cache_node *n = NULL; 4101 3610 unsigned long flags; 3611 + bool on_node_partial; 4102 3612 4103 3613 stat(s, FREE_SLOWPATH); 4104 - 4105 - if (kfence_free(head)) 4106 - return; 4107 3614 4108 3615 if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) { 4109 3616 free_to_partial_list(s, slab, head, tail, cnt, addr); ··· 4120 3631 was_frozen = new.frozen; 4121 3632 new.inuse -= cnt; 4122 3633 if ((!new.inuse || !prior) && !was_frozen) { 4123 - 4124 - if (kmem_cache_has_cpu_partial(s) && !prior) { 4125 - 4126 - /* 4127 - * Slab was on no list before and will be 4128 - * partially empty 4129 - * We can defer the list move and instead 4130 - * freeze it. 4131 - */ 4132 - new.frozen = 1; 4133 - 4134 - } else { /* Needs to be taken off a list */ 3634 + /* Needs to be taken off a list */ 3635 + if (!kmem_cache_has_cpu_partial(s) || prior) { 4135 3636 4136 3637 n = get_node(s, slab_nid(slab)); 4137 3638 /* ··· 4134 3655 */ 4135 3656 spin_lock_irqsave(&n->list_lock, flags); 4136 3657 3658 + on_node_partial = slab_test_node_partial(slab); 4137 3659 } 4138 3660 } 4139 3661 ··· 4151 3671 * activity can be necessary. 4152 3672 */ 4153 3673 stat(s, FREE_FROZEN); 4154 - } else if (new.frozen) { 3674 + } else if (kmem_cache_has_cpu_partial(s) && !prior) { 4155 3675 /* 4156 - * If we just froze the slab then put it onto the 3676 + * If we started with a full slab then put it onto the 4157 3677 * per cpu partial list. 4158 3678 */ 4159 3679 put_cpu_partial(s, slab, 1); 4160 3680 stat(s, CPU_PARTIAL_FREE); 4161 3681 } 4162 3682 3683 + return; 3684 + } 3685 + 3686 + /* 3687 + * This slab was partially empty but not on the per-node partial list, 3688 + * in which case we shouldn't manipulate its list, just return. 3689 + */ 3690 + if (prior && !on_node_partial) { 3691 + spin_unlock_irqrestore(&n->list_lock, flags); 4163 3692 return; 4164 3693 } 4165 3694 ··· 4224 3735 struct slab *slab, void *head, void *tail, 4225 3736 int cnt, unsigned long addr) 4226 3737 { 4227 - void *tail_obj = tail ? : head; 4228 3738 struct kmem_cache_cpu *c; 4229 3739 unsigned long tid; 4230 3740 void **freelist; ··· 4242 3754 barrier(); 4243 3755 4244 3756 if (unlikely(slab != c->slab)) { 4245 - __slab_free(s, slab, head, tail_obj, cnt, addr); 3757 + __slab_free(s, slab, head, tail, cnt, addr); 4246 3758 return; 4247 3759 } 4248 3760 4249 3761 if (USE_LOCKLESS_FAST_PATH()) { 4250 3762 freelist = READ_ONCE(c->freelist); 4251 3763 4252 - set_freepointer(s, tail_obj, freelist); 3764 + set_freepointer(s, tail, freelist); 4253 3765 4254 3766 if (unlikely(!__update_cpu_freelist_fast(s, freelist, head, tid))) { 4255 3767 note_cmpxchg_failure("slab_free", s, tid); ··· 4266 3778 tid = c->tid; 4267 3779 freelist = c->freelist; 4268 3780 4269 - set_freepointer(s, tail_obj, freelist); 3781 + set_freepointer(s, tail, freelist); 4270 3782 c->freelist = head; 4271 3783 c->tid = next_tid(tid); 4272 3784 4273 3785 local_unlock(&s->cpu_slab->lock); 4274 3786 } 4275 - stat(s, FREE_FASTPATH); 3787 + stat_add(s, FREE_FASTPATH, cnt); 4276 3788 } 4277 3789 #else /* CONFIG_SLUB_TINY */ 4278 3790 static void do_slab_free(struct kmem_cache *s, 4279 3791 struct slab *slab, void *head, void *tail, 4280 3792 int cnt, unsigned long addr) 4281 3793 { 4282 - void *tail_obj = tail ? : head; 4283 - 4284 - __slab_free(s, slab, head, tail_obj, cnt, addr); 3794 + __slab_free(s, slab, head, tail, cnt, addr); 4285 3795 } 4286 3796 #endif /* CONFIG_SLUB_TINY */ 4287 3797 4288 - static __fastpath_inline void slab_free(struct kmem_cache *s, struct slab *slab, 4289 - void *head, void *tail, void **p, int cnt, 4290 - unsigned long addr) 3798 + static __fastpath_inline 3799 + void slab_free(struct kmem_cache *s, struct slab *slab, void *object, 3800 + unsigned long addr) 3801 + { 3802 + memcg_slab_free_hook(s, slab, &object, 1); 3803 + 3804 + if (likely(slab_free_hook(s, object, slab_want_init_on_free(s)))) 3805 + do_slab_free(s, slab, object, object, 1, addr); 3806 + } 3807 + 3808 + static __fastpath_inline 3809 + void slab_free_bulk(struct kmem_cache *s, struct slab *slab, void *head, 3810 + void *tail, void **p, int cnt, unsigned long addr) 4291 3811 { 4292 3812 memcg_slab_free_hook(s, slab, p, cnt); 4293 3813 /* 4294 3814 * With KASAN enabled slab_free_freelist_hook modifies the freelist 4295 3815 * to remove objects, whose reuse must be delayed. 4296 3816 */ 4297 - if (slab_free_freelist_hook(s, &head, &tail, &cnt)) 3817 + if (likely(slab_free_freelist_hook(s, &head, &tail, &cnt))) 4298 3818 do_slab_free(s, slab, head, tail, cnt, addr); 4299 3819 } 4300 3820 4301 3821 #ifdef CONFIG_KASAN_GENERIC 4302 3822 void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr) 4303 3823 { 4304 - do_slab_free(cache, virt_to_slab(x), x, NULL, 1, addr); 3824 + do_slab_free(cache, virt_to_slab(x), x, x, 1, addr); 4305 3825 } 4306 3826 #endif 4307 3827 4308 - void __kmem_cache_free(struct kmem_cache *s, void *x, unsigned long caller) 3828 + static inline struct kmem_cache *virt_to_cache(const void *obj) 4309 3829 { 4310 - slab_free(s, virt_to_slab(x), x, NULL, &x, 1, caller); 3830 + struct slab *slab; 3831 + 3832 + slab = virt_to_slab(obj); 3833 + if (WARN_ONCE(!slab, "%s: Object is not a Slab page!\n", __func__)) 3834 + return NULL; 3835 + return slab->slab_cache; 4311 3836 } 4312 3837 3838 + static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) 3839 + { 3840 + struct kmem_cache *cachep; 3841 + 3842 + if (!IS_ENABLED(CONFIG_SLAB_FREELIST_HARDENED) && 3843 + !kmem_cache_debug_flags(s, SLAB_CONSISTENCY_CHECKS)) 3844 + return s; 3845 + 3846 + cachep = virt_to_cache(x); 3847 + if (WARN(cachep && cachep != s, 3848 + "%s: Wrong slab cache. %s but object is from %s\n", 3849 + __func__, s->name, cachep->name)) 3850 + print_tracking(cachep, x); 3851 + return cachep; 3852 + } 3853 + 3854 + /** 3855 + * kmem_cache_free - Deallocate an object 3856 + * @s: The cache the allocation was from. 3857 + * @x: The previously allocated object. 3858 + * 3859 + * Free an object which was previously allocated from this 3860 + * cache. 3861 + */ 4313 3862 void kmem_cache_free(struct kmem_cache *s, void *x) 4314 3863 { 4315 3864 s = cache_from_obj(s, x); 4316 3865 if (!s) 4317 3866 return; 4318 3867 trace_kmem_cache_free(_RET_IP_, x, s); 4319 - slab_free(s, virt_to_slab(x), x, NULL, &x, 1, _RET_IP_); 3868 + slab_free(s, virt_to_slab(x), x, _RET_IP_); 4320 3869 } 4321 3870 EXPORT_SYMBOL(kmem_cache_free); 3871 + 3872 + static void free_large_kmalloc(struct folio *folio, void *object) 3873 + { 3874 + unsigned int order = folio_order(folio); 3875 + 3876 + if (WARN_ON_ONCE(order == 0)) 3877 + pr_warn_once("object pointer: 0x%p\n", object); 3878 + 3879 + kmemleak_free(object); 3880 + kasan_kfree_large(object); 3881 + kmsan_kfree_large(object); 3882 + 3883 + mod_lruvec_page_state(folio_page(folio, 0), NR_SLAB_UNRECLAIMABLE_B, 3884 + -(PAGE_SIZE << order)); 3885 + __free_pages(folio_page(folio, 0), order); 3886 + } 3887 + 3888 + /** 3889 + * kfree - free previously allocated memory 3890 + * @object: pointer returned by kmalloc() or kmem_cache_alloc() 3891 + * 3892 + * If @object is NULL, no operation is performed. 3893 + */ 3894 + void kfree(const void *object) 3895 + { 3896 + struct folio *folio; 3897 + struct slab *slab; 3898 + struct kmem_cache *s; 3899 + void *x = (void *)object; 3900 + 3901 + trace_kfree(_RET_IP_, object); 3902 + 3903 + if (unlikely(ZERO_OR_NULL_PTR(object))) 3904 + return; 3905 + 3906 + folio = virt_to_folio(object); 3907 + if (unlikely(!folio_test_slab(folio))) { 3908 + free_large_kmalloc(folio, (void *)object); 3909 + return; 3910 + } 3911 + 3912 + slab = folio_slab(folio); 3913 + s = slab->slab_cache; 3914 + slab_free(s, slab, x, _RET_IP_); 3915 + } 3916 + EXPORT_SYMBOL(kfree); 4322 3917 4323 3918 struct detached_freelist { 4324 3919 struct slab *slab; ··· 4482 3911 return same; 4483 3912 } 4484 3913 3914 + /* 3915 + * Internal bulk free of objects that were not initialised by the post alloc 3916 + * hooks and thus should not be processed by the free hooks 3917 + */ 3918 + static void __kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) 3919 + { 3920 + if (!size) 3921 + return; 3922 + 3923 + do { 3924 + struct detached_freelist df; 3925 + 3926 + size = build_detached_freelist(s, size, p, &df); 3927 + if (!df.slab) 3928 + continue; 3929 + 3930 + do_slab_free(df.s, df.slab, df.freelist, df.tail, df.cnt, 3931 + _RET_IP_); 3932 + } while (likely(size)); 3933 + } 3934 + 4485 3935 /* Note that interrupts must be enabled when calling this function. */ 4486 3936 void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) 4487 3937 { ··· 4516 3924 if (!df.slab) 4517 3925 continue; 4518 3926 4519 - slab_free(df.s, df.slab, df.freelist, df.tail, &p[size], df.cnt, 4520 - _RET_IP_); 3927 + slab_free_bulk(df.s, df.slab, df.freelist, df.tail, &p[size], 3928 + df.cnt, _RET_IP_); 4521 3929 } while (likely(size)); 4522 3930 } 4523 3931 EXPORT_SYMBOL(kmem_cache_free_bulk); 4524 3932 4525 3933 #ifndef CONFIG_SLUB_TINY 4526 - static inline int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, 4527 - size_t size, void **p, struct obj_cgroup *objcg) 3934 + static inline 3935 + int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, 3936 + void **p) 4528 3937 { 4529 3938 struct kmem_cache_cpu *c; 4530 3939 unsigned long irqflags; ··· 4579 3986 c->freelist = get_freepointer(s, object); 4580 3987 p[i] = object; 4581 3988 maybe_wipe_obj_freeptr(s, p[i]); 3989 + stat(s, ALLOC_FASTPATH); 4582 3990 } 4583 3991 c->tid = next_tid(c->tid); 4584 3992 local_unlock_irqrestore(&s->cpu_slab->lock, irqflags); ··· 4589 3995 4590 3996 error: 4591 3997 slub_put_cpu_ptr(s->cpu_slab); 4592 - slab_post_alloc_hook(s, objcg, flags, i, p, false, s->object_size); 4593 - kmem_cache_free_bulk(s, i, p); 3998 + __kmem_cache_free_bulk(s, i, p); 4594 3999 return 0; 4595 4000 4596 4001 } 4597 4002 #else /* CONFIG_SLUB_TINY */ 4598 4003 static int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, 4599 - size_t size, void **p, struct obj_cgroup *objcg) 4004 + size_t size, void **p) 4600 4005 { 4601 4006 int i; 4602 4007 ··· 4618 4025 return i; 4619 4026 4620 4027 error: 4621 - slab_post_alloc_hook(s, objcg, flags, i, p, false, s->object_size); 4622 - kmem_cache_free_bulk(s, i, p); 4028 + __kmem_cache_free_bulk(s, i, p); 4623 4029 return 0; 4624 4030 } 4625 4031 #endif /* CONFIG_SLUB_TINY */ ··· 4638 4046 if (unlikely(!s)) 4639 4047 return 0; 4640 4048 4641 - i = __kmem_cache_alloc_bulk(s, flags, size, p, objcg); 4049 + i = __kmem_cache_alloc_bulk(s, flags, size, p); 4642 4050 4643 4051 /* 4644 4052 * memcg and kmem_cache debug support and memory initialization. 4645 4053 * Done outside of the IRQ disabled fastpath loop. 4646 4054 */ 4647 - if (i != 0) 4055 + if (likely(i != 0)) { 4648 4056 slab_post_alloc_hook(s, objcg, flags, size, p, 4649 4057 slab_want_init_on_alloc(flags, s), s->object_size); 4058 + } else { 4059 + memcg_slab_alloc_error_hook(s, size, objcg); 4060 + } 4061 + 4650 4062 return i; 4651 4063 } 4652 4064 EXPORT_SYMBOL(kmem_cache_alloc_bulk); ··· 5427 4831 5428 4832 if (free == slab->objects) { 5429 4833 list_move(&slab->slab_list, &discard); 4834 + slab_clear_node_partial(slab); 5430 4835 n->nr_partial--; 5431 4836 dec_slabs_node(s, node, slab->objects); 5432 4837 } else if (free <= SHRINK_PROMOTE_MAX)