mm/slub.c at v6.19 · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / mm / slub.c
at v6.19 10116 lines 260 kB view raw
    1// SPDX-License-Identifier: GPL-2.0
    2/*
    3 * SLUB: A slab allocator that limits cache line use instead of queuing
    4 * objects in per cpu and per node lists.
    5 *
    6 * The allocator synchronizes using per slab locks or atomic operations
    7 * and only uses a centralized lock to manage a pool of partial slabs.
    8 *
    9 * (C) 2007 SGI, Christoph Lameter
   10 * (C) 2011 Linux Foundation, Christoph Lameter
   11 */
   12
   13#include <linux/mm.h>
   14#include <linux/swap.h> /* mm_account_reclaimed_pages() */
   15#include <linux/module.h>
   16#include <linux/bit_spinlock.h>
   17#include <linux/interrupt.h>
   18#include <linux/swab.h>
   19#include <linux/bitops.h>
   20#include <linux/slab.h>
   21#include "slab.h"
   22#include <linux/vmalloc.h>
   23#include <linux/proc_fs.h>
   24#include <linux/seq_file.h>
   25#include <linux/kasan.h>
   26#include <linux/node.h>
   27#include <linux/kmsan.h>
   28#include <linux/cpu.h>
   29#include <linux/cpuset.h>
   30#include <linux/mempolicy.h>
   31#include <linux/ctype.h>
   32#include <linux/stackdepot.h>
   33#include <linux/debugobjects.h>
   34#include <linux/kallsyms.h>
   35#include <linux/kfence.h>
   36#include <linux/memory.h>
   37#include <linux/math64.h>
   38#include <linux/fault-inject.h>
   39#include <linux/kmemleak.h>
   40#include <linux/stacktrace.h>
   41#include <linux/prefetch.h>
   42#include <linux/memcontrol.h>
   43#include <linux/random.h>
   44#include <kunit/test.h>
   45#include <kunit/test-bug.h>
   46#include <linux/sort.h>
   47#include <linux/irq_work.h>
   48#include <linux/kprobes.h>
   49#include <linux/debugfs.h>
   50#include <trace/events/kmem.h>
   51
   52#include "internal.h"
   53
   54/*
   55 * Lock order:
   56 *   1. slab_mutex (Global Mutex)
   57 *   2. node->list_lock (Spinlock)
   58 *   3. kmem_cache->cpu_slab->lock (Local lock)
   59 *   4. slab_lock(slab) (Only on some arches)
   60 *   5. object_map_lock (Only for debugging)
   61 *
   62 *   slab_mutex
   63 *
   64 *   The role of the slab_mutex is to protect the list of all the slabs
   65 *   and to synchronize major metadata changes to slab cache structures.
   66 *   Also synchronizes memory hotplug callbacks.
   67 *
   68 *   slab_lock
   69 *
   70 *   The slab_lock is a wrapper around the page lock, thus it is a bit
   71 *   spinlock.
   72 *
   73 *   The slab_lock is only used on arches that do not have the ability
   74 *   to do a cmpxchg_double. It only protects:
   75 *
   76 *	A. slab->freelist	-> List of free objects in a slab
   77 *	B. slab->inuse		-> Number of objects in use
   78 *	C. slab->objects	-> Number of objects in slab
   79 *	D. slab->frozen		-> frozen state
   80 *
   81 *   Frozen slabs
   82 *
   83 *   If a slab is frozen then it is exempt from list management. It is
   84 *   the cpu slab which is actively allocated from by the processor that
   85 *   froze it and it is not on any list. The processor that froze the
   86 *   slab is the one who can perform list operations on the slab. Other
   87 *   processors may put objects onto the freelist but the processor that
   88 *   froze the slab is the only one that can retrieve the objects from the
   89 *   slab's freelist.
   90 *
   91 *   CPU partial slabs
   92 *
   93 *   The partially empty slabs cached on the CPU partial list are used
   94 *   for performance reasons, which speeds up the allocation process.
   95 *   These slabs are not frozen, but are also exempt from list management,
   96 *   by clearing the SL_partial flag when moving out of the node
   97 *   partial list. Please see __slab_free() for more details.
   98 *
   99 *   To sum up, the current scheme is:
  100 *   - node partial slab: SL_partial && !frozen
  101 *   - cpu partial slab: !SL_partial && !frozen
  102 *   - cpu slab: !SL_partial && frozen
  103 *   - full slab: !SL_partial && !frozen
  104 *
  105 *   list_lock
  106 *
  107 *   The list_lock protects the partial and full list on each node and
  108 *   the partial slab counter. If taken then no new slabs may be added or
  109 *   removed from the lists nor make the number of partial slabs be modified.
  110 *   (Note that the total number of slabs is an atomic value that may be
  111 *   modified without taking the list lock).
  112 *
  113 *   The list_lock is a centralized lock and thus we avoid taking it as
  114 *   much as possible. As long as SLUB does not have to handle partial
  115 *   slabs, operations can continue without any centralized lock. F.e.
  116 *   allocating a long series of objects that fill up slabs does not require
  117 *   the list lock.
  118 *
  119 *   For debug caches, all allocations are forced to go through a list_lock
  120 *   protected region to serialize against concurrent validation.
  121 *
  122 *   cpu_slab->lock local lock
  123 *
  124 *   This locks protect slowpath manipulation of all kmem_cache_cpu fields
  125 *   except the stat counters. This is a percpu structure manipulated only by
  126 *   the local cpu, so the lock protects against being preempted or interrupted
  127 *   by an irq. Fast path operations rely on lockless operations instead.
  128 *
  129 *   On PREEMPT_RT, the local lock neither disables interrupts nor preemption
  130 *   which means the lockless fastpath cannot be used as it might interfere with
  131 *   an in-progress slow path operations. In this case the local lock is always
  132 *   taken but it still utilizes the freelist for the common operations.
  133 *
  134 *   lockless fastpaths
  135 *
  136 *   The fast path allocation (slab_alloc_node()) and freeing (do_slab_free())
  137 *   are fully lockless when satisfied from the percpu slab (and when
  138 *   cmpxchg_double is possible to use, otherwise slab_lock is taken).
  139 *   They also don't disable preemption or migration or irqs. They rely on
  140 *   the transaction id (tid) field to detect being preempted or moved to
  141 *   another cpu.
  142 *
  143 *   irq, preemption, migration considerations
  144 *
  145 *   Interrupts are disabled as part of list_lock or local_lock operations, or
  146 *   around the slab_lock operation, in order to make the slab allocator safe
  147 *   to use in the context of an irq.
  148 *
  149 *   In addition, preemption (or migration on PREEMPT_RT) is disabled in the
  150 *   allocation slowpath, bulk allocation, and put_cpu_partial(), so that the
  151 *   local cpu doesn't change in the process and e.g. the kmem_cache_cpu pointer
  152 *   doesn't have to be revalidated in each section protected by the local lock.
  153 *
  154 * SLUB assigns one slab for allocation to each processor.
  155 * Allocations only occur from these slabs called cpu slabs.
  156 *
  157 * Slabs with free elements are kept on a partial list and during regular
  158 * operations no list for full slabs is used. If an object in a full slab is
  159 * freed then the slab will show up again on the partial lists.
  160 * We track full slabs for debugging purposes though because otherwise we
  161 * cannot scan all objects.
  162 *
  163 * Slabs are freed when they become empty. Teardown and setup is
  164 * minimal so we rely on the page allocators per cpu caches for
  165 * fast frees and allocs.
  166 *
  167 * slab->frozen		The slab is frozen and exempt from list processing.
  168 * 			This means that the slab is dedicated to a purpose
  169 * 			such as satisfying allocations for a specific
  170 * 			processor. Objects may be freed in the slab while
  171 * 			it is frozen but slab_free will then skip the usual
  172 * 			list operations. It is up to the processor holding
  173 * 			the slab to integrate the slab into the slab lists
  174 * 			when the slab is no longer needed.
  175 *
  176 * 			One use of this flag is to mark slabs that are
  177 * 			used for allocations. Then such a slab becomes a cpu
  178 * 			slab. The cpu slab may be equipped with an additional
  179 * 			freelist that allows lockless access to
  180 * 			free objects in addition to the regular freelist
  181 * 			that requires the slab lock.
  182 *
  183 * SLAB_DEBUG_FLAGS	Slab requires special handling due to debug
  184 * 			options set. This moves	slab handling out of
  185 * 			the fast path and disables lockless freelists.
  186 */
  187
  188/**
  189 * enum slab_flags - How the slab flags bits are used.
  190 * @SL_locked: Is locked with slab_lock()
  191 * @SL_partial: On the per-node partial list
  192 * @SL_pfmemalloc: Was allocated from PF_MEMALLOC reserves
  193 *
  194 * The slab flags share space with the page flags but some bits have
  195 * different interpretations.  The high bits are used for information
  196 * like zone/node/section.
  197 */
  198enum slab_flags {
  199	SL_locked = PG_locked,
  200	SL_partial = PG_workingset,	/* Historical reasons for this bit */
  201	SL_pfmemalloc = PG_active,	/* Historical reasons for this bit */
  202};
  203
  204/*
  205 * We could simply use migrate_disable()/enable() but as long as it's a
  206 * function call even on !PREEMPT_RT, use inline preempt_disable() there.
  207 */
  208#ifndef CONFIG_PREEMPT_RT
  209#define slub_get_cpu_ptr(var)		get_cpu_ptr(var)
  210#define slub_put_cpu_ptr(var)		put_cpu_ptr(var)
  211#define USE_LOCKLESS_FAST_PATH()	(true)
  212#else
  213#define slub_get_cpu_ptr(var)		\
  214({					\
  215	migrate_disable();		\
  216	this_cpu_ptr(var);		\
  217})
  218#define slub_put_cpu_ptr(var)		\
  219do {					\
  220	(void)(var);			\
  221	migrate_enable();		\
  222} while (0)
  223#define USE_LOCKLESS_FAST_PATH()	(false)
  224#endif
  225
  226#ifndef CONFIG_SLUB_TINY
  227#define __fastpath_inline __always_inline
  228#else
  229#define __fastpath_inline
  230#endif
  231
  232#ifdef CONFIG_SLUB_DEBUG
  233#ifdef CONFIG_SLUB_DEBUG_ON
  234DEFINE_STATIC_KEY_TRUE(slub_debug_enabled);
  235#else
  236DEFINE_STATIC_KEY_FALSE(slub_debug_enabled);
  237#endif
  238#endif		/* CONFIG_SLUB_DEBUG */
  239
  240#ifdef CONFIG_NUMA
  241static DEFINE_STATIC_KEY_FALSE(strict_numa);
  242#endif
  243
  244/* Structure holding parameters for get_partial() call chain */
  245struct partial_context {
  246	gfp_t flags;
  247	unsigned int orig_size;
  248	void *object;
  249};
  250
  251static inline bool kmem_cache_debug(struct kmem_cache *s)
  252{
  253	return kmem_cache_debug_flags(s, SLAB_DEBUG_FLAGS);
  254}
  255
  256void *fixup_red_left(struct kmem_cache *s, void *p)
  257{
  258	if (kmem_cache_debug_flags(s, SLAB_RED_ZONE))
  259		p += s->red_left_pad;
  260
  261	return p;
  262}
  263
  264static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s)
  265{
  266#ifdef CONFIG_SLUB_CPU_PARTIAL
  267	return !kmem_cache_debug(s);
  268#else
  269	return false;
  270#endif
  271}
  272
  273/*
  274 * Issues still to be resolved:
  275 *
  276 * - Support PAGE_ALLOC_DEBUG. Should be easy to do.
  277 *
  278 * - Variable sizing of the per node arrays
  279 */
  280
  281/* Enable to log cmpxchg failures */
  282#undef SLUB_DEBUG_CMPXCHG
  283
  284#ifndef CONFIG_SLUB_TINY
  285/*
  286 * Minimum number of partial slabs. These will be left on the partial
  287 * lists even if they are empty. kmem_cache_shrink may reclaim them.
  288 */
  289#define MIN_PARTIAL 5
  290
  291/*
  292 * Maximum number of desirable partial slabs.
  293 * The existence of more partial slabs makes kmem_cache_shrink
  294 * sort the partial list by the number of objects in use.
  295 */
  296#define MAX_PARTIAL 10
  297#else
  298#define MIN_PARTIAL 0
  299#define MAX_PARTIAL 0
  300#endif
  301
  302#define DEBUG_DEFAULT_FLAGS (SLAB_CONSISTENCY_CHECKS | SLAB_RED_ZONE | \
  303				SLAB_POISON | SLAB_STORE_USER)
  304
  305/*
  306 * These debug flags cannot use CMPXCHG because there might be consistency
  307 * issues when checking or reading debug information
  308 */
  309#define SLAB_NO_CMPXCHG (SLAB_CONSISTENCY_CHECKS | SLAB_STORE_USER | \
  310				SLAB_TRACE)
  311
  312
  313/*
  314 * Debugging flags that require metadata to be stored in the slab.  These get
  315 * disabled when slab_debug=O is used and a cache's min order increases with
  316 * metadata.
  317 */
  318#define DEBUG_METADATA_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER)
  319
  320#define OO_SHIFT	16
  321#define OO_MASK		((1 << OO_SHIFT) - 1)
  322#define MAX_OBJS_PER_PAGE	32767 /* since slab.objects is u15 */
  323
  324/* Internal SLUB flags */
  325/* Poison object */
  326#define __OBJECT_POISON		__SLAB_FLAG_BIT(_SLAB_OBJECT_POISON)
  327/* Use cmpxchg_double */
  328
  329#ifdef system_has_freelist_aba
  330#define __CMPXCHG_DOUBLE	__SLAB_FLAG_BIT(_SLAB_CMPXCHG_DOUBLE)
  331#else
  332#define __CMPXCHG_DOUBLE	__SLAB_FLAG_UNUSED
  333#endif
  334
  335/*
  336 * Tracking user of a slab.
  337 */
  338#define TRACK_ADDRS_COUNT 16
  339struct track {
  340	unsigned long addr;	/* Called from address */
  341#ifdef CONFIG_STACKDEPOT
  342	depot_stack_handle_t handle;
  343#endif
  344	int cpu;		/* Was running on cpu */
  345	int pid;		/* Pid context */
  346	unsigned long when;	/* When did the operation occur */
  347};
  348
  349enum track_item { TRACK_ALLOC, TRACK_FREE };
  350
  351#ifdef SLAB_SUPPORTS_SYSFS
  352static int sysfs_slab_add(struct kmem_cache *);
  353static int sysfs_slab_alias(struct kmem_cache *, const char *);
  354#else
  355static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; }
  356static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p)
  357							{ return 0; }
  358#endif
  359
  360#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_SLUB_DEBUG)
  361static void debugfs_slab_add(struct kmem_cache *);
  362#else
  363static inline void debugfs_slab_add(struct kmem_cache *s) { }
  364#endif
  365
  366enum stat_item {
  367	ALLOC_PCS,		/* Allocation from percpu sheaf */
  368	ALLOC_FASTPATH,		/* Allocation from cpu slab */
  369	ALLOC_SLOWPATH,		/* Allocation by getting a new cpu slab */
  370	FREE_PCS,		/* Free to percpu sheaf */
  371	FREE_RCU_SHEAF,		/* Free to rcu_free sheaf */
  372	FREE_RCU_SHEAF_FAIL,	/* Failed to free to a rcu_free sheaf */
  373	FREE_FASTPATH,		/* Free to cpu slab */
  374	FREE_SLOWPATH,		/* Freeing not to cpu slab */
  375	FREE_FROZEN,		/* Freeing to frozen slab */
  376	FREE_ADD_PARTIAL,	/* Freeing moves slab to partial list */
  377	FREE_REMOVE_PARTIAL,	/* Freeing removes last object */
  378	ALLOC_FROM_PARTIAL,	/* Cpu slab acquired from node partial list */
  379	ALLOC_SLAB,		/* Cpu slab acquired from page allocator */
  380	ALLOC_REFILL,		/* Refill cpu slab from slab freelist */
  381	ALLOC_NODE_MISMATCH,	/* Switching cpu slab */
  382	FREE_SLAB,		/* Slab freed to the page allocator */
  383	CPUSLAB_FLUSH,		/* Abandoning of the cpu slab */
  384	DEACTIVATE_FULL,	/* Cpu slab was full when deactivated */
  385	DEACTIVATE_EMPTY,	/* Cpu slab was empty when deactivated */
  386	DEACTIVATE_TO_HEAD,	/* Cpu slab was moved to the head of partials */
  387	DEACTIVATE_TO_TAIL,	/* Cpu slab was moved to the tail of partials */
  388	DEACTIVATE_REMOTE_FREES,/* Slab contained remotely freed objects */
  389	DEACTIVATE_BYPASS,	/* Implicit deactivation */
  390	ORDER_FALLBACK,		/* Number of times fallback was necessary */
  391	CMPXCHG_DOUBLE_CPU_FAIL,/* Failures of this_cpu_cmpxchg_double */
  392	CMPXCHG_DOUBLE_FAIL,	/* Failures of slab freelist update */
  393	CPU_PARTIAL_ALLOC,	/* Used cpu partial on alloc */
  394	CPU_PARTIAL_FREE,	/* Refill cpu partial on free */
  395	CPU_PARTIAL_NODE,	/* Refill cpu partial from node partial */
  396	CPU_PARTIAL_DRAIN,	/* Drain cpu partial to node partial */
  397	SHEAF_FLUSH,		/* Objects flushed from a sheaf */
  398	SHEAF_REFILL,		/* Objects refilled to a sheaf */
  399	SHEAF_ALLOC,		/* Allocation of an empty sheaf */
  400	SHEAF_FREE,		/* Freeing of an empty sheaf */
  401	BARN_GET,		/* Got full sheaf from barn */
  402	BARN_GET_FAIL,		/* Failed to get full sheaf from barn */
  403	BARN_PUT,		/* Put full sheaf to barn */
  404	BARN_PUT_FAIL,		/* Failed to put full sheaf to barn */
  405	SHEAF_PREFILL_FAST,	/* Sheaf prefill grabbed the spare sheaf */
  406	SHEAF_PREFILL_SLOW,	/* Sheaf prefill found no spare sheaf */
  407	SHEAF_PREFILL_OVERSIZE,	/* Allocation of oversize sheaf for prefill */
  408	SHEAF_RETURN_FAST,	/* Sheaf return reattached spare sheaf */
  409	SHEAF_RETURN_SLOW,	/* Sheaf return could not reattach spare */
  410	NR_SLUB_STAT_ITEMS
  411};
  412
  413struct freelist_tid {
  414	union {
  415		struct {
  416			void *freelist;		/* Pointer to next available object */
  417			unsigned long tid;	/* Globally unique transaction id */
  418		};
  419		freelist_full_t freelist_tid;
  420	};
  421};
  422
  423/*
  424 * When changing the layout, make sure freelist and tid are still compatible
  425 * with this_cpu_cmpxchg_double() alignment requirements.
  426 */
  427struct kmem_cache_cpu {
  428	struct freelist_tid;
  429	struct slab *slab;	/* The slab from which we are allocating */
  430#ifdef CONFIG_SLUB_CPU_PARTIAL
  431	struct slab *partial;	/* Partially allocated slabs */
  432#endif
  433	local_trylock_t lock;	/* Protects the fields above */
  434#ifdef CONFIG_SLUB_STATS
  435	unsigned int stat[NR_SLUB_STAT_ITEMS];
  436#endif
  437};
  438
  439static inline void stat(const struct kmem_cache *s, enum stat_item si)
  440{
  441#ifdef CONFIG_SLUB_STATS
  442	/*
  443	 * The rmw is racy on a preemptible kernel but this is acceptable, so
  444	 * avoid this_cpu_add()'s irq-disable overhead.
  445	 */
  446	raw_cpu_inc(s->cpu_slab->stat[si]);
  447#endif
  448}
  449
  450static inline
  451void stat_add(const struct kmem_cache *s, enum stat_item si, int v)
  452{
  453#ifdef CONFIG_SLUB_STATS
  454	raw_cpu_add(s->cpu_slab->stat[si], v);
  455#endif
  456}
  457
  458#define MAX_FULL_SHEAVES	10
  459#define MAX_EMPTY_SHEAVES	10
  460
  461struct node_barn {
  462	spinlock_t lock;
  463	struct list_head sheaves_full;
  464	struct list_head sheaves_empty;
  465	unsigned int nr_full;
  466	unsigned int nr_empty;
  467};
  468
  469struct slab_sheaf {
  470	union {
  471		struct rcu_head rcu_head;
  472		struct list_head barn_list;
  473		/* only used for prefilled sheafs */
  474		struct {
  475			unsigned int capacity;
  476			bool pfmemalloc;
  477		};
  478	};
  479	struct kmem_cache *cache;
  480	unsigned int size;
  481	int node; /* only used for rcu_sheaf */
  482	void *objects[];
  483};
  484
  485struct slub_percpu_sheaves {
  486	local_trylock_t lock;
  487	struct slab_sheaf *main; /* never NULL when unlocked */
  488	struct slab_sheaf *spare; /* empty or full, may be NULL */
  489	struct slab_sheaf *rcu_free; /* for batching kfree_rcu() */
  490};
  491
  492/*
  493 * The slab lists for all objects.
  494 */
  495struct kmem_cache_node {
  496	spinlock_t list_lock;
  497	unsigned long nr_partial;
  498	struct list_head partial;
  499#ifdef CONFIG_SLUB_DEBUG
  500	atomic_long_t nr_slabs;
  501	atomic_long_t total_objects;
  502	struct list_head full;
  503#endif
  504	struct node_barn *barn;
  505};
  506
  507static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node)
  508{
  509	return s->node[node];
  510}
  511
  512/*
  513 * Get the barn of the current cpu's closest memory node. It may not exist on
  514 * systems with memoryless nodes but without CONFIG_HAVE_MEMORYLESS_NODES
  515 */
  516static inline struct node_barn *get_barn(struct kmem_cache *s)
  517{
  518	struct kmem_cache_node *n = get_node(s, numa_mem_id());
  519
  520	if (!n)
  521		return NULL;
  522
  523	return n->barn;
  524}
  525
  526/*
  527 * Iterator over all nodes. The body will be executed for each node that has
  528 * a kmem_cache_node structure allocated (which is true for all online nodes)
  529 */
  530#define for_each_kmem_cache_node(__s, __node, __n) \
  531	for (__node = 0; __node < nr_node_ids; __node++) \
  532		 if ((__n = get_node(__s, __node)))
  533
  534/*
  535 * Tracks for which NUMA nodes we have kmem_cache_nodes allocated.
  536 * Corresponds to node_state[N_MEMORY], but can temporarily
  537 * differ during memory hotplug/hotremove operations.
  538 * Protected by slab_mutex.
  539 */
  540static nodemask_t slab_nodes;
  541
  542/*
  543 * Workqueue used for flush_cpu_slab().
  544 */
  545static struct workqueue_struct *flushwq;
  546
  547struct slub_flush_work {
  548	struct work_struct work;
  549	struct kmem_cache *s;
  550	bool skip;
  551};
  552
  553static DEFINE_MUTEX(flush_lock);
  554static DEFINE_PER_CPU(struct slub_flush_work, slub_flush);
  555
  556/********************************************************************
  557 * 			Core slab cache functions
  558 *******************************************************************/
  559
  560/*
  561 * Returns freelist pointer (ptr). With hardening, this is obfuscated
  562 * with an XOR of the address where the pointer is held and a per-cache
  563 * random number.
  564 */
  565static inline freeptr_t freelist_ptr_encode(const struct kmem_cache *s,
  566					    void *ptr, unsigned long ptr_addr)
  567{
  568	unsigned long encoded;
  569
  570#ifdef CONFIG_SLAB_FREELIST_HARDENED
  571	encoded = (unsigned long)ptr ^ s->random ^ swab(ptr_addr);
  572#else
  573	encoded = (unsigned long)ptr;
  574#endif
  575	return (freeptr_t){.v = encoded};
  576}
  577
  578static inline void *freelist_ptr_decode(const struct kmem_cache *s,
  579					freeptr_t ptr, unsigned long ptr_addr)
  580{
  581	void *decoded;
  582
  583#ifdef CONFIG_SLAB_FREELIST_HARDENED
  584	decoded = (void *)(ptr.v ^ s->random ^ swab(ptr_addr));
  585#else
  586	decoded = (void *)ptr.v;
  587#endif
  588	return decoded;
  589}
  590
  591static inline void *get_freepointer(struct kmem_cache *s, void *object)
  592{
  593	unsigned long ptr_addr;
  594	freeptr_t p;
  595
  596	object = kasan_reset_tag(object);
  597	ptr_addr = (unsigned long)object + s->offset;
  598	p = *(freeptr_t *)(ptr_addr);
  599	return freelist_ptr_decode(s, p, ptr_addr);
  600}
  601
  602static void prefetch_freepointer(const struct kmem_cache *s, void *object)
  603{
  604	prefetchw(object + s->offset);
  605}
  606
  607/*
  608 * When running under KMSAN, get_freepointer_safe() may return an uninitialized
  609 * pointer value in the case the current thread loses the race for the next
  610 * memory chunk in the freelist. In that case this_cpu_cmpxchg_double() in
  611 * slab_alloc_node() will fail, so the uninitialized value won't be used, but
  612 * KMSAN will still check all arguments of cmpxchg because of imperfect
  613 * handling of inline assembly.
  614 * To work around this problem, we apply __no_kmsan_checks to ensure that
  615 * get_freepointer_safe() returns initialized memory.
  616 */
  617__no_kmsan_checks
  618static inline void *get_freepointer_safe(struct kmem_cache *s, void *object)
  619{
  620	unsigned long freepointer_addr;
  621	freeptr_t p;
  622
  623	if (!debug_pagealloc_enabled_static())
  624		return get_freepointer(s, object);
  625
  626	object = kasan_reset_tag(object);
  627	freepointer_addr = (unsigned long)object + s->offset;
  628	copy_from_kernel_nofault(&p, (freeptr_t *)freepointer_addr, sizeof(p));
  629	return freelist_ptr_decode(s, p, freepointer_addr);
  630}
  631
  632static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
  633{
  634	unsigned long freeptr_addr = (unsigned long)object + s->offset;
  635
  636#ifdef CONFIG_SLAB_FREELIST_HARDENED
  637	BUG_ON(object == fp); /* naive detection of double free or corruption */
  638#endif
  639
  640	freeptr_addr = (unsigned long)kasan_reset_tag((void *)freeptr_addr);
  641	*(freeptr_t *)freeptr_addr = freelist_ptr_encode(s, fp, freeptr_addr);
  642}
  643
  644/*
  645 * See comment in calculate_sizes().
  646 */
  647static inline bool freeptr_outside_object(struct kmem_cache *s)
  648{
  649	return s->offset >= s->inuse;
  650}
  651
  652/*
  653 * Return offset of the end of info block which is inuse + free pointer if
  654 * not overlapping with object.
  655 */
  656static inline unsigned int get_info_end(struct kmem_cache *s)
  657{
  658	if (freeptr_outside_object(s))
  659		return s->inuse + sizeof(void *);
  660	else
  661		return s->inuse;
  662}
  663
  664/* Loop over all objects in a slab */
  665#define for_each_object(__p, __s, __addr, __objects) \
  666	for (__p = fixup_red_left(__s, __addr); \
  667		__p < (__addr) + (__objects) * (__s)->size; \
  668		__p += (__s)->size)
  669
  670static inline unsigned int order_objects(unsigned int order, unsigned int size)
  671{
  672	return ((unsigned int)PAGE_SIZE << order) / size;
  673}
  674
  675static inline struct kmem_cache_order_objects oo_make(unsigned int order,
  676		unsigned int size)
  677{
  678	struct kmem_cache_order_objects x = {
  679		(order << OO_SHIFT) + order_objects(order, size)
  680	};
  681
  682	return x;
  683}
  684
  685static inline unsigned int oo_order(struct kmem_cache_order_objects x)
  686{
  687	return x.x >> OO_SHIFT;
  688}
  689
  690static inline unsigned int oo_objects(struct kmem_cache_order_objects x)
  691{
  692	return x.x & OO_MASK;
  693}
  694
  695#ifdef CONFIG_SLUB_CPU_PARTIAL
  696static void slub_set_cpu_partial(struct kmem_cache *s, unsigned int nr_objects)
  697{
  698	unsigned int nr_slabs;
  699
  700	s->cpu_partial = nr_objects;
  701
  702	/*
  703	 * We take the number of objects but actually limit the number of
  704	 * slabs on the per cpu partial list, in order to limit excessive
  705	 * growth of the list. For simplicity we assume that the slabs will
  706	 * be half-full.
  707	 */
  708	nr_slabs = DIV_ROUND_UP(nr_objects * 2, oo_objects(s->oo));
  709	s->cpu_partial_slabs = nr_slabs;
  710}
  711
  712static inline unsigned int slub_get_cpu_partial(struct kmem_cache *s)
  713{
  714	return s->cpu_partial_slabs;
  715}
  716#else
  717#ifdef SLAB_SUPPORTS_SYSFS
  718static inline void
  719slub_set_cpu_partial(struct kmem_cache *s, unsigned int nr_objects)
  720{
  721}
  722#endif
  723
  724static inline unsigned int slub_get_cpu_partial(struct kmem_cache *s)
  725{
  726	return 0;
  727}
  728#endif /* CONFIG_SLUB_CPU_PARTIAL */
  729
  730/*
  731 * If network-based swap is enabled, slub must keep track of whether memory
  732 * were allocated from pfmemalloc reserves.
  733 */
  734static inline bool slab_test_pfmemalloc(const struct slab *slab)
  735{
  736	return test_bit(SL_pfmemalloc, &slab->flags.f);
  737}
  738
  739static inline void slab_set_pfmemalloc(struct slab *slab)
  740{
  741	set_bit(SL_pfmemalloc, &slab->flags.f);
  742}
  743
  744static inline void __slab_clear_pfmemalloc(struct slab *slab)
  745{
  746	__clear_bit(SL_pfmemalloc, &slab->flags.f);
  747}
  748
  749/*
  750 * Per slab locking using the pagelock
  751 */
  752static __always_inline void slab_lock(struct slab *slab)
  753{
  754	bit_spin_lock(SL_locked, &slab->flags.f);
  755}
  756
  757static __always_inline void slab_unlock(struct slab *slab)
  758{
  759	bit_spin_unlock(SL_locked, &slab->flags.f);
  760}
  761
  762static inline bool
  763__update_freelist_fast(struct slab *slab, struct freelist_counters *old,
  764		       struct freelist_counters *new)
  765{
  766#ifdef system_has_freelist_aba
  767	return try_cmpxchg_freelist(&slab->freelist_counters,
  768				    &old->freelist_counters,
  769				    new->freelist_counters);
  770#else
  771	return false;
  772#endif
  773}
  774
  775static inline bool
  776__update_freelist_slow(struct slab *slab, struct freelist_counters *old,
  777		       struct freelist_counters *new)
  778{
  779	bool ret = false;
  780
  781	slab_lock(slab);
  782	if (slab->freelist == old->freelist &&
  783	    slab->counters == old->counters) {
  784		slab->freelist = new->freelist;
  785		slab->counters = new->counters;
  786		ret = true;
  787	}
  788	slab_unlock(slab);
  789
  790	return ret;
  791}
  792
  793/*
  794 * Interrupts must be disabled (for the fallback code to work right), typically
  795 * by an _irqsave() lock variant. On PREEMPT_RT the preempt_disable(), which is
  796 * part of bit_spin_lock(), is sufficient because the policy is not to allow any
  797 * allocation/ free operation in hardirq context. Therefore nothing can
  798 * interrupt the operation.
  799 */
  800static inline bool __slab_update_freelist(struct kmem_cache *s, struct slab *slab,
  801		struct freelist_counters *old, struct freelist_counters *new, const char *n)
  802{
  803	bool ret;
  804
  805	if (USE_LOCKLESS_FAST_PATH())
  806		lockdep_assert_irqs_disabled();
  807
  808	if (s->flags & __CMPXCHG_DOUBLE)
  809		ret = __update_freelist_fast(slab, old, new);
  810	else
  811		ret = __update_freelist_slow(slab, old, new);
  812
  813	if (likely(ret))
  814		return true;
  815
  816	cpu_relax();
  817	stat(s, CMPXCHG_DOUBLE_FAIL);
  818
  819#ifdef SLUB_DEBUG_CMPXCHG
  820	pr_info("%s %s: cmpxchg double redo ", n, s->name);
  821#endif
  822
  823	return false;
  824}
  825
  826static inline bool slab_update_freelist(struct kmem_cache *s, struct slab *slab,
  827		struct freelist_counters *old, struct freelist_counters *new, const char *n)
  828{
  829	bool ret;
  830
  831	if (s->flags & __CMPXCHG_DOUBLE) {
  832		ret = __update_freelist_fast(slab, old, new);
  833	} else {
  834		unsigned long flags;
  835
  836		local_irq_save(flags);
  837		ret = __update_freelist_slow(slab, old, new);
  838		local_irq_restore(flags);
  839	}
  840	if (likely(ret))
  841		return true;
  842
  843	cpu_relax();
  844	stat(s, CMPXCHG_DOUBLE_FAIL);
  845
  846#ifdef SLUB_DEBUG_CMPXCHG
  847	pr_info("%s %s: cmpxchg double redo ", n, s->name);
  848#endif
  849
  850	return false;
  851}
  852
  853/*
  854 * kmalloc caches has fixed sizes (mostly power of 2), and kmalloc() API
  855 * family will round up the real request size to these fixed ones, so
  856 * there could be an extra area than what is requested. Save the original
  857 * request size in the meta data area, for better debug and sanity check.
  858 */
  859static inline void set_orig_size(struct kmem_cache *s,
  860				void *object, unsigned int orig_size)
  861{
  862	void *p = kasan_reset_tag(object);
  863
  864	if (!slub_debug_orig_size(s))
  865		return;
  866
  867	p += get_info_end(s);
  868	p += sizeof(struct track) * 2;
  869
  870	*(unsigned int *)p = orig_size;
  871}
  872
  873static inline unsigned int get_orig_size(struct kmem_cache *s, void *object)
  874{
  875	void *p = kasan_reset_tag(object);
  876
  877	if (is_kfence_address(object))
  878		return kfence_ksize(object);
  879
  880	if (!slub_debug_orig_size(s))
  881		return s->object_size;
  882
  883	p += get_info_end(s);
  884	p += sizeof(struct track) * 2;
  885
  886	return *(unsigned int *)p;
  887}
  888
  889#ifdef CONFIG_SLUB_DEBUG
  890
  891/*
  892 * For debugging context when we want to check if the struct slab pointer
  893 * appears to be valid.
  894 */
  895static inline bool validate_slab_ptr(struct slab *slab)
  896{
  897	return PageSlab(slab_page(slab));
  898}
  899
  900static unsigned long object_map[BITS_TO_LONGS(MAX_OBJS_PER_PAGE)];
  901static DEFINE_SPINLOCK(object_map_lock);
  902
  903static void __fill_map(unsigned long *obj_map, struct kmem_cache *s,
  904		       struct slab *slab)
  905{
  906	void *addr = slab_address(slab);
  907	void *p;
  908
  909	bitmap_zero(obj_map, slab->objects);
  910
  911	for (p = slab->freelist; p; p = get_freepointer(s, p))
  912		set_bit(__obj_to_index(s, addr, p), obj_map);
  913}
  914
  915#if IS_ENABLED(CONFIG_KUNIT)
  916static bool slab_add_kunit_errors(void)
  917{
  918	struct kunit_resource *resource;
  919
  920	if (!kunit_get_current_test())
  921		return false;
  922
  923	resource = kunit_find_named_resource(current->kunit_test, "slab_errors");
  924	if (!resource)
  925		return false;
  926
  927	(*(int *)resource->data)++;
  928	kunit_put_resource(resource);
  929	return true;
  930}
  931
  932bool slab_in_kunit_test(void)
  933{
  934	struct kunit_resource *resource;
  935
  936	if (!kunit_get_current_test())
  937		return false;
  938
  939	resource = kunit_find_named_resource(current->kunit_test, "slab_errors");
  940	if (!resource)
  941		return false;
  942
  943	kunit_put_resource(resource);
  944	return true;
  945}
  946#else
  947static inline bool slab_add_kunit_errors(void) { return false; }
  948#endif
  949
  950static inline unsigned int size_from_object(struct kmem_cache *s)
  951{
  952	if (s->flags & SLAB_RED_ZONE)
  953		return s->size - s->red_left_pad;
  954
  955	return s->size;
  956}
  957
  958static inline void *restore_red_left(struct kmem_cache *s, void *p)
  959{
  960	if (s->flags & SLAB_RED_ZONE)
  961		p -= s->red_left_pad;
  962
  963	return p;
  964}
  965
  966/*
  967 * Debug settings:
  968 */
  969#if defined(CONFIG_SLUB_DEBUG_ON)
  970static slab_flags_t slub_debug = DEBUG_DEFAULT_FLAGS;
  971#else
  972static slab_flags_t slub_debug;
  973#endif
  974
  975static const char *slub_debug_string __ro_after_init;
  976static int disable_higher_order_debug;
  977
  978/*
  979 * slub is about to manipulate internal object metadata.  This memory lies
  980 * outside the range of the allocated object, so accessing it would normally
  981 * be reported by kasan as a bounds error.  metadata_access_enable() is used
  982 * to tell kasan that these accesses are OK.
  983 */
  984static inline void metadata_access_enable(void)
  985{
  986	kasan_disable_current();
  987	kmsan_disable_current();
  988}
  989
  990static inline void metadata_access_disable(void)
  991{
  992	kmsan_enable_current();
  993	kasan_enable_current();
  994}
  995
  996/*
  997 * Object debugging
  998 */
  999
 1000/* Verify that a pointer has an address that is valid within a slab page */
 1001static inline int check_valid_pointer(struct kmem_cache *s,
 1002				struct slab *slab, void *object)
 1003{
 1004	void *base;
 1005
 1006	if (!object)
 1007		return 1;
 1008
 1009	base = slab_address(slab);
 1010	object = kasan_reset_tag(object);
 1011	object = restore_red_left(s, object);
 1012	if (object < base || object >= base + slab->objects * s->size ||
 1013		(object - base) % s->size) {
 1014		return 0;
 1015	}
 1016
 1017	return 1;
 1018}
 1019
 1020static void print_section(char *level, char *text, u8 *addr,
 1021			  unsigned int length)
 1022{
 1023	metadata_access_enable();
 1024	print_hex_dump(level, text, DUMP_PREFIX_ADDRESS,
 1025			16, 1, kasan_reset_tag((void *)addr), length, 1);
 1026	metadata_access_disable();
 1027}
 1028
 1029static struct track *get_track(struct kmem_cache *s, void *object,
 1030	enum track_item alloc)
 1031{
 1032	struct track *p;
 1033
 1034	p = object + get_info_end(s);
 1035
 1036	return kasan_reset_tag(p + alloc);
 1037}
 1038
 1039#ifdef CONFIG_STACKDEPOT
 1040static noinline depot_stack_handle_t set_track_prepare(gfp_t gfp_flags)
 1041{
 1042	depot_stack_handle_t handle;
 1043	unsigned long entries[TRACK_ADDRS_COUNT];
 1044	unsigned int nr_entries;
 1045
 1046	nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 3);
 1047	handle = stack_depot_save(entries, nr_entries, gfp_flags);
 1048
 1049	return handle;
 1050}
 1051#else
 1052static inline depot_stack_handle_t set_track_prepare(gfp_t gfp_flags)
 1053{
 1054	return 0;
 1055}
 1056#endif
 1057
 1058static void set_track_update(struct kmem_cache *s, void *object,
 1059			     enum track_item alloc, unsigned long addr,
 1060			     depot_stack_handle_t handle)
 1061{
 1062	struct track *p = get_track(s, object, alloc);
 1063
 1064#ifdef CONFIG_STACKDEPOT
 1065	p->handle = handle;
 1066#endif
 1067	p->addr = addr;
 1068	p->cpu = smp_processor_id();
 1069	p->pid = current->pid;
 1070	p->when = jiffies;
 1071}
 1072
 1073static __always_inline void set_track(struct kmem_cache *s, void *object,
 1074				      enum track_item alloc, unsigned long addr, gfp_t gfp_flags)
 1075{
 1076	depot_stack_handle_t handle = set_track_prepare(gfp_flags);
 1077
 1078	set_track_update(s, object, alloc, addr, handle);
 1079}
 1080
 1081static void init_tracking(struct kmem_cache *s, void *object)
 1082{
 1083	struct track *p;
 1084
 1085	if (!(s->flags & SLAB_STORE_USER))
 1086		return;
 1087
 1088	p = get_track(s, object, TRACK_ALLOC);
 1089	memset(p, 0, 2*sizeof(struct track));
 1090}
 1091
 1092static void print_track(const char *s, struct track *t, unsigned long pr_time)
 1093{
 1094	depot_stack_handle_t handle __maybe_unused;
 1095
 1096	if (!t->addr)
 1097		return;
 1098
 1099	pr_err("%s in %pS age=%lu cpu=%u pid=%d\n",
 1100	       s, (void *)t->addr, pr_time - t->when, t->cpu, t->pid);
 1101#ifdef CONFIG_STACKDEPOT
 1102	handle = READ_ONCE(t->handle);
 1103	if (handle)
 1104		stack_depot_print(handle);
 1105	else
 1106		pr_err("object allocation/free stack trace missing\n");
 1107#endif
 1108}
 1109
 1110void print_tracking(struct kmem_cache *s, void *object)
 1111{
 1112	unsigned long pr_time = jiffies;
 1113	if (!(s->flags & SLAB_STORE_USER))
 1114		return;
 1115
 1116	print_track("Allocated", get_track(s, object, TRACK_ALLOC), pr_time);
 1117	print_track("Freed", get_track(s, object, TRACK_FREE), pr_time);
 1118}
 1119
 1120static void print_slab_info(const struct slab *slab)
 1121{
 1122	pr_err("Slab 0x%p objects=%u used=%u fp=0x%p flags=%pGp\n",
 1123	       slab, slab->objects, slab->inuse, slab->freelist,
 1124	       &slab->flags.f);
 1125}
 1126
 1127void skip_orig_size_check(struct kmem_cache *s, const void *object)
 1128{
 1129	set_orig_size(s, (void *)object, s->object_size);
 1130}
 1131
 1132static void __slab_bug(struct kmem_cache *s, const char *fmt, va_list argsp)
 1133{
 1134	struct va_format vaf;
 1135	va_list args;
 1136
 1137	va_copy(args, argsp);
 1138	vaf.fmt = fmt;
 1139	vaf.va = &args;
 1140	pr_err("=============================================================================\n");
 1141	pr_err("BUG %s (%s): %pV\n", s ? s->name : "<unknown>", print_tainted(), &vaf);
 1142	pr_err("-----------------------------------------------------------------------------\n\n");
 1143	va_end(args);
 1144}
 1145
 1146static void slab_bug(struct kmem_cache *s, const char *fmt, ...)
 1147{
 1148	va_list args;
 1149
 1150	va_start(args, fmt);
 1151	__slab_bug(s, fmt, args);
 1152	va_end(args);
 1153}
 1154
 1155__printf(2, 3)
 1156static void slab_fix(struct kmem_cache *s, const char *fmt, ...)
 1157{
 1158	struct va_format vaf;
 1159	va_list args;
 1160
 1161	if (slab_add_kunit_errors())
 1162		return;
 1163
 1164	va_start(args, fmt);
 1165	vaf.fmt = fmt;
 1166	vaf.va = &args;
 1167	pr_err("FIX %s: %pV\n", s->name, &vaf);
 1168	va_end(args);
 1169}
 1170
 1171static void print_trailer(struct kmem_cache *s, struct slab *slab, u8 *p)
 1172{
 1173	unsigned int off;	/* Offset of last byte */
 1174	u8 *addr = slab_address(slab);
 1175
 1176	print_tracking(s, p);
 1177
 1178	print_slab_info(slab);
 1179
 1180	pr_err("Object 0x%p @offset=%tu fp=0x%p\n\n",
 1181	       p, p - addr, get_freepointer(s, p));
 1182
 1183	if (s->flags & SLAB_RED_ZONE)
 1184		print_section(KERN_ERR, "Redzone  ", p - s->red_left_pad,
 1185			      s->red_left_pad);
 1186	else if (p > addr + 16)
 1187		print_section(KERN_ERR, "Bytes b4 ", p - 16, 16);
 1188
 1189	print_section(KERN_ERR,         "Object   ", p,
 1190		      min_t(unsigned int, s->object_size, PAGE_SIZE));
 1191	if (s->flags & SLAB_RED_ZONE)
 1192		print_section(KERN_ERR, "Redzone  ", p + s->object_size,
 1193			s->inuse - s->object_size);
 1194
 1195	off = get_info_end(s);
 1196
 1197	if (s->flags & SLAB_STORE_USER)
 1198		off += 2 * sizeof(struct track);
 1199
 1200	if (slub_debug_orig_size(s))
 1201		off += sizeof(unsigned int);
 1202
 1203	off += kasan_metadata_size(s, false);
 1204
 1205	if (off != size_from_object(s))
 1206		/* Beginning of the filler is the free pointer */
 1207		print_section(KERN_ERR, "Padding  ", p + off,
 1208			      size_from_object(s) - off);
 1209}
 1210
 1211static void object_err(struct kmem_cache *s, struct slab *slab,
 1212			u8 *object, const char *reason)
 1213{
 1214	if (slab_add_kunit_errors())
 1215		return;
 1216
 1217	slab_bug(s, reason);
 1218	if (!object || !check_valid_pointer(s, slab, object)) {
 1219		print_slab_info(slab);
 1220		pr_err("Invalid pointer 0x%p\n", object);
 1221	} else {
 1222		print_trailer(s, slab, object);
 1223	}
 1224	add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
 1225
 1226	WARN_ON(1);
 1227}
 1228
 1229static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab,
 1230			       void **freelist, void *nextfree)
 1231{
 1232	if ((s->flags & SLAB_CONSISTENCY_CHECKS) &&
 1233	    !check_valid_pointer(s, slab, nextfree) && freelist) {
 1234		object_err(s, slab, *freelist, "Freechain corrupt");
 1235		*freelist = NULL;
 1236		slab_fix(s, "Isolate corrupted freechain");
 1237		return true;
 1238	}
 1239
 1240	return false;
 1241}
 1242
 1243static void __slab_err(struct slab *slab)
 1244{
 1245	if (slab_in_kunit_test())
 1246		return;
 1247
 1248	print_slab_info(slab);
 1249	add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
 1250
 1251	WARN_ON(1);
 1252}
 1253
 1254static __printf(3, 4) void slab_err(struct kmem_cache *s, struct slab *slab,
 1255			const char *fmt, ...)
 1256{
 1257	va_list args;
 1258
 1259	if (slab_add_kunit_errors())
 1260		return;
 1261
 1262	va_start(args, fmt);
 1263	__slab_bug(s, fmt, args);
 1264	va_end(args);
 1265
 1266	__slab_err(slab);
 1267}
 1268
 1269static void init_object(struct kmem_cache *s, void *object, u8 val)
 1270{
 1271	u8 *p = kasan_reset_tag(object);
 1272	unsigned int poison_size = s->object_size;
 1273
 1274	if (s->flags & SLAB_RED_ZONE) {
 1275		/*
 1276		 * Here and below, avoid overwriting the KMSAN shadow. Keeping
 1277		 * the shadow makes it possible to distinguish uninit-value
 1278		 * from use-after-free.
 1279		 */
 1280		memset_no_sanitize_memory(p - s->red_left_pad, val,
 1281					  s->red_left_pad);
 1282
 1283		if (slub_debug_orig_size(s) && val == SLUB_RED_ACTIVE) {
 1284			/*
 1285			 * Redzone the extra allocated space by kmalloc than
 1286			 * requested, and the poison size will be limited to
 1287			 * the original request size accordingly.
 1288			 */
 1289			poison_size = get_orig_size(s, object);
 1290		}
 1291	}
 1292
 1293	if (s->flags & __OBJECT_POISON) {
 1294		memset_no_sanitize_memory(p, POISON_FREE, poison_size - 1);
 1295		memset_no_sanitize_memory(p + poison_size - 1, POISON_END, 1);
 1296	}
 1297
 1298	if (s->flags & SLAB_RED_ZONE)
 1299		memset_no_sanitize_memory(p + poison_size, val,
 1300					  s->inuse - poison_size);
 1301}
 1302
 1303static void restore_bytes(struct kmem_cache *s, const char *message, u8 data,
 1304						void *from, void *to)
 1305{
 1306	slab_fix(s, "Restoring %s 0x%p-0x%p=0x%x", message, from, to - 1, data);
 1307	memset(from, data, to - from);
 1308}
 1309
 1310#ifdef CONFIG_KMSAN
 1311#define pad_check_attributes noinline __no_kmsan_checks
 1312#else
 1313#define pad_check_attributes
 1314#endif
 1315
 1316static pad_check_attributes int
 1317check_bytes_and_report(struct kmem_cache *s, struct slab *slab,
 1318		       u8 *object, const char *what, u8 *start, unsigned int value,
 1319		       unsigned int bytes, bool slab_obj_print)
 1320{
 1321	u8 *fault;
 1322	u8 *end;
 1323	u8 *addr = slab_address(slab);
 1324
 1325	metadata_access_enable();
 1326	fault = memchr_inv(kasan_reset_tag(start), value, bytes);
 1327	metadata_access_disable();
 1328	if (!fault)
 1329		return 1;
 1330
 1331	end = start + bytes;
 1332	while (end > fault && end[-1] == value)
 1333		end--;
 1334
 1335	if (slab_add_kunit_errors())
 1336		goto skip_bug_print;
 1337
 1338	pr_err("[%s overwritten] 0x%p-0x%p @offset=%tu. First byte 0x%x instead of 0x%x\n",
 1339	       what, fault, end - 1, fault - addr, fault[0], value);
 1340
 1341	if (slab_obj_print)
 1342		object_err(s, slab, object, "Object corrupt");
 1343
 1344skip_bug_print:
 1345	restore_bytes(s, what, value, fault, end);
 1346	return 0;
 1347}
 1348
 1349/*
 1350 * Object layout:
 1351 *
 1352 * object address
 1353 * 	Bytes of the object to be managed.
 1354 * 	If the freepointer may overlay the object then the free
 1355 *	pointer is at the middle of the object.
 1356 *
 1357 * 	Poisoning uses 0x6b (POISON_FREE) and the last byte is
 1358 * 	0xa5 (POISON_END)
 1359 *
 1360 * object + s->object_size
 1361 * 	Padding to reach word boundary. This is also used for Redzoning.
 1362 * 	Padding is extended by another word if Redzoning is enabled and
 1363 * 	object_size == inuse.
 1364 *
 1365 * 	We fill with 0xbb (SLUB_RED_INACTIVE) for inactive objects and with
 1366 * 	0xcc (SLUB_RED_ACTIVE) for objects in use.
 1367 *
 1368 * object + s->inuse
 1369 * 	Meta data starts here.
 1370 *
 1371 * 	A. Free pointer (if we cannot overwrite object on free)
 1372 * 	B. Tracking data for SLAB_STORE_USER
 1373 *	C. Original request size for kmalloc object (SLAB_STORE_USER enabled)
 1374 *	D. Padding to reach required alignment boundary or at minimum
 1375 * 		one word if debugging is on to be able to detect writes
 1376 * 		before the word boundary.
 1377 *
 1378 *	Padding is done using 0x5a (POISON_INUSE)
 1379 *
 1380 * object + s->size
 1381 * 	Nothing is used beyond s->size.
 1382 *
 1383 * If slabcaches are merged then the object_size and inuse boundaries are mostly
 1384 * ignored. And therefore no slab options that rely on these boundaries
 1385 * may be used with merged slabcaches.
 1386 */
 1387
 1388static int check_pad_bytes(struct kmem_cache *s, struct slab *slab, u8 *p)
 1389{
 1390	unsigned long off = get_info_end(s);	/* The end of info */
 1391
 1392	if (s->flags & SLAB_STORE_USER) {
 1393		/* We also have user information there */
 1394		off += 2 * sizeof(struct track);
 1395
 1396		if (s->flags & SLAB_KMALLOC)
 1397			off += sizeof(unsigned int);
 1398	}
 1399
 1400	off += kasan_metadata_size(s, false);
 1401
 1402	if (size_from_object(s) == off)
 1403		return 1;
 1404
 1405	return check_bytes_and_report(s, slab, p, "Object padding",
 1406			p + off, POISON_INUSE, size_from_object(s) - off, true);
 1407}
 1408
 1409/* Check the pad bytes at the end of a slab page */
 1410static pad_check_attributes void
 1411slab_pad_check(struct kmem_cache *s, struct slab *slab)
 1412{
 1413	u8 *start;
 1414	u8 *fault;
 1415	u8 *end;
 1416	u8 *pad;
 1417	int length;
 1418	int remainder;
 1419
 1420	if (!(s->flags & SLAB_POISON))
 1421		return;
 1422
 1423	start = slab_address(slab);
 1424	length = slab_size(slab);
 1425	end = start + length;
 1426	remainder = length % s->size;
 1427	if (!remainder)
 1428		return;
 1429
 1430	pad = end - remainder;
 1431	metadata_access_enable();
 1432	fault = memchr_inv(kasan_reset_tag(pad), POISON_INUSE, remainder);
 1433	metadata_access_disable();
 1434	if (!fault)
 1435		return;
 1436	while (end > fault && end[-1] == POISON_INUSE)
 1437		end--;
 1438
 1439	slab_bug(s, "Padding overwritten. 0x%p-0x%p @offset=%tu",
 1440		 fault, end - 1, fault - start);
 1441	print_section(KERN_ERR, "Padding ", pad, remainder);
 1442	__slab_err(slab);
 1443
 1444	restore_bytes(s, "slab padding", POISON_INUSE, fault, end);
 1445}
 1446
 1447static int check_object(struct kmem_cache *s, struct slab *slab,
 1448					void *object, u8 val)
 1449{
 1450	u8 *p = object;
 1451	u8 *endobject = object + s->object_size;
 1452	unsigned int orig_size, kasan_meta_size;
 1453	int ret = 1;
 1454
 1455	if (s->flags & SLAB_RED_ZONE) {
 1456		if (!check_bytes_and_report(s, slab, object, "Left Redzone",
 1457			object - s->red_left_pad, val, s->red_left_pad, ret))
 1458			ret = 0;
 1459
 1460		if (!check_bytes_and_report(s, slab, object, "Right Redzone",
 1461			endobject, val, s->inuse - s->object_size, ret))
 1462			ret = 0;
 1463
 1464		if (slub_debug_orig_size(s) && val == SLUB_RED_ACTIVE) {
 1465			orig_size = get_orig_size(s, object);
 1466
 1467			if (s->object_size > orig_size  &&
 1468				!check_bytes_and_report(s, slab, object,
 1469					"kmalloc Redzone", p + orig_size,
 1470					val, s->object_size - orig_size, ret)) {
 1471				ret = 0;
 1472			}
 1473		}
 1474	} else {
 1475		if ((s->flags & SLAB_POISON) && s->object_size < s->inuse) {
 1476			if (!check_bytes_and_report(s, slab, p, "Alignment padding",
 1477				endobject, POISON_INUSE,
 1478				s->inuse - s->object_size, ret))
 1479				ret = 0;
 1480		}
 1481	}
 1482
 1483	if (s->flags & SLAB_POISON) {
 1484		if (val != SLUB_RED_ACTIVE && (s->flags & __OBJECT_POISON)) {
 1485			/*
 1486			 * KASAN can save its free meta data inside of the
 1487			 * object at offset 0. Thus, skip checking the part of
 1488			 * the redzone that overlaps with the meta data.
 1489			 */
 1490			kasan_meta_size = kasan_metadata_size(s, true);
 1491			if (kasan_meta_size < s->object_size - 1 &&
 1492			    !check_bytes_and_report(s, slab, p, "Poison",
 1493					p + kasan_meta_size, POISON_FREE,
 1494					s->object_size - kasan_meta_size - 1, ret))
 1495				ret = 0;
 1496			if (kasan_meta_size < s->object_size &&
 1497			    !check_bytes_and_report(s, slab, p, "End Poison",
 1498					p + s->object_size - 1, POISON_END, 1, ret))
 1499				ret = 0;
 1500		}
 1501		/*
 1502		 * check_pad_bytes cleans up on its own.
 1503		 */
 1504		if (!check_pad_bytes(s, slab, p))
 1505			ret = 0;
 1506	}
 1507
 1508	/*
 1509	 * Cannot check freepointer while object is allocated if
 1510	 * object and freepointer overlap.
 1511	 */
 1512	if ((freeptr_outside_object(s) || val != SLUB_RED_ACTIVE) &&
 1513	    !check_valid_pointer(s, slab, get_freepointer(s, p))) {
 1514		object_err(s, slab, p, "Freepointer corrupt");
 1515		/*
 1516		 * No choice but to zap it and thus lose the remainder
 1517		 * of the free objects in this slab. May cause
 1518		 * another error because the object count is now wrong.
 1519		 */
 1520		set_freepointer(s, p, NULL);
 1521		ret = 0;
 1522	}
 1523
 1524	return ret;
 1525}
 1526
 1527/*
 1528 * Checks if the slab state looks sane. Assumes the struct slab pointer
 1529 * was either obtained in a way that ensures it's valid, or validated
 1530 * by validate_slab_ptr()
 1531 */
 1532static int check_slab(struct kmem_cache *s, struct slab *slab)
 1533{
 1534	int maxobj;
 1535
 1536	maxobj = order_objects(slab_order(slab), s->size);
 1537	if (slab->objects > maxobj) {
 1538		slab_err(s, slab, "objects %u > max %u",
 1539			slab->objects, maxobj);
 1540		return 0;
 1541	}
 1542	if (slab->inuse > slab->objects) {
 1543		slab_err(s, slab, "inuse %u > max %u",
 1544			slab->inuse, slab->objects);
 1545		return 0;
 1546	}
 1547	if (slab->frozen) {
 1548		slab_err(s, slab, "Slab disabled since SLUB metadata consistency check failed");
 1549		return 0;
 1550	}
 1551
 1552	/* Slab_pad_check fixes things up after itself */
 1553	slab_pad_check(s, slab);
 1554	return 1;
 1555}
 1556
 1557/*
 1558 * Determine if a certain object in a slab is on the freelist. Must hold the
 1559 * slab lock to guarantee that the chains are in a consistent state.
 1560 */
 1561static bool on_freelist(struct kmem_cache *s, struct slab *slab, void *search)
 1562{
 1563	int nr = 0;
 1564	void *fp;
 1565	void *object = NULL;
 1566	int max_objects;
 1567
 1568	fp = slab->freelist;
 1569	while (fp && nr <= slab->objects) {
 1570		if (fp == search)
 1571			return true;
 1572		if (!check_valid_pointer(s, slab, fp)) {
 1573			if (object) {
 1574				object_err(s, slab, object,
 1575					"Freechain corrupt");
 1576				set_freepointer(s, object, NULL);
 1577				break;
 1578			} else {
 1579				slab_err(s, slab, "Freepointer corrupt");
 1580				slab->freelist = NULL;
 1581				slab->inuse = slab->objects;
 1582				slab_fix(s, "Freelist cleared");
 1583				return false;
 1584			}
 1585		}
 1586		object = fp;
 1587		fp = get_freepointer(s, object);
 1588		nr++;
 1589	}
 1590
 1591	if (nr > slab->objects) {
 1592		slab_err(s, slab, "Freelist cycle detected");
 1593		slab->freelist = NULL;
 1594		slab->inuse = slab->objects;
 1595		slab_fix(s, "Freelist cleared");
 1596		return false;
 1597	}
 1598
 1599	max_objects = order_objects(slab_order(slab), s->size);
 1600	if (max_objects > MAX_OBJS_PER_PAGE)
 1601		max_objects = MAX_OBJS_PER_PAGE;
 1602
 1603	if (slab->objects != max_objects) {
 1604		slab_err(s, slab, "Wrong number of objects. Found %d but should be %d",
 1605			 slab->objects, max_objects);
 1606		slab->objects = max_objects;
 1607		slab_fix(s, "Number of objects adjusted");
 1608	}
 1609	if (slab->inuse != slab->objects - nr) {
 1610		slab_err(s, slab, "Wrong object count. Counter is %d but counted were %d",
 1611			 slab->inuse, slab->objects - nr);
 1612		slab->inuse = slab->objects - nr;
 1613		slab_fix(s, "Object count adjusted");
 1614	}
 1615	return search == NULL;
 1616}
 1617
 1618static void trace(struct kmem_cache *s, struct slab *slab, void *object,
 1619								int alloc)
 1620{
 1621	if (s->flags & SLAB_TRACE) {
 1622		pr_info("TRACE %s %s 0x%p inuse=%d fp=0x%p\n",
 1623			s->name,
 1624			alloc ? "alloc" : "free",
 1625			object, slab->inuse,
 1626			slab->freelist);
 1627
 1628		if (!alloc)
 1629			print_section(KERN_INFO, "Object ", (void *)object,
 1630					s->object_size);
 1631
 1632		dump_stack();
 1633	}
 1634}
 1635
 1636/*
 1637 * Tracking of fully allocated slabs for debugging purposes.
 1638 */
 1639static void add_full(struct kmem_cache *s,
 1640	struct kmem_cache_node *n, struct slab *slab)
 1641{
 1642	if (!(s->flags & SLAB_STORE_USER))
 1643		return;
 1644
 1645	lockdep_assert_held(&n->list_lock);
 1646	list_add(&slab->slab_list, &n->full);
 1647}
 1648
 1649static void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, struct slab *slab)
 1650{
 1651	if (!(s->flags & SLAB_STORE_USER))
 1652		return;
 1653
 1654	lockdep_assert_held(&n->list_lock);
 1655	list_del(&slab->slab_list);
 1656}
 1657
 1658static inline unsigned long node_nr_slabs(struct kmem_cache_node *n)
 1659{
 1660	return atomic_long_read(&n->nr_slabs);
 1661}
 1662
 1663static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects)
 1664{
 1665	struct kmem_cache_node *n = get_node(s, node);
 1666
 1667	atomic_long_inc(&n->nr_slabs);
 1668	atomic_long_add(objects, &n->total_objects);
 1669}
 1670static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects)
 1671{
 1672	struct kmem_cache_node *n = get_node(s, node);
 1673
 1674	atomic_long_dec(&n->nr_slabs);
 1675	atomic_long_sub(objects, &n->total_objects);
 1676}
 1677
 1678/* Object debug checks for alloc/free paths */
 1679static void setup_object_debug(struct kmem_cache *s, void *object)
 1680{
 1681	if (!kmem_cache_debug_flags(s, SLAB_STORE_USER|SLAB_RED_ZONE|__OBJECT_POISON))
 1682		return;
 1683
 1684	init_object(s, object, SLUB_RED_INACTIVE);
 1685	init_tracking(s, object);
 1686}
 1687
 1688static
 1689void setup_slab_debug(struct kmem_cache *s, struct slab *slab, void *addr)
 1690{
 1691	if (!kmem_cache_debug_flags(s, SLAB_POISON))
 1692		return;
 1693
 1694	metadata_access_enable();
 1695	memset(kasan_reset_tag(addr), POISON_INUSE, slab_size(slab));
 1696	metadata_access_disable();
 1697}
 1698
 1699static inline int alloc_consistency_checks(struct kmem_cache *s,
 1700					struct slab *slab, void *object)
 1701{
 1702	if (!check_slab(s, slab))
 1703		return 0;
 1704
 1705	if (!check_valid_pointer(s, slab, object)) {
 1706		object_err(s, slab, object, "Freelist Pointer check fails");
 1707		return 0;
 1708	}
 1709
 1710	if (!check_object(s, slab, object, SLUB_RED_INACTIVE))
 1711		return 0;
 1712
 1713	return 1;
 1714}
 1715
 1716static noinline bool alloc_debug_processing(struct kmem_cache *s,
 1717			struct slab *slab, void *object, int orig_size)
 1718{
 1719	if (s->flags & SLAB_CONSISTENCY_CHECKS) {
 1720		if (!alloc_consistency_checks(s, slab, object))
 1721			goto bad;
 1722	}
 1723
 1724	/* Success. Perform special debug activities for allocs */
 1725	trace(s, slab, object, 1);
 1726	set_orig_size(s, object, orig_size);
 1727	init_object(s, object, SLUB_RED_ACTIVE);
 1728	return true;
 1729
 1730bad:
 1731	/*
 1732	 * Let's do the best we can to avoid issues in the future. Marking all
 1733	 * objects as used avoids touching the remaining objects.
 1734	 */
 1735	slab_fix(s, "Marking all objects used");
 1736	slab->inuse = slab->objects;
 1737	slab->freelist = NULL;
 1738	slab->frozen = 1; /* mark consistency-failed slab as frozen */
 1739
 1740	return false;
 1741}
 1742
 1743static inline int free_consistency_checks(struct kmem_cache *s,
 1744		struct slab *slab, void *object, unsigned long addr)
 1745{
 1746	if (!check_valid_pointer(s, slab, object)) {
 1747		slab_err(s, slab, "Invalid object pointer 0x%p", object);
 1748		return 0;
 1749	}
 1750
 1751	if (on_freelist(s, slab, object)) {
 1752		object_err(s, slab, object, "Object already free");
 1753		return 0;
 1754	}
 1755
 1756	if (!check_object(s, slab, object, SLUB_RED_ACTIVE))
 1757		return 0;
 1758
 1759	if (unlikely(s != slab->slab_cache)) {
 1760		if (!slab->slab_cache) {
 1761			slab_err(NULL, slab, "No slab cache for object 0x%p",
 1762				 object);
 1763		} else {
 1764			object_err(s, slab, object,
 1765				   "page slab pointer corrupt.");
 1766		}
 1767		return 0;
 1768	}
 1769	return 1;
 1770}
 1771
 1772/*
 1773 * Parse a block of slab_debug options. Blocks are delimited by ';'
 1774 *
 1775 * @str:    start of block
 1776 * @flags:  returns parsed flags, or DEBUG_DEFAULT_FLAGS if none specified
 1777 * @slabs:  return start of list of slabs, or NULL when there's no list
 1778 * @init:   assume this is initial parsing and not per-kmem-create parsing
 1779 *
 1780 * returns the start of next block if there's any, or NULL
 1781 */
 1782static const char *
 1783parse_slub_debug_flags(const char *str, slab_flags_t *flags, const char **slabs, bool init)
 1784{
 1785	bool higher_order_disable = false;
 1786
 1787	/* Skip any completely empty blocks */
 1788	while (*str && *str == ';')
 1789		str++;
 1790
 1791	if (*str == ',') {
 1792		/*
 1793		 * No options but restriction on slabs. This means full
 1794		 * debugging for slabs matching a pattern.
 1795		 */
 1796		*flags = DEBUG_DEFAULT_FLAGS;
 1797		goto check_slabs;
 1798	}
 1799	*flags = 0;
 1800
 1801	/* Determine which debug features should be switched on */
 1802	for (; *str && *str != ',' && *str != ';'; str++) {
 1803		switch (tolower(*str)) {
 1804		case '-':
 1805			*flags = 0;
 1806			break;
 1807		case 'f':
 1808			*flags |= SLAB_CONSISTENCY_CHECKS;
 1809			break;
 1810		case 'z':
 1811			*flags |= SLAB_RED_ZONE;
 1812			break;
 1813		case 'p':
 1814			*flags |= SLAB_POISON;
 1815			break;
 1816		case 'u':
 1817			*flags |= SLAB_STORE_USER;
 1818			break;
 1819		case 't':
 1820			*flags |= SLAB_TRACE;
 1821			break;
 1822		case 'a':
 1823			*flags |= SLAB_FAILSLAB;
 1824			break;
 1825		case 'o':
 1826			/*
 1827			 * Avoid enabling debugging on caches if its minimum
 1828			 * order would increase as a result.
 1829			 */
 1830			higher_order_disable = true;
 1831			break;
 1832		default:
 1833			if (init)
 1834				pr_err("slab_debug option '%c' unknown. skipped\n", *str);
 1835		}
 1836	}
 1837check_slabs:
 1838	if (*str == ',')
 1839		*slabs = ++str;
 1840	else
 1841		*slabs = NULL;
 1842
 1843	/* Skip over the slab list */
 1844	while (*str && *str != ';')
 1845		str++;
 1846
 1847	/* Skip any completely empty blocks */
 1848	while (*str && *str == ';')
 1849		str++;
 1850
 1851	if (init && higher_order_disable)
 1852		disable_higher_order_debug = 1;
 1853
 1854	if (*str)
 1855		return str;
 1856	else
 1857		return NULL;
 1858}
 1859
 1860static int __init setup_slub_debug(const char *str, const struct kernel_param *kp)
 1861{
 1862	slab_flags_t flags;
 1863	slab_flags_t global_flags;
 1864	const char *saved_str;
 1865	const char *slab_list;
 1866	bool global_slub_debug_changed = false;
 1867	bool slab_list_specified = false;
 1868
 1869	global_flags = DEBUG_DEFAULT_FLAGS;
 1870	if (!str || !*str)
 1871		/*
 1872		 * No options specified. Switch on full debugging.
 1873		 */
 1874		goto out;
 1875
 1876	saved_str = str;
 1877	while (str) {
 1878		str = parse_slub_debug_flags(str, &flags, &slab_list, true);
 1879
 1880		if (!slab_list) {
 1881			global_flags = flags;
 1882			global_slub_debug_changed = true;
 1883		} else {
 1884			slab_list_specified = true;
 1885			if (flags & SLAB_STORE_USER)
 1886				stack_depot_request_early_init();
 1887		}
 1888	}
 1889
 1890	/*
 1891	 * For backwards compatibility, a single list of flags with list of
 1892	 * slabs means debugging is only changed for those slabs, so the global
 1893	 * slab_debug should be unchanged (0 or DEBUG_DEFAULT_FLAGS, depending
 1894	 * on CONFIG_SLUB_DEBUG_ON). We can extended that to multiple lists as
 1895	 * long as there is no option specifying flags without a slab list.
 1896	 */
 1897	if (slab_list_specified) {
 1898		if (!global_slub_debug_changed)
 1899			global_flags = slub_debug;
 1900		slub_debug_string = saved_str;
 1901	}
 1902out:
 1903	slub_debug = global_flags;
 1904	if (slub_debug & SLAB_STORE_USER)
 1905		stack_depot_request_early_init();
 1906	if (slub_debug != 0 || slub_debug_string)
 1907		static_branch_enable(&slub_debug_enabled);
 1908	else
 1909		static_branch_disable(&slub_debug_enabled);
 1910	if ((static_branch_unlikely(&init_on_alloc) ||
 1911	     static_branch_unlikely(&init_on_free)) &&
 1912	    (slub_debug & SLAB_POISON))
 1913		pr_info("mem auto-init: SLAB_POISON will take precedence over init_on_alloc/init_on_free\n");
 1914	return 0;
 1915}
 1916
 1917static const struct kernel_param_ops param_ops_slab_debug __initconst = {
 1918	.flags = KERNEL_PARAM_OPS_FL_NOARG,
 1919	.set = setup_slub_debug,
 1920};
 1921__core_param_cb(slab_debug, &param_ops_slab_debug, NULL, 0);
 1922__core_param_cb(slub_debug, &param_ops_slab_debug, NULL, 0);
 1923
 1924/*
 1925 * kmem_cache_flags - apply debugging options to the cache
 1926 * @flags:		flags to set
 1927 * @name:		name of the cache
 1928 *
 1929 * Debug option(s) are applied to @flags. In addition to the debug
 1930 * option(s), if a slab name (or multiple) is specified i.e.
 1931 * slab_debug=<Debug-Options>,<slab name1>,<slab name2> ...
 1932 * then only the select slabs will receive the debug option(s).
 1933 */
 1934slab_flags_t kmem_cache_flags(slab_flags_t flags, const char *name)
 1935{
 1936	const char *iter;
 1937	size_t len;
 1938	const char *next_block;
 1939	slab_flags_t block_flags;
 1940	slab_flags_t slub_debug_local = slub_debug;
 1941
 1942	if (flags & SLAB_NO_USER_FLAGS)
 1943		return flags;
 1944
 1945	/*
 1946	 * If the slab cache is for debugging (e.g. kmemleak) then
 1947	 * don't store user (stack trace) information by default,
 1948	 * but let the user enable it via the command line below.
 1949	 */
 1950	if (flags & SLAB_NOLEAKTRACE)
 1951		slub_debug_local &= ~SLAB_STORE_USER;
 1952
 1953	len = strlen(name);
 1954	next_block = slub_debug_string;
 1955	/* Go through all blocks of debug options, see if any matches our slab's name */
 1956	while (next_block) {
 1957		next_block = parse_slub_debug_flags(next_block, &block_flags, &iter, false);
 1958		if (!iter)
 1959			continue;
 1960		/* Found a block that has a slab list, search it */
 1961		while (*iter) {
 1962			const char *end, *glob;
 1963			size_t cmplen;
 1964
 1965			end = strchrnul(iter, ',');
 1966			if (next_block && next_block < end)
 1967				end = next_block - 1;
 1968
 1969			glob = strnchr(iter, end - iter, '*');
 1970			if (glob)
 1971				cmplen = glob - iter;
 1972			else
 1973				cmplen = max_t(size_t, len, (end - iter));
 1974
 1975			if (!strncmp(name, iter, cmplen)) {
 1976				flags |= block_flags;
 1977				return flags;
 1978			}
 1979
 1980			if (!*end || *end == ';')
 1981				break;
 1982			iter = end + 1;
 1983		}
 1984	}
 1985
 1986	return flags | slub_debug_local;
 1987}
 1988#else /* !CONFIG_SLUB_DEBUG */
 1989static inline void setup_object_debug(struct kmem_cache *s, void *object) {}
 1990static inline
 1991void setup_slab_debug(struct kmem_cache *s, struct slab *slab, void *addr) {}
 1992
 1993static inline bool alloc_debug_processing(struct kmem_cache *s,
 1994	struct slab *slab, void *object, int orig_size) { return true; }
 1995
 1996static inline bool free_debug_processing(struct kmem_cache *s,
 1997	struct slab *slab, void *head, void *tail, int *bulk_cnt,
 1998	unsigned long addr, depot_stack_handle_t handle) { return true; }
 1999
 2000static inline void slab_pad_check(struct kmem_cache *s, struct slab *slab) {}
 2001static inline int check_object(struct kmem_cache *s, struct slab *slab,
 2002			void *object, u8 val) { return 1; }
 2003static inline depot_stack_handle_t set_track_prepare(gfp_t gfp_flags) { return 0; }
 2004static inline void set_track(struct kmem_cache *s, void *object,
 2005			     enum track_item alloc, unsigned long addr, gfp_t gfp_flags) {}
 2006static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n,
 2007					struct slab *slab) {}
 2008static inline void remove_full(struct kmem_cache *s, struct kmem_cache_node *n,
 2009					struct slab *slab) {}
 2010slab_flags_t kmem_cache_flags(slab_flags_t flags, const char *name)
 2011{
 2012	return flags;
 2013}
 2014#define slub_debug 0
 2015
 2016#define disable_higher_order_debug 0
 2017
 2018static inline unsigned long node_nr_slabs(struct kmem_cache_node *n)
 2019							{ return 0; }
 2020static inline void inc_slabs_node(struct kmem_cache *s, int node,
 2021							int objects) {}
 2022static inline void dec_slabs_node(struct kmem_cache *s, int node,
 2023							int objects) {}
 2024static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab,
 2025			       void **freelist, void *nextfree)
 2026{
 2027	return false;
 2028}
 2029#endif /* CONFIG_SLUB_DEBUG */
 2030
 2031/*
 2032 * The allocated objcg pointers array is not accounted directly.
 2033 * Moreover, it should not come from DMA buffer and is not readily
 2034 * reclaimable. So those GFP bits should be masked off.
 2035 */
 2036#define OBJCGS_CLEAR_MASK	(__GFP_DMA | __GFP_RECLAIMABLE | \
 2037				__GFP_ACCOUNT | __GFP_NOFAIL)
 2038
 2039#ifdef CONFIG_SLAB_OBJ_EXT
 2040
 2041#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG
 2042
 2043static inline void mark_objexts_empty(struct slabobj_ext *obj_exts)
 2044{
 2045	struct slabobj_ext *slab_exts;
 2046	struct slab *obj_exts_slab;
 2047
 2048	obj_exts_slab = virt_to_slab(obj_exts);
 2049	slab_exts = slab_obj_exts(obj_exts_slab);
 2050	if (slab_exts) {
 2051		unsigned int offs = obj_to_index(obj_exts_slab->slab_cache,
 2052						 obj_exts_slab, obj_exts);
 2053
 2054		if (unlikely(is_codetag_empty(&slab_exts[offs].ref)))
 2055			return;
 2056
 2057		/* codetag should be NULL here */
 2058		WARN_ON(slab_exts[offs].ref.ct);
 2059		set_codetag_empty(&slab_exts[offs].ref);
 2060	}
 2061}
 2062
 2063static inline bool mark_failed_objexts_alloc(struct slab *slab)
 2064{
 2065	return cmpxchg(&slab->obj_exts, 0, OBJEXTS_ALLOC_FAIL) == 0;
 2066}
 2067
 2068static inline void handle_failed_objexts_alloc(unsigned long obj_exts,
 2069			struct slabobj_ext *vec, unsigned int objects)
 2070{
 2071	/*
 2072	 * If vector previously failed to allocate then we have live
 2073	 * objects with no tag reference. Mark all references in this
 2074	 * vector as empty to avoid warnings later on.
 2075	 */
 2076	if (obj_exts == OBJEXTS_ALLOC_FAIL) {
 2077		unsigned int i;
 2078
 2079		for (i = 0; i < objects; i++)
 2080			set_codetag_empty(&vec[i].ref);
 2081	}
 2082}
 2083
 2084#else /* CONFIG_MEM_ALLOC_PROFILING_DEBUG */
 2085
 2086static inline void mark_objexts_empty(struct slabobj_ext *obj_exts) {}
 2087static inline bool mark_failed_objexts_alloc(struct slab *slab) { return false; }
 2088static inline void handle_failed_objexts_alloc(unsigned long obj_exts,
 2089			struct slabobj_ext *vec, unsigned int objects) {}
 2090
 2091#endif /* CONFIG_MEM_ALLOC_PROFILING_DEBUG */
 2092
 2093static inline void init_slab_obj_exts(struct slab *slab)
 2094{
 2095	slab->obj_exts = 0;
 2096}
 2097
 2098int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s,
 2099		        gfp_t gfp, bool new_slab)
 2100{
 2101	bool allow_spin = gfpflags_allow_spinning(gfp);
 2102	unsigned int objects = objs_per_slab(s, slab);
 2103	unsigned long new_exts;
 2104	unsigned long old_exts;
 2105	struct slabobj_ext *vec;
 2106
 2107	gfp &= ~OBJCGS_CLEAR_MASK;
 2108	/* Prevent recursive extension vector allocation */
 2109	gfp |= __GFP_NO_OBJ_EXT;
 2110
 2111	/*
 2112	 * Note that allow_spin may be false during early boot and its
 2113	 * restricted GFP_BOOT_MASK. Due to kmalloc_nolock() only supporting
 2114	 * architectures with cmpxchg16b, early obj_exts will be missing for
 2115	 * very early allocations on those.
 2116	 */
 2117	if (unlikely(!allow_spin)) {
 2118		size_t sz = objects * sizeof(struct slabobj_ext);
 2119
 2120		vec = kmalloc_nolock(sz, __GFP_ZERO | __GFP_NO_OBJ_EXT,
 2121				     slab_nid(slab));
 2122	} else {
 2123		vec = kcalloc_node(objects, sizeof(struct slabobj_ext), gfp,
 2124				   slab_nid(slab));
 2125	}
 2126	if (!vec) {
 2127		/*
 2128		 * Try to mark vectors which failed to allocate.
 2129		 * If this operation fails, there may be a racing process
 2130		 * that has already completed the allocation.
 2131		 */
 2132		if (!mark_failed_objexts_alloc(slab) &&
 2133		    slab_obj_exts(slab))
 2134			return 0;
 2135
 2136		return -ENOMEM;
 2137	}
 2138
 2139	new_exts = (unsigned long)vec;
 2140	if (unlikely(!allow_spin))
 2141		new_exts |= OBJEXTS_NOSPIN_ALLOC;
 2142#ifdef CONFIG_MEMCG
 2143	new_exts |= MEMCG_DATA_OBJEXTS;
 2144#endif
 2145retry:
 2146	old_exts = READ_ONCE(slab->obj_exts);
 2147	handle_failed_objexts_alloc(old_exts, vec, objects);
 2148	if (new_slab) {
 2149		/*
 2150		 * If the slab is brand new and nobody can yet access its
 2151		 * obj_exts, no synchronization is required and obj_exts can
 2152		 * be simply assigned.
 2153		 */
 2154		slab->obj_exts = new_exts;
 2155	} else if (old_exts & ~OBJEXTS_FLAGS_MASK) {
 2156		/*
 2157		 * If the slab is already in use, somebody can allocate and
 2158		 * assign slabobj_exts in parallel. In this case the existing
 2159		 * objcg vector should be reused.
 2160		 */
 2161		mark_objexts_empty(vec);
 2162		if (unlikely(!allow_spin))
 2163			kfree_nolock(vec);
 2164		else
 2165			kfree(vec);
 2166		return 0;
 2167	} else if (cmpxchg(&slab->obj_exts, old_exts, new_exts) != old_exts) {
 2168		/* Retry if a racing thread changed slab->obj_exts from under us. */
 2169		goto retry;
 2170	}
 2171
 2172	if (allow_spin)
 2173		kmemleak_not_leak(vec);
 2174	return 0;
 2175}
 2176
 2177static inline void free_slab_obj_exts(struct slab *slab)
 2178{
 2179	struct slabobj_ext *obj_exts;
 2180
 2181	obj_exts = slab_obj_exts(slab);
 2182	if (!obj_exts) {
 2183		/*
 2184		 * If obj_exts allocation failed, slab->obj_exts is set to
 2185		 * OBJEXTS_ALLOC_FAIL. In this case, we end up here and should
 2186		 * clear the flag.
 2187		 */
 2188		slab->obj_exts = 0;
 2189		return;
 2190	}
 2191
 2192	/*
 2193	 * obj_exts was created with __GFP_NO_OBJ_EXT flag, therefore its
 2194	 * corresponding extension will be NULL. alloc_tag_sub() will throw a
 2195	 * warning if slab has extensions but the extension of an object is
 2196	 * NULL, therefore replace NULL with CODETAG_EMPTY to indicate that
 2197	 * the extension for obj_exts is expected to be NULL.
 2198	 */
 2199	mark_objexts_empty(obj_exts);
 2200	if (unlikely(READ_ONCE(slab->obj_exts) & OBJEXTS_NOSPIN_ALLOC))
 2201		kfree_nolock(obj_exts);
 2202	else
 2203		kfree(obj_exts);
 2204	slab->obj_exts = 0;
 2205}
 2206
 2207#else /* CONFIG_SLAB_OBJ_EXT */
 2208
 2209static inline void init_slab_obj_exts(struct slab *slab)
 2210{
 2211}
 2212
 2213static int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s,
 2214			       gfp_t gfp, bool new_slab)
 2215{
 2216	return 0;
 2217}
 2218
 2219static inline void free_slab_obj_exts(struct slab *slab)
 2220{
 2221}
 2222
 2223#endif /* CONFIG_SLAB_OBJ_EXT */
 2224
 2225#ifdef CONFIG_MEM_ALLOC_PROFILING
 2226
 2227static inline struct slabobj_ext *
 2228prepare_slab_obj_exts_hook(struct kmem_cache *s, gfp_t flags, void *p)
 2229{
 2230	struct slab *slab;
 2231
 2232	slab = virt_to_slab(p);
 2233	if (!slab_obj_exts(slab) &&
 2234	    alloc_slab_obj_exts(slab, s, flags, false)) {
 2235		pr_warn_once("%s, %s: Failed to create slab extension vector!\n",
 2236			     __func__, s->name);
 2237		return NULL;
 2238	}
 2239
 2240	return slab_obj_exts(slab) + obj_to_index(s, slab, p);
 2241}
 2242
 2243/* Should be called only if mem_alloc_profiling_enabled() */
 2244static noinline void
 2245__alloc_tagging_slab_alloc_hook(struct kmem_cache *s, void *object, gfp_t flags)
 2246{
 2247	struct slabobj_ext *obj_exts;
 2248
 2249	if (!object)
 2250		return;
 2251
 2252	if (s->flags & (SLAB_NO_OBJ_EXT | SLAB_NOLEAKTRACE))
 2253		return;
 2254
 2255	if (flags & __GFP_NO_OBJ_EXT)
 2256		return;
 2257
 2258	obj_exts = prepare_slab_obj_exts_hook(s, flags, object);
 2259	/*
 2260	 * Currently obj_exts is used only for allocation profiling.
 2261	 * If other users appear then mem_alloc_profiling_enabled()
 2262	 * check should be added before alloc_tag_add().
 2263	 */
 2264	if (likely(obj_exts))
 2265		alloc_tag_add(&obj_exts->ref, current->alloc_tag, s->size);
 2266	else
 2267		alloc_tag_set_inaccurate(current->alloc_tag);
 2268}
 2269
 2270static inline void
 2271alloc_tagging_slab_alloc_hook(struct kmem_cache *s, void *object, gfp_t flags)
 2272{
 2273	if (mem_alloc_profiling_enabled())
 2274		__alloc_tagging_slab_alloc_hook(s, object, flags);
 2275}
 2276
 2277/* Should be called only if mem_alloc_profiling_enabled() */
 2278static noinline void
 2279__alloc_tagging_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p,
 2280			       int objects)
 2281{
 2282	struct slabobj_ext *obj_exts;
 2283	int i;
 2284
 2285	/* slab->obj_exts might not be NULL if it was created for MEMCG accounting. */
 2286	if (s->flags & (SLAB_NO_OBJ_EXT | SLAB_NOLEAKTRACE))
 2287		return;
 2288
 2289	obj_exts = slab_obj_exts(slab);
 2290	if (!obj_exts)
 2291		return;
 2292
 2293	for (i = 0; i < objects; i++) {
 2294		unsigned int off = obj_to_index(s, slab, p[i]);
 2295
 2296		alloc_tag_sub(&obj_exts[off].ref, s->size);
 2297	}
 2298}
 2299
 2300static inline void
 2301alloc_tagging_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p,
 2302			     int objects)
 2303{
 2304	if (mem_alloc_profiling_enabled())
 2305		__alloc_tagging_slab_free_hook(s, slab, p, objects);
 2306}
 2307
 2308#else /* CONFIG_MEM_ALLOC_PROFILING */
 2309
 2310static inline void
 2311alloc_tagging_slab_alloc_hook(struct kmem_cache *s, void *object, gfp_t flags)
 2312{
 2313}
 2314
 2315static inline void
 2316alloc_tagging_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p,
 2317			     int objects)
 2318{
 2319}
 2320
 2321#endif /* CONFIG_MEM_ALLOC_PROFILING */
 2322
 2323
 2324#ifdef CONFIG_MEMCG
 2325
 2326static void memcg_alloc_abort_single(struct kmem_cache *s, void *object);
 2327
 2328static __fastpath_inline
 2329bool memcg_slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru,
 2330				gfp_t flags, size_t size, void **p)
 2331{
 2332	if (likely(!memcg_kmem_online()))
 2333		return true;
 2334
 2335	if (likely(!(flags & __GFP_ACCOUNT) && !(s->flags & SLAB_ACCOUNT)))
 2336		return true;
 2337
 2338	if (likely(__memcg_slab_post_alloc_hook(s, lru, flags, size, p)))
 2339		return true;
 2340
 2341	if (likely(size == 1)) {
 2342		memcg_alloc_abort_single(s, *p);
 2343		*p = NULL;
 2344	} else {
 2345		kmem_cache_free_bulk(s, size, p);
 2346	}
 2347
 2348	return false;
 2349}
 2350
 2351static __fastpath_inline
 2352void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p,
 2353			  int objects)
 2354{
 2355	struct slabobj_ext *obj_exts;
 2356
 2357	if (!memcg_kmem_online())
 2358		return;
 2359
 2360	obj_exts = slab_obj_exts(slab);
 2361	if (likely(!obj_exts))
 2362		return;
 2363
 2364	__memcg_slab_free_hook(s, slab, p, objects, obj_exts);
 2365}
 2366
 2367static __fastpath_inline
 2368bool memcg_slab_post_charge(void *p, gfp_t flags)
 2369{
 2370	struct slabobj_ext *slab_exts;
 2371	struct kmem_cache *s;
 2372	struct page *page;
 2373	struct slab *slab;
 2374	unsigned long off;
 2375
 2376	page = virt_to_page(p);
 2377	if (PageLargeKmalloc(page)) {
 2378		unsigned int order;
 2379		int size;
 2380
 2381		if (PageMemcgKmem(page))
 2382			return true;
 2383
 2384		order = large_kmalloc_order(page);
 2385		if (__memcg_kmem_charge_page(page, flags, order))
 2386			return false;
 2387
 2388		/*
 2389		 * This page has already been accounted in the global stats but
 2390		 * not in the memcg stats. So, subtract from the global and use
 2391		 * the interface which adds to both global and memcg stats.
 2392		 */
 2393		size = PAGE_SIZE << order;
 2394		mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE_B, -size);
 2395		mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B, size);
 2396		return true;
 2397	}
 2398
 2399	slab = page_slab(page);
 2400	s = slab->slab_cache;
 2401
 2402	/*
 2403	 * Ignore KMALLOC_NORMAL cache to avoid possible circular dependency
 2404	 * of slab_obj_exts being allocated from the same slab and thus the slab
 2405	 * becoming effectively unfreeable.
 2406	 */
 2407	if (is_kmalloc_normal(s))
 2408		return true;
 2409
 2410	/* Ignore already charged objects. */
 2411	slab_exts = slab_obj_exts(slab);
 2412	if (slab_exts) {
 2413		off = obj_to_index(s, slab, p);
 2414		if (unlikely(slab_exts[off].objcg))
 2415			return true;
 2416	}
 2417
 2418	return __memcg_slab_post_alloc_hook(s, NULL, flags, 1, &p);
 2419}
 2420
 2421#else /* CONFIG_MEMCG */
 2422static inline bool memcg_slab_post_alloc_hook(struct kmem_cache *s,
 2423					      struct list_lru *lru,
 2424					      gfp_t flags, size_t size,
 2425					      void **p)
 2426{
 2427	return true;
 2428}
 2429
 2430static inline void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab,
 2431					void **p, int objects)
 2432{
 2433}
 2434
 2435static inline bool memcg_slab_post_charge(void *p, gfp_t flags)
 2436{
 2437	return true;
 2438}
 2439#endif /* CONFIG_MEMCG */
 2440
 2441#ifdef CONFIG_SLUB_RCU_DEBUG
 2442static void slab_free_after_rcu_debug(struct rcu_head *rcu_head);
 2443
 2444struct rcu_delayed_free {
 2445	struct rcu_head head;
 2446	void *object;
 2447};
 2448#endif
 2449
 2450/*
 2451 * Hooks for other subsystems that check memory allocations. In a typical
 2452 * production configuration these hooks all should produce no code at all.
 2453 *
 2454 * Returns true if freeing of the object can proceed, false if its reuse
 2455 * was delayed by CONFIG_SLUB_RCU_DEBUG or KASAN quarantine, or it was returned
 2456 * to KFENCE.
 2457 */
 2458static __always_inline
 2459bool slab_free_hook(struct kmem_cache *s, void *x, bool init,
 2460		    bool after_rcu_delay)
 2461{
 2462	/* Are the object contents still accessible? */
 2463	bool still_accessible = (s->flags & SLAB_TYPESAFE_BY_RCU) && !after_rcu_delay;
 2464
 2465	kmemleak_free_recursive(x, s->flags);
 2466	kmsan_slab_free(s, x);
 2467
 2468	debug_check_no_locks_freed(x, s->object_size);
 2469
 2470	if (!(s->flags & SLAB_DEBUG_OBJECTS))
 2471		debug_check_no_obj_freed(x, s->object_size);
 2472
 2473	/* Use KCSAN to help debug racy use-after-free. */
 2474	if (!still_accessible)
 2475		__kcsan_check_access(x, s->object_size,
 2476				     KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ASSERT);
 2477
 2478	if (kfence_free(x))
 2479		return false;
 2480
 2481	/*
 2482	 * Give KASAN a chance to notice an invalid free operation before we
 2483	 * modify the object.
 2484	 */
 2485	if (kasan_slab_pre_free(s, x))
 2486		return false;
 2487
 2488#ifdef CONFIG_SLUB_RCU_DEBUG
 2489	if (still_accessible) {
 2490		struct rcu_delayed_free *delayed_free;
 2491
 2492		delayed_free = kmalloc(sizeof(*delayed_free), GFP_NOWAIT);
 2493		if (delayed_free) {
 2494			/*
 2495			 * Let KASAN track our call stack as a "related work
 2496			 * creation", just like if the object had been freed
 2497			 * normally via kfree_rcu().
 2498			 * We have to do this manually because the rcu_head is
 2499			 * not located inside the object.
 2500			 */
 2501			kasan_record_aux_stack(x);
 2502
 2503			delayed_free->object = x;
 2504			call_rcu(&delayed_free->head, slab_free_after_rcu_debug);
 2505			return false;
 2506		}
 2507	}
 2508#endif /* CONFIG_SLUB_RCU_DEBUG */
 2509
 2510	/*
 2511	 * As memory initialization might be integrated into KASAN,
 2512	 * kasan_slab_free and initialization memset's must be
 2513	 * kept together to avoid discrepancies in behavior.
 2514	 *
 2515	 * The initialization memset's clear the object and the metadata,
 2516	 * but don't touch the SLAB redzone.
 2517	 *
 2518	 * The object's freepointer is also avoided if stored outside the
 2519	 * object.
 2520	 */
 2521	if (unlikely(init)) {
 2522		int rsize;
 2523		unsigned int inuse, orig_size;
 2524
 2525		inuse = get_info_end(s);
 2526		orig_size = get_orig_size(s, x);
 2527		if (!kasan_has_integrated_init())
 2528			memset(kasan_reset_tag(x), 0, orig_size);
 2529		rsize = (s->flags & SLAB_RED_ZONE) ? s->red_left_pad : 0;
 2530		memset((char *)kasan_reset_tag(x) + inuse, 0,
 2531		       s->size - inuse - rsize);
 2532		/*
 2533		 * Restore orig_size, otherwise kmalloc redzone overwritten
 2534		 * would be reported
 2535		 */
 2536		set_orig_size(s, x, orig_size);
 2537
 2538	}
 2539	/* KASAN might put x into memory quarantine, delaying its reuse. */
 2540	return !kasan_slab_free(s, x, init, still_accessible, false);
 2541}
 2542
 2543static __fastpath_inline
 2544bool slab_free_freelist_hook(struct kmem_cache *s, void **head, void **tail,
 2545			     int *cnt)
 2546{
 2547
 2548	void *object;
 2549	void *next = *head;
 2550	void *old_tail = *tail;
 2551	bool init;
 2552
 2553	if (is_kfence_address(next)) {
 2554		slab_free_hook(s, next, false, false);
 2555		return false;
 2556	}
 2557
 2558	/* Head and tail of the reconstructed freelist */
 2559	*head = NULL;
 2560	*tail = NULL;
 2561
 2562	init = slab_want_init_on_free(s);
 2563
 2564	do {
 2565		object = next;
 2566		next = get_freepointer(s, object);
 2567
 2568		/* If object's reuse doesn't have to be delayed */
 2569		if (likely(slab_free_hook(s, object, init, false))) {
 2570			/* Move object to the new freelist */
 2571			set_freepointer(s, object, *head);
 2572			*head = object;
 2573			if (!*tail)
 2574				*tail = object;
 2575		} else {
 2576			/*
 2577			 * Adjust the reconstructed freelist depth
 2578			 * accordingly if object's reuse is delayed.
 2579			 */
 2580			--(*cnt);
 2581		}
 2582	} while (object != old_tail);
 2583
 2584	return *head != NULL;
 2585}
 2586
 2587static void *setup_object(struct kmem_cache *s, void *object)
 2588{
 2589	setup_object_debug(s, object);
 2590	object = kasan_init_slab_obj(s, object);
 2591	if (unlikely(s->ctor)) {
 2592		kasan_unpoison_new_object(s, object);
 2593		s->ctor(object);
 2594		kasan_poison_new_object(s, object);
 2595	}
 2596	return object;
 2597}
 2598
 2599static struct slab_sheaf *alloc_empty_sheaf(struct kmem_cache *s, gfp_t gfp)
 2600{
 2601	struct slab_sheaf *sheaf;
 2602	size_t sheaf_size;
 2603
 2604	if (gfp & __GFP_NO_OBJ_EXT)
 2605		return NULL;
 2606
 2607	gfp &= ~OBJCGS_CLEAR_MASK;
 2608
 2609	/*
 2610	 * Prevent recursion to the same cache, or a deep stack of kmallocs of
 2611	 * varying sizes (sheaf capacity might differ for each kmalloc size
 2612	 * bucket)
 2613	 */
 2614	if (s->flags & SLAB_KMALLOC)
 2615		gfp |= __GFP_NO_OBJ_EXT;
 2616
 2617	sheaf_size = struct_size(sheaf, objects, s->sheaf_capacity);
 2618	sheaf = kzalloc(sheaf_size, gfp);
 2619
 2620	if (unlikely(!sheaf))
 2621		return NULL;
 2622
 2623	sheaf->cache = s;
 2624
 2625	stat(s, SHEAF_ALLOC);
 2626
 2627	return sheaf;
 2628}
 2629
 2630static void free_empty_sheaf(struct kmem_cache *s, struct slab_sheaf *sheaf)
 2631{
 2632	kfree(sheaf);
 2633
 2634	stat(s, SHEAF_FREE);
 2635}
 2636
 2637static int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags,
 2638				   size_t size, void **p);
 2639
 2640
 2641static int refill_sheaf(struct kmem_cache *s, struct slab_sheaf *sheaf,
 2642			 gfp_t gfp)
 2643{
 2644	int to_fill = s->sheaf_capacity - sheaf->size;
 2645	int filled;
 2646
 2647	if (!to_fill)
 2648		return 0;
 2649
 2650	filled = __kmem_cache_alloc_bulk(s, gfp, to_fill,
 2651					 &sheaf->objects[sheaf->size]);
 2652
 2653	sheaf->size += filled;
 2654
 2655	stat_add(s, SHEAF_REFILL, filled);
 2656
 2657	if (filled < to_fill)
 2658		return -ENOMEM;
 2659
 2660	return 0;
 2661}
 2662
 2663
 2664static struct slab_sheaf *alloc_full_sheaf(struct kmem_cache *s, gfp_t gfp)
 2665{
 2666	struct slab_sheaf *sheaf = alloc_empty_sheaf(s, gfp);
 2667
 2668	if (!sheaf)
 2669		return NULL;
 2670
 2671	if (refill_sheaf(s, sheaf, gfp | __GFP_NOMEMALLOC)) {
 2672		free_empty_sheaf(s, sheaf);
 2673		return NULL;
 2674	}
 2675
 2676	return sheaf;
 2677}
 2678
 2679/*
 2680 * Maximum number of objects freed during a single flush of main pcs sheaf.
 2681 * Translates directly to an on-stack array size.
 2682 */
 2683#define PCS_BATCH_MAX	32U
 2684
 2685static void __kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p);
 2686
 2687/*
 2688 * Free all objects from the main sheaf. In order to perform
 2689 * __kmem_cache_free_bulk() outside of cpu_sheaves->lock, work in batches where
 2690 * object pointers are moved to a on-stack array under the lock. To bound the
 2691 * stack usage, limit each batch to PCS_BATCH_MAX.
 2692 *
 2693 * returns true if at least partially flushed
 2694 */
 2695static bool sheaf_flush_main(struct kmem_cache *s)
 2696{
 2697	struct slub_percpu_sheaves *pcs;
 2698	unsigned int batch, remaining;
 2699	void *objects[PCS_BATCH_MAX];
 2700	struct slab_sheaf *sheaf;
 2701	bool ret = false;
 2702
 2703next_batch:
 2704	if (!local_trylock(&s->cpu_sheaves->lock))
 2705		return ret;
 2706
 2707	pcs = this_cpu_ptr(s->cpu_sheaves);
 2708	sheaf = pcs->main;
 2709
 2710	batch = min(PCS_BATCH_MAX, sheaf->size);
 2711
 2712	sheaf->size -= batch;
 2713	memcpy(objects, sheaf->objects + sheaf->size, batch * sizeof(void *));
 2714
 2715	remaining = sheaf->size;
 2716
 2717	local_unlock(&s->cpu_sheaves->lock);
 2718
 2719	__kmem_cache_free_bulk(s, batch, &objects[0]);
 2720
 2721	stat_add(s, SHEAF_FLUSH, batch);
 2722
 2723	ret = true;
 2724
 2725	if (remaining)
 2726		goto next_batch;
 2727
 2728	return ret;
 2729}
 2730
 2731/*
 2732 * Free all objects from a sheaf that's unused, i.e. not linked to any
 2733 * cpu_sheaves, so we need no locking and batching. The locking is also not
 2734 * necessary when flushing cpu's sheaves (both spare and main) during cpu
 2735 * hotremove as the cpu is not executing anymore.
 2736 */
 2737static void sheaf_flush_unused(struct kmem_cache *s, struct slab_sheaf *sheaf)
 2738{
 2739	if (!sheaf->size)
 2740		return;
 2741
 2742	stat_add(s, SHEAF_FLUSH, sheaf->size);
 2743
 2744	__kmem_cache_free_bulk(s, sheaf->size, &sheaf->objects[0]);
 2745
 2746	sheaf->size = 0;
 2747}
 2748
 2749static bool __rcu_free_sheaf_prepare(struct kmem_cache *s,
 2750				     struct slab_sheaf *sheaf)
 2751{
 2752	bool init = slab_want_init_on_free(s);
 2753	void **p = &sheaf->objects[0];
 2754	unsigned int i = 0;
 2755	bool pfmemalloc = false;
 2756
 2757	while (i < sheaf->size) {
 2758		struct slab *slab = virt_to_slab(p[i]);
 2759
 2760		memcg_slab_free_hook(s, slab, p + i, 1);
 2761		alloc_tagging_slab_free_hook(s, slab, p + i, 1);
 2762
 2763		if (unlikely(!slab_free_hook(s, p[i], init, true))) {
 2764			p[i] = p[--sheaf->size];
 2765			continue;
 2766		}
 2767
 2768		if (slab_test_pfmemalloc(slab))
 2769			pfmemalloc = true;
 2770
 2771		i++;
 2772	}
 2773
 2774	return pfmemalloc;
 2775}
 2776
 2777static void rcu_free_sheaf_nobarn(struct rcu_head *head)
 2778{
 2779	struct slab_sheaf *sheaf;
 2780	struct kmem_cache *s;
 2781
 2782	sheaf = container_of(head, struct slab_sheaf, rcu_head);
 2783	s = sheaf->cache;
 2784
 2785	__rcu_free_sheaf_prepare(s, sheaf);
 2786
 2787	sheaf_flush_unused(s, sheaf);
 2788
 2789	free_empty_sheaf(s, sheaf);
 2790}
 2791
 2792/*
 2793 * Caller needs to make sure migration is disabled in order to fully flush
 2794 * single cpu's sheaves
 2795 *
 2796 * must not be called from an irq
 2797 *
 2798 * flushing operations are rare so let's keep it simple and flush to slabs
 2799 * directly, skipping the barn
 2800 */
 2801static void pcs_flush_all(struct kmem_cache *s)
 2802{
 2803	struct slub_percpu_sheaves *pcs;
 2804	struct slab_sheaf *spare, *rcu_free;
 2805
 2806	local_lock(&s->cpu_sheaves->lock);
 2807	pcs = this_cpu_ptr(s->cpu_sheaves);
 2808
 2809	spare = pcs->spare;
 2810	pcs->spare = NULL;
 2811
 2812	rcu_free = pcs->rcu_free;
 2813	pcs->rcu_free = NULL;
 2814
 2815	local_unlock(&s->cpu_sheaves->lock);
 2816
 2817	if (spare) {
 2818		sheaf_flush_unused(s, spare);
 2819		free_empty_sheaf(s, spare);
 2820	}
 2821
 2822	if (rcu_free)
 2823		call_rcu(&rcu_free->rcu_head, rcu_free_sheaf_nobarn);
 2824
 2825	sheaf_flush_main(s);
 2826}
 2827
 2828static void __pcs_flush_all_cpu(struct kmem_cache *s, unsigned int cpu)
 2829{
 2830	struct slub_percpu_sheaves *pcs;
 2831
 2832	pcs = per_cpu_ptr(s->cpu_sheaves, cpu);
 2833
 2834	/* The cpu is not executing anymore so we don't need pcs->lock */
 2835	sheaf_flush_unused(s, pcs->main);
 2836	if (pcs->spare) {
 2837		sheaf_flush_unused(s, pcs->spare);
 2838		free_empty_sheaf(s, pcs->spare);
 2839		pcs->spare = NULL;
 2840	}
 2841
 2842	if (pcs->rcu_free) {
 2843		call_rcu(&pcs->rcu_free->rcu_head, rcu_free_sheaf_nobarn);
 2844		pcs->rcu_free = NULL;
 2845	}
 2846}
 2847
 2848static void pcs_destroy(struct kmem_cache *s)
 2849{
 2850	int cpu;
 2851
 2852	for_each_possible_cpu(cpu) {
 2853		struct slub_percpu_sheaves *pcs;
 2854
 2855		pcs = per_cpu_ptr(s->cpu_sheaves, cpu);
 2856
 2857		/* can happen when unwinding failed create */
 2858		if (!pcs->main)
 2859			continue;
 2860
 2861		/*
 2862		 * We have already passed __kmem_cache_shutdown() so everything
 2863		 * was flushed and there should be no objects allocated from
 2864		 * slabs, otherwise kmem_cache_destroy() would have aborted.
 2865		 * Therefore something would have to be really wrong if the
 2866		 * warnings here trigger, and we should rather leave objects and
 2867		 * sheaves to leak in that case.
 2868		 */
 2869
 2870		WARN_ON(pcs->spare);
 2871		WARN_ON(pcs->rcu_free);
 2872
 2873		if (!WARN_ON(pcs->main->size)) {
 2874			free_empty_sheaf(s, pcs->main);
 2875			pcs->main = NULL;
 2876		}
 2877	}
 2878
 2879	free_percpu(s->cpu_sheaves);
 2880	s->cpu_sheaves = NULL;
 2881}
 2882
 2883static struct slab_sheaf *barn_get_empty_sheaf(struct node_barn *barn)
 2884{
 2885	struct slab_sheaf *empty = NULL;
 2886	unsigned long flags;
 2887
 2888	if (!data_race(barn->nr_empty))
 2889		return NULL;
 2890
 2891	spin_lock_irqsave(&barn->lock, flags);
 2892
 2893	if (likely(barn->nr_empty)) {
 2894		empty = list_first_entry(&barn->sheaves_empty,
 2895					 struct slab_sheaf, barn_list);
 2896		list_del(&empty->barn_list);
 2897		barn->nr_empty--;
 2898	}
 2899
 2900	spin_unlock_irqrestore(&barn->lock, flags);
 2901
 2902	return empty;
 2903}
 2904
 2905/*
 2906 * The following two functions are used mainly in cases where we have to undo an
 2907 * intended action due to a race or cpu migration. Thus they do not check the
 2908 * empty or full sheaf limits for simplicity.
 2909 */
 2910
 2911static void barn_put_empty_sheaf(struct node_barn *barn, struct slab_sheaf *sheaf)
 2912{
 2913	unsigned long flags;
 2914
 2915	spin_lock_irqsave(&barn->lock, flags);
 2916
 2917	list_add(&sheaf->barn_list, &barn->sheaves_empty);
 2918	barn->nr_empty++;
 2919
 2920	spin_unlock_irqrestore(&barn->lock, flags);
 2921}
 2922
 2923static void barn_put_full_sheaf(struct node_barn *barn, struct slab_sheaf *sheaf)
 2924{
 2925	unsigned long flags;
 2926
 2927	spin_lock_irqsave(&barn->lock, flags);
 2928
 2929	list_add(&sheaf->barn_list, &barn->sheaves_full);
 2930	barn->nr_full++;
 2931
 2932	spin_unlock_irqrestore(&barn->lock, flags);
 2933}
 2934
 2935static struct slab_sheaf *barn_get_full_or_empty_sheaf(struct node_barn *barn)
 2936{
 2937	struct slab_sheaf *sheaf = NULL;
 2938	unsigned long flags;
 2939
 2940	if (!data_race(barn->nr_full) && !data_race(barn->nr_empty))
 2941		return NULL;
 2942
 2943	spin_lock_irqsave(&barn->lock, flags);
 2944
 2945	if (barn->nr_full) {
 2946		sheaf = list_first_entry(&barn->sheaves_full, struct slab_sheaf,
 2947					barn_list);
 2948		list_del(&sheaf->barn_list);
 2949		barn->nr_full--;
 2950	} else if (barn->nr_empty) {
 2951		sheaf = list_first_entry(&barn->sheaves_empty,
 2952					 struct slab_sheaf, barn_list);
 2953		list_del(&sheaf->barn_list);
 2954		barn->nr_empty--;
 2955	}
 2956
 2957	spin_unlock_irqrestore(&barn->lock, flags);
 2958
 2959	return sheaf;
 2960}
 2961
 2962/*
 2963 * If a full sheaf is available, return it and put the supplied empty one to
 2964 * barn. We ignore the limit on empty sheaves as the number of sheaves doesn't
 2965 * change.
 2966 */
 2967static struct slab_sheaf *
 2968barn_replace_empty_sheaf(struct node_barn *barn, struct slab_sheaf *empty)
 2969{
 2970	struct slab_sheaf *full = NULL;
 2971	unsigned long flags;
 2972
 2973	if (!data_race(barn->nr_full))
 2974		return NULL;
 2975
 2976	spin_lock_irqsave(&barn->lock, flags);
 2977
 2978	if (likely(barn->nr_full)) {
 2979		full = list_first_entry(&barn->sheaves_full, struct slab_sheaf,
 2980					barn_list);
 2981		list_del(&full->barn_list);
 2982		list_add(&empty->barn_list, &barn->sheaves_empty);
 2983		barn->nr_full--;
 2984		barn->nr_empty++;
 2985	}
 2986
 2987	spin_unlock_irqrestore(&barn->lock, flags);
 2988
 2989	return full;
 2990}
 2991
 2992/*
 2993 * If an empty sheaf is available, return it and put the supplied full one to
 2994 * barn. But if there are too many full sheaves, reject this with -E2BIG.
 2995 */
 2996static struct slab_sheaf *
 2997barn_replace_full_sheaf(struct node_barn *barn, struct slab_sheaf *full)
 2998{
 2999	struct slab_sheaf *empty;
 3000	unsigned long flags;
 3001
 3002	/* we don't repeat this check under barn->lock as it's not critical */
 3003	if (data_race(barn->nr_full) >= MAX_FULL_SHEAVES)
 3004		return ERR_PTR(-E2BIG);
 3005	if (!data_race(barn->nr_empty))
 3006		return ERR_PTR(-ENOMEM);
 3007
 3008	spin_lock_irqsave(&barn->lock, flags);
 3009
 3010	if (likely(barn->nr_empty)) {
 3011		empty = list_first_entry(&barn->sheaves_empty, struct slab_sheaf,
 3012					 barn_list);
 3013		list_del(&empty->barn_list);
 3014		list_add(&full->barn_list, &barn->sheaves_full);
 3015		barn->nr_empty--;
 3016		barn->nr_full++;
 3017	} else {
 3018		empty = ERR_PTR(-ENOMEM);
 3019	}
 3020
 3021	spin_unlock_irqrestore(&barn->lock, flags);
 3022
 3023	return empty;
 3024}
 3025
 3026static void barn_init(struct node_barn *barn)
 3027{
 3028	spin_lock_init(&barn->lock);
 3029	INIT_LIST_HEAD(&barn->sheaves_full);
 3030	INIT_LIST_HEAD(&barn->sheaves_empty);
 3031	barn->nr_full = 0;
 3032	barn->nr_empty = 0;
 3033}
 3034
 3035static void barn_shrink(struct kmem_cache *s, struct node_barn *barn)
 3036{
 3037	LIST_HEAD(empty_list);
 3038	LIST_HEAD(full_list);
 3039	struct slab_sheaf *sheaf, *sheaf2;
 3040	unsigned long flags;
 3041
 3042	spin_lock_irqsave(&barn->lock, flags);
 3043
 3044	list_splice_init(&barn->sheaves_full, &full_list);
 3045	barn->nr_full = 0;
 3046	list_splice_init(&barn->sheaves_empty, &empty_list);
 3047	barn->nr_empty = 0;
 3048
 3049	spin_unlock_irqrestore(&barn->lock, flags);
 3050
 3051	list_for_each_entry_safe(sheaf, sheaf2, &full_list, barn_list) {
 3052		sheaf_flush_unused(s, sheaf);
 3053		free_empty_sheaf(s, sheaf);
 3054	}
 3055
 3056	list_for_each_entry_safe(sheaf, sheaf2, &empty_list, barn_list)
 3057		free_empty_sheaf(s, sheaf);
 3058}
 3059
 3060/*
 3061 * Slab allocation and freeing
 3062 */
 3063static inline struct slab *alloc_slab_page(gfp_t flags, int node,
 3064					   struct kmem_cache_order_objects oo,
 3065					   bool allow_spin)
 3066{
 3067	struct page *page;
 3068	struct slab *slab;
 3069	unsigned int order = oo_order(oo);
 3070
 3071	if (unlikely(!allow_spin))
 3072		page = alloc_frozen_pages_nolock(0/* __GFP_COMP is implied */,
 3073								  node, order);
 3074	else if (node == NUMA_NO_NODE)
 3075		page = alloc_frozen_pages(flags, order);
 3076	else
 3077		page = __alloc_frozen_pages(flags, order, node, NULL);
 3078
 3079	if (!page)
 3080		return NULL;
 3081
 3082	__SetPageSlab(page);
 3083	slab = page_slab(page);
 3084	if (page_is_pfmemalloc(page))
 3085		slab_set_pfmemalloc(slab);
 3086
 3087	return slab;
 3088}
 3089
 3090#ifdef CONFIG_SLAB_FREELIST_RANDOM
 3091/* Pre-initialize the random sequence cache */
 3092static int init_cache_random_seq(struct kmem_cache *s)
 3093{
 3094	unsigned int count = oo_objects(s->oo);
 3095	int err;
 3096
 3097	/* Bailout if already initialised */
 3098	if (s->random_seq)
 3099		return 0;
 3100
 3101	err = cache_random_seq_create(s, count, GFP_KERNEL);
 3102	if (err) {
 3103		pr_err("SLUB: Unable to initialize free list for %s\n",
 3104			s->name);
 3105		return err;
 3106	}
 3107
 3108	/* Transform to an offset on the set of pages */
 3109	if (s->random_seq) {
 3110		unsigned int i;
 3111
 3112		for (i = 0; i < count; i++)
 3113			s->random_seq[i] *= s->size;
 3114	}
 3115	return 0;
 3116}
 3117
 3118/* Initialize each random sequence freelist per cache */
 3119static void __init init_freelist_randomization(void)
 3120{
 3121	struct kmem_cache *s;
 3122
 3123	mutex_lock(&slab_mutex);
 3124
 3125	list_for_each_entry(s, &slab_caches, list)
 3126		init_cache_random_seq(s);
 3127
 3128	mutex_unlock(&slab_mutex);
 3129}
 3130
 3131/* Get the next entry on the pre-computed freelist randomized */
 3132static void *next_freelist_entry(struct kmem_cache *s,
 3133				unsigned long *pos, void *start,
 3134				unsigned long page_limit,
 3135				unsigned long freelist_count)
 3136{
 3137	unsigned int idx;
 3138
 3139	/*
 3140	 * If the target page allocation failed, the number of objects on the
 3141	 * page might be smaller than the usual size defined by the cache.
 3142	 */
 3143	do {
 3144		idx = s->random_seq[*pos];
 3145		*pos += 1;
 3146		if (*pos >= freelist_count)
 3147			*pos = 0;
 3148	} while (unlikely(idx >= page_limit));
 3149
 3150	return (char *)start + idx;
 3151}
 3152
 3153/* Shuffle the single linked freelist based on a random pre-computed sequence */
 3154static bool shuffle_freelist(struct kmem_cache *s, struct slab *slab)
 3155{
 3156	void *start;
 3157	void *cur;
 3158	void *next;
 3159	unsigned long idx, pos, page_limit, freelist_count;
 3160
 3161	if (slab->objects < 2 || !s->random_seq)
 3162		return false;
 3163
 3164	freelist_count = oo_objects(s->oo);
 3165	pos = get_random_u32_below(freelist_count);
 3166
 3167	page_limit = slab->objects * s->size;
 3168	start = fixup_red_left(s, slab_address(slab));
 3169
 3170	/* First entry is used as the base of the freelist */
 3171	cur = next_freelist_entry(s, &pos, start, page_limit, freelist_count);
 3172	cur = setup_object(s, cur);
 3173	slab->freelist = cur;
 3174
 3175	for (idx = 1; idx < slab->objects; idx++) {
 3176		next = next_freelist_entry(s, &pos, start, page_limit,
 3177			freelist_count);
 3178		next = setup_object(s, next);
 3179		set_freepointer(s, cur, next);
 3180		cur = next;
 3181	}
 3182	set_freepointer(s, cur, NULL);
 3183
 3184	return true;
 3185}
 3186#else
 3187static inline int init_cache_random_seq(struct kmem_cache *s)
 3188{
 3189	return 0;
 3190}
 3191static inline void init_freelist_randomization(void) { }
 3192static inline bool shuffle_freelist(struct kmem_cache *s, struct slab *slab)
 3193{
 3194	return false;
 3195}
 3196#endif /* CONFIG_SLAB_FREELIST_RANDOM */
 3197
 3198static __always_inline void account_slab(struct slab *slab, int order,
 3199					 struct kmem_cache *s, gfp_t gfp)
 3200{
 3201	if (memcg_kmem_online() && (s->flags & SLAB_ACCOUNT))
 3202		alloc_slab_obj_exts(slab, s, gfp, true);
 3203
 3204	mod_node_page_state(slab_pgdat(slab), cache_vmstat_idx(s),
 3205			    PAGE_SIZE << order);
 3206}
 3207
 3208static __always_inline void unaccount_slab(struct slab *slab, int order,
 3209					   struct kmem_cache *s)
 3210{
 3211	/*
 3212	 * The slab object extensions should now be freed regardless of
 3213	 * whether mem_alloc_profiling_enabled() or not because profiling
 3214	 * might have been disabled after slab->obj_exts got allocated.
 3215	 */
 3216	free_slab_obj_exts(slab);
 3217
 3218	mod_node_page_state(slab_pgdat(slab), cache_vmstat_idx(s),
 3219			    -(PAGE_SIZE << order));
 3220}
 3221
 3222static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
 3223{
 3224	bool allow_spin = gfpflags_allow_spinning(flags);
 3225	struct slab *slab;
 3226	struct kmem_cache_order_objects oo = s->oo;
 3227	gfp_t alloc_gfp;
 3228	void *start, *p, *next;
 3229	int idx;
 3230	bool shuffle;
 3231
 3232	flags &= gfp_allowed_mask;
 3233
 3234	flags |= s->allocflags;
 3235
 3236	/*
 3237	 * Let the initial higher-order allocation fail under memory pressure
 3238	 * so we fall-back to the minimum order allocation.
 3239	 */
 3240	alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL;
 3241	if ((alloc_gfp & __GFP_DIRECT_RECLAIM) && oo_order(oo) > oo_order(s->min))
 3242		alloc_gfp = (alloc_gfp | __GFP_NOMEMALLOC) & ~__GFP_RECLAIM;
 3243
 3244	/*
 3245	 * __GFP_RECLAIM could be cleared on the first allocation attempt,
 3246	 * so pass allow_spin flag directly.
 3247	 */
 3248	slab = alloc_slab_page(alloc_gfp, node, oo, allow_spin);
 3249	if (unlikely(!slab)) {
 3250		oo = s->min;
 3251		alloc_gfp = flags;
 3252		/*
 3253		 * Allocation may have failed due to fragmentation.
 3254		 * Try a lower order alloc if possible
 3255		 */
 3256		slab = alloc_slab_page(alloc_gfp, node, oo, allow_spin);
 3257		if (unlikely(!slab))
 3258			return NULL;
 3259		stat(s, ORDER_FALLBACK);
 3260	}
 3261
 3262	slab->objects = oo_objects(oo);
 3263	slab->inuse = 0;
 3264	slab->frozen = 0;
 3265	init_slab_obj_exts(slab);
 3266
 3267	account_slab(slab, oo_order(oo), s, flags);
 3268
 3269	slab->slab_cache = s;
 3270
 3271	kasan_poison_slab(slab);
 3272
 3273	start = slab_address(slab);
 3274
 3275	setup_slab_debug(s, slab, start);
 3276
 3277	shuffle = shuffle_freelist(s, slab);
 3278
 3279	if (!shuffle) {
 3280		start = fixup_red_left(s, start);
 3281		start = setup_object(s, start);
 3282		slab->freelist = start;
 3283		for (idx = 0, p = start; idx < slab->objects - 1; idx++) {
 3284			next = p + s->size;
 3285			next = setup_object(s, next);
 3286			set_freepointer(s, p, next);
 3287			p = next;
 3288		}
 3289		set_freepointer(s, p, NULL);
 3290	}
 3291
 3292	return slab;
 3293}
 3294
 3295static struct slab *new_slab(struct kmem_cache *s, gfp_t flags, int node)
 3296{
 3297	if (unlikely(flags & GFP_SLAB_BUG_MASK))
 3298		flags = kmalloc_fix_flags(flags);
 3299
 3300	WARN_ON_ONCE(s->ctor && (flags & __GFP_ZERO));
 3301
 3302	return allocate_slab(s,
 3303		flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node);
 3304}
 3305
 3306static void __free_slab(struct kmem_cache *s, struct slab *slab)
 3307{
 3308	struct page *page = slab_page(slab);
 3309	int order = compound_order(page);
 3310	int pages = 1 << order;
 3311
 3312	__slab_clear_pfmemalloc(slab);
 3313	page->mapping = NULL;
 3314	__ClearPageSlab(page);
 3315	mm_account_reclaimed_pages(pages);
 3316	unaccount_slab(slab, order, s);
 3317	free_frozen_pages(page, order);
 3318}
 3319
 3320static void rcu_free_slab(struct rcu_head *h)
 3321{
 3322	struct slab *slab = container_of(h, struct slab, rcu_head);
 3323
 3324	__free_slab(slab->slab_cache, slab);
 3325}
 3326
 3327static void free_slab(struct kmem_cache *s, struct slab *slab)
 3328{
 3329	if (kmem_cache_debug_flags(s, SLAB_CONSISTENCY_CHECKS)) {
 3330		void *p;
 3331
 3332		slab_pad_check(s, slab);
 3333		for_each_object(p, s, slab_address(slab), slab->objects)
 3334			check_object(s, slab, p, SLUB_RED_INACTIVE);
 3335	}
 3336
 3337	if (unlikely(s->flags & SLAB_TYPESAFE_BY_RCU))
 3338		call_rcu(&slab->rcu_head, rcu_free_slab);
 3339	else
 3340		__free_slab(s, slab);
 3341}
 3342
 3343static void discard_slab(struct kmem_cache *s, struct slab *slab)
 3344{
 3345	dec_slabs_node(s, slab_nid(slab), slab->objects);
 3346	free_slab(s, slab);
 3347}
 3348
 3349static inline bool slab_test_node_partial(const struct slab *slab)
 3350{
 3351	return test_bit(SL_partial, &slab->flags.f);
 3352}
 3353
 3354static inline void slab_set_node_partial(struct slab *slab)
 3355{
 3356	set_bit(SL_partial, &slab->flags.f);
 3357}
 3358
 3359static inline void slab_clear_node_partial(struct slab *slab)
 3360{
 3361	clear_bit(SL_partial, &slab->flags.f);
 3362}
 3363
 3364/*
 3365 * Management of partially allocated slabs.
 3366 */
 3367static inline void
 3368__add_partial(struct kmem_cache_node *n, struct slab *slab, int tail)
 3369{
 3370	n->nr_partial++;
 3371	if (tail == DEACTIVATE_TO_TAIL)
 3372		list_add_tail(&slab->slab_list, &n->partial);
 3373	else
 3374		list_add(&slab->slab_list, &n->partial);
 3375	slab_set_node_partial(slab);
 3376}
 3377
 3378static inline void add_partial(struct kmem_cache_node *n,
 3379				struct slab *slab, int tail)
 3380{
 3381	lockdep_assert_held(&n->list_lock);
 3382	__add_partial(n, slab, tail);
 3383}
 3384
 3385static inline void remove_partial(struct kmem_cache_node *n,
 3386					struct slab *slab)
 3387{
 3388	lockdep_assert_held(&n->list_lock);
 3389	list_del(&slab->slab_list);
 3390	slab_clear_node_partial(slab);
 3391	n->nr_partial--;
 3392}
 3393
 3394/*
 3395 * Called only for kmem_cache_debug() caches instead of remove_partial(), with a
 3396 * slab from the n->partial list. Remove only a single object from the slab, do
 3397 * the alloc_debug_processing() checks and leave the slab on the list, or move
 3398 * it to full list if it was the last free object.
 3399 */
 3400static void *alloc_single_from_partial(struct kmem_cache *s,
 3401		struct kmem_cache_node *n, struct slab *slab, int orig_size)
 3402{
 3403	void *object;
 3404
 3405	lockdep_assert_held(&n->list_lock);
 3406
 3407#ifdef CONFIG_SLUB_DEBUG
 3408	if (s->flags & SLAB_CONSISTENCY_CHECKS) {
 3409		if (!validate_slab_ptr(slab)) {
 3410			slab_err(s, slab, "Not a valid slab page");
 3411			return NULL;
 3412		}
 3413	}
 3414#endif
 3415
 3416	object = slab->freelist;
 3417	slab->freelist = get_freepointer(s, object);
 3418	slab->inuse++;
 3419
 3420	if (!alloc_debug_processing(s, slab, object, orig_size)) {
 3421		remove_partial(n, slab);
 3422		return NULL;
 3423	}
 3424
 3425	if (slab->inuse == slab->objects) {
 3426		remove_partial(n, slab);
 3427		add_full(s, n, slab);
 3428	}
 3429
 3430	return object;
 3431}
 3432
 3433static void defer_deactivate_slab(struct slab *slab, void *flush_freelist);
 3434
 3435/*
 3436 * Called only for kmem_cache_debug() caches to allocate from a freshly
 3437 * allocated slab. Allocate a single object instead of whole freelist
 3438 * and put the slab to the partial (or full) list.
 3439 */
 3440static void *alloc_single_from_new_slab(struct kmem_cache *s, struct slab *slab,
 3441					int orig_size, gfp_t gfpflags)
 3442{
 3443	bool allow_spin = gfpflags_allow_spinning(gfpflags);
 3444	int nid = slab_nid(slab);
 3445	struct kmem_cache_node *n = get_node(s, nid);
 3446	unsigned long flags;
 3447	void *object;
 3448
 3449	if (!allow_spin && !spin_trylock_irqsave(&n->list_lock, flags)) {
 3450		/* Unlucky, discard newly allocated slab */
 3451		defer_deactivate_slab(slab, NULL);
 3452		return NULL;
 3453	}
 3454
 3455	object = slab->freelist;
 3456	slab->freelist = get_freepointer(s, object);
 3457	slab->inuse = 1;
 3458
 3459	if (!alloc_debug_processing(s, slab, object, orig_size)) {
 3460		/*
 3461		 * It's not really expected that this would fail on a
 3462		 * freshly allocated slab, but a concurrent memory
 3463		 * corruption in theory could cause that.
 3464		 * Leak memory of allocated slab.
 3465		 */
 3466		if (!allow_spin)
 3467			spin_unlock_irqrestore(&n->list_lock, flags);
 3468		return NULL;
 3469	}
 3470
 3471	if (allow_spin)
 3472		spin_lock_irqsave(&n->list_lock, flags);
 3473
 3474	if (slab->inuse == slab->objects)
 3475		add_full(s, n, slab);
 3476	else
 3477		add_partial(n, slab, DEACTIVATE_TO_HEAD);
 3478
 3479	inc_slabs_node(s, nid, slab->objects);
 3480	spin_unlock_irqrestore(&n->list_lock, flags);
 3481
 3482	return object;
 3483}
 3484
 3485#ifdef CONFIG_SLUB_CPU_PARTIAL
 3486static void put_cpu_partial(struct kmem_cache *s, struct slab *slab, int drain);
 3487#else
 3488static inline void put_cpu_partial(struct kmem_cache *s, struct slab *slab,
 3489				   int drain) { }
 3490#endif
 3491static inline bool pfmemalloc_match(struct slab *slab, gfp_t gfpflags);
 3492
 3493/*
 3494 * Try to allocate a partial slab from a specific node.
 3495 */
 3496static struct slab *get_partial_node(struct kmem_cache *s,
 3497				     struct kmem_cache_node *n,
 3498				     struct partial_context *pc)
 3499{
 3500	struct slab *slab, *slab2, *partial = NULL;
 3501	unsigned long flags;
 3502	unsigned int partial_slabs = 0;
 3503
 3504	/*
 3505	 * Racy check. If we mistakenly see no partial slabs then we
 3506	 * just allocate an empty slab. If we mistakenly try to get a
 3507	 * partial slab and there is none available then get_partial()
 3508	 * will return NULL.
 3509	 */
 3510	if (!n || !n->nr_partial)
 3511		return NULL;
 3512
 3513	if (gfpflags_allow_spinning(pc->flags))
 3514		spin_lock_irqsave(&n->list_lock, flags);
 3515	else if (!spin_trylock_irqsave(&n->list_lock, flags))
 3516		return NULL;
 3517	list_for_each_entry_safe(slab, slab2, &n->partial, slab_list) {
 3518		if (!pfmemalloc_match(slab, pc->flags))
 3519			continue;
 3520
 3521		if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) {
 3522			void *object = alloc_single_from_partial(s, n, slab,
 3523							pc->orig_size);
 3524			if (object) {
 3525				partial = slab;
 3526				pc->object = object;
 3527				break;
 3528			}
 3529			continue;
 3530		}
 3531
 3532		remove_partial(n, slab);
 3533
 3534		if (!partial) {
 3535			partial = slab;
 3536			stat(s, ALLOC_FROM_PARTIAL);
 3537
 3538			if ((slub_get_cpu_partial(s) == 0)) {
 3539				break;
 3540			}
 3541		} else {
 3542			put_cpu_partial(s, slab, 0);
 3543			stat(s, CPU_PARTIAL_NODE);
 3544
 3545			if (++partial_slabs > slub_get_cpu_partial(s) / 2) {
 3546				break;
 3547			}
 3548		}
 3549	}
 3550	spin_unlock_irqrestore(&n->list_lock, flags);
 3551	return partial;
 3552}
 3553
 3554/*
 3555 * Get a slab from somewhere. Search in increasing NUMA distances.
 3556 */
 3557static struct slab *get_any_partial(struct kmem_cache *s,
 3558				    struct partial_context *pc)
 3559{
 3560#ifdef CONFIG_NUMA
 3561	struct zonelist *zonelist;
 3562	struct zoneref *z;
 3563	struct zone *zone;
 3564	enum zone_type highest_zoneidx = gfp_zone(pc->flags);
 3565	struct slab *slab;
 3566	unsigned int cpuset_mems_cookie;
 3567
 3568	/*
 3569	 * The defrag ratio allows a configuration of the tradeoffs between
 3570	 * inter node defragmentation and node local allocations. A lower
 3571	 * defrag_ratio increases the tendency to do local allocations
 3572	 * instead of attempting to obtain partial slabs from other nodes.
 3573	 *
 3574	 * If the defrag_ratio is set to 0 then kmalloc() always
 3575	 * returns node local objects. If the ratio is higher then kmalloc()
 3576	 * may return off node objects because partial slabs are obtained
 3577	 * from other nodes and filled up.
 3578	 *
 3579	 * If /sys/kernel/slab/xx/remote_node_defrag_ratio is set to 100
 3580	 * (which makes defrag_ratio = 1000) then every (well almost)
 3581	 * allocation will first attempt to defrag slab caches on other nodes.
 3582	 * This means scanning over all nodes to look for partial slabs which
 3583	 * may be expensive if we do it every time we are trying to find a slab
 3584	 * with available objects.
 3585	 */
 3586	if (!s->remote_node_defrag_ratio ||
 3587			get_cycles() % 1024 > s->remote_node_defrag_ratio)
 3588		return NULL;
 3589
 3590	do {
 3591		cpuset_mems_cookie = read_mems_allowed_begin();
 3592		zonelist = node_zonelist(mempolicy_slab_node(), pc->flags);
 3593		for_each_zone_zonelist(zone, z, zonelist, highest_zoneidx) {
 3594			struct kmem_cache_node *n;
 3595
 3596			n = get_node(s, zone_to_nid(zone));
 3597
 3598			if (n && cpuset_zone_allowed(zone, pc->flags) &&
 3599					n->nr_partial > s->min_partial) {
 3600				slab = get_partial_node(s, n, pc);
 3601				if (slab) {
 3602					/*
 3603					 * Don't check read_mems_allowed_retry()
 3604					 * here - if mems_allowed was updated in
 3605					 * parallel, that was a harmless race
 3606					 * between allocation and the cpuset
 3607					 * update
 3608					 */
 3609					return slab;
 3610				}
 3611			}
 3612		}
 3613	} while (read_mems_allowed_retry(cpuset_mems_cookie));
 3614#endif	/* CONFIG_NUMA */
 3615	return NULL;
 3616}
 3617
 3618/*
 3619 * Get a partial slab, lock it and return it.
 3620 */
 3621static struct slab *get_partial(struct kmem_cache *s, int node,
 3622				struct partial_context *pc)
 3623{
 3624	struct slab *slab;
 3625	int searchnode = node;
 3626
 3627	if (node == NUMA_NO_NODE)
 3628		searchnode = numa_mem_id();
 3629
 3630	slab = get_partial_node(s, get_node(s, searchnode), pc);
 3631	if (slab || (node != NUMA_NO_NODE && (pc->flags & __GFP_THISNODE)))
 3632		return slab;
 3633
 3634	return get_any_partial(s, pc);
 3635}
 3636
 3637#ifdef CONFIG_PREEMPTION
 3638/*
 3639 * Calculate the next globally unique transaction for disambiguation
 3640 * during cmpxchg. The transactions start with the cpu number and are then
 3641 * incremented by CONFIG_NR_CPUS.
 3642 */
 3643#define TID_STEP  roundup_pow_of_two(CONFIG_NR_CPUS)
 3644#else
 3645/*
 3646 * No preemption supported therefore also no need to check for
 3647 * different cpus.
 3648 */
 3649#define TID_STEP 1
 3650#endif /* CONFIG_PREEMPTION */
 3651
 3652static inline unsigned long next_tid(unsigned long tid)
 3653{
 3654	return tid + TID_STEP;
 3655}
 3656
 3657#ifdef SLUB_DEBUG_CMPXCHG
 3658static inline unsigned int tid_to_cpu(unsigned long tid)
 3659{
 3660	return tid % TID_STEP;
 3661}
 3662
 3663static inline unsigned long tid_to_event(unsigned long tid)
 3664{
 3665	return tid / TID_STEP;
 3666}
 3667#endif
 3668
 3669static inline unsigned int init_tid(int cpu)
 3670{
 3671	return cpu;
 3672}
 3673
 3674static inline void note_cmpxchg_failure(const char *n,
 3675		const struct kmem_cache *s, unsigned long tid)
 3676{
 3677#ifdef SLUB_DEBUG_CMPXCHG
 3678	unsigned long actual_tid = __this_cpu_read(s->cpu_slab->tid);
 3679
 3680	pr_info("%s %s: cmpxchg redo ", n, s->name);
 3681
 3682	if (IS_ENABLED(CONFIG_PREEMPTION) &&
 3683	    tid_to_cpu(tid) != tid_to_cpu(actual_tid)) {
 3684		pr_warn("due to cpu change %d -> %d\n",
 3685			tid_to_cpu(tid), tid_to_cpu(actual_tid));
 3686	} else if (tid_to_event(tid) != tid_to_event(actual_tid)) {
 3687		pr_warn("due to cpu running other code. Event %ld->%ld\n",
 3688			tid_to_event(tid), tid_to_event(actual_tid));
 3689	} else {
 3690		pr_warn("for unknown reason: actual=%lx was=%lx target=%lx\n",
 3691			actual_tid, tid, next_tid(tid));
 3692	}
 3693#endif
 3694	stat(s, CMPXCHG_DOUBLE_CPU_FAIL);
 3695}
 3696
 3697static void init_kmem_cache_cpus(struct kmem_cache *s)
 3698{
 3699#ifdef CONFIG_PREEMPT_RT
 3700	/*
 3701	 * Register lockdep key for non-boot kmem caches to avoid
 3702	 * WARN_ON_ONCE(static_obj(key))) in lockdep_register_key()
 3703	 */
 3704	bool finegrain_lockdep = !init_section_contains(s, 1);
 3705#else
 3706	/*
 3707	 * Don't bother with different lockdep classes for each
 3708	 * kmem_cache, since we only use local_trylock_irqsave().
 3709	 */
 3710	bool finegrain_lockdep = false;
 3711#endif
 3712	int cpu;
 3713	struct kmem_cache_cpu *c;
 3714
 3715	if (finegrain_lockdep)
 3716		lockdep_register_key(&s->lock_key);
 3717	for_each_possible_cpu(cpu) {
 3718		c = per_cpu_ptr(s->cpu_slab, cpu);
 3719		local_trylock_init(&c->lock);
 3720		if (finegrain_lockdep)
 3721			lockdep_set_class(&c->lock, &s->lock_key);
 3722		c->tid = init_tid(cpu);
 3723	}
 3724}
 3725
 3726/*
 3727 * Finishes removing the cpu slab. Merges cpu's freelist with slab's freelist,
 3728 * unfreezes the slabs and puts it on the proper list.
 3729 * Assumes the slab has been already safely taken away from kmem_cache_cpu
 3730 * by the caller.
 3731 */
 3732static void deactivate_slab(struct kmem_cache *s, struct slab *slab,
 3733			    void *freelist)
 3734{
 3735	struct kmem_cache_node *n = get_node(s, slab_nid(slab));
 3736	int free_delta = 0;
 3737	void *nextfree, *freelist_iter, *freelist_tail;
 3738	int tail = DEACTIVATE_TO_HEAD;
 3739	unsigned long flags = 0;
 3740	struct freelist_counters old, new;
 3741
 3742	if (READ_ONCE(slab->freelist)) {
 3743		stat(s, DEACTIVATE_REMOTE_FREES);
 3744		tail = DEACTIVATE_TO_TAIL;
 3745	}
 3746
 3747	/*
 3748	 * Stage one: Count the objects on cpu's freelist as free_delta and
 3749	 * remember the last object in freelist_tail for later splicing.
 3750	 */
 3751	freelist_tail = NULL;
 3752	freelist_iter = freelist;
 3753	while (freelist_iter) {
 3754		nextfree = get_freepointer(s, freelist_iter);
 3755
 3756		/*
 3757		 * If 'nextfree' is invalid, it is possible that the object at
 3758		 * 'freelist_iter' is already corrupted.  So isolate all objects
 3759		 * starting at 'freelist_iter' by skipping them.
 3760		 */
 3761		if (freelist_corrupted(s, slab, &freelist_iter, nextfree))
 3762			break;
 3763
 3764		freelist_tail = freelist_iter;
 3765		free_delta++;
 3766
 3767		freelist_iter = nextfree;
 3768	}
 3769
 3770	/*
 3771	 * Stage two: Unfreeze the slab while splicing the per-cpu
 3772	 * freelist to the head of slab's freelist.
 3773	 */
 3774	do {
 3775		old.freelist = READ_ONCE(slab->freelist);
 3776		old.counters = READ_ONCE(slab->counters);
 3777		VM_BUG_ON(!old.frozen);
 3778
 3779		/* Determine target state of the slab */
 3780		new.counters = old.counters;
 3781		new.frozen = 0;
 3782		if (freelist_tail) {
 3783			new.inuse -= free_delta;
 3784			set_freepointer(s, freelist_tail, old.freelist);
 3785			new.freelist = freelist;
 3786		} else {
 3787			new.freelist = old.freelist;
 3788		}
 3789	} while (!slab_update_freelist(s, slab, &old, &new, "unfreezing slab"));
 3790
 3791	/*
 3792	 * Stage three: Manipulate the slab list based on the updated state.
 3793	 */
 3794	if (!new.inuse && n->nr_partial >= s->min_partial) {
 3795		stat(s, DEACTIVATE_EMPTY);
 3796		discard_slab(s, slab);
 3797		stat(s, FREE_SLAB);
 3798	} else if (new.freelist) {
 3799		spin_lock_irqsave(&n->list_lock, flags);
 3800		add_partial(n, slab, tail);
 3801		spin_unlock_irqrestore(&n->list_lock, flags);
 3802		stat(s, tail);
 3803	} else {
 3804		stat(s, DEACTIVATE_FULL);
 3805	}
 3806}
 3807
 3808/*
 3809 * ___slab_alloc()'s caller is supposed to check if kmem_cache::kmem_cache_cpu::lock
 3810 * can be acquired without a deadlock before invoking the function.
 3811 *
 3812 * Without LOCKDEP we trust the code to be correct. kmalloc_nolock() is
 3813 * using local_lock_is_locked() properly before calling local_lock_cpu_slab(),
 3814 * and kmalloc() is not used in an unsupported context.
 3815 *
 3816 * With LOCKDEP, on PREEMPT_RT lockdep does its checking in local_lock_irqsave().
 3817 * On !PREEMPT_RT we use trylock to avoid false positives in NMI, but
 3818 * lockdep_assert() will catch a bug in case:
 3819 * #1
 3820 * kmalloc() -> ___slab_alloc() -> irqsave -> NMI -> bpf -> kmalloc_nolock()
 3821 * or
 3822 * #2
 3823 * kmalloc() -> ___slab_alloc() -> irqsave -> tracepoint/kprobe -> bpf -> kmalloc_nolock()
 3824 *
 3825 * On PREEMPT_RT an invocation is not possible from IRQ-off or preempt
 3826 * disabled context. The lock will always be acquired and if needed it
 3827 * block and sleep until the lock is available.
 3828 * #1 is possible in !PREEMPT_RT only.
 3829 * #2 is possible in both with a twist that irqsave is replaced with rt_spinlock:
 3830 * kmalloc() -> ___slab_alloc() -> rt_spin_lock(kmem_cache_A) ->
 3831 *    tracepoint/kprobe -> bpf -> kmalloc_nolock() -> rt_spin_lock(kmem_cache_B)
 3832 *
 3833 * local_lock_is_locked() prevents the case kmem_cache_A == kmem_cache_B
 3834 */
 3835#if defined(CONFIG_PREEMPT_RT) || !defined(CONFIG_LOCKDEP)
 3836#define local_lock_cpu_slab(s, flags)	\
 3837	local_lock_irqsave(&(s)->cpu_slab->lock, flags)
 3838#else
 3839#define local_lock_cpu_slab(s, flags)					       \
 3840	do {								       \
 3841		bool __l = local_trylock_irqsave(&(s)->cpu_slab->lock, flags); \
 3842		lockdep_assert(__l);					       \
 3843	} while (0)
 3844#endif
 3845
 3846#define local_unlock_cpu_slab(s, flags)	\
 3847	local_unlock_irqrestore(&(s)->cpu_slab->lock, flags)
 3848
 3849#ifdef CONFIG_SLUB_CPU_PARTIAL
 3850static void __put_partials(struct kmem_cache *s, struct slab *partial_slab)
 3851{
 3852	struct kmem_cache_node *n = NULL, *n2 = NULL;
 3853	struct slab *slab, *slab_to_discard = NULL;
 3854	unsigned long flags = 0;
 3855
 3856	while (partial_slab) {
 3857		slab = partial_slab;
 3858		partial_slab = slab->next;
 3859
 3860		n2 = get_node(s, slab_nid(slab));
 3861		if (n != n2) {
 3862			if (n)
 3863				spin_unlock_irqrestore(&n->list_lock, flags);
 3864
 3865			n = n2;
 3866			spin_lock_irqsave(&n->list_lock, flags);
 3867		}
 3868
 3869		if (unlikely(!slab->inuse && n->nr_partial >= s->min_partial)) {
 3870			slab->next = slab_to_discard;
 3871			slab_to_discard = slab;
 3872		} else {
 3873			add_partial(n, slab, DEACTIVATE_TO_TAIL);
 3874			stat(s, FREE_ADD_PARTIAL);
 3875		}
 3876	}
 3877
 3878	if (n)
 3879		spin_unlock_irqrestore(&n->list_lock, flags);
 3880
 3881	while (slab_to_discard) {
 3882		slab = slab_to_discard;
 3883		slab_to_discard = slab_to_discard->next;
 3884
 3885		stat(s, DEACTIVATE_EMPTY);
 3886		discard_slab(s, slab);
 3887		stat(s, FREE_SLAB);
 3888	}
 3889}
 3890
 3891/*
 3892 * Put all the cpu partial slabs to the node partial list.
 3893 */
 3894static void put_partials(struct kmem_cache *s)
 3895{
 3896	struct slab *partial_slab;
 3897	unsigned long flags;
 3898
 3899	local_lock_irqsave(&s->cpu_slab->lock, flags);
 3900	partial_slab = this_cpu_read(s->cpu_slab->partial);
 3901	this_cpu_write(s->cpu_slab->partial, NULL);
 3902	local_unlock_irqrestore(&s->cpu_slab->lock, flags);
 3903
 3904	if (partial_slab)
 3905		__put_partials(s, partial_slab);
 3906}
 3907
 3908static void put_partials_cpu(struct kmem_cache *s,
 3909			     struct kmem_cache_cpu *c)
 3910{
 3911	struct slab *partial_slab;
 3912
 3913	partial_slab = slub_percpu_partial(c);
 3914	c->partial = NULL;
 3915
 3916	if (partial_slab)
 3917		__put_partials(s, partial_slab);
 3918}
 3919
 3920/*
 3921 * Put a slab into a partial slab slot if available.
 3922 *
 3923 * If we did not find a slot then simply move all the partials to the
 3924 * per node partial list.
 3925 */
 3926static void put_cpu_partial(struct kmem_cache *s, struct slab *slab, int drain)
 3927{
 3928	struct slab *oldslab;
 3929	struct slab *slab_to_put = NULL;
 3930	unsigned long flags;
 3931	int slabs = 0;
 3932
 3933	local_lock_cpu_slab(s, flags);
 3934
 3935	oldslab = this_cpu_read(s->cpu_slab->partial);
 3936
 3937	if (oldslab) {
 3938		if (drain && oldslab->slabs >= s->cpu_partial_slabs) {
 3939			/*
 3940			 * Partial array is full. Move the existing set to the
 3941			 * per node partial list. Postpone the actual unfreezing
 3942			 * outside of the critical section.
 3943			 */
 3944			slab_to_put = oldslab;
 3945			oldslab = NULL;
 3946		} else {
 3947			slabs = oldslab->slabs;
 3948		}
 3949	}
 3950
 3951	slabs++;
 3952
 3953	slab->slabs = slabs;
 3954	slab->next = oldslab;
 3955
 3956	this_cpu_write(s->cpu_slab->partial, slab);
 3957
 3958	local_unlock_cpu_slab(s, flags);
 3959
 3960	if (slab_to_put) {
 3961		__put_partials(s, slab_to_put);
 3962		stat(s, CPU_PARTIAL_DRAIN);
 3963	}
 3964}
 3965
 3966#else	/* CONFIG_SLUB_CPU_PARTIAL */
 3967
 3968static inline void put_partials(struct kmem_cache *s) { }
 3969static inline void put_partials_cpu(struct kmem_cache *s,
 3970				    struct kmem_cache_cpu *c) { }
 3971
 3972#endif	/* CONFIG_SLUB_CPU_PARTIAL */
 3973
 3974static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
 3975{
 3976	unsigned long flags;
 3977	struct slab *slab;
 3978	void *freelist;
 3979
 3980	local_lock_irqsave(&s->cpu_slab->lock, flags);
 3981
 3982	slab = c->slab;
 3983	freelist = c->freelist;
 3984
 3985	c->slab = NULL;
 3986	c->freelist = NULL;
 3987	c->tid = next_tid(c->tid);
 3988
 3989	local_unlock_irqrestore(&s->cpu_slab->lock, flags);
 3990
 3991	if (slab) {
 3992		deactivate_slab(s, slab, freelist);
 3993		stat(s, CPUSLAB_FLUSH);
 3994	}
 3995}
 3996
 3997static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu)
 3998{
 3999	struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
 4000	void *freelist = c->freelist;
 4001	struct slab *slab = c->slab;
 4002
 4003	c->slab = NULL;
 4004	c->freelist = NULL;
 4005	c->tid = next_tid(c->tid);
 4006
 4007	if (slab) {
 4008		deactivate_slab(s, slab, freelist);
 4009		stat(s, CPUSLAB_FLUSH);
 4010	}
 4011
 4012	put_partials_cpu(s, c);
 4013}
 4014
 4015static inline void flush_this_cpu_slab(struct kmem_cache *s)
 4016{
 4017	struct kmem_cache_cpu *c = this_cpu_ptr(s->cpu_slab);
 4018
 4019	if (c->slab)
 4020		flush_slab(s, c);
 4021
 4022	put_partials(s);
 4023}
 4024
 4025static bool has_cpu_slab(int cpu, struct kmem_cache *s)
 4026{
 4027	struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
 4028
 4029	return c->slab || slub_percpu_partial(c);
 4030}
 4031
 4032static bool has_pcs_used(int cpu, struct kmem_cache *s)
 4033{
 4034	struct slub_percpu_sheaves *pcs;
 4035
 4036	if (!s->cpu_sheaves)
 4037		return false;
 4038
 4039	pcs = per_cpu_ptr(s->cpu_sheaves, cpu);
 4040
 4041	return (pcs->spare || pcs->rcu_free || pcs->main->size);
 4042}
 4043
 4044/*
 4045 * Flush cpu slab.
 4046 *
 4047 * Called from CPU work handler with migration disabled.
 4048 */
 4049static void flush_cpu_slab(struct work_struct *w)
 4050{
 4051	struct kmem_cache *s;
 4052	struct slub_flush_work *sfw;
 4053
 4054	sfw = container_of(w, struct slub_flush_work, work);
 4055
 4056	s = sfw->s;
 4057
 4058	if (s->cpu_sheaves)
 4059		pcs_flush_all(s);
 4060
 4061	flush_this_cpu_slab(s);
 4062}
 4063
 4064static void flush_all_cpus_locked(struct kmem_cache *s)
 4065{
 4066	struct slub_flush_work *sfw;
 4067	unsigned int cpu;
 4068
 4069	lockdep_assert_cpus_held();
 4070	mutex_lock(&flush_lock);
 4071
 4072	for_each_online_cpu(cpu) {
 4073		sfw = &per_cpu(slub_flush, cpu);
 4074		if (!has_cpu_slab(cpu, s) && !has_pcs_used(cpu, s)) {
 4075			sfw->skip = true;
 4076			continue;
 4077		}
 4078		INIT_WORK(&sfw->work, flush_cpu_slab);
 4079		sfw->skip = false;
 4080		sfw->s = s;
 4081		queue_work_on(cpu, flushwq, &sfw->work);
 4082	}
 4083
 4084	for_each_online_cpu(cpu) {
 4085		sfw = &per_cpu(slub_flush, cpu);
 4086		if (sfw->skip)
 4087			continue;
 4088		flush_work(&sfw->work);
 4089	}
 4090
 4091	mutex_unlock(&flush_lock);
 4092}
 4093
 4094static void flush_all(struct kmem_cache *s)
 4095{
 4096	cpus_read_lock();
 4097	flush_all_cpus_locked(s);
 4098	cpus_read_unlock();
 4099}
 4100
 4101static void flush_rcu_sheaf(struct work_struct *w)
 4102{
 4103	struct slub_percpu_sheaves *pcs;
 4104	struct slab_sheaf *rcu_free;
 4105	struct slub_flush_work *sfw;
 4106	struct kmem_cache *s;
 4107
 4108	sfw = container_of(w, struct slub_flush_work, work);
 4109	s = sfw->s;
 4110
 4111	local_lock(&s->cpu_sheaves->lock);
 4112	pcs = this_cpu_ptr(s->cpu_sheaves);
 4113
 4114	rcu_free = pcs->rcu_free;
 4115	pcs->rcu_free = NULL;
 4116
 4117	local_unlock(&s->cpu_sheaves->lock);
 4118
 4119	if (rcu_free)
 4120		call_rcu(&rcu_free->rcu_head, rcu_free_sheaf_nobarn);
 4121}
 4122
 4123
 4124/* needed for kvfree_rcu_barrier() */
 4125void flush_rcu_sheaves_on_cache(struct kmem_cache *s)
 4126{
 4127	struct slub_flush_work *sfw;
 4128	unsigned int cpu;
 4129
 4130	mutex_lock(&flush_lock);
 4131
 4132	for_each_online_cpu(cpu) {
 4133		sfw = &per_cpu(slub_flush, cpu);
 4134
 4135		/*
 4136		 * we don't check if rcu_free sheaf exists - racing
 4137		 * __kfree_rcu_sheaf() might have just removed it.
 4138		 * by executing flush_rcu_sheaf() on the cpu we make
 4139		 * sure the __kfree_rcu_sheaf() finished its call_rcu()
 4140		 */
 4141
 4142		INIT_WORK(&sfw->work, flush_rcu_sheaf);
 4143		sfw->s = s;
 4144		queue_work_on(cpu, flushwq, &sfw->work);
 4145	}
 4146
 4147	for_each_online_cpu(cpu) {
 4148		sfw = &per_cpu(slub_flush, cpu);
 4149		flush_work(&sfw->work);
 4150	}
 4151
 4152	mutex_unlock(&flush_lock);
 4153}
 4154
 4155void flush_all_rcu_sheaves(void)
 4156{
 4157	struct kmem_cache *s;
 4158
 4159	cpus_read_lock();
 4160	mutex_lock(&slab_mutex);
 4161
 4162	list_for_each_entry(s, &slab_caches, list) {
 4163		if (!s->cpu_sheaves)
 4164			continue;
 4165		flush_rcu_sheaves_on_cache(s);
 4166	}
 4167
 4168	mutex_unlock(&slab_mutex);
 4169	cpus_read_unlock();
 4170
 4171	rcu_barrier();
 4172}
 4173
 4174/*
 4175 * Use the cpu notifier to insure that the cpu slabs are flushed when
 4176 * necessary.
 4177 */
 4178static int slub_cpu_dead(unsigned int cpu)
 4179{
 4180	struct kmem_cache *s;
 4181
 4182	mutex_lock(&slab_mutex);
 4183	list_for_each_entry(s, &slab_caches, list) {
 4184		__flush_cpu_slab(s, cpu);
 4185		if (s->cpu_sheaves)
 4186			__pcs_flush_all_cpu(s, cpu);
 4187	}
 4188	mutex_unlock(&slab_mutex);
 4189	return 0;
 4190}
 4191
 4192/*
 4193 * Check if the objects in a per cpu structure fit numa
 4194 * locality expectations.
 4195 */
 4196static inline int node_match(struct slab *slab, int node)
 4197{
 4198#ifdef CONFIG_NUMA
 4199	if (node != NUMA_NO_NODE && slab_nid(slab) != node)
 4200		return 0;
 4201#endif
 4202	return 1;
 4203}
 4204
 4205#ifdef CONFIG_SLUB_DEBUG
 4206static int count_free(struct slab *slab)
 4207{
 4208	return slab->objects - slab->inuse;
 4209}
 4210
 4211static inline unsigned long node_nr_objs(struct kmem_cache_node *n)
 4212{
 4213	return atomic_long_read(&n->total_objects);
 4214}
 4215
 4216/* Supports checking bulk free of a constructed freelist */
 4217static inline bool free_debug_processing(struct kmem_cache *s,
 4218	struct slab *slab, void *head, void *tail, int *bulk_cnt,
 4219	unsigned long addr, depot_stack_handle_t handle)
 4220{
 4221	bool checks_ok = false;
 4222	void *object = head;
 4223	int cnt = 0;
 4224
 4225	if (s->flags & SLAB_CONSISTENCY_CHECKS) {
 4226		if (!check_slab(s, slab))
 4227			goto out;
 4228	}
 4229
 4230	if (slab->inuse < *bulk_cnt) {
 4231		slab_err(s, slab, "Slab has %d allocated objects but %d are to be freed\n",
 4232			 slab->inuse, *bulk_cnt);
 4233		goto out;
 4234	}
 4235
 4236next_object:
 4237
 4238	if (++cnt > *bulk_cnt)
 4239		goto out_cnt;
 4240
 4241	if (s->flags & SLAB_CONSISTENCY_CHECKS) {
 4242		if (!free_consistency_checks(s, slab, object, addr))
 4243			goto out;
 4244	}
 4245
 4246	if (s->flags & SLAB_STORE_USER)
 4247		set_track_update(s, object, TRACK_FREE, addr, handle);
 4248	trace(s, slab, object, 0);
 4249	/* Freepointer not overwritten by init_object(), SLAB_POISON moved it */
 4250	init_object(s, object, SLUB_RED_INACTIVE);
 4251
 4252	/* Reached end of constructed freelist yet? */
 4253	if (object != tail) {
 4254		object = get_freepointer(s, object);
 4255		goto next_object;
 4256	}
 4257	checks_ok = true;
 4258
 4259out_cnt:
 4260	if (cnt != *bulk_cnt) {
 4261		slab_err(s, slab, "Bulk free expected %d objects but found %d\n",
 4262			 *bulk_cnt, cnt);
 4263		*bulk_cnt = cnt;
 4264	}
 4265
 4266out:
 4267
 4268	if (!checks_ok)
 4269		slab_fix(s, "Object at 0x%p not freed", object);
 4270
 4271	return checks_ok;
 4272}
 4273#endif /* CONFIG_SLUB_DEBUG */
 4274
 4275#if defined(CONFIG_SLUB_DEBUG) || defined(SLAB_SUPPORTS_SYSFS)
 4276static unsigned long count_partial(struct kmem_cache_node *n,
 4277					int (*get_count)(struct slab *))
 4278{
 4279	unsigned long flags;
 4280	unsigned long x = 0;
 4281	struct slab *slab;
 4282
 4283	spin_lock_irqsave(&n->list_lock, flags);
 4284	list_for_each_entry(slab, &n->partial, slab_list)
 4285		x += get_count(slab);
 4286	spin_unlock_irqrestore(&n->list_lock, flags);
 4287	return x;
 4288}
 4289#endif /* CONFIG_SLUB_DEBUG || SLAB_SUPPORTS_SYSFS */
 4290
 4291#ifdef CONFIG_SLUB_DEBUG
 4292#define MAX_PARTIAL_TO_SCAN 10000
 4293
 4294static unsigned long count_partial_free_approx(struct kmem_cache_node *n)
 4295{
 4296	unsigned long flags;
 4297	unsigned long x = 0;
 4298	struct slab *slab;
 4299
 4300	spin_lock_irqsave(&n->list_lock, flags);
 4301	if (n->nr_partial <= MAX_PARTIAL_TO_SCAN) {
 4302		list_for_each_entry(slab, &n->partial, slab_list)
 4303			x += slab->objects - slab->inuse;
 4304	} else {
 4305		/*
 4306		 * For a long list, approximate the total count of objects in
 4307		 * it to meet the limit on the number of slabs to scan.
 4308		 * Scan from both the list's head and tail for better accuracy.
 4309		 */
 4310		unsigned long scanned = 0;
 4311
 4312		list_for_each_entry(slab, &n->partial, slab_list) {
 4313			x += slab->objects - slab->inuse;
 4314			if (++scanned == MAX_PARTIAL_TO_SCAN / 2)
 4315				break;
 4316		}
 4317		list_for_each_entry_reverse(slab, &n->partial, slab_list) {
 4318			x += slab->objects - slab->inuse;
 4319			if (++scanned == MAX_PARTIAL_TO_SCAN)
 4320				break;
 4321		}
 4322		x = mult_frac(x, n->nr_partial, scanned);
 4323		x = min(x, node_nr_objs(n));
 4324	}
 4325	spin_unlock_irqrestore(&n->list_lock, flags);
 4326	return x;
 4327}
 4328
 4329static noinline void
 4330slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid)
 4331{
 4332	static DEFINE_RATELIMIT_STATE(slub_oom_rs, DEFAULT_RATELIMIT_INTERVAL,
 4333				      DEFAULT_RATELIMIT_BURST);
 4334	int cpu = raw_smp_processor_id();
 4335	int node;
 4336	struct kmem_cache_node *n;
 4337
 4338	if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slub_oom_rs))
 4339		return;
 4340
 4341	pr_warn("SLUB: Unable to allocate memory on CPU %u (of node %d) on node %d, gfp=%#x(%pGg)\n",
 4342		cpu, cpu_to_node(cpu), nid, gfpflags, &gfpflags);
 4343	pr_warn("  cache: %s, object size: %u, buffer size: %u, default order: %u, min order: %u\n",
 4344		s->name, s->object_size, s->size, oo_order(s->oo),
 4345		oo_order(s->min));
 4346
 4347	if (oo_order(s->min) > get_order(s->object_size))
 4348		pr_warn("  %s debugging increased min order, use slab_debug=O to disable.\n",
 4349			s->name);
 4350
 4351	for_each_kmem_cache_node(s, node, n) {
 4352		unsigned long nr_slabs;
 4353		unsigned long nr_objs;
 4354		unsigned long nr_free;
 4355
 4356		nr_free  = count_partial_free_approx(n);
 4357		nr_slabs = node_nr_slabs(n);
 4358		nr_objs  = node_nr_objs(n);
 4359
 4360		pr_warn("  node %d: slabs: %ld, objs: %ld, free: %ld\n",
 4361			node, nr_slabs, nr_objs, nr_free);
 4362	}
 4363}
 4364#else /* CONFIG_SLUB_DEBUG */
 4365static inline void
 4366slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) { }
 4367#endif
 4368
 4369static inline bool pfmemalloc_match(struct slab *slab, gfp_t gfpflags)
 4370{
 4371	if (unlikely(slab_test_pfmemalloc(slab)))
 4372		return gfp_pfmemalloc_allowed(gfpflags);
 4373
 4374	return true;
 4375}
 4376
 4377static inline bool
 4378__update_cpu_freelist_fast(struct kmem_cache *s,
 4379			   void *freelist_old, void *freelist_new,
 4380			   unsigned long tid)
 4381{
 4382	struct freelist_tid old = { .freelist = freelist_old, .tid = tid };
 4383	struct freelist_tid new = { .freelist = freelist_new, .tid = next_tid(tid) };
 4384
 4385	return this_cpu_try_cmpxchg_freelist(s->cpu_slab->freelist_tid,
 4386					     &old.freelist_tid, new.freelist_tid);
 4387}
 4388
 4389/*
 4390 * Check the slab->freelist and either transfer the freelist to the
 4391 * per cpu freelist or deactivate the slab.
 4392 *
 4393 * The slab is still frozen if the return value is not NULL.
 4394 *
 4395 * If this function returns NULL then the slab has been unfrozen.
 4396 */
 4397static inline void *get_freelist(struct kmem_cache *s, struct slab *slab)
 4398{
 4399	struct freelist_counters old, new;
 4400
 4401	lockdep_assert_held(this_cpu_ptr(&s->cpu_slab->lock));
 4402
 4403	do {
 4404		old.freelist = slab->freelist;
 4405		old.counters = slab->counters;
 4406
 4407		new.freelist = NULL;
 4408		new.counters = old.counters;
 4409
 4410		new.inuse = old.objects;
 4411		new.frozen = old.freelist != NULL;
 4412
 4413
 4414	} while (!__slab_update_freelist(s, slab, &old, &new, "get_freelist"));
 4415
 4416	return old.freelist;
 4417}
 4418
 4419/*
 4420 * Freeze the partial slab and return the pointer to the freelist.
 4421 */
 4422static inline void *freeze_slab(struct kmem_cache *s, struct slab *slab)
 4423{
 4424	struct freelist_counters old, new;
 4425
 4426	do {
 4427		old.freelist = slab->freelist;
 4428		old.counters = slab->counters;
 4429
 4430		new.freelist = NULL;
 4431		new.counters = old.counters;
 4432		VM_BUG_ON(new.frozen);
 4433
 4434		new.inuse = old.objects;
 4435		new.frozen = 1;
 4436
 4437	} while (!slab_update_freelist(s, slab, &old, &new, "freeze_slab"));
 4438
 4439	return old.freelist;
 4440}
 4441
 4442/*
 4443 * Slow path. The lockless freelist is empty or we need to perform
 4444 * debugging duties.
 4445 *
 4446 * Processing is still very fast if new objects have been freed to the
 4447 * regular freelist. In that case we simply take over the regular freelist
 4448 * as the lockless freelist and zap the regular freelist.
 4449 *
 4450 * If that is not working then we fall back to the partial lists. We take the
 4451 * first element of the freelist as the object to allocate now and move the
 4452 * rest of the freelist to the lockless freelist.
 4453 *
 4454 * And if we were unable to get a new slab from the partial slab lists then
 4455 * we need to allocate a new slab. This is the slowest path since it involves
 4456 * a call to the page allocator and the setup of a new slab.
 4457 *
 4458 * Version of __slab_alloc to use when we know that preemption is
 4459 * already disabled (which is the case for bulk allocation).
 4460 */
 4461static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
 4462			  unsigned long addr, struct kmem_cache_cpu *c, unsigned int orig_size)
 4463{
 4464	bool allow_spin = gfpflags_allow_spinning(gfpflags);
 4465	void *freelist;
 4466	struct slab *slab;
 4467	unsigned long flags;
 4468	struct partial_context pc;
 4469	bool try_thisnode = true;
 4470
 4471	stat(s, ALLOC_SLOWPATH);
 4472
 4473reread_slab:
 4474
 4475	slab = READ_ONCE(c->slab);
 4476	if (!slab) {
 4477		/*
 4478		 * if the node is not online or has no normal memory, just
 4479		 * ignore the node constraint
 4480		 */
 4481		if (unlikely(node != NUMA_NO_NODE &&
 4482			     !node_isset(node, slab_nodes)))
 4483			node = NUMA_NO_NODE;
 4484		goto new_slab;
 4485	}
 4486
 4487	if (unlikely(!node_match(slab, node))) {
 4488		/*
 4489		 * same as above but node_match() being false already
 4490		 * implies node != NUMA_NO_NODE.
 4491		 *
 4492		 * We don't strictly honor pfmemalloc and NUMA preferences
 4493		 * when !allow_spin because:
 4494		 *
 4495		 * 1. Most kmalloc() users allocate objects on the local node,
 4496		 *    so kmalloc_nolock() tries not to interfere with them by
 4497		 *    deactivating the cpu slab.
 4498		 *
 4499		 * 2. Deactivating due to NUMA or pfmemalloc mismatch may cause
 4500		 *    unnecessary slab allocations even when n->partial list
 4501		 *    is not empty.
 4502		 */
 4503		if (!node_isset(node, slab_nodes) ||
 4504		    !allow_spin) {
 4505			node = NUMA_NO_NODE;
 4506		} else {
 4507			stat(s, ALLOC_NODE_MISMATCH);
 4508			goto deactivate_slab;
 4509		}
 4510	}
 4511
 4512	/*
 4513	 * By rights, we should be searching for a slab page that was
 4514	 * PFMEMALLOC but right now, we are losing the pfmemalloc
 4515	 * information when the page leaves the per-cpu allocator
 4516	 */
 4517	if (unlikely(!pfmemalloc_match(slab, gfpflags) && allow_spin))
 4518		goto deactivate_slab;
 4519
 4520	/* must check again c->slab in case we got preempted and it changed */
 4521	local_lock_cpu_slab(s, flags);
 4522
 4523	if (unlikely(slab != c->slab)) {
 4524		local_unlock_cpu_slab(s, flags);
 4525		goto reread_slab;
 4526	}
 4527	freelist = c->freelist;
 4528	if (freelist)
 4529		goto load_freelist;
 4530
 4531	freelist = get_freelist(s, slab);
 4532
 4533	if (!freelist) {
 4534		c->slab = NULL;
 4535		c->tid = next_tid(c->tid);
 4536		local_unlock_cpu_slab(s, flags);
 4537		stat(s, DEACTIVATE_BYPASS);
 4538		goto new_slab;
 4539	}
 4540
 4541	stat(s, ALLOC_REFILL);
 4542
 4543load_freelist:
 4544
 4545	lockdep_assert_held(this_cpu_ptr(&s->cpu_slab->lock));
 4546
 4547	/*
 4548	 * freelist is pointing to the list of objects to be used.
 4549	 * slab is pointing to the slab from which the objects are obtained.
 4550	 * That slab must be frozen for per cpu allocations to work.
 4551	 */
 4552	VM_BUG_ON(!c->slab->frozen);
 4553	c->freelist = get_freepointer(s, freelist);
 4554	c->tid = next_tid(c->tid);
 4555	local_unlock_cpu_slab(s, flags);
 4556	return freelist;
 4557
 4558deactivate_slab:
 4559
 4560	local_lock_cpu_slab(s, flags);
 4561	if (slab != c->slab) {
 4562		local_unlock_cpu_slab(s, flags);
 4563		goto reread_slab;
 4564	}
 4565	freelist = c->freelist;
 4566	c->slab = NULL;
 4567	c->freelist = NULL;
 4568	c->tid = next_tid(c->tid);
 4569	local_unlock_cpu_slab(s, flags);
 4570	deactivate_slab(s, slab, freelist);
 4571
 4572new_slab:
 4573
 4574#ifdef CONFIG_SLUB_CPU_PARTIAL
 4575	while (slub_percpu_partial(c)) {
 4576		local_lock_cpu_slab(s, flags);
 4577		if (unlikely(c->slab)) {
 4578			local_unlock_cpu_slab(s, flags);
 4579			goto reread_slab;
 4580		}
 4581		if (unlikely(!slub_percpu_partial(c))) {
 4582			local_unlock_cpu_slab(s, flags);
 4583			/* we were preempted and partial list got empty */
 4584			goto new_objects;
 4585		}
 4586
 4587		slab = slub_percpu_partial(c);
 4588		slub_set_percpu_partial(c, slab);
 4589
 4590		if (likely(node_match(slab, node) &&
 4591			   pfmemalloc_match(slab, gfpflags)) ||
 4592		    !allow_spin) {
 4593			c->slab = slab;
 4594			freelist = get_freelist(s, slab);
 4595			VM_BUG_ON(!freelist);
 4596			stat(s, CPU_PARTIAL_ALLOC);
 4597			goto load_freelist;
 4598		}
 4599
 4600		local_unlock_cpu_slab(s, flags);
 4601
 4602		slab->next = NULL;
 4603		__put_partials(s, slab);
 4604	}
 4605#endif
 4606
 4607new_objects:
 4608
 4609	pc.flags = gfpflags;
 4610	/*
 4611	 * When a preferred node is indicated but no __GFP_THISNODE
 4612	 *
 4613	 * 1) try to get a partial slab from target node only by having
 4614	 *    __GFP_THISNODE in pc.flags for get_partial()
 4615	 * 2) if 1) failed, try to allocate a new slab from target node with
 4616	 *    GPF_NOWAIT | __GFP_THISNODE opportunistically
 4617	 * 3) if 2) failed, retry with original gfpflags which will allow
 4618	 *    get_partial() try partial lists of other nodes before potentially
 4619	 *    allocating new page from other nodes
 4620	 */
 4621	if (unlikely(node != NUMA_NO_NODE && !(gfpflags & __GFP_THISNODE)
 4622		     && try_thisnode)) {
 4623		if (unlikely(!allow_spin))
 4624			/* Do not upgrade gfp to NOWAIT from more restrictive mode */
 4625			pc.flags = gfpflags | __GFP_THISNODE;
 4626		else
 4627			pc.flags = GFP_NOWAIT | __GFP_THISNODE;
 4628	}
 4629
 4630	pc.orig_size = orig_size;
 4631	slab = get_partial(s, node, &pc);
 4632	if (slab) {
 4633		if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) {
 4634			freelist = pc.object;
 4635			/*
 4636			 * For debug caches here we had to go through
 4637			 * alloc_single_from_partial() so just store the
 4638			 * tracking info and return the object.
 4639			 *
 4640			 * Due to disabled preemption we need to disallow
 4641			 * blocking. The flags are further adjusted by
 4642			 * gfp_nested_mask() in stack_depot itself.
 4643			 */
 4644			if (s->flags & SLAB_STORE_USER)
 4645				set_track(s, freelist, TRACK_ALLOC, addr,
 4646					  gfpflags & ~(__GFP_DIRECT_RECLAIM));
 4647
 4648			return freelist;
 4649		}
 4650
 4651		freelist = freeze_slab(s, slab);
 4652		goto retry_load_slab;
 4653	}
 4654
 4655	slub_put_cpu_ptr(s->cpu_slab);
 4656	slab = new_slab(s, pc.flags, node);
 4657	c = slub_get_cpu_ptr(s->cpu_slab);
 4658
 4659	if (unlikely(!slab)) {
 4660		if (node != NUMA_NO_NODE && !(gfpflags & __GFP_THISNODE)
 4661		    && try_thisnode) {
 4662			try_thisnode = false;
 4663			goto new_objects;
 4664		}
 4665		slab_out_of_memory(s, gfpflags, node);
 4666		return NULL;
 4667	}
 4668
 4669	stat(s, ALLOC_SLAB);
 4670
 4671	if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) {
 4672		freelist = alloc_single_from_new_slab(s, slab, orig_size, gfpflags);
 4673
 4674		if (unlikely(!freelist)) {
 4675			/* This could cause an endless loop. Fail instead. */
 4676			if (!allow_spin)
 4677				return NULL;
 4678			goto new_objects;
 4679		}
 4680
 4681		if (s->flags & SLAB_STORE_USER)
 4682			set_track(s, freelist, TRACK_ALLOC, addr,
 4683				  gfpflags & ~(__GFP_DIRECT_RECLAIM));
 4684
 4685		return freelist;
 4686	}
 4687
 4688	/*
 4689	 * No other reference to the slab yet so we can
 4690	 * muck around with it freely without cmpxchg
 4691	 */
 4692	freelist = slab->freelist;
 4693	slab->freelist = NULL;
 4694	slab->inuse = slab->objects;
 4695	slab->frozen = 1;
 4696
 4697	inc_slabs_node(s, slab_nid(slab), slab->objects);
 4698
 4699	if (unlikely(!pfmemalloc_match(slab, gfpflags) && allow_spin)) {
 4700		/*
 4701		 * For !pfmemalloc_match() case we don't load freelist so that
 4702		 * we don't make further mismatched allocations easier.
 4703		 */
 4704		deactivate_slab(s, slab, get_freepointer(s, freelist));
 4705		return freelist;
 4706	}
 4707
 4708retry_load_slab:
 4709
 4710	local_lock_cpu_slab(s, flags);
 4711	if (unlikely(c->slab)) {
 4712		void *flush_freelist = c->freelist;
 4713		struct slab *flush_slab = c->slab;
 4714
 4715		c->slab = NULL;
 4716		c->freelist = NULL;
 4717		c->tid = next_tid(c->tid);
 4718
 4719		local_unlock_cpu_slab(s, flags);
 4720
 4721		if (unlikely(!allow_spin)) {
 4722			/* Reentrant slub cannot take locks, defer */
 4723			defer_deactivate_slab(flush_slab, flush_freelist);
 4724		} else {
 4725			deactivate_slab(s, flush_slab, flush_freelist);
 4726		}
 4727
 4728		stat(s, CPUSLAB_FLUSH);
 4729
 4730		goto retry_load_slab;
 4731	}
 4732	c->slab = slab;
 4733
 4734	goto load_freelist;
 4735}
 4736/*
 4737 * We disallow kprobes in ___slab_alloc() to prevent reentrance
 4738 *
 4739 * kmalloc() -> ___slab_alloc() -> local_lock_cpu_slab() protected part of
 4740 * ___slab_alloc() manipulating c->freelist -> kprobe -> bpf ->
 4741 * kmalloc_nolock() or kfree_nolock() -> __update_cpu_freelist_fast()
 4742 * manipulating c->freelist without lock.
 4743 *
 4744 * This does not prevent kprobe in functions called from ___slab_alloc() such as
 4745 * local_lock_irqsave() itself, and that is fine, we only need to protect the
 4746 * c->freelist manipulation in ___slab_alloc() itself.
 4747 */
 4748NOKPROBE_SYMBOL(___slab_alloc);
 4749
 4750/*
 4751 * A wrapper for ___slab_alloc() for contexts where preemption is not yet
 4752 * disabled. Compensates for possible cpu changes by refetching the per cpu area
 4753 * pointer.
 4754 */
 4755static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
 4756			  unsigned long addr, struct kmem_cache_cpu *c, unsigned int orig_size)
 4757{
 4758	void *p;
 4759
 4760#ifdef CONFIG_PREEMPT_COUNT
 4761	/*
 4762	 * We may have been preempted and rescheduled on a different
 4763	 * cpu before disabling preemption. Need to reload cpu area
 4764	 * pointer.
 4765	 */
 4766	c = slub_get_cpu_ptr(s->cpu_slab);
 4767#endif
 4768	if (unlikely(!gfpflags_allow_spinning(gfpflags))) {
 4769		if (local_lock_is_locked(&s->cpu_slab->lock)) {
 4770			/*
 4771			 * EBUSY is an internal signal to kmalloc_nolock() to
 4772			 * retry a different bucket. It's not propagated
 4773			 * to the caller.
 4774			 */
 4775			p = ERR_PTR(-EBUSY);
 4776			goto out;
 4777		}
 4778	}
 4779	p = ___slab_alloc(s, gfpflags, node, addr, c, orig_size);
 4780out:
 4781#ifdef CONFIG_PREEMPT_COUNT
 4782	slub_put_cpu_ptr(s->cpu_slab);
 4783#endif
 4784	return p;
 4785}
 4786
 4787static __always_inline void *__slab_alloc_node(struct kmem_cache *s,
 4788		gfp_t gfpflags, int node, unsigned long addr, size_t orig_size)
 4789{
 4790	struct kmem_cache_cpu *c;
 4791	struct slab *slab;
 4792	unsigned long tid;
 4793	void *object;
 4794
 4795redo:
 4796	/*
 4797	 * Must read kmem_cache cpu data via this cpu ptr. Preemption is
 4798	 * enabled. We may switch back and forth between cpus while
 4799	 * reading from one cpu area. That does not matter as long
 4800	 * as we end up on the original cpu again when doing the cmpxchg.
 4801	 *
 4802	 * We must guarantee that tid and kmem_cache_cpu are retrieved on the
 4803	 * same cpu. We read first the kmem_cache_cpu pointer and use it to read
 4804	 * the tid. If we are preempted and switched to another cpu between the
 4805	 * two reads, it's OK as the two are still associated with the same cpu
 4806	 * and cmpxchg later will validate the cpu.
 4807	 */
 4808	c = raw_cpu_ptr(s->cpu_slab);
 4809	tid = READ_ONCE(c->tid);
 4810
 4811	/*
 4812	 * Irqless object alloc/free algorithm used here depends on sequence
 4813	 * of fetching cpu_slab's data. tid should be fetched before anything
 4814	 * on c to guarantee that object and slab associated with previous tid
 4815	 * won't be used with current tid. If we fetch tid first, object and
 4816	 * slab could be one associated with next tid and our alloc/free
 4817	 * request will be failed. In this case, we will retry. So, no problem.
 4818	 */
 4819	barrier();
 4820
 4821	/*
 4822	 * The transaction ids are globally unique per cpu and per operation on
 4823	 * a per cpu queue. Thus they can be guarantee that the cmpxchg_double
 4824	 * occurs on the right processor and that there was no operation on the
 4825	 * linked list in between.
 4826	 */
 4827
 4828	object = c->freelist;
 4829	slab = c->slab;
 4830
 4831#ifdef CONFIG_NUMA
 4832	if (static_branch_unlikely(&strict_numa) &&
 4833			node == NUMA_NO_NODE) {
 4834
 4835		struct mempolicy *mpol = current->mempolicy;
 4836
 4837		if (mpol) {
 4838			/*
 4839			 * Special BIND rule support. If existing slab
 4840			 * is in permitted set then do not redirect
 4841			 * to a particular node.
 4842			 * Otherwise we apply the memory policy to get
 4843			 * the node we need to allocate on.
 4844			 */
 4845			if (mpol->mode != MPOL_BIND || !slab ||
 4846					!node_isset(slab_nid(slab), mpol->nodes))
 4847
 4848				node = mempolicy_slab_node();
 4849		}
 4850	}
 4851#endif
 4852
 4853	if (!USE_LOCKLESS_FAST_PATH() ||
 4854	    unlikely(!object || !slab || !node_match(slab, node))) {
 4855		object = __slab_alloc(s, gfpflags, node, addr, c, orig_size);
 4856	} else {
 4857		void *next_object = get_freepointer_safe(s, object);
 4858
 4859		/*
 4860		 * The cmpxchg will only match if there was no additional
 4861		 * operation and if we are on the right processor.
 4862		 *
 4863		 * The cmpxchg does the following atomically (without lock
 4864		 * semantics!)
 4865		 * 1. Relocate first pointer to the current per cpu area.
 4866		 * 2. Verify that tid and freelist have not been changed
 4867		 * 3. If they were not changed replace tid and freelist
 4868		 *
 4869		 * Since this is without lock semantics the protection is only
 4870		 * against code executing on this cpu *not* from access by
 4871		 * other cpus.
 4872		 */
 4873		if (unlikely(!__update_cpu_freelist_fast(s, object, next_object, tid))) {
 4874			note_cmpxchg_failure("slab_alloc", s, tid);
 4875			goto redo;
 4876		}
 4877		prefetch_freepointer(s, next_object);
 4878		stat(s, ALLOC_FASTPATH);
 4879	}
 4880
 4881	return object;
 4882}
 4883
 4884/*
 4885 * If the object has been wiped upon free, make sure it's fully initialized by
 4886 * zeroing out freelist pointer.
 4887 *
 4888 * Note that we also wipe custom freelist pointers.
 4889 */
 4890static __always_inline void maybe_wipe_obj_freeptr(struct kmem_cache *s,
 4891						   void *obj)
 4892{
 4893	if (unlikely(slab_want_init_on_free(s)) && obj &&
 4894	    !freeptr_outside_object(s))
 4895		memset((void *)((char *)kasan_reset_tag(obj) + s->offset),
 4896			0, sizeof(void *));
 4897}
 4898
 4899static __fastpath_inline
 4900struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags)
 4901{
 4902	flags &= gfp_allowed_mask;
 4903
 4904	might_alloc(flags);
 4905
 4906	if (unlikely(should_failslab(s, flags)))
 4907		return NULL;
 4908
 4909	return s;
 4910}
 4911
 4912static __fastpath_inline
 4913bool slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru,
 4914			  gfp_t flags, size_t size, void **p, bool init,
 4915			  unsigned int orig_size)
 4916{
 4917	unsigned int zero_size = s->object_size;
 4918	bool kasan_init = init;
 4919	size_t i;
 4920	gfp_t init_flags = flags & gfp_allowed_mask;
 4921
 4922	/*
 4923	 * For kmalloc object, the allocated memory size(object_size) is likely
 4924	 * larger than the requested size(orig_size). If redzone check is
 4925	 * enabled for the extra space, don't zero it, as it will be redzoned
 4926	 * soon. The redzone operation for this extra space could be seen as a
 4927	 * replacement of current poisoning under certain debug option, and
 4928	 * won't break other sanity checks.
 4929	 */
 4930	if (kmem_cache_debug_flags(s, SLAB_STORE_USER | SLAB_RED_ZONE) &&
 4931	    (s->flags & SLAB_KMALLOC))
 4932		zero_size = orig_size;
 4933
 4934	/*
 4935	 * When slab_debug is enabled, avoid memory initialization integrated
 4936	 * into KASAN and instead zero out the memory via the memset below with
 4937	 * the proper size. Otherwise, KASAN might overwrite SLUB redzones and
 4938	 * cause false-positive reports. This does not lead to a performance
 4939	 * penalty on production builds, as slab_debug is not intended to be
 4940	 * enabled there.
 4941	 */
 4942	if (__slub_debug_enabled())
 4943		kasan_init = false;
 4944
 4945	/*
 4946	 * As memory initialization might be integrated into KASAN,
 4947	 * kasan_slab_alloc and initialization memset must be
 4948	 * kept together to avoid discrepancies in behavior.
 4949	 *
 4950	 * As p[i] might get tagged, memset and kmemleak hook come after KASAN.
 4951	 */
 4952	for (i = 0; i < size; i++) {
 4953		p[i] = kasan_slab_alloc(s, p[i], init_flags, kasan_init);
 4954		if (p[i] && init && (!kasan_init ||
 4955				     !kasan_has_integrated_init()))
 4956			memset(p[i], 0, zero_size);
 4957		if (gfpflags_allow_spinning(flags))
 4958			kmemleak_alloc_recursive(p[i], s->object_size, 1,
 4959						 s->flags, init_flags);
 4960		kmsan_slab_alloc(s, p[i], init_flags);
 4961		alloc_tagging_slab_alloc_hook(s, p[i], flags);
 4962	}
 4963
 4964	return memcg_slab_post_alloc_hook(s, lru, flags, size, p);
 4965}
 4966
 4967/*
 4968 * Replace the empty main sheaf with a (at least partially) full sheaf.
 4969 *
 4970 * Must be called with the cpu_sheaves local lock locked. If successful, returns
 4971 * the pcs pointer and the local lock locked (possibly on a different cpu than
 4972 * initially called). If not successful, returns NULL and the local lock
 4973 * unlocked.
 4974 */
 4975static struct slub_percpu_sheaves *
 4976__pcs_replace_empty_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs, gfp_t gfp)
 4977{
 4978	struct slab_sheaf *empty = NULL;
 4979	struct slab_sheaf *full;
 4980	struct node_barn *barn;
 4981	bool can_alloc;
 4982
 4983	lockdep_assert_held(this_cpu_ptr(&s->cpu_sheaves->lock));
 4984
 4985	if (pcs->spare && pcs->spare->size > 0) {
 4986		swap(pcs->main, pcs->spare);
 4987		return pcs;
 4988	}
 4989
 4990	barn = get_barn(s);
 4991	if (!barn) {
 4992		local_unlock(&s->cpu_sheaves->lock);
 4993		return NULL;
 4994	}
 4995
 4996	full = barn_replace_empty_sheaf(barn, pcs->main);
 4997
 4998	if (full) {
 4999		stat(s, BARN_GET);
 5000		pcs->main = full;
 5001		return pcs;
 5002	}
 5003
 5004	stat(s, BARN_GET_FAIL);
 5005
 5006	can_alloc = gfpflags_allow_blocking(gfp);
 5007
 5008	if (can_alloc) {
 5009		if (pcs->spare) {
 5010			empty = pcs->spare;
 5011			pcs->spare = NULL;
 5012		} else {
 5013			empty = barn_get_empty_sheaf(barn);
 5014		}
 5015	}
 5016
 5017	local_unlock(&s->cpu_sheaves->lock);
 5018
 5019	if (!can_alloc)
 5020		return NULL;
 5021
 5022	if (empty) {
 5023		if (!refill_sheaf(s, empty, gfp | __GFP_NOMEMALLOC)) {
 5024			full = empty;
 5025		} else {
 5026			/*
 5027			 * we must be very low on memory so don't bother
 5028			 * with the barn
 5029			 */
 5030			free_empty_sheaf(s, empty);
 5031		}
 5032	} else {
 5033		full = alloc_full_sheaf(s, gfp);
 5034	}
 5035
 5036	if (!full)
 5037		return NULL;
 5038
 5039	/*
 5040	 * we can reach here only when gfpflags_allow_blocking
 5041	 * so this must not be an irq
 5042	 */
 5043	local_lock(&s->cpu_sheaves->lock);
 5044	pcs = this_cpu_ptr(s->cpu_sheaves);
 5045
 5046	/*
 5047	 * If we are returning empty sheaf, we either got it from the
 5048	 * barn or had to allocate one. If we are returning a full
 5049	 * sheaf, it's due to racing or being migrated to a different
 5050	 * cpu. Breaching the barn's sheaf limits should be thus rare
 5051	 * enough so just ignore them to simplify the recovery.
 5052	 */
 5053
 5054	if (pcs->main->size == 0) {
 5055		barn_put_empty_sheaf(barn, pcs->main);
 5056		pcs->main = full;
 5057		return pcs;
 5058	}
 5059
 5060	if (!pcs->spare) {
 5061		pcs->spare = full;
 5062		return pcs;
 5063	}
 5064
 5065	if (pcs->spare->size == 0) {
 5066		barn_put_empty_sheaf(barn, pcs->spare);
 5067		pcs->spare = full;
 5068		return pcs;
 5069	}
 5070
 5071	barn_put_full_sheaf(barn, full);
 5072	stat(s, BARN_PUT);
 5073
 5074	return pcs;
 5075}
 5076
 5077static __fastpath_inline
 5078void *alloc_from_pcs(struct kmem_cache *s, gfp_t gfp, int node)
 5079{
 5080	struct slub_percpu_sheaves *pcs;
 5081	bool node_requested;
 5082	void *object;
 5083
 5084#ifdef CONFIG_NUMA
 5085	if (static_branch_unlikely(&strict_numa) &&
 5086			 node == NUMA_NO_NODE) {
 5087
 5088		struct mempolicy *mpol = current->mempolicy;
 5089
 5090		if (mpol) {
 5091			/*
 5092			 * Special BIND rule support. If the local node
 5093			 * is in permitted set then do not redirect
 5094			 * to a particular node.
 5095			 * Otherwise we apply the memory policy to get
 5096			 * the node we need to allocate on.
 5097			 */
 5098			if (mpol->mode != MPOL_BIND ||
 5099					!node_isset(numa_mem_id(), mpol->nodes))
 5100
 5101				node = mempolicy_slab_node();
 5102		}
 5103	}
 5104#endif
 5105
 5106	node_requested = IS_ENABLED(CONFIG_NUMA) && node != NUMA_NO_NODE;
 5107
 5108	/*
 5109	 * We assume the percpu sheaves contain only local objects although it's
 5110	 * not completely guaranteed, so we verify later.
 5111	 */
 5112	if (unlikely(node_requested && node != numa_mem_id()))
 5113		return NULL;
 5114
 5115	if (!local_trylock(&s->cpu_sheaves->lock))
 5116		return NULL;
 5117
 5118	pcs = this_cpu_ptr(s->cpu_sheaves);
 5119
 5120	if (unlikely(pcs->main->size == 0)) {
 5121		pcs = __pcs_replace_empty_main(s, pcs, gfp);
 5122		if (unlikely(!pcs))
 5123			return NULL;
 5124	}
 5125
 5126	object = pcs->main->objects[pcs->main->size - 1];
 5127
 5128	if (unlikely(node_requested)) {
 5129		/*
 5130		 * Verify that the object was from the node we want. This could
 5131		 * be false because of cpu migration during an unlocked part of
 5132		 * the current allocation or previous freeing process.
 5133		 */
 5134		if (page_to_nid(virt_to_page(object)) != node) {
 5135			local_unlock(&s->cpu_sheaves->lock);
 5136			return NULL;
 5137		}
 5138	}
 5139
 5140	pcs->main->size--;
 5141
 5142	local_unlock(&s->cpu_sheaves->lock);
 5143
 5144	stat(s, ALLOC_PCS);
 5145
 5146	return object;
 5147}
 5148
 5149static __fastpath_inline
 5150unsigned int alloc_from_pcs_bulk(struct kmem_cache *s, size_t size, void **p)
 5151{
 5152	struct slub_percpu_sheaves *pcs;
 5153	struct slab_sheaf *main;
 5154	unsigned int allocated = 0;
 5155	unsigned int batch;
 5156
 5157next_batch:
 5158	if (!local_trylock(&s->cpu_sheaves->lock))
 5159		return allocated;
 5160
 5161	pcs = this_cpu_ptr(s->cpu_sheaves);
 5162
 5163	if (unlikely(pcs->main->size == 0)) {
 5164
 5165		struct slab_sheaf *full;
 5166		struct node_barn *barn;
 5167
 5168		if (pcs->spare && pcs->spare->size > 0) {
 5169			swap(pcs->main, pcs->spare);
 5170			goto do_alloc;
 5171		}
 5172
 5173		barn = get_barn(s);
 5174		if (!barn) {
 5175			local_unlock(&s->cpu_sheaves->lock);
 5176			return allocated;
 5177		}
 5178
 5179		full = barn_replace_empty_sheaf(barn, pcs->main);
 5180
 5181		if (full) {
 5182			stat(s, BARN_GET);
 5183			pcs->main = full;
 5184			goto do_alloc;
 5185		}
 5186
 5187		stat(s, BARN_GET_FAIL);
 5188
 5189		local_unlock(&s->cpu_sheaves->lock);
 5190
 5191		/*
 5192		 * Once full sheaves in barn are depleted, let the bulk
 5193		 * allocation continue from slab pages, otherwise we would just
 5194		 * be copying arrays of pointers twice.
 5195		 */
 5196		return allocated;
 5197	}
 5198
 5199do_alloc:
 5200
 5201	main = pcs->main;
 5202	batch = min(size, main->size);
 5203
 5204	main->size -= batch;
 5205	memcpy(p, main->objects + main->size, batch * sizeof(void *));
 5206
 5207	local_unlock(&s->cpu_sheaves->lock);
 5208
 5209	stat_add(s, ALLOC_PCS, batch);
 5210
 5211	allocated += batch;
 5212
 5213	if (batch < size) {
 5214		p += batch;
 5215		size -= batch;
 5216		goto next_batch;
 5217	}
 5218
 5219	return allocated;
 5220}
 5221
 5222
 5223/*
 5224 * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc)
 5225 * have the fastpath folded into their functions. So no function call
 5226 * overhead for requests that can be satisfied on the fastpath.
 5227 *
 5228 * The fastpath works by first checking if the lockless freelist can be used.
 5229 * If not then __slab_alloc is called for slow processing.
 5230 *
 5231 * Otherwise we can simply pick the next object from the lockless free list.
 5232 */
 5233static __fastpath_inline void *slab_alloc_node(struct kmem_cache *s, struct list_lru *lru,
 5234		gfp_t gfpflags, int node, unsigned long addr, size_t orig_size)
 5235{
 5236	void *object;
 5237	bool init = false;
 5238
 5239	s = slab_pre_alloc_hook(s, gfpflags);
 5240	if (unlikely(!s))
 5241		return NULL;
 5242
 5243	object = kfence_alloc(s, orig_size, gfpflags);
 5244	if (unlikely(object))
 5245		goto out;
 5246
 5247	if (s->cpu_sheaves)
 5248		object = alloc_from_pcs(s, gfpflags, node);
 5249
 5250	if (!object)
 5251		object = __slab_alloc_node(s, gfpflags, node, addr, orig_size);
 5252
 5253	maybe_wipe_obj_freeptr(s, object);
 5254	init = slab_want_init_on_alloc(gfpflags, s);
 5255
 5256out:
 5257	/*
 5258	 * When init equals 'true', like for kzalloc() family, only
 5259	 * @orig_size bytes might be zeroed instead of s->object_size
 5260	 * In case this fails due to memcg_slab_post_alloc_hook(),
 5261	 * object is set to NULL
 5262	 */
 5263	slab_post_alloc_hook(s, lru, gfpflags, 1, &object, init, orig_size);
 5264
 5265	return object;
 5266}
 5267
 5268void *kmem_cache_alloc_noprof(struct kmem_cache *s, gfp_t gfpflags)
 5269{
 5270	void *ret = slab_alloc_node(s, NULL, gfpflags, NUMA_NO_NODE, _RET_IP_,
 5271				    s->object_size);
 5272
 5273	trace_kmem_cache_alloc(_RET_IP_, ret, s, gfpflags, NUMA_NO_NODE);
 5274
 5275	return ret;
 5276}
 5277EXPORT_SYMBOL(kmem_cache_alloc_noprof);
 5278
 5279void *kmem_cache_alloc_lru_noprof(struct kmem_cache *s, struct list_lru *lru,
 5280			   gfp_t gfpflags)
 5281{
 5282	void *ret = slab_alloc_node(s, lru, gfpflags, NUMA_NO_NODE, _RET_IP_,
 5283				    s->object_size);
 5284
 5285	trace_kmem_cache_alloc(_RET_IP_, ret, s, gfpflags, NUMA_NO_NODE);
 5286
 5287	return ret;
 5288}
 5289EXPORT_SYMBOL(kmem_cache_alloc_lru_noprof);
 5290
 5291bool kmem_cache_charge(void *objp, gfp_t gfpflags)
 5292{
 5293	if (!memcg_kmem_online())
 5294		return true;
 5295
 5296	return memcg_slab_post_charge(objp, gfpflags);
 5297}
 5298EXPORT_SYMBOL(kmem_cache_charge);
 5299
 5300/**
 5301 * kmem_cache_alloc_node - Allocate an object on the specified node
 5302 * @s: The cache to allocate from.
 5303 * @gfpflags: See kmalloc().
 5304 * @node: node number of the target node.
 5305 *
 5306 * Identical to kmem_cache_alloc but it will allocate memory on the given
 5307 * node, which can improve the performance for cpu bound structures.
 5308 *
 5309 * Fallback to other node is possible if __GFP_THISNODE is not set.
 5310 *
 5311 * Return: pointer to the new object or %NULL in case of error
 5312 */
 5313void *kmem_cache_alloc_node_noprof(struct kmem_cache *s, gfp_t gfpflags, int node)
 5314{
 5315	void *ret = slab_alloc_node(s, NULL, gfpflags, node, _RET_IP_, s->object_size);
 5316
 5317	trace_kmem_cache_alloc(_RET_IP_, ret, s, gfpflags, node);
 5318
 5319	return ret;
 5320}
 5321EXPORT_SYMBOL(kmem_cache_alloc_node_noprof);
 5322
 5323static int __prefill_sheaf_pfmemalloc(struct kmem_cache *s,
 5324				      struct slab_sheaf *sheaf, gfp_t gfp)
 5325{
 5326	int ret = 0;
 5327
 5328	ret = refill_sheaf(s, sheaf, gfp | __GFP_NOMEMALLOC);
 5329
 5330	if (likely(!ret || !gfp_pfmemalloc_allowed(gfp)))
 5331		return ret;
 5332
 5333	/*
 5334	 * if we are allowed to, refill sheaf with pfmemalloc but then remember
 5335	 * it for when it's returned
 5336	 */
 5337	ret = refill_sheaf(s, sheaf, gfp);
 5338	sheaf->pfmemalloc = true;
 5339
 5340	return ret;
 5341}
 5342
 5343/*
 5344 * returns a sheaf that has at least the requested size
 5345 * when prefilling is needed, do so with given gfp flags
 5346 *
 5347 * return NULL if sheaf allocation or prefilling failed
 5348 */
 5349struct slab_sheaf *
 5350kmem_cache_prefill_sheaf(struct kmem_cache *s, gfp_t gfp, unsigned int size)
 5351{
 5352	struct slub_percpu_sheaves *pcs;
 5353	struct slab_sheaf *sheaf = NULL;
 5354	struct node_barn *barn;
 5355
 5356	if (unlikely(size > s->sheaf_capacity)) {
 5357
 5358		/*
 5359		 * slab_debug disables cpu sheaves intentionally so all
 5360		 * prefilled sheaves become "oversize" and we give up on
 5361		 * performance for the debugging. Same with SLUB_TINY.
 5362		 * Creating a cache without sheaves and then requesting a
 5363		 * prefilled sheaf is however not expected, so warn.
 5364		 */
 5365		WARN_ON_ONCE(s->sheaf_capacity == 0 &&
 5366			     !IS_ENABLED(CONFIG_SLUB_TINY) &&
 5367			     !(s->flags & SLAB_DEBUG_FLAGS));
 5368
 5369		sheaf = kzalloc(struct_size(sheaf, objects, size), gfp);
 5370		if (!sheaf)
 5371			return NULL;
 5372
 5373		stat(s, SHEAF_PREFILL_OVERSIZE);
 5374		sheaf->cache = s;
 5375		sheaf->capacity = size;
 5376
 5377		/*
 5378		 * we do not need to care about pfmemalloc here because oversize
 5379		 * sheaves area always flushed and freed when returned
 5380		 */
 5381		if (!__kmem_cache_alloc_bulk(s, gfp, size,
 5382					     &sheaf->objects[0])) {
 5383			kfree(sheaf);
 5384			return NULL;
 5385		}
 5386
 5387		sheaf->size = size;
 5388
 5389		return sheaf;
 5390	}
 5391
 5392	local_lock(&s->cpu_sheaves->lock);
 5393	pcs = this_cpu_ptr(s->cpu_sheaves);
 5394
 5395	if (pcs->spare) {
 5396		sheaf = pcs->spare;
 5397		pcs->spare = NULL;
 5398		stat(s, SHEAF_PREFILL_FAST);
 5399	} else {
 5400		barn = get_barn(s);
 5401
 5402		stat(s, SHEAF_PREFILL_SLOW);
 5403		if (barn)
 5404			sheaf = barn_get_full_or_empty_sheaf(barn);
 5405		if (sheaf && sheaf->size)
 5406			stat(s, BARN_GET);
 5407		else
 5408			stat(s, BARN_GET_FAIL);
 5409	}
 5410
 5411	local_unlock(&s->cpu_sheaves->lock);
 5412
 5413
 5414	if (!sheaf)
 5415		sheaf = alloc_empty_sheaf(s, gfp);
 5416
 5417	if (sheaf) {
 5418		sheaf->capacity = s->sheaf_capacity;
 5419		sheaf->pfmemalloc = false;
 5420
 5421		if (sheaf->size < size &&
 5422		    __prefill_sheaf_pfmemalloc(s, sheaf, gfp)) {
 5423			sheaf_flush_unused(s, sheaf);
 5424			free_empty_sheaf(s, sheaf);
 5425			sheaf = NULL;
 5426		}
 5427	}
 5428
 5429	return sheaf;
 5430}
 5431
 5432/*
 5433 * Use this to return a sheaf obtained by kmem_cache_prefill_sheaf()
 5434 *
 5435 * If the sheaf cannot simply become the percpu spare sheaf, but there's space
 5436 * for a full sheaf in the barn, we try to refill the sheaf back to the cache's
 5437 * sheaf_capacity to avoid handling partially full sheaves.
 5438 *
 5439 * If the refill fails because gfp is e.g. GFP_NOWAIT, or the barn is full, the
 5440 * sheaf is instead flushed and freed.
 5441 */
 5442void kmem_cache_return_sheaf(struct kmem_cache *s, gfp_t gfp,
 5443			     struct slab_sheaf *sheaf)
 5444{
 5445	struct slub_percpu_sheaves *pcs;
 5446	struct node_barn *barn;
 5447
 5448	if (unlikely((sheaf->capacity != s->sheaf_capacity)
 5449		     || sheaf->pfmemalloc)) {
 5450		sheaf_flush_unused(s, sheaf);
 5451		kfree(sheaf);
 5452		return;
 5453	}
 5454
 5455	local_lock(&s->cpu_sheaves->lock);
 5456	pcs = this_cpu_ptr(s->cpu_sheaves);
 5457	barn = get_barn(s);
 5458
 5459	if (!pcs->spare) {
 5460		pcs->spare = sheaf;
 5461		sheaf = NULL;
 5462		stat(s, SHEAF_RETURN_FAST);
 5463	}
 5464
 5465	local_unlock(&s->cpu_sheaves->lock);
 5466
 5467	if (!sheaf)
 5468		return;
 5469
 5470	stat(s, SHEAF_RETURN_SLOW);
 5471
 5472	/*
 5473	 * If the barn has too many full sheaves or we fail to refill the sheaf,
 5474	 * simply flush and free it.
 5475	 */
 5476	if (!barn || data_race(barn->nr_full) >= MAX_FULL_SHEAVES ||
 5477	    refill_sheaf(s, sheaf, gfp)) {
 5478		sheaf_flush_unused(s, sheaf);
 5479		free_empty_sheaf(s, sheaf);
 5480		return;
 5481	}
 5482
 5483	barn_put_full_sheaf(barn, sheaf);
 5484	stat(s, BARN_PUT);
 5485}
 5486
 5487/*
 5488 * refill a sheaf previously returned by kmem_cache_prefill_sheaf to at least
 5489 * the given size
 5490 *
 5491 * the sheaf might be replaced by a new one when requesting more than
 5492 * s->sheaf_capacity objects if such replacement is necessary, but the refill
 5493 * fails (returning -ENOMEM), the existing sheaf is left intact
 5494 *
 5495 * In practice we always refill to full sheaf's capacity.
 5496 */
 5497int kmem_cache_refill_sheaf(struct kmem_cache *s, gfp_t gfp,
 5498			    struct slab_sheaf **sheafp, unsigned int size)
 5499{
 5500	struct slab_sheaf *sheaf;
 5501
 5502	/*
 5503	 * TODO: do we want to support *sheaf == NULL to be equivalent of
 5504	 * kmem_cache_prefill_sheaf() ?
 5505	 */
 5506	if (!sheafp || !(*sheafp))
 5507		return -EINVAL;
 5508
 5509	sheaf = *sheafp;
 5510	if (sheaf->size >= size)
 5511		return 0;
 5512
 5513	if (likely(sheaf->capacity >= size)) {
 5514		if (likely(sheaf->capacity == s->sheaf_capacity))
 5515			return __prefill_sheaf_pfmemalloc(s, sheaf, gfp);
 5516
 5517		if (!__kmem_cache_alloc_bulk(s, gfp, sheaf->capacity - sheaf->size,
 5518					     &sheaf->objects[sheaf->size])) {
 5519			return -ENOMEM;
 5520		}
 5521		sheaf->size = sheaf->capacity;
 5522
 5523		return 0;
 5524	}
 5525
 5526	/*
 5527	 * We had a regular sized sheaf and need an oversize one, or we had an
 5528	 * oversize one already but need a larger one now.
 5529	 * This should be a very rare path so let's not complicate it.
 5530	 */
 5531	sheaf = kmem_cache_prefill_sheaf(s, gfp, size);
 5532	if (!sheaf)
 5533		return -ENOMEM;
 5534
 5535	kmem_cache_return_sheaf(s, gfp, *sheafp);
 5536	*sheafp = sheaf;
 5537	return 0;
 5538}
 5539
 5540/*
 5541 * Allocate from a sheaf obtained by kmem_cache_prefill_sheaf()
 5542 *
 5543 * Guaranteed not to fail as many allocations as was the requested size.
 5544 * After the sheaf is emptied, it fails - no fallback to the slab cache itself.
 5545 *
 5546 * The gfp parameter is meant only to specify __GFP_ZERO or __GFP_ACCOUNT
 5547 * memcg charging is forced over limit if necessary, to avoid failure.
 5548 *
 5549 * It is possible that the allocation comes from kfence and then the sheaf
 5550 * size is not decreased.
 5551 */
 5552void *
 5553kmem_cache_alloc_from_sheaf_noprof(struct kmem_cache *s, gfp_t gfp,
 5554				   struct slab_sheaf *sheaf)
 5555{
 5556	void *ret = NULL;
 5557	bool init;
 5558
 5559	if (sheaf->size == 0)
 5560		goto out;
 5561
 5562	ret = kfence_alloc(s, s->object_size, gfp);
 5563
 5564	if (likely(!ret))
 5565		ret = sheaf->objects[--sheaf->size];
 5566
 5567	init = slab_want_init_on_alloc(gfp, s);
 5568
 5569	/* add __GFP_NOFAIL to force successful memcg charging */
 5570	slab_post_alloc_hook(s, NULL, gfp | __GFP_NOFAIL, 1, &ret, init, s->object_size);
 5571out:
 5572	trace_kmem_cache_alloc(_RET_IP_, ret, s, gfp, NUMA_NO_NODE);
 5573
 5574	return ret;
 5575}
 5576
 5577unsigned int kmem_cache_sheaf_size(struct slab_sheaf *sheaf)
 5578{
 5579	return sheaf->size;
 5580}
 5581/*
 5582 * To avoid unnecessary overhead, we pass through large allocation requests
 5583 * directly to the page allocator. We use __GFP_COMP, because we will need to
 5584 * know the allocation order to free the pages properly in kfree.
 5585 */
 5586static void *___kmalloc_large_node(size_t size, gfp_t flags, int node)
 5587{
 5588	struct page *page;
 5589	void *ptr = NULL;
 5590	unsigned int order = get_order(size);
 5591
 5592	if (unlikely(flags & GFP_SLAB_BUG_MASK))
 5593		flags = kmalloc_fix_flags(flags);
 5594
 5595	flags |= __GFP_COMP;
 5596
 5597	if (node == NUMA_NO_NODE)
 5598		page = alloc_frozen_pages_noprof(flags, order);
 5599	else
 5600		page = __alloc_frozen_pages_noprof(flags, order, node, NULL);
 5601
 5602	if (page) {
 5603		ptr = page_address(page);
 5604		mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B,
 5605				      PAGE_SIZE << order);
 5606		__SetPageLargeKmalloc(page);
 5607	}
 5608
 5609	ptr = kasan_kmalloc_large(ptr, size, flags);
 5610	/* As ptr might get tagged, call kmemleak hook after KASAN. */
 5611	kmemleak_alloc(ptr, size, 1, flags);
 5612	kmsan_kmalloc_large(ptr, size, flags);
 5613
 5614	return ptr;
 5615}
 5616
 5617void *__kmalloc_large_noprof(size_t size, gfp_t flags)
 5618{
 5619	void *ret = ___kmalloc_large_node(size, flags, NUMA_NO_NODE);
 5620
 5621	trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << get_order(size),
 5622		      flags, NUMA_NO_NODE);
 5623	return ret;
 5624}
 5625EXPORT_SYMBOL(__kmalloc_large_noprof);
 5626
 5627void *__kmalloc_large_node_noprof(size_t size, gfp_t flags, int node)
 5628{
 5629	void *ret = ___kmalloc_large_node(size, flags, node);
 5630
 5631	trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << get_order(size),
 5632		      flags, node);
 5633	return ret;
 5634}
 5635EXPORT_SYMBOL(__kmalloc_large_node_noprof);
 5636
 5637static __always_inline
 5638void *__do_kmalloc_node(size_t size, kmem_buckets *b, gfp_t flags, int node,
 5639			unsigned long caller)
 5640{
 5641	struct kmem_cache *s;
 5642	void *ret;
 5643
 5644	if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) {
 5645		ret = __kmalloc_large_node_noprof(size, flags, node);
 5646		trace_kmalloc(caller, ret, size,
 5647			      PAGE_SIZE << get_order(size), flags, node);
 5648		return ret;
 5649	}
 5650
 5651	if (unlikely(!size))
 5652		return ZERO_SIZE_PTR;
 5653
 5654	s = kmalloc_slab(size, b, flags, caller);
 5655
 5656	ret = slab_alloc_node(s, NULL, flags, node, caller, size);
 5657	ret = kasan_kmalloc(s, ret, size, flags);
 5658	trace_kmalloc(caller, ret, size, s->size, flags, node);
 5659	return ret;
 5660}
 5661void *__kmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags, int node)
 5662{
 5663	return __do_kmalloc_node(size, PASS_BUCKET_PARAM(b), flags, node, _RET_IP_);
 5664}
 5665EXPORT_SYMBOL(__kmalloc_node_noprof);
 5666
 5667void *__kmalloc_noprof(size_t size, gfp_t flags)
 5668{
 5669	return __do_kmalloc_node(size, NULL, flags, NUMA_NO_NODE, _RET_IP_);
 5670}
 5671EXPORT_SYMBOL(__kmalloc_noprof);
 5672
 5673/**
 5674 * kmalloc_nolock - Allocate an object of given size from any context.
 5675 * @size: size to allocate
 5676 * @gfp_flags: GFP flags. Only __GFP_ACCOUNT, __GFP_ZERO, __GFP_NO_OBJ_EXT
 5677 * allowed.
 5678 * @node: node number of the target node.
 5679 *
 5680 * Return: pointer to the new object or NULL in case of error.
 5681 * NULL does not mean EBUSY or EAGAIN. It means ENOMEM.
 5682 * There is no reason to call it again and expect !NULL.
 5683 */
 5684void *kmalloc_nolock_noprof(size_t size, gfp_t gfp_flags, int node)
 5685{
 5686	gfp_t alloc_gfp = __GFP_NOWARN | __GFP_NOMEMALLOC | gfp_flags;
 5687	struct kmem_cache *s;
 5688	bool can_retry = true;
 5689	void *ret = ERR_PTR(-EBUSY);
 5690
 5691	VM_WARN_ON_ONCE(gfp_flags & ~(__GFP_ACCOUNT | __GFP_ZERO |
 5692				      __GFP_NO_OBJ_EXT));
 5693
 5694	if (unlikely(!size))
 5695		return ZERO_SIZE_PTR;
 5696
 5697	if (IS_ENABLED(CONFIG_PREEMPT_RT) && !preemptible())
 5698		/*
 5699		 * kmalloc_nolock() in PREEMPT_RT is not supported from
 5700		 * non-preemptible context because local_lock becomes a
 5701		 * sleeping lock on RT.
 5702		 */
 5703		return NULL;
 5704retry:
 5705	if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))
 5706		return NULL;
 5707	s = kmalloc_slab(size, NULL, alloc_gfp, _RET_IP_);
 5708
 5709	if (!(s->flags & __CMPXCHG_DOUBLE) && !kmem_cache_debug(s))
 5710		/*
 5711		 * kmalloc_nolock() is not supported on architectures that
 5712		 * don't implement cmpxchg16b, but debug caches don't use
 5713		 * per-cpu slab and per-cpu partial slabs. They rely on
 5714		 * kmem_cache_node->list_lock, so kmalloc_nolock() can
 5715		 * attempt to allocate from debug caches by
 5716		 * spin_trylock_irqsave(&n->list_lock, ...)
 5717		 */
 5718		return NULL;
 5719
 5720	/*
 5721	 * Do not call slab_alloc_node(), since trylock mode isn't
 5722	 * compatible with slab_pre_alloc_hook/should_failslab and
 5723	 * kfence_alloc. Hence call __slab_alloc_node() (at most twice)
 5724	 * and slab_post_alloc_hook() directly.
 5725	 *
 5726	 * In !PREEMPT_RT ___slab_alloc() manipulates (freelist,tid) pair
 5727	 * in irq saved region. It assumes that the same cpu will not
 5728	 * __update_cpu_freelist_fast() into the same (freelist,tid) pair.
 5729	 * Therefore use in_nmi() to check whether particular bucket is in
 5730	 * irq protected section.
 5731	 *
 5732	 * If in_nmi() && local_lock_is_locked(s->cpu_slab) then it means that
 5733	 * this cpu was interrupted somewhere inside ___slab_alloc() after
 5734	 * it did local_lock_irqsave(&s->cpu_slab->lock, flags).
 5735	 * In this case fast path with __update_cpu_freelist_fast() is not safe.
 5736	 */
 5737	if (!in_nmi() || !local_lock_is_locked(&s->cpu_slab->lock))
 5738		ret = __slab_alloc_node(s, alloc_gfp, node, _RET_IP_, size);
 5739
 5740	if (PTR_ERR(ret) == -EBUSY) {
 5741		if (can_retry) {
 5742			/* pick the next kmalloc bucket */
 5743			size = s->object_size + 1;
 5744			/*
 5745			 * Another alternative is to
 5746			 * if (memcg) alloc_gfp &= ~__GFP_ACCOUNT;
 5747			 * else if (!memcg) alloc_gfp |= __GFP_ACCOUNT;
 5748			 * to retry from bucket of the same size.
 5749			 */
 5750			can_retry = false;
 5751			goto retry;
 5752		}
 5753		ret = NULL;
 5754	}
 5755
 5756	maybe_wipe_obj_freeptr(s, ret);
 5757	slab_post_alloc_hook(s, NULL, alloc_gfp, 1, &ret,
 5758			     slab_want_init_on_alloc(alloc_gfp, s), size);
 5759
 5760	ret = kasan_kmalloc(s, ret, size, alloc_gfp);
 5761	return ret;
 5762}
 5763EXPORT_SYMBOL_GPL(kmalloc_nolock_noprof);
 5764
 5765void *__kmalloc_node_track_caller_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags,
 5766					 int node, unsigned long caller)
 5767{
 5768	return __do_kmalloc_node(size, PASS_BUCKET_PARAM(b), flags, node, caller);
 5769
 5770}
 5771EXPORT_SYMBOL(__kmalloc_node_track_caller_noprof);
 5772
 5773void *__kmalloc_cache_noprof(struct kmem_cache *s, gfp_t gfpflags, size_t size)
 5774{
 5775	void *ret = slab_alloc_node(s, NULL, gfpflags, NUMA_NO_NODE,
 5776					    _RET_IP_, size);
 5777
 5778	trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags, NUMA_NO_NODE);
 5779
 5780	ret = kasan_kmalloc(s, ret, size, gfpflags);
 5781	return ret;
 5782}
 5783EXPORT_SYMBOL(__kmalloc_cache_noprof);
 5784
 5785void *__kmalloc_cache_node_noprof(struct kmem_cache *s, gfp_t gfpflags,
 5786				  int node, size_t size)
 5787{
 5788	void *ret = slab_alloc_node(s, NULL, gfpflags, node, _RET_IP_, size);
 5789
 5790	trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags, node);
 5791
 5792	ret = kasan_kmalloc(s, ret, size, gfpflags);
 5793	return ret;
 5794}
 5795EXPORT_SYMBOL(__kmalloc_cache_node_noprof);
 5796
 5797static noinline void free_to_partial_list(
 5798	struct kmem_cache *s, struct slab *slab,
 5799	void *head, void *tail, int bulk_cnt,
 5800	unsigned long addr)
 5801{
 5802	struct kmem_cache_node *n = get_node(s, slab_nid(slab));
 5803	struct slab *slab_free = NULL;
 5804	int cnt = bulk_cnt;
 5805	unsigned long flags;
 5806	depot_stack_handle_t handle = 0;
 5807
 5808	/*
 5809	 * We cannot use GFP_NOWAIT as there are callsites where waking up
 5810	 * kswapd could deadlock
 5811	 */
 5812	if (s->flags & SLAB_STORE_USER)
 5813		handle = set_track_prepare(__GFP_NOWARN);
 5814
 5815	spin_lock_irqsave(&n->list_lock, flags);
 5816
 5817	if (free_debug_processing(s, slab, head, tail, &cnt, addr, handle)) {
 5818		void *prior = slab->freelist;
 5819
 5820		/* Perform the actual freeing while we still hold the locks */
 5821		slab->inuse -= cnt;
 5822		set_freepointer(s, tail, prior);
 5823		slab->freelist = head;
 5824
 5825		/*
 5826		 * If the slab is empty, and node's partial list is full,
 5827		 * it should be discarded anyway no matter it's on full or
 5828		 * partial list.
 5829		 */
 5830		if (slab->inuse == 0 && n->nr_partial >= s->min_partial)
 5831			slab_free = slab;
 5832
 5833		if (!prior) {
 5834			/* was on full list */
 5835			remove_full(s, n, slab);
 5836			if (!slab_free) {
 5837				add_partial(n, slab, DEACTIVATE_TO_TAIL);
 5838				stat(s, FREE_ADD_PARTIAL);
 5839			}
 5840		} else if (slab_free) {
 5841			remove_partial(n, slab);
 5842			stat(s, FREE_REMOVE_PARTIAL);
 5843		}
 5844	}
 5845
 5846	if (slab_free) {
 5847		/*
 5848		 * Update the counters while still holding n->list_lock to
 5849		 * prevent spurious validation warnings
 5850		 */
 5851		dec_slabs_node(s, slab_nid(slab_free), slab_free->objects);
 5852	}
 5853
 5854	spin_unlock_irqrestore(&n->list_lock, flags);
 5855
 5856	if (slab_free) {
 5857		stat(s, FREE_SLAB);
 5858		free_slab(s, slab_free);
 5859	}
 5860}
 5861
 5862/*
 5863 * Slow path handling. This may still be called frequently since objects
 5864 * have a longer lifetime than the cpu slabs in most processing loads.
 5865 *
 5866 * So we still attempt to reduce cache line usage. Just take the slab
 5867 * lock and free the item. If there is no additional partial slab
 5868 * handling required then we can return immediately.
 5869 */
 5870static void __slab_free(struct kmem_cache *s, struct slab *slab,
 5871			void *head, void *tail, int cnt,
 5872			unsigned long addr)
 5873
 5874{
 5875	bool was_frozen, was_full;
 5876	struct freelist_counters old, new;
 5877	struct kmem_cache_node *n = NULL;
 5878	unsigned long flags;
 5879	bool on_node_partial;
 5880
 5881	stat(s, FREE_SLOWPATH);
 5882
 5883	if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) {
 5884		free_to_partial_list(s, slab, head, tail, cnt, addr);
 5885		return;
 5886	}
 5887
 5888	/*
 5889	 * It is enough to test IS_ENABLED(CONFIG_SLUB_CPU_PARTIAL) below
 5890	 * instead of kmem_cache_has_cpu_partial(s), because kmem_cache_debug(s)
 5891	 * is the only other reason it can be false, and it is already handled
 5892	 * above.
 5893	 */
 5894
 5895	do {
 5896		if (unlikely(n)) {
 5897			spin_unlock_irqrestore(&n->list_lock, flags);
 5898			n = NULL;
 5899		}
 5900
 5901		old.freelist = slab->freelist;
 5902		old.counters = slab->counters;
 5903
 5904		was_full = (old.freelist == NULL);
 5905		was_frozen = old.frozen;
 5906
 5907		set_freepointer(s, tail, old.freelist);
 5908
 5909		new.freelist = head;
 5910		new.counters = old.counters;
 5911		new.inuse -= cnt;
 5912
 5913		/*
 5914		 * Might need to be taken off (due to becoming empty) or added
 5915		 * to (due to not being full anymore) the partial list.
 5916		 * Unless it's frozen.
 5917		 */
 5918		if ((!new.inuse || was_full) && !was_frozen) {
 5919			/*
 5920			 * If slab becomes non-full and we have cpu partial
 5921			 * lists, we put it there unconditionally to avoid
 5922			 * taking the list_lock. Otherwise we need it.
 5923			 */
 5924			if (!(IS_ENABLED(CONFIG_SLUB_CPU_PARTIAL) && was_full)) {
 5925
 5926				n = get_node(s, slab_nid(slab));
 5927				/*
 5928				 * Speculatively acquire the list_lock.
 5929				 * If the cmpxchg does not succeed then we may
 5930				 * drop the list_lock without any processing.
 5931				 *
 5932				 * Otherwise the list_lock will synchronize with
 5933				 * other processors updating the list of slabs.
 5934				 */
 5935				spin_lock_irqsave(&n->list_lock, flags);
 5936
 5937				on_node_partial = slab_test_node_partial(slab);
 5938			}
 5939		}
 5940
 5941	} while (!slab_update_freelist(s, slab, &old, &new, "__slab_free"));
 5942
 5943	if (likely(!n)) {
 5944
 5945		if (likely(was_frozen)) {
 5946			/*
 5947			 * The list lock was not taken therefore no list
 5948			 * activity can be necessary.
 5949			 */
 5950			stat(s, FREE_FROZEN);
 5951		} else if (IS_ENABLED(CONFIG_SLUB_CPU_PARTIAL) && was_full) {
 5952			/*
 5953			 * If we started with a full slab then put it onto the
 5954			 * per cpu partial list.
 5955			 */
 5956			put_cpu_partial(s, slab, 1);
 5957			stat(s, CPU_PARTIAL_FREE);
 5958		}
 5959
 5960		/*
 5961		 * In other cases we didn't take the list_lock because the slab
 5962		 * was already on the partial list and will remain there.
 5963		 */
 5964
 5965		return;
 5966	}
 5967
 5968	/*
 5969	 * This slab was partially empty but not on the per-node partial list,
 5970	 * in which case we shouldn't manipulate its list, just return.
 5971	 */
 5972	if (!was_full && !on_node_partial) {
 5973		spin_unlock_irqrestore(&n->list_lock, flags);
 5974		return;
 5975	}
 5976
 5977	/*
 5978	 * If slab became empty, should we add/keep it on the partial list or we
 5979	 * have enough?
 5980	 */
 5981	if (unlikely(!new.inuse && n->nr_partial >= s->min_partial))
 5982		goto slab_empty;
 5983
 5984	/*
 5985	 * Objects left in the slab. If it was not on the partial list before
 5986	 * then add it. This can only happen when cache has no per cpu partial
 5987	 * list otherwise we would have put it there.
 5988	 */
 5989	if (!IS_ENABLED(CONFIG_SLUB_CPU_PARTIAL) && unlikely(was_full)) {
 5990		add_partial(n, slab, DEACTIVATE_TO_TAIL);
 5991		stat(s, FREE_ADD_PARTIAL);
 5992	}
 5993	spin_unlock_irqrestore(&n->list_lock, flags);
 5994	return;
 5995
 5996slab_empty:
 5997	/*
 5998	 * The slab could have a single object and thus go from full to empty in
 5999	 * a single free, but more likely it was on the partial list. Remove it.
 6000	 */
 6001	if (likely(!was_full)) {
 6002		remove_partial(n, slab);
 6003		stat(s, FREE_REMOVE_PARTIAL);
 6004	}
 6005
 6006	spin_unlock_irqrestore(&n->list_lock, flags);
 6007	stat(s, FREE_SLAB);
 6008	discard_slab(s, slab);
 6009}
 6010
 6011/*
 6012 * pcs is locked. We should have get rid of the spare sheaf and obtained an
 6013 * empty sheaf, while the main sheaf is full. We want to install the empty sheaf
 6014 * as a main sheaf, and make the current main sheaf a spare sheaf.
 6015 *
 6016 * However due to having relinquished the cpu_sheaves lock when obtaining
 6017 * the empty sheaf, we need to handle some unlikely but possible cases.
 6018 *
 6019 * If we put any sheaf to barn here, it's because we were interrupted or have
 6020 * been migrated to a different cpu, which should be rare enough so just ignore
 6021 * the barn's limits to simplify the handling.
 6022 *
 6023 * An alternative scenario that gets us here is when we fail
 6024 * barn_replace_full_sheaf(), because there's no empty sheaf available in the
 6025 * barn, so we had to allocate it by alloc_empty_sheaf(). But because we saw the
 6026 * limit on full sheaves was not exceeded, we assume it didn't change and just
 6027 * put the full sheaf there.
 6028 */
 6029static void __pcs_install_empty_sheaf(struct kmem_cache *s,
 6030		struct slub_percpu_sheaves *pcs, struct slab_sheaf *empty,
 6031		struct node_barn *barn)
 6032{
 6033	lockdep_assert_held(this_cpu_ptr(&s->cpu_sheaves->lock));
 6034
 6035	/* This is what we expect to find if nobody interrupted us. */
 6036	if (likely(!pcs->spare)) {
 6037		pcs->spare = pcs->main;
 6038		pcs->main = empty;
 6039		return;
 6040	}
 6041
 6042	/*
 6043	 * Unlikely because if the main sheaf had space, we would have just
 6044	 * freed to it. Get rid of our empty sheaf.
 6045	 */
 6046	if (pcs->main->size < s->sheaf_capacity) {
 6047		barn_put_empty_sheaf(barn, empty);
 6048		return;
 6049	}
 6050
 6051	/* Also unlikely for the same reason */
 6052	if (pcs->spare->size < s->sheaf_capacity) {
 6053		swap(pcs->main, pcs->spare);
 6054		barn_put_empty_sheaf(barn, empty);
 6055		return;
 6056	}
 6057
 6058	/*
 6059	 * We probably failed barn_replace_full_sheaf() due to no empty sheaf
 6060	 * available there, but we allocated one, so finish the job.
 6061	 */
 6062	barn_put_full_sheaf(barn, pcs->main);
 6063	stat(s, BARN_PUT);
 6064	pcs->main = empty;
 6065}
 6066
 6067/*
 6068 * Replace the full main sheaf with a (at least partially) empty sheaf.
 6069 *
 6070 * Must be called with the cpu_sheaves local lock locked. If successful, returns
 6071 * the pcs pointer and the local lock locked (possibly on a different cpu than
 6072 * initially called). If not successful, returns NULL and the local lock
 6073 * unlocked.
 6074 */
 6075static struct slub_percpu_sheaves *
 6076__pcs_replace_full_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs)
 6077{
 6078	struct slab_sheaf *empty;
 6079	struct node_barn *barn;
 6080	bool put_fail;
 6081
 6082restart:
 6083	lockdep_assert_held(this_cpu_ptr(&s->cpu_sheaves->lock));
 6084
 6085	barn = get_barn(s);
 6086	if (!barn) {
 6087		local_unlock(&s->cpu_sheaves->lock);
 6088		return NULL;
 6089	}
 6090
 6091	put_fail = false;
 6092
 6093	if (!pcs->spare) {
 6094		empty = barn_get_empty_sheaf(barn);
 6095		if (empty) {
 6096			pcs->spare = pcs->main;
 6097			pcs->main = empty;
 6098			return pcs;
 6099		}
 6100		goto alloc_empty;
 6101	}
 6102
 6103	if (pcs->spare->size < s->sheaf_capacity) {
 6104		swap(pcs->main, pcs->spare);
 6105		return pcs;
 6106	}
 6107
 6108	empty = barn_replace_full_sheaf(barn, pcs->main);
 6109
 6110	if (!IS_ERR(empty)) {
 6111		stat(s, BARN_PUT);
 6112		pcs->main = empty;
 6113		return pcs;
 6114	}
 6115
 6116	if (PTR_ERR(empty) == -E2BIG) {
 6117		/* Since we got here, spare exists and is full */
 6118		struct slab_sheaf *to_flush = pcs->spare;
 6119
 6120		stat(s, BARN_PUT_FAIL);
 6121
 6122		pcs->spare = NULL;
 6123		local_unlock(&s->cpu_sheaves->lock);
 6124
 6125		sheaf_flush_unused(s, to_flush);
 6126		empty = to_flush;
 6127		goto got_empty;
 6128	}
 6129
 6130	/*
 6131	 * We could not replace full sheaf because barn had no empty
 6132	 * sheaves. We can still allocate it and put the full sheaf in
 6133	 * __pcs_install_empty_sheaf(), but if we fail to allocate it,
 6134	 * make sure to count the fail.
 6135	 */
 6136	put_fail = true;
 6137
 6138alloc_empty:
 6139	local_unlock(&s->cpu_sheaves->lock);
 6140
 6141	empty = alloc_empty_sheaf(s, GFP_NOWAIT);
 6142	if (empty)
 6143		goto got_empty;
 6144
 6145	if (put_fail)
 6146		 stat(s, BARN_PUT_FAIL);
 6147
 6148	if (!sheaf_flush_main(s))
 6149		return NULL;
 6150
 6151	if (!local_trylock(&s->cpu_sheaves->lock))
 6152		return NULL;
 6153
 6154	pcs = this_cpu_ptr(s->cpu_sheaves);
 6155
 6156	/*
 6157	 * we flushed the main sheaf so it should be empty now,
 6158	 * but in case we got preempted or migrated, we need to
 6159	 * check again
 6160	 */
 6161	if (pcs->main->size == s->sheaf_capacity)
 6162		goto restart;
 6163
 6164	return pcs;
 6165
 6166got_empty:
 6167	if (!local_trylock(&s->cpu_sheaves->lock)) {
 6168		barn_put_empty_sheaf(barn, empty);
 6169		return NULL;
 6170	}
 6171
 6172	pcs = this_cpu_ptr(s->cpu_sheaves);
 6173	__pcs_install_empty_sheaf(s, pcs, empty, barn);
 6174
 6175	return pcs;
 6176}
 6177
 6178/*
 6179 * Free an object to the percpu sheaves.
 6180 * The object is expected to have passed slab_free_hook() already.
 6181 */
 6182static __fastpath_inline
 6183bool free_to_pcs(struct kmem_cache *s, void *object)
 6184{
 6185	struct slub_percpu_sheaves *pcs;
 6186
 6187	if (!local_trylock(&s->cpu_sheaves->lock))
 6188		return false;
 6189
 6190	pcs = this_cpu_ptr(s->cpu_sheaves);
 6191
 6192	if (unlikely(pcs->main->size == s->sheaf_capacity)) {
 6193
 6194		pcs = __pcs_replace_full_main(s, pcs);
 6195		if (unlikely(!pcs))
 6196			return false;
 6197	}
 6198
 6199	pcs->main->objects[pcs->main->size++] = object;
 6200
 6201	local_unlock(&s->cpu_sheaves->lock);
 6202
 6203	stat(s, FREE_PCS);
 6204
 6205	return true;
 6206}
 6207
 6208static void rcu_free_sheaf(struct rcu_head *head)
 6209{
 6210	struct kmem_cache_node *n;
 6211	struct slab_sheaf *sheaf;
 6212	struct node_barn *barn = NULL;
 6213	struct kmem_cache *s;
 6214
 6215	sheaf = container_of(head, struct slab_sheaf, rcu_head);
 6216
 6217	s = sheaf->cache;
 6218
 6219	/*
 6220	 * This may remove some objects due to slab_free_hook() returning false,
 6221	 * so that the sheaf might no longer be completely full. But it's easier
 6222	 * to handle it as full (unless it became completely empty), as the code
 6223	 * handles it fine. The only downside is that sheaf will serve fewer
 6224	 * allocations when reused. It only happens due to debugging, which is a
 6225	 * performance hit anyway.
 6226	 *
 6227	 * If it returns true, there was at least one object from pfmemalloc
 6228	 * slab so simply flush everything.
 6229	 */
 6230	if (__rcu_free_sheaf_prepare(s, sheaf))
 6231		goto flush;
 6232
 6233	n = get_node(s, sheaf->node);
 6234	if (!n)
 6235		goto flush;
 6236
 6237	barn = n->barn;
 6238
 6239	/* due to slab_free_hook() */
 6240	if (unlikely(sheaf->size == 0))
 6241		goto empty;
 6242
 6243	/*
 6244	 * Checking nr_full/nr_empty outside lock avoids contention in case the
 6245	 * barn is at the respective limit. Due to the race we might go over the
 6246	 * limit but that should be rare and harmless.
 6247	 */
 6248
 6249	if (data_race(barn->nr_full) < MAX_FULL_SHEAVES) {
 6250		stat(s, BARN_PUT);
 6251		barn_put_full_sheaf(barn, sheaf);
 6252		return;
 6253	}
 6254
 6255flush:
 6256	stat(s, BARN_PUT_FAIL);
 6257	sheaf_flush_unused(s, sheaf);
 6258
 6259empty:
 6260	if (barn && data_race(barn->nr_empty) < MAX_EMPTY_SHEAVES) {
 6261		barn_put_empty_sheaf(barn, sheaf);
 6262		return;
 6263	}
 6264
 6265	free_empty_sheaf(s, sheaf);
 6266}
 6267
 6268bool __kfree_rcu_sheaf(struct kmem_cache *s, void *obj)
 6269{
 6270	struct slub_percpu_sheaves *pcs;
 6271	struct slab_sheaf *rcu_sheaf;
 6272
 6273	if (!local_trylock(&s->cpu_sheaves->lock))
 6274		goto fail;
 6275
 6276	pcs = this_cpu_ptr(s->cpu_sheaves);
 6277
 6278	if (unlikely(!pcs->rcu_free)) {
 6279
 6280		struct slab_sheaf *empty;
 6281		struct node_barn *barn;
 6282
 6283		if (pcs->spare && pcs->spare->size == 0) {
 6284			pcs->rcu_free = pcs->spare;
 6285			pcs->spare = NULL;
 6286			goto do_free;
 6287		}
 6288
 6289		barn = get_barn(s);
 6290		if (!barn) {
 6291			local_unlock(&s->cpu_sheaves->lock);
 6292			goto fail;
 6293		}
 6294
 6295		empty = barn_get_empty_sheaf(barn);
 6296
 6297		if (empty) {
 6298			pcs->rcu_free = empty;
 6299			goto do_free;
 6300		}
 6301
 6302		local_unlock(&s->cpu_sheaves->lock);
 6303
 6304		empty = alloc_empty_sheaf(s, GFP_NOWAIT);
 6305
 6306		if (!empty)
 6307			goto fail;
 6308
 6309		if (!local_trylock(&s->cpu_sheaves->lock)) {
 6310			barn_put_empty_sheaf(barn, empty);
 6311			goto fail;
 6312		}
 6313
 6314		pcs = this_cpu_ptr(s->cpu_sheaves);
 6315
 6316		if (unlikely(pcs->rcu_free))
 6317			barn_put_empty_sheaf(barn, empty);
 6318		else
 6319			pcs->rcu_free = empty;
 6320	}
 6321
 6322do_free:
 6323
 6324	rcu_sheaf = pcs->rcu_free;
 6325
 6326	/*
 6327	 * Since we flush immediately when size reaches capacity, we never reach
 6328	 * this with size already at capacity, so no OOB write is possible.
 6329	 */
 6330	rcu_sheaf->objects[rcu_sheaf->size++] = obj;
 6331
 6332	if (likely(rcu_sheaf->size < s->sheaf_capacity)) {
 6333		rcu_sheaf = NULL;
 6334	} else {
 6335		pcs->rcu_free = NULL;
 6336		rcu_sheaf->node = numa_mem_id();
 6337	}
 6338
 6339	/*
 6340	 * we flush before local_unlock to make sure a racing
 6341	 * flush_all_rcu_sheaves() doesn't miss this sheaf
 6342	 */
 6343	if (rcu_sheaf)
 6344		call_rcu(&rcu_sheaf->rcu_head, rcu_free_sheaf);
 6345
 6346	local_unlock(&s->cpu_sheaves->lock);
 6347
 6348	stat(s, FREE_RCU_SHEAF);
 6349	return true;
 6350
 6351fail:
 6352	stat(s, FREE_RCU_SHEAF_FAIL);
 6353	return false;
 6354}
 6355
 6356/*
 6357 * Bulk free objects to the percpu sheaves.
 6358 * Unlike free_to_pcs() this includes the calls to all necessary hooks
 6359 * and the fallback to freeing to slab pages.
 6360 */
 6361static void free_to_pcs_bulk(struct kmem_cache *s, size_t size, void **p)
 6362{
 6363	struct slub_percpu_sheaves *pcs;
 6364	struct slab_sheaf *main, *empty;
 6365	bool init = slab_want_init_on_free(s);
 6366	unsigned int batch, i = 0;
 6367	struct node_barn *barn;
 6368	void *remote_objects[PCS_BATCH_MAX];
 6369	unsigned int remote_nr = 0;
 6370	int node = numa_mem_id();
 6371
 6372next_remote_batch:
 6373	while (i < size) {
 6374		struct slab *slab = virt_to_slab(p[i]);
 6375
 6376		memcg_slab_free_hook(s, slab, p + i, 1);
 6377		alloc_tagging_slab_free_hook(s, slab, p + i, 1);
 6378
 6379		if (unlikely(!slab_free_hook(s, p[i], init, false))) {
 6380			p[i] = p[--size];
 6381			continue;
 6382		}
 6383
 6384		if (unlikely((IS_ENABLED(CONFIG_NUMA) && slab_nid(slab) != node)
 6385			     || slab_test_pfmemalloc(slab))) {
 6386			remote_objects[remote_nr] = p[i];
 6387			p[i] = p[--size];
 6388			if (++remote_nr >= PCS_BATCH_MAX)
 6389				goto flush_remote;
 6390			continue;
 6391		}
 6392
 6393		i++;
 6394	}
 6395
 6396	if (!size)
 6397		goto flush_remote;
 6398
 6399next_batch:
 6400	if (!local_trylock(&s->cpu_sheaves->lock))
 6401		goto fallback;
 6402
 6403	pcs = this_cpu_ptr(s->cpu_sheaves);
 6404
 6405	if (likely(pcs->main->size < s->sheaf_capacity))
 6406		goto do_free;
 6407
 6408	barn = get_barn(s);
 6409	if (!barn)
 6410		goto no_empty;
 6411
 6412	if (!pcs->spare) {
 6413		empty = barn_get_empty_sheaf(barn);
 6414		if (!empty)
 6415			goto no_empty;
 6416
 6417		pcs->spare = pcs->main;
 6418		pcs->main = empty;
 6419		goto do_free;
 6420	}
 6421
 6422	if (pcs->spare->size < s->sheaf_capacity) {
 6423		swap(pcs->main, pcs->spare);
 6424		goto do_free;
 6425	}
 6426
 6427	empty = barn_replace_full_sheaf(barn, pcs->main);
 6428	if (IS_ERR(empty)) {
 6429		stat(s, BARN_PUT_FAIL);
 6430		goto no_empty;
 6431	}
 6432
 6433	stat(s, BARN_PUT);
 6434	pcs->main = empty;
 6435
 6436do_free:
 6437	main = pcs->main;
 6438	batch = min(size, s->sheaf_capacity - main->size);
 6439
 6440	memcpy(main->objects + main->size, p, batch * sizeof(void *));
 6441	main->size += batch;
 6442
 6443	local_unlock(&s->cpu_sheaves->lock);
 6444
 6445	stat_add(s, FREE_PCS, batch);
 6446
 6447	if (batch < size) {
 6448		p += batch;
 6449		size -= batch;
 6450		goto next_batch;
 6451	}
 6452
 6453	if (remote_nr)
 6454		goto flush_remote;
 6455
 6456	return;
 6457
 6458no_empty:
 6459	local_unlock(&s->cpu_sheaves->lock);
 6460
 6461	/*
 6462	 * if we depleted all empty sheaves in the barn or there are too
 6463	 * many full sheaves, free the rest to slab pages
 6464	 */
 6465fallback:
 6466	__kmem_cache_free_bulk(s, size, p);
 6467
 6468flush_remote:
 6469	if (remote_nr) {
 6470		__kmem_cache_free_bulk(s, remote_nr, &remote_objects[0]);
 6471		if (i < size) {
 6472			remote_nr = 0;
 6473			goto next_remote_batch;
 6474		}
 6475	}
 6476}
 6477
 6478struct defer_free {
 6479	struct llist_head objects;
 6480	struct llist_head slabs;
 6481	struct irq_work work;
 6482};
 6483
 6484static void free_deferred_objects(struct irq_work *work);
 6485
 6486static DEFINE_PER_CPU(struct defer_free, defer_free_objects) = {
 6487	.objects = LLIST_HEAD_INIT(objects),
 6488	.slabs = LLIST_HEAD_INIT(slabs),
 6489	.work = IRQ_WORK_INIT(free_deferred_objects),
 6490};
 6491
 6492/*
 6493 * In PREEMPT_RT irq_work runs in per-cpu kthread, so it's safe
 6494 * to take sleeping spin_locks from __slab_free() and deactivate_slab().
 6495 * In !PREEMPT_RT irq_work will run after local_unlock_irqrestore().
 6496 */
 6497static void free_deferred_objects(struct irq_work *work)
 6498{
 6499	struct defer_free *df = container_of(work, struct defer_free, work);
 6500	struct llist_head *objs = &df->objects;
 6501	struct llist_head *slabs = &df->slabs;
 6502	struct llist_node *llnode, *pos, *t;
 6503
 6504	if (llist_empty(objs) && llist_empty(slabs))
 6505		return;
 6506
 6507	llnode = llist_del_all(objs);
 6508	llist_for_each_safe(pos, t, llnode) {
 6509		struct kmem_cache *s;
 6510		struct slab *slab;
 6511		void *x = pos;
 6512
 6513		slab = virt_to_slab(x);
 6514		s = slab->slab_cache;
 6515
 6516		/* Point 'x' back to the beginning of allocated object */
 6517		x -= s->offset;
 6518
 6519		/*
 6520		 * We used freepointer in 'x' to link 'x' into df->objects.
 6521		 * Clear it to NULL to avoid false positive detection
 6522		 * of "Freepointer corruption".
 6523		 */
 6524		set_freepointer(s, x, NULL);
 6525
 6526		__slab_free(s, slab, x, x, 1, _THIS_IP_);
 6527	}
 6528
 6529	llnode = llist_del_all(slabs);
 6530	llist_for_each_safe(pos, t, llnode) {
 6531		struct slab *slab = container_of(pos, struct slab, llnode);
 6532
 6533		if (slab->frozen)
 6534			deactivate_slab(slab->slab_cache, slab, slab->flush_freelist);
 6535		else
 6536			free_slab(slab->slab_cache, slab);
 6537	}
 6538}
 6539
 6540static void defer_free(struct kmem_cache *s, void *head)
 6541{
 6542	struct defer_free *df;
 6543
 6544	guard(preempt)();
 6545
 6546	head = kasan_reset_tag(head);
 6547
 6548	df = this_cpu_ptr(&defer_free_objects);
 6549	if (llist_add(head + s->offset, &df->objects))
 6550		irq_work_queue(&df->work);
 6551}
 6552
 6553static void defer_deactivate_slab(struct slab *slab, void *flush_freelist)
 6554{
 6555	struct defer_free *df;
 6556
 6557	slab->flush_freelist = flush_freelist;
 6558
 6559	guard(preempt)();
 6560
 6561	df = this_cpu_ptr(&defer_free_objects);
 6562	if (llist_add(&slab->llnode, &df->slabs))
 6563		irq_work_queue(&df->work);
 6564}
 6565
 6566void defer_free_barrier(void)
 6567{
 6568	int cpu;
 6569
 6570	for_each_possible_cpu(cpu)
 6571		irq_work_sync(&per_cpu_ptr(&defer_free_objects, cpu)->work);
 6572}
 6573
 6574/*
 6575 * Fastpath with forced inlining to produce a kfree and kmem_cache_free that
 6576 * can perform fastpath freeing without additional function calls.
 6577 *
 6578 * The fastpath is only possible if we are freeing to the current cpu slab
 6579 * of this processor. This typically the case if we have just allocated
 6580 * the item before.
 6581 *
 6582 * If fastpath is not possible then fall back to __slab_free where we deal
 6583 * with all sorts of special processing.
 6584 *
 6585 * Bulk free of a freelist with several objects (all pointing to the
 6586 * same slab) possible by specifying head and tail ptr, plus objects
 6587 * count (cnt). Bulk free indicated by tail pointer being set.
 6588 */
 6589static __always_inline void do_slab_free(struct kmem_cache *s,
 6590				struct slab *slab, void *head, void *tail,
 6591				int cnt, unsigned long addr)
 6592{
 6593	/* cnt == 0 signals that it's called from kfree_nolock() */
 6594	bool allow_spin = cnt;
 6595	struct kmem_cache_cpu *c;
 6596	unsigned long tid;
 6597	void **freelist;
 6598
 6599redo:
 6600	/*
 6601	 * Determine the currently cpus per cpu slab.
 6602	 * The cpu may change afterward. However that does not matter since
 6603	 * data is retrieved via this pointer. If we are on the same cpu
 6604	 * during the cmpxchg then the free will succeed.
 6605	 */
 6606	c = raw_cpu_ptr(s->cpu_slab);
 6607	tid = READ_ONCE(c->tid);
 6608
 6609	/* Same with comment on barrier() in __slab_alloc_node() */
 6610	barrier();
 6611
 6612	if (unlikely(slab != c->slab)) {
 6613		if (unlikely(!allow_spin)) {
 6614			/*
 6615			 * __slab_free() can locklessly cmpxchg16 into a slab,
 6616			 * but then it might need to take spin_lock or local_lock
 6617			 * in put_cpu_partial() for further processing.
 6618			 * Avoid the complexity and simply add to a deferred list.
 6619			 */
 6620			defer_free(s, head);
 6621		} else {
 6622			__slab_free(s, slab, head, tail, cnt, addr);
 6623		}
 6624		return;
 6625	}
 6626
 6627	if (unlikely(!allow_spin)) {
 6628		if ((in_nmi() || !USE_LOCKLESS_FAST_PATH()) &&
 6629		    local_lock_is_locked(&s->cpu_slab->lock)) {
 6630			defer_free(s, head);
 6631			return;
 6632		}
 6633		cnt = 1; /* restore cnt. kfree_nolock() frees one object at a time */
 6634	}
 6635
 6636	if (USE_LOCKLESS_FAST_PATH()) {
 6637		freelist = READ_ONCE(c->freelist);
 6638
 6639		set_freepointer(s, tail, freelist);
 6640
 6641		if (unlikely(!__update_cpu_freelist_fast(s, freelist, head, tid))) {
 6642			note_cmpxchg_failure("slab_free", s, tid);
 6643			goto redo;
 6644		}
 6645	} else {
 6646		__maybe_unused unsigned long flags = 0;
 6647
 6648		/* Update the free list under the local lock */
 6649		local_lock_cpu_slab(s, flags);
 6650		c = this_cpu_ptr(s->cpu_slab);
 6651		if (unlikely(slab != c->slab)) {
 6652			local_unlock_cpu_slab(s, flags);
 6653			goto redo;
 6654		}
 6655		tid = c->tid;
 6656		freelist = c->freelist;
 6657
 6658		set_freepointer(s, tail, freelist);
 6659		c->freelist = head;
 6660		c->tid = next_tid(tid);
 6661
 6662		local_unlock_cpu_slab(s, flags);
 6663	}
 6664	stat_add(s, FREE_FASTPATH, cnt);
 6665}
 6666
 6667static __fastpath_inline
 6668void slab_free(struct kmem_cache *s, struct slab *slab, void *object,
 6669	       unsigned long addr)
 6670{
 6671	memcg_slab_free_hook(s, slab, &object, 1);
 6672	alloc_tagging_slab_free_hook(s, slab, &object, 1);
 6673
 6674	if (unlikely(!slab_free_hook(s, object, slab_want_init_on_free(s), false)))
 6675		return;
 6676
 6677	if (s->cpu_sheaves && likely(!IS_ENABLED(CONFIG_NUMA) ||
 6678				     slab_nid(slab) == numa_mem_id())
 6679			   && likely(!slab_test_pfmemalloc(slab))) {
 6680		if (likely(free_to_pcs(s, object)))
 6681			return;
 6682	}
 6683
 6684	do_slab_free(s, slab, object, object, 1, addr);
 6685}
 6686
 6687#ifdef CONFIG_MEMCG
 6688/* Do not inline the rare memcg charging failed path into the allocation path */
 6689static noinline
 6690void memcg_alloc_abort_single(struct kmem_cache *s, void *object)
 6691{
 6692	struct slab *slab = virt_to_slab(object);
 6693
 6694	alloc_tagging_slab_free_hook(s, slab, &object, 1);
 6695
 6696	if (likely(slab_free_hook(s, object, slab_want_init_on_free(s), false)))
 6697		do_slab_free(s, slab, object, object, 1, _RET_IP_);
 6698}
 6699#endif
 6700
 6701static __fastpath_inline
 6702void slab_free_bulk(struct kmem_cache *s, struct slab *slab, void *head,
 6703		    void *tail, void **p, int cnt, unsigned long addr)
 6704{
 6705	memcg_slab_free_hook(s, slab, p, cnt);
 6706	alloc_tagging_slab_free_hook(s, slab, p, cnt);
 6707	/*
 6708	 * With KASAN enabled slab_free_freelist_hook modifies the freelist
 6709	 * to remove objects, whose reuse must be delayed.
 6710	 */
 6711	if (likely(slab_free_freelist_hook(s, &head, &tail, &cnt)))
 6712		do_slab_free(s, slab, head, tail, cnt, addr);
 6713}
 6714
 6715#ifdef CONFIG_SLUB_RCU_DEBUG
 6716static void slab_free_after_rcu_debug(struct rcu_head *rcu_head)
 6717{
 6718	struct rcu_delayed_free *delayed_free =
 6719			container_of(rcu_head, struct rcu_delayed_free, head);
 6720	void *object = delayed_free->object;
 6721	struct slab *slab = virt_to_slab(object);
 6722	struct kmem_cache *s;
 6723
 6724	kfree(delayed_free);
 6725
 6726	if (WARN_ON(is_kfence_address(object)))
 6727		return;
 6728
 6729	/* find the object and the cache again */
 6730	if (WARN_ON(!slab))
 6731		return;
 6732	s = slab->slab_cache;
 6733	if (WARN_ON(!(s->flags & SLAB_TYPESAFE_BY_RCU)))
 6734		return;
 6735
 6736	/* resume freeing */
 6737	if (slab_free_hook(s, object, slab_want_init_on_free(s), true))
 6738		do_slab_free(s, slab, object, object, 1, _THIS_IP_);
 6739}
 6740#endif /* CONFIG_SLUB_RCU_DEBUG */
 6741
 6742#ifdef CONFIG_KASAN_GENERIC
 6743void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr)
 6744{
 6745	do_slab_free(cache, virt_to_slab(x), x, x, 1, addr);
 6746}
 6747#endif
 6748
 6749static inline struct kmem_cache *virt_to_cache(const void *obj)
 6750{
 6751	struct slab *slab;
 6752
 6753	slab = virt_to_slab(obj);
 6754	if (WARN_ONCE(!slab, "%s: Object is not a Slab page!\n", __func__))
 6755		return NULL;
 6756	return slab->slab_cache;
 6757}
 6758
 6759static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x)
 6760{
 6761	struct kmem_cache *cachep;
 6762
 6763	if (!IS_ENABLED(CONFIG_SLAB_FREELIST_HARDENED) &&
 6764	    !kmem_cache_debug_flags(s, SLAB_CONSISTENCY_CHECKS))
 6765		return s;
 6766
 6767	cachep = virt_to_cache(x);
 6768	if (WARN(cachep && cachep != s,
 6769		 "%s: Wrong slab cache. %s but object is from %s\n",
 6770		 __func__, s->name, cachep->name))
 6771		print_tracking(cachep, x);
 6772	return cachep;
 6773}
 6774
 6775/**
 6776 * kmem_cache_free - Deallocate an object
 6777 * @s: The cache the allocation was from.
 6778 * @x: The previously allocated object.
 6779 *
 6780 * Free an object which was previously allocated from this
 6781 * cache.
 6782 */
 6783void kmem_cache_free(struct kmem_cache *s, void *x)
 6784{
 6785	s = cache_from_obj(s, x);
 6786	if (!s)
 6787		return;
 6788	trace_kmem_cache_free(_RET_IP_, x, s);
 6789	slab_free(s, virt_to_slab(x), x, _RET_IP_);
 6790}
 6791EXPORT_SYMBOL(kmem_cache_free);
 6792
 6793static void free_large_kmalloc(struct page *page, void *object)
 6794{
 6795	unsigned int order = compound_order(page);
 6796
 6797	if (WARN_ON_ONCE(!PageLargeKmalloc(page))) {
 6798		dump_page(page, "Not a kmalloc allocation");
 6799		return;
 6800	}
 6801
 6802	if (WARN_ON_ONCE(order == 0))
 6803		pr_warn_once("object pointer: 0x%p\n", object);
 6804
 6805	kmemleak_free(object);
 6806	kasan_kfree_large(object);
 6807	kmsan_kfree_large(object);
 6808
 6809	mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B,
 6810			      -(PAGE_SIZE << order));
 6811	__ClearPageLargeKmalloc(page);
 6812	free_frozen_pages(page, order);
 6813}
 6814
 6815/*
 6816 * Given an rcu_head embedded within an object obtained from kvmalloc at an
 6817 * offset < 4k, free the object in question.
 6818 */
 6819void kvfree_rcu_cb(struct rcu_head *head)
 6820{
 6821	void *obj = head;
 6822	struct page *page;
 6823	struct slab *slab;
 6824	struct kmem_cache *s;
 6825	void *slab_addr;
 6826
 6827	if (is_vmalloc_addr(obj)) {
 6828		obj = (void *) PAGE_ALIGN_DOWN((unsigned long)obj);
 6829		vfree(obj);
 6830		return;
 6831	}
 6832
 6833	page = virt_to_page(obj);
 6834	slab = page_slab(page);
 6835	if (!slab) {
 6836		/*
 6837		 * rcu_head offset can be only less than page size so no need to
 6838		 * consider allocation order
 6839		 */
 6840		obj = (void *) PAGE_ALIGN_DOWN((unsigned long)obj);
 6841		free_large_kmalloc(page, obj);
 6842		return;
 6843	}
 6844
 6845	s = slab->slab_cache;
 6846	slab_addr = slab_address(slab);
 6847
 6848	if (is_kfence_address(obj)) {
 6849		obj = kfence_object_start(obj);
 6850	} else {
 6851		unsigned int idx = __obj_to_index(s, slab_addr, obj);
 6852
 6853		obj = slab_addr + s->size * idx;
 6854		obj = fixup_red_left(s, obj);
 6855	}
 6856
 6857	slab_free(s, slab, obj, _RET_IP_);
 6858}
 6859
 6860/**
 6861 * kfree - free previously allocated memory
 6862 * @object: pointer returned by kmalloc() or kmem_cache_alloc()
 6863 *
 6864 * If @object is NULL, no operation is performed.
 6865 */
 6866void kfree(const void *object)
 6867{
 6868	struct page *page;
 6869	struct slab *slab;
 6870	struct kmem_cache *s;
 6871	void *x = (void *)object;
 6872
 6873	trace_kfree(_RET_IP_, object);
 6874
 6875	if (unlikely(ZERO_OR_NULL_PTR(object)))
 6876		return;
 6877
 6878	page = virt_to_page(object);
 6879	slab = page_slab(page);
 6880	if (!slab) {
 6881		free_large_kmalloc(page, (void *)object);
 6882		return;
 6883	}
 6884
 6885	s = slab->slab_cache;
 6886	slab_free(s, slab, x, _RET_IP_);
 6887}
 6888EXPORT_SYMBOL(kfree);
 6889
 6890/*
 6891 * Can be called while holding raw_spinlock_t or from IRQ and NMI,
 6892 * but ONLY for objects allocated by kmalloc_nolock().
 6893 * Debug checks (like kmemleak and kfence) were skipped on allocation,
 6894 * hence
 6895 * obj = kmalloc(); kfree_nolock(obj);
 6896 * will miss kmemleak/kfence book keeping and will cause false positives.
 6897 * large_kmalloc is not supported either.
 6898 */
 6899void kfree_nolock(const void *object)
 6900{
 6901	struct slab *slab;
 6902	struct kmem_cache *s;
 6903	void *x = (void *)object;
 6904
 6905	if (unlikely(ZERO_OR_NULL_PTR(object)))
 6906		return;
 6907
 6908	slab = virt_to_slab(object);
 6909	if (unlikely(!slab)) {
 6910		WARN_ONCE(1, "large_kmalloc is not supported by kfree_nolock()");
 6911		return;
 6912	}
 6913
 6914	s = slab->slab_cache;
 6915
 6916	memcg_slab_free_hook(s, slab, &x, 1);
 6917	alloc_tagging_slab_free_hook(s, slab, &x, 1);
 6918	/*
 6919	 * Unlike slab_free() do NOT call the following:
 6920	 * kmemleak_free_recursive(x, s->flags);
 6921	 * debug_check_no_locks_freed(x, s->object_size);
 6922	 * debug_check_no_obj_freed(x, s->object_size);
 6923	 * __kcsan_check_access(x, s->object_size, ..);
 6924	 * kfence_free(x);
 6925	 * since they take spinlocks or not safe from any context.
 6926	 */
 6927	kmsan_slab_free(s, x);
 6928	/*
 6929	 * If KASAN finds a kernel bug it will do kasan_report_invalid_free()
 6930	 * which will call raw_spin_lock_irqsave() which is technically
 6931	 * unsafe from NMI, but take chance and report kernel bug.
 6932	 * The sequence of
 6933	 * kasan_report_invalid_free() -> raw_spin_lock_irqsave() -> NMI
 6934	 *  -> kfree_nolock() -> kasan_report_invalid_free() on the same CPU
 6935	 * is double buggy and deserves to deadlock.
 6936	 */
 6937	if (kasan_slab_pre_free(s, x))
 6938		return;
 6939	/*
 6940	 * memcg, kasan_slab_pre_free are done for 'x'.
 6941	 * The only thing left is kasan_poison without quarantine,
 6942	 * since kasan quarantine takes locks and not supported from NMI.
 6943	 */
 6944	kasan_slab_free(s, x, false, false, /* skip quarantine */true);
 6945	do_slab_free(s, slab, x, x, 0, _RET_IP_);
 6946}
 6947EXPORT_SYMBOL_GPL(kfree_nolock);
 6948
 6949static __always_inline __realloc_size(2) void *
 6950__do_krealloc(const void *p, size_t new_size, unsigned long align, gfp_t flags, int nid)
 6951{
 6952	void *ret;
 6953	size_t ks = 0;
 6954	int orig_size = 0;
 6955	struct kmem_cache *s = NULL;
 6956
 6957	if (unlikely(ZERO_OR_NULL_PTR(p)))
 6958		goto alloc_new;
 6959
 6960	/* Check for double-free. */
 6961	if (!kasan_check_byte(p))
 6962		return NULL;
 6963
 6964	/*
 6965	 * If reallocation is not necessary (e. g. the new size is less
 6966	 * than the current allocated size), the current allocation will be
 6967	 * preserved unless __GFP_THISNODE is set. In the latter case a new
 6968	 * allocation on the requested node will be attempted.
 6969	 */
 6970	if (unlikely(flags & __GFP_THISNODE) && nid != NUMA_NO_NODE &&
 6971		     nid != page_to_nid(virt_to_page(p)))
 6972		goto alloc_new;
 6973
 6974	if (is_kfence_address(p)) {
 6975		ks = orig_size = kfence_ksize(p);
 6976	} else {
 6977		struct page *page = virt_to_page(p);
 6978		struct slab *slab = page_slab(page);
 6979
 6980		if (!slab) {
 6981			/* Big kmalloc object */
 6982			ks = page_size(page);
 6983			WARN_ON(ks <= KMALLOC_MAX_CACHE_SIZE);
 6984			WARN_ON(p != page_address(page));
 6985		} else {
 6986			s = slab->slab_cache;
 6987			orig_size = get_orig_size(s, (void *)p);
 6988			ks = s->object_size;
 6989		}
 6990	}
 6991
 6992	/* If the old object doesn't fit, allocate a bigger one */
 6993	if (new_size > ks)
 6994		goto alloc_new;
 6995
 6996	/* If the old object doesn't satisfy the new alignment, allocate a new one */
 6997	if (!IS_ALIGNED((unsigned long)p, align))
 6998		goto alloc_new;
 6999
 7000	/* Zero out spare memory. */
 7001	if (want_init_on_alloc(flags)) {
 7002		kasan_disable_current();
 7003		if (orig_size && orig_size < new_size)
 7004			memset(kasan_reset_tag(p) + orig_size, 0, new_size - orig_size);
 7005		else
 7006			memset(kasan_reset_tag(p) + new_size, 0, ks - new_size);
 7007		kasan_enable_current();
 7008	}
 7009
 7010	/* Setup kmalloc redzone when needed */
 7011	if (s && slub_debug_orig_size(s)) {
 7012		set_orig_size(s, (void *)p, new_size);
 7013		if (s->flags & SLAB_RED_ZONE && new_size < ks)
 7014			memset_no_sanitize_memory(kasan_reset_tag(p) + new_size,
 7015						SLUB_RED_ACTIVE, ks - new_size);
 7016	}
 7017
 7018	p = kasan_krealloc(p, new_size, flags);
 7019	return (void *)p;
 7020
 7021alloc_new:
 7022	ret = kmalloc_node_track_caller_noprof(new_size, flags, nid, _RET_IP_);
 7023	if (ret && p) {
 7024		/* Disable KASAN checks as the object's redzone is accessed. */
 7025		kasan_disable_current();
 7026		memcpy(ret, kasan_reset_tag(p), orig_size ?: ks);
 7027		kasan_enable_current();
 7028	}
 7029
 7030	return ret;
 7031}
 7032
 7033/**
 7034 * krealloc_node_align - reallocate memory. The contents will remain unchanged.
 7035 * @p: object to reallocate memory for.
 7036 * @new_size: how many bytes of memory are required.
 7037 * @align: desired alignment.
 7038 * @flags: the type of memory to allocate.
 7039 * @nid: NUMA node or NUMA_NO_NODE
 7040 *
 7041 * If @p is %NULL, krealloc() behaves exactly like kmalloc().  If @new_size
 7042 * is 0 and @p is not a %NULL pointer, the object pointed to is freed.
 7043 *
 7044 * Only alignments up to those guaranteed by kmalloc() will be honored. Please see
 7045 * Documentation/core-api/memory-allocation.rst for more details.
 7046 *
 7047 * If __GFP_ZERO logic is requested, callers must ensure that, starting with the
 7048 * initial memory allocation, every subsequent call to this API for the same
 7049 * memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that
 7050 * __GFP_ZERO is not fully honored by this API.
 7051 *
 7052 * When slub_debug_orig_size() is off, krealloc() only knows about the bucket
 7053 * size of an allocation (but not the exact size it was allocated with) and
 7054 * hence implements the following semantics for shrinking and growing buffers
 7055 * with __GFP_ZERO::
 7056 *
 7057 *           new             bucket
 7058 *   0       size             size
 7059 *   |--------|----------------|
 7060 *   |  keep  |      zero      |
 7061 *
 7062 * Otherwise, the original allocation size 'orig_size' could be used to
 7063 * precisely clear the requested size, and the new size will also be stored
 7064 * as the new 'orig_size'.
 7065 *
 7066 * In any case, the contents of the object pointed to are preserved up to the
 7067 * lesser of the new and old sizes.
 7068 *
 7069 * Return: pointer to the allocated memory or %NULL in case of error
 7070 */
 7071void *krealloc_node_align_noprof(const void *p, size_t new_size, unsigned long align,
 7072				 gfp_t flags, int nid)
 7073{
 7074	void *ret;
 7075
 7076	if (unlikely(!new_size)) {
 7077		kfree(p);
 7078		return ZERO_SIZE_PTR;
 7079	}
 7080
 7081	ret = __do_krealloc(p, new_size, align, flags, nid);
 7082	if (ret && kasan_reset_tag(p) != kasan_reset_tag(ret))
 7083		kfree(p);
 7084
 7085	return ret;
 7086}
 7087EXPORT_SYMBOL(krealloc_node_align_noprof);
 7088
 7089static gfp_t kmalloc_gfp_adjust(gfp_t flags, size_t size)
 7090{
 7091	/*
 7092	 * We want to attempt a large physically contiguous block first because
 7093	 * it is less likely to fragment multiple larger blocks and therefore
 7094	 * contribute to a long term fragmentation less than vmalloc fallback.
 7095	 * However make sure that larger requests are not too disruptive - i.e.
 7096	 * do not direct reclaim unless physically continuous memory is preferred
 7097	 * (__GFP_RETRY_MAYFAIL mode). We still kick in kswapd/kcompactd to
 7098	 * start working in the background
 7099	 */
 7100	if (size > PAGE_SIZE) {
 7101		flags |= __GFP_NOWARN;
 7102
 7103		if (!(flags & __GFP_RETRY_MAYFAIL))
 7104			flags &= ~__GFP_DIRECT_RECLAIM;
 7105
 7106		/* nofail semantic is implemented by the vmalloc fallback */
 7107		flags &= ~__GFP_NOFAIL;
 7108	}
 7109
 7110	return flags;
 7111}
 7112
 7113/**
 7114 * __kvmalloc_node - attempt to allocate physically contiguous memory, but upon
 7115 * failure, fall back to non-contiguous (vmalloc) allocation.
 7116 * @size: size of the request.
 7117 * @b: which set of kmalloc buckets to allocate from.
 7118 * @align: desired alignment.
 7119 * @flags: gfp mask for the allocation - must be compatible (superset) with GFP_KERNEL.
 7120 * @node: numa node to allocate from
 7121 *
 7122 * Only alignments up to those guaranteed by kmalloc() will be honored. Please see
 7123 * Documentation/core-api/memory-allocation.rst for more details.
 7124 *
 7125 * Uses kmalloc to get the memory but if the allocation fails then falls back
 7126 * to the vmalloc allocator. Use kvfree for freeing the memory.
 7127 *
 7128 * GFP_NOWAIT and GFP_ATOMIC are supported, the __GFP_NORETRY modifier is not.
 7129 * __GFP_RETRY_MAYFAIL is supported, and it should be used only if kmalloc is
 7130 * preferable to the vmalloc fallback, due to visible performance drawbacks.
 7131 *
 7132 * Return: pointer to the allocated memory of %NULL in case of failure
 7133 */
 7134void *__kvmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), unsigned long align,
 7135			     gfp_t flags, int node)
 7136{
 7137	bool allow_block;
 7138	void *ret;
 7139
 7140	/*
 7141	 * It doesn't really make sense to fallback to vmalloc for sub page
 7142	 * requests
 7143	 */
 7144	ret = __do_kmalloc_node(size, PASS_BUCKET_PARAM(b),
 7145				kmalloc_gfp_adjust(flags, size),
 7146				node, _RET_IP_);
 7147	if (ret || size <= PAGE_SIZE)
 7148		return ret;
 7149
 7150	/* Don't even allow crazy sizes */
 7151	if (unlikely(size > INT_MAX)) {
 7152		WARN_ON_ONCE(!(flags & __GFP_NOWARN));
 7153		return NULL;
 7154	}
 7155
 7156	/*
 7157	 * For non-blocking the VM_ALLOW_HUGE_VMAP is not used
 7158	 * because the huge-mapping path in vmalloc contains at
 7159	 * least one might_sleep() call.
 7160	 *
 7161	 * TODO: Revise huge-mapping path to support non-blocking
 7162	 * flags.
 7163	 */
 7164	allow_block = gfpflags_allow_blocking(flags);
 7165
 7166	/*
 7167	 * kvmalloc() can always use VM_ALLOW_HUGE_VMAP,
 7168	 * since the callers already cannot assume anything
 7169	 * about the resulting pointer, and cannot play
 7170	 * protection games.
 7171	 */
 7172	return __vmalloc_node_range_noprof(size, align, VMALLOC_START, VMALLOC_END,
 7173			flags, PAGE_KERNEL, allow_block ? VM_ALLOW_HUGE_VMAP:0,
 7174			node, __builtin_return_address(0));
 7175}
 7176EXPORT_SYMBOL(__kvmalloc_node_noprof);
 7177
 7178/**
 7179 * kvfree() - Free memory.
 7180 * @addr: Pointer to allocated memory.
 7181 *
 7182 * kvfree frees memory allocated by any of vmalloc(), kmalloc() or kvmalloc().
 7183 * It is slightly more efficient to use kfree() or vfree() if you are certain
 7184 * that you know which one to use.
 7185 *
 7186 * Context: Either preemptible task context or not-NMI interrupt.
 7187 */
 7188void kvfree(const void *addr)
 7189{
 7190	if (is_vmalloc_addr(addr))
 7191		vfree(addr);
 7192	else
 7193		kfree(addr);
 7194}
 7195EXPORT_SYMBOL(kvfree);
 7196
 7197/**
 7198 * kvfree_sensitive - Free a data object containing sensitive information.
 7199 * @addr: address of the data object to be freed.
 7200 * @len: length of the data object.
 7201 *
 7202 * Use the special memzero_explicit() function to clear the content of a
 7203 * kvmalloc'ed object containing sensitive data to make sure that the
 7204 * compiler won't optimize out the data clearing.
 7205 */
 7206void kvfree_sensitive(const void *addr, size_t len)
 7207{
 7208	if (likely(!ZERO_OR_NULL_PTR(addr))) {
 7209		memzero_explicit((void *)addr, len);
 7210		kvfree(addr);
 7211	}
 7212}
 7213EXPORT_SYMBOL(kvfree_sensitive);
 7214
 7215/**
 7216 * kvrealloc_node_align - reallocate memory; contents remain unchanged
 7217 * @p: object to reallocate memory for
 7218 * @size: the size to reallocate
 7219 * @align: desired alignment
 7220 * @flags: the flags for the page level allocator
 7221 * @nid: NUMA node id
 7222 *
 7223 * If @p is %NULL, kvrealloc() behaves exactly like kvmalloc(). If @size is 0
 7224 * and @p is not a %NULL pointer, the object pointed to is freed.
 7225 *
 7226 * Only alignments up to those guaranteed by kmalloc() will be honored. Please see
 7227 * Documentation/core-api/memory-allocation.rst for more details.
 7228 *
 7229 * If __GFP_ZERO logic is requested, callers must ensure that, starting with the
 7230 * initial memory allocation, every subsequent call to this API for the same
 7231 * memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that
 7232 * __GFP_ZERO is not fully honored by this API.
 7233 *
 7234 * In any case, the contents of the object pointed to are preserved up to the
 7235 * lesser of the new and old sizes.
 7236 *
 7237 * This function must not be called concurrently with itself or kvfree() for the
 7238 * same memory allocation.
 7239 *
 7240 * Return: pointer to the allocated memory or %NULL in case of error
 7241 */
 7242void *kvrealloc_node_align_noprof(const void *p, size_t size, unsigned long align,
 7243				  gfp_t flags, int nid)
 7244{
 7245	void *n;
 7246
 7247	if (is_vmalloc_addr(p))
 7248		return vrealloc_node_align_noprof(p, size, align, flags, nid);
 7249
 7250	n = krealloc_node_align_noprof(p, size, align, kmalloc_gfp_adjust(flags, size), nid);
 7251	if (!n) {
 7252		/* We failed to krealloc(), fall back to kvmalloc(). */
 7253		n = kvmalloc_node_align_noprof(size, align, flags, nid);
 7254		if (!n)
 7255			return NULL;
 7256
 7257		if (p) {
 7258			/* We already know that `p` is not a vmalloc address. */
 7259			kasan_disable_current();
 7260			memcpy(n, kasan_reset_tag(p), ksize(p));
 7261			kasan_enable_current();
 7262
 7263			kfree(p);
 7264		}
 7265	}
 7266
 7267	return n;
 7268}
 7269EXPORT_SYMBOL(kvrealloc_node_align_noprof);
 7270
 7271struct detached_freelist {
 7272	struct slab *slab;
 7273	void *tail;
 7274	void *freelist;
 7275	int cnt;
 7276	struct kmem_cache *s;
 7277};
 7278
 7279/*
 7280 * This function progressively scans the array with free objects (with
 7281 * a limited look ahead) and extract objects belonging to the same
 7282 * slab.  It builds a detached freelist directly within the given
 7283 * slab/objects.  This can happen without any need for
 7284 * synchronization, because the objects are owned by running process.
 7285 * The freelist is build up as a single linked list in the objects.
 7286 * The idea is, that this detached freelist can then be bulk
 7287 * transferred to the real freelist(s), but only requiring a single
 7288 * synchronization primitive.  Look ahead in the array is limited due
 7289 * to performance reasons.
 7290 */
 7291static inline
 7292int build_detached_freelist(struct kmem_cache *s, size_t size,
 7293			    void **p, struct detached_freelist *df)
 7294{
 7295	int lookahead = 3;
 7296	void *object;
 7297	struct page *page;
 7298	struct slab *slab;
 7299	size_t same;
 7300
 7301	object = p[--size];
 7302	page = virt_to_page(object);
 7303	slab = page_slab(page);
 7304	if (!s) {
 7305		/* Handle kalloc'ed objects */
 7306		if (!slab) {
 7307			free_large_kmalloc(page, object);
 7308			df->slab = NULL;
 7309			return size;
 7310		}
 7311		/* Derive kmem_cache from object */
 7312		df->slab = slab;
 7313		df->s = slab->slab_cache;
 7314	} else {
 7315		df->slab = slab;
 7316		df->s = cache_from_obj(s, object); /* Support for memcg */
 7317	}
 7318
 7319	/* Start new detached freelist */
 7320	df->tail = object;
 7321	df->freelist = object;
 7322	df->cnt = 1;
 7323
 7324	if (is_kfence_address(object))
 7325		return size;
 7326
 7327	set_freepointer(df->s, object, NULL);
 7328
 7329	same = size;
 7330	while (size) {
 7331		object = p[--size];
 7332		/* df->slab is always set at this point */
 7333		if (df->slab == virt_to_slab(object)) {
 7334			/* Opportunity build freelist */
 7335			set_freepointer(df->s, object, df->freelist);
 7336			df->freelist = object;
 7337			df->cnt++;
 7338			same--;
 7339			if (size != same)
 7340				swap(p[size], p[same]);
 7341			continue;
 7342		}
 7343
 7344		/* Limit look ahead search */
 7345		if (!--lookahead)
 7346			break;
 7347	}
 7348
 7349	return same;
 7350}
 7351
 7352/*
 7353 * Internal bulk free of objects that were not initialised by the post alloc
 7354 * hooks and thus should not be processed by the free hooks
 7355 */
 7356static void __kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
 7357{
 7358	if (!size)
 7359		return;
 7360
 7361	do {
 7362		struct detached_freelist df;
 7363
 7364		size = build_detached_freelist(s, size, p, &df);
 7365		if (!df.slab)
 7366			continue;
 7367
 7368		if (kfence_free(df.freelist))
 7369			continue;
 7370
 7371		do_slab_free(df.s, df.slab, df.freelist, df.tail, df.cnt,
 7372			     _RET_IP_);
 7373	} while (likely(size));
 7374}
 7375
 7376/* Note that interrupts must be enabled when calling this function. */
 7377void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
 7378{
 7379	if (!size)
 7380		return;
 7381
 7382	/*
 7383	 * freeing to sheaves is so incompatible with the detached freelist so
 7384	 * once we go that way, we have to do everything differently
 7385	 */
 7386	if (s && s->cpu_sheaves) {
 7387		free_to_pcs_bulk(s, size, p);
 7388		return;
 7389	}
 7390
 7391	do {
 7392		struct detached_freelist df;
 7393
 7394		size = build_detached_freelist(s, size, p, &df);
 7395		if (!df.slab)
 7396			continue;
 7397
 7398		slab_free_bulk(df.s, df.slab, df.freelist, df.tail, &p[size],
 7399			       df.cnt, _RET_IP_);
 7400	} while (likely(size));
 7401}
 7402EXPORT_SYMBOL(kmem_cache_free_bulk);
 7403
 7404static inline
 7405int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
 7406			    void **p)
 7407{
 7408	struct kmem_cache_cpu *c;
 7409	unsigned long irqflags;
 7410	int i;
 7411
 7412	/*
 7413	 * Drain objects in the per cpu slab, while disabling local
 7414	 * IRQs, which protects against PREEMPT and interrupts
 7415	 * handlers invoking normal fastpath.
 7416	 */
 7417	c = slub_get_cpu_ptr(s->cpu_slab);
 7418	local_lock_irqsave(&s->cpu_slab->lock, irqflags);
 7419
 7420	for (i = 0; i < size; i++) {
 7421		void *object = c->freelist;
 7422
 7423		if (unlikely(!object)) {
 7424			/*
 7425			 * We may have removed an object from c->freelist using
 7426			 * the fastpath in the previous iteration; in that case,
 7427			 * c->tid has not been bumped yet.
 7428			 * Since ___slab_alloc() may reenable interrupts while
 7429			 * allocating memory, we should bump c->tid now.
 7430			 */
 7431			c->tid = next_tid(c->tid);
 7432
 7433			local_unlock_irqrestore(&s->cpu_slab->lock, irqflags);
 7434
 7435			/*
 7436			 * Invoking slow path likely have side-effect
 7437			 * of re-populating per CPU c->freelist
 7438			 */
 7439			p[i] = ___slab_alloc(s, flags, NUMA_NO_NODE,
 7440					    _RET_IP_, c, s->object_size);
 7441			if (unlikely(!p[i]))
 7442				goto error;
 7443
 7444			c = this_cpu_ptr(s->cpu_slab);
 7445			maybe_wipe_obj_freeptr(s, p[i]);
 7446
 7447			local_lock_irqsave(&s->cpu_slab->lock, irqflags);
 7448
 7449			continue; /* goto for-loop */
 7450		}
 7451		c->freelist = get_freepointer(s, object);
 7452		p[i] = object;
 7453		maybe_wipe_obj_freeptr(s, p[i]);
 7454		stat(s, ALLOC_FASTPATH);
 7455	}
 7456	c->tid = next_tid(c->tid);
 7457	local_unlock_irqrestore(&s->cpu_slab->lock, irqflags);
 7458	slub_put_cpu_ptr(s->cpu_slab);
 7459
 7460	return i;
 7461
 7462error:
 7463	slub_put_cpu_ptr(s->cpu_slab);
 7464	__kmem_cache_free_bulk(s, i, p);
 7465	return 0;
 7466
 7467}
 7468
 7469/* Note that interrupts must be enabled when calling this function. */
 7470int kmem_cache_alloc_bulk_noprof(struct kmem_cache *s, gfp_t flags, size_t size,
 7471				 void **p)
 7472{
 7473	unsigned int i = 0;
 7474	void *kfence_obj;
 7475
 7476	if (!size)
 7477		return 0;
 7478
 7479	s = slab_pre_alloc_hook(s, flags);
 7480	if (unlikely(!s))
 7481		return 0;
 7482
 7483	/*
 7484	 * to make things simpler, only assume at most once kfence allocated
 7485	 * object per bulk allocation and choose its index randomly
 7486	 */
 7487	kfence_obj = kfence_alloc(s, s->object_size, flags);
 7488
 7489	if (unlikely(kfence_obj)) {
 7490		if (unlikely(size == 1)) {
 7491			p[0] = kfence_obj;
 7492			goto out;
 7493		}
 7494		size--;
 7495	}
 7496
 7497	if (s->cpu_sheaves)
 7498		i = alloc_from_pcs_bulk(s, size, p);
 7499
 7500	if (i < size) {
 7501		/*
 7502		 * If we ran out of memory, don't bother with freeing back to
 7503		 * the percpu sheaves, we have bigger problems.
 7504		 */
 7505		if (unlikely(__kmem_cache_alloc_bulk(s, flags, size - i, p + i) == 0)) {
 7506			if (i > 0)
 7507				__kmem_cache_free_bulk(s, i, p);
 7508			if (kfence_obj)
 7509				__kfence_free(kfence_obj);
 7510			return 0;
 7511		}
 7512	}
 7513
 7514	if (unlikely(kfence_obj)) {
 7515		int idx = get_random_u32_below(size + 1);
 7516
 7517		if (idx != size)
 7518			p[size] = p[idx];
 7519		p[idx] = kfence_obj;
 7520
 7521		size++;
 7522	}
 7523
 7524out:
 7525	/*
 7526	 * memcg and kmem_cache debug support and memory initialization.
 7527	 * Done outside of the IRQ disabled fastpath loop.
 7528	 */
 7529	if (unlikely(!slab_post_alloc_hook(s, NULL, flags, size, p,
 7530		    slab_want_init_on_alloc(flags, s), s->object_size))) {
 7531		return 0;
 7532	}
 7533
 7534	return size;
 7535}
 7536EXPORT_SYMBOL(kmem_cache_alloc_bulk_noprof);
 7537
 7538/*
 7539 * Object placement in a slab is made very easy because we always start at
 7540 * offset 0. If we tune the size of the object to the alignment then we can
 7541 * get the required alignment by putting one properly sized object after
 7542 * another.
 7543 *
 7544 * Notice that the allocation order determines the sizes of the per cpu
 7545 * caches. Each processor has always one slab available for allocations.
 7546 * Increasing the allocation order reduces the number of times that slabs
 7547 * must be moved on and off the partial lists and is therefore a factor in
 7548 * locking overhead.
 7549 */
 7550
 7551/*
 7552 * Minimum / Maximum order of slab pages. This influences locking overhead
 7553 * and slab fragmentation. A higher order reduces the number of partial slabs
 7554 * and increases the number of allocations possible without having to
 7555 * take the list_lock.
 7556 */
 7557static unsigned int slub_min_order;
 7558static unsigned int slub_max_order =
 7559	IS_ENABLED(CONFIG_SLUB_TINY) ? 1 : PAGE_ALLOC_COSTLY_ORDER;
 7560static unsigned int slub_min_objects;
 7561
 7562/*
 7563 * Calculate the order of allocation given an slab object size.
 7564 *
 7565 * The order of allocation has significant impact on performance and other
 7566 * system components. Generally order 0 allocations should be preferred since
 7567 * order 0 does not cause fragmentation in the page allocator. Larger objects
 7568 * be problematic to put into order 0 slabs because there may be too much
 7569 * unused space left. We go to a higher order if more than 1/16th of the slab
 7570 * would be wasted.
 7571 *
 7572 * In order to reach satisfactory performance we must ensure that a minimum
 7573 * number of objects is in one slab. Otherwise we may generate too much
 7574 * activity on the partial lists which requires taking the list_lock. This is
 7575 * less a concern for large slabs though which are rarely used.
 7576 *
 7577 * slab_max_order specifies the order where we begin to stop considering the
 7578 * number of objects in a slab as critical. If we reach slab_max_order then
 7579 * we try to keep the page order as low as possible. So we accept more waste
 7580 * of space in favor of a small page order.
 7581 *
 7582 * Higher order allocations also allow the placement of more objects in a
 7583 * slab and thereby reduce object handling overhead. If the user has
 7584 * requested a higher minimum order then we start with that one instead of
 7585 * the smallest order which will fit the object.
 7586 */
 7587static inline unsigned int calc_slab_order(unsigned int size,
 7588		unsigned int min_order, unsigned int max_order,
 7589		unsigned int fract_leftover)
 7590{
 7591	unsigned int order;
 7592
 7593	for (order = min_order; order <= max_order; order++) {
 7594
 7595		unsigned int slab_size = (unsigned int)PAGE_SIZE << order;
 7596		unsigned int rem;
 7597
 7598		rem = slab_size % size;
 7599
 7600		if (rem <= slab_size / fract_leftover)
 7601			break;
 7602	}
 7603
 7604	return order;
 7605}
 7606
 7607static inline int calculate_order(unsigned int size)
 7608{
 7609	unsigned int order;
 7610	unsigned int min_objects;
 7611	unsigned int max_objects;
 7612	unsigned int min_order;
 7613
 7614	min_objects = slub_min_objects;
 7615	if (!min_objects) {
 7616		/*
 7617		 * Some architectures will only update present cpus when
 7618		 * onlining them, so don't trust the number if it's just 1. But
 7619		 * we also don't want to use nr_cpu_ids always, as on some other
 7620		 * architectures, there can be many possible cpus, but never
 7621		 * onlined. Here we compromise between trying to avoid too high
 7622		 * order on systems that appear larger than they are, and too
 7623		 * low order on systems that appear smaller than they are.
 7624		 */
 7625		unsigned int nr_cpus = num_present_cpus();
 7626		if (nr_cpus <= 1)
 7627			nr_cpus = nr_cpu_ids;
 7628		min_objects = 4 * (fls(nr_cpus) + 1);
 7629	}
 7630	/* min_objects can't be 0 because get_order(0) is undefined */
 7631	max_objects = max(order_objects(slub_max_order, size), 1U);
 7632	min_objects = min(min_objects, max_objects);
 7633
 7634	min_order = max_t(unsigned int, slub_min_order,
 7635			  get_order(min_objects * size));
 7636	if (order_objects(min_order, size) > MAX_OBJS_PER_PAGE)
 7637		return get_order(size * MAX_OBJS_PER_PAGE) - 1;
 7638
 7639	/*
 7640	 * Attempt to find best configuration for a slab. This works by first
 7641	 * attempting to generate a layout with the best possible configuration
 7642	 * and backing off gradually.
 7643	 *
 7644	 * We start with accepting at most 1/16 waste and try to find the
 7645	 * smallest order from min_objects-derived/slab_min_order up to
 7646	 * slab_max_order that will satisfy the constraint. Note that increasing
 7647	 * the order can only result in same or less fractional waste, not more.
 7648	 *
 7649	 * If that fails, we increase the acceptable fraction of waste and try
 7650	 * again. The last iteration with fraction of 1/2 would effectively
 7651	 * accept any waste and give us the order determined by min_objects, as
 7652	 * long as at least single object fits within slab_max_order.
 7653	 */
 7654	for (unsigned int fraction = 16; fraction > 1; fraction /= 2) {
 7655		order = calc_slab_order(size, min_order, slub_max_order,
 7656					fraction);
 7657		if (order <= slub_max_order)
 7658			return order;
 7659	}
 7660
 7661	/*
 7662	 * Doh this slab cannot be placed using slab_max_order.
 7663	 */
 7664	order = get_order(size);
 7665	if (order <= MAX_PAGE_ORDER)
 7666		return order;
 7667	return -ENOSYS;
 7668}
 7669
 7670static void
 7671init_kmem_cache_node(struct kmem_cache_node *n, struct node_barn *barn)
 7672{
 7673	n->nr_partial = 0;
 7674	spin_lock_init(&n->list_lock);
 7675	INIT_LIST_HEAD(&n->partial);
 7676#ifdef CONFIG_SLUB_DEBUG
 7677	atomic_long_set(&n->nr_slabs, 0);
 7678	atomic_long_set(&n->total_objects, 0);
 7679	INIT_LIST_HEAD(&n->full);
 7680#endif
 7681	n->barn = barn;
 7682	if (barn)
 7683		barn_init(barn);
 7684}
 7685
 7686static inline int alloc_kmem_cache_cpus(struct kmem_cache *s)
 7687{
 7688	BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE <
 7689			NR_KMALLOC_TYPES * KMALLOC_SHIFT_HIGH *
 7690			sizeof(struct kmem_cache_cpu));
 7691
 7692	/*
 7693	 * Must align to double word boundary for the double cmpxchg
 7694	 * instructions to work; see __pcpu_double_call_return_bool().
 7695	 */
 7696	s->cpu_slab = __alloc_percpu(sizeof(struct kmem_cache_cpu),
 7697				     2 * sizeof(void *));
 7698
 7699	if (!s->cpu_slab)
 7700		return 0;
 7701
 7702	init_kmem_cache_cpus(s);
 7703
 7704	return 1;
 7705}
 7706
 7707static int init_percpu_sheaves(struct kmem_cache *s)
 7708{
 7709	int cpu;
 7710
 7711	for_each_possible_cpu(cpu) {
 7712		struct slub_percpu_sheaves *pcs;
 7713
 7714		pcs = per_cpu_ptr(s->cpu_sheaves, cpu);
 7715
 7716		local_trylock_init(&pcs->lock);
 7717
 7718		pcs->main = alloc_empty_sheaf(s, GFP_KERNEL);
 7719
 7720		if (!pcs->main)
 7721			return -ENOMEM;
 7722	}
 7723
 7724	return 0;
 7725}
 7726
 7727static struct kmem_cache *kmem_cache_node;
 7728
 7729/*
 7730 * No kmalloc_node yet so do it by hand. We know that this is the first
 7731 * slab on the node for this slabcache. There are no concurrent accesses
 7732 * possible.
 7733 *
 7734 * Note that this function only works on the kmem_cache_node
 7735 * when allocating for the kmem_cache_node. This is used for bootstrapping
 7736 * memory on a fresh node that has no slab structures yet.
 7737 */
 7738static void early_kmem_cache_node_alloc(int node)
 7739{
 7740	struct slab *slab;
 7741	struct kmem_cache_node *n;
 7742
 7743	BUG_ON(kmem_cache_node->size < sizeof(struct kmem_cache_node));
 7744
 7745	slab = new_slab(kmem_cache_node, GFP_NOWAIT, node);
 7746
 7747	BUG_ON(!slab);
 7748	if (slab_nid(slab) != node) {
 7749		pr_err("SLUB: Unable to allocate memory from node %d\n", node);
 7750		pr_err("SLUB: Allocating a useless per node structure in order to be able to continue\n");
 7751	}
 7752
 7753	n = slab->freelist;
 7754	BUG_ON(!n);
 7755#ifdef CONFIG_SLUB_DEBUG
 7756	init_object(kmem_cache_node, n, SLUB_RED_ACTIVE);
 7757#endif
 7758	n = kasan_slab_alloc(kmem_cache_node, n, GFP_KERNEL, false);
 7759	slab->freelist = get_freepointer(kmem_cache_node, n);
 7760	slab->inuse = 1;
 7761	kmem_cache_node->node[node] = n;
 7762	init_kmem_cache_node(n, NULL);
 7763	inc_slabs_node(kmem_cache_node, node, slab->objects);
 7764
 7765	/*
 7766	 * No locks need to be taken here as it has just been
 7767	 * initialized and there is no concurrent access.
 7768	 */
 7769	__add_partial(n, slab, DEACTIVATE_TO_HEAD);
 7770}
 7771
 7772static void free_kmem_cache_nodes(struct kmem_cache *s)
 7773{
 7774	int node;
 7775	struct kmem_cache_node *n;
 7776
 7777	for_each_kmem_cache_node(s, node, n) {
 7778		if (n->barn) {
 7779			WARN_ON(n->barn->nr_full);
 7780			WARN_ON(n->barn->nr_empty);
 7781			kfree(n->barn);
 7782			n->barn = NULL;
 7783		}
 7784
 7785		s->node[node] = NULL;
 7786		kmem_cache_free(kmem_cache_node, n);
 7787	}
 7788}
 7789
 7790void __kmem_cache_release(struct kmem_cache *s)
 7791{
 7792	cache_random_seq_destroy(s);
 7793	if (s->cpu_sheaves)
 7794		pcs_destroy(s);
 7795#ifdef CONFIG_PREEMPT_RT
 7796	if (s->cpu_slab)
 7797		lockdep_unregister_key(&s->lock_key);
 7798#endif
 7799	free_percpu(s->cpu_slab);
 7800	free_kmem_cache_nodes(s);
 7801}
 7802
 7803static int init_kmem_cache_nodes(struct kmem_cache *s)
 7804{
 7805	int node;
 7806
 7807	for_each_node_mask(node, slab_nodes) {
 7808		struct kmem_cache_node *n;
 7809		struct node_barn *barn = NULL;
 7810
 7811		if (slab_state == DOWN) {
 7812			early_kmem_cache_node_alloc(node);
 7813			continue;
 7814		}
 7815
 7816		if (s->cpu_sheaves) {
 7817			barn = kmalloc_node(sizeof(*barn), GFP_KERNEL, node);
 7818
 7819			if (!barn)
 7820				return 0;
 7821		}
 7822
 7823		n = kmem_cache_alloc_node(kmem_cache_node,
 7824						GFP_KERNEL, node);
 7825		if (!n) {
 7826			kfree(barn);
 7827			return 0;
 7828		}
 7829
 7830		init_kmem_cache_node(n, barn);
 7831
 7832		s->node[node] = n;
 7833	}
 7834	return 1;
 7835}
 7836
 7837static void set_cpu_partial(struct kmem_cache *s)
 7838{
 7839#ifdef CONFIG_SLUB_CPU_PARTIAL
 7840	unsigned int nr_objects;
 7841
 7842	/*
 7843	 * cpu_partial determined the maximum number of objects kept in the
 7844	 * per cpu partial lists of a processor.
 7845	 *
 7846	 * Per cpu partial lists mainly contain slabs that just have one
 7847	 * object freed. If they are used for allocation then they can be
 7848	 * filled up again with minimal effort. The slab will never hit the
 7849	 * per node partial lists and therefore no locking will be required.
 7850	 *
 7851	 * For backwards compatibility reasons, this is determined as number
 7852	 * of objects, even though we now limit maximum number of pages, see
 7853	 * slub_set_cpu_partial()
 7854	 */
 7855	if (!kmem_cache_has_cpu_partial(s))
 7856		nr_objects = 0;
 7857	else if (s->size >= PAGE_SIZE)
 7858		nr_objects = 6;
 7859	else if (s->size >= 1024)
 7860		nr_objects = 24;
 7861	else if (s->size >= 256)
 7862		nr_objects = 52;
 7863	else
 7864		nr_objects = 120;
 7865
 7866	slub_set_cpu_partial(s, nr_objects);
 7867#endif
 7868}
 7869
 7870/*
 7871 * calculate_sizes() determines the order and the distribution of data within
 7872 * a slab object.
 7873 */
 7874static int calculate_sizes(struct kmem_cache_args *args, struct kmem_cache *s)
 7875{
 7876	slab_flags_t flags = s->flags;
 7877	unsigned int size = s->object_size;
 7878	unsigned int order;
 7879
 7880	/*
 7881	 * Round up object size to the next word boundary. We can only
 7882	 * place the free pointer at word boundaries and this determines
 7883	 * the possible location of the free pointer.
 7884	 */
 7885	size = ALIGN(size, sizeof(void *));
 7886
 7887#ifdef CONFIG_SLUB_DEBUG
 7888	/*
 7889	 * Determine if we can poison the object itself. If the user of
 7890	 * the slab may touch the object after free or before allocation
 7891	 * then we should never poison the object itself.
 7892	 */
 7893	if ((flags & SLAB_POISON) && !(flags & SLAB_TYPESAFE_BY_RCU) &&
 7894			!s->ctor)
 7895		s->flags |= __OBJECT_POISON;
 7896	else
 7897		s->flags &= ~__OBJECT_POISON;
 7898
 7899
 7900	/*
 7901	 * If we are Redzoning then check if there is some space between the
 7902	 * end of the object and the free pointer. If not then add an
 7903	 * additional word to have some bytes to store Redzone information.
 7904	 */
 7905	if ((flags & SLAB_RED_ZONE) && size == s->object_size)
 7906		size += sizeof(void *);
 7907#endif
 7908
 7909	/*
 7910	 * With that we have determined the number of bytes in actual use
 7911	 * by the object and redzoning.
 7912	 */
 7913	s->inuse = size;
 7914
 7915	if (((flags & SLAB_TYPESAFE_BY_RCU) && !args->use_freeptr_offset) ||
 7916	    (flags & SLAB_POISON) || s->ctor ||
 7917	    ((flags & SLAB_RED_ZONE) &&
 7918	     (s->object_size < sizeof(void *) || slub_debug_orig_size(s)))) {
 7919		/*
 7920		 * Relocate free pointer after the object if it is not
 7921		 * permitted to overwrite the first word of the object on
 7922		 * kmem_cache_free.
 7923		 *
 7924		 * This is the case if we do RCU, have a constructor, are
 7925		 * poisoning the objects, or are redzoning an object smaller
 7926		 * than sizeof(void *) or are redzoning an object with
 7927		 * slub_debug_orig_size() enabled, in which case the right
 7928		 * redzone may be extended.
 7929		 *
 7930		 * The assumption that s->offset >= s->inuse means free
 7931		 * pointer is outside of the object is used in the
 7932		 * freeptr_outside_object() function. If that is no
 7933		 * longer true, the function needs to be modified.
 7934		 */
 7935		s->offset = size;
 7936		size += sizeof(void *);
 7937	} else if ((flags & SLAB_TYPESAFE_BY_RCU) && args->use_freeptr_offset) {
 7938		s->offset = args->freeptr_offset;
 7939	} else {
 7940		/*
 7941		 * Store freelist pointer near middle of object to keep
 7942		 * it away from the edges of the object to avoid small
 7943		 * sized over/underflows from neighboring allocations.
 7944		 */
 7945		s->offset = ALIGN_DOWN(s->object_size / 2, sizeof(void *));
 7946	}
 7947
 7948#ifdef CONFIG_SLUB_DEBUG
 7949	if (flags & SLAB_STORE_USER) {
 7950		/*
 7951		 * Need to store information about allocs and frees after
 7952		 * the object.
 7953		 */
 7954		size += 2 * sizeof(struct track);
 7955
 7956		/* Save the original kmalloc request size */
 7957		if (flags & SLAB_KMALLOC)
 7958			size += sizeof(unsigned int);
 7959	}
 7960#endif
 7961
 7962	kasan_cache_create(s, &size, &s->flags);
 7963#ifdef CONFIG_SLUB_DEBUG
 7964	if (flags & SLAB_RED_ZONE) {
 7965		/*
 7966		 * Add some empty padding so that we can catch
 7967		 * overwrites from earlier objects rather than let
 7968		 * tracking information or the free pointer be
 7969		 * corrupted if a user writes before the start
 7970		 * of the object.
 7971		 */
 7972		size += sizeof(void *);
 7973
 7974		s->red_left_pad = sizeof(void *);
 7975		s->red_left_pad = ALIGN(s->red_left_pad, s->align);
 7976		size += s->red_left_pad;
 7977	}
 7978#endif
 7979
 7980	/*
 7981	 * SLUB stores one object immediately after another beginning from
 7982	 * offset 0. In order to align the objects we have to simply size
 7983	 * each object to conform to the alignment.
 7984	 */
 7985	size = ALIGN(size, s->align);
 7986	s->size = size;
 7987	s->reciprocal_size = reciprocal_value(size);
 7988	order = calculate_order(size);
 7989
 7990	if ((int)order < 0)
 7991		return 0;
 7992
 7993	s->allocflags = __GFP_COMP;
 7994
 7995	if (s->flags & SLAB_CACHE_DMA)
 7996		s->allocflags |= GFP_DMA;
 7997
 7998	if (s->flags & SLAB_CACHE_DMA32)
 7999		s->allocflags |= GFP_DMA32;
 8000
 8001	if (s->flags & SLAB_RECLAIM_ACCOUNT)
 8002		s->allocflags |= __GFP_RECLAIMABLE;
 8003
 8004	/*
 8005	 * Determine the number of objects per slab
 8006	 */
 8007	s->oo = oo_make(order, size);
 8008	s->min = oo_make(get_order(size), size);
 8009
 8010	return !!oo_objects(s->oo);
 8011}
 8012
 8013static void list_slab_objects(struct kmem_cache *s, struct slab *slab)
 8014{
 8015#ifdef CONFIG_SLUB_DEBUG
 8016	void *addr = slab_address(slab);
 8017	void *p;
 8018
 8019	if (!slab_add_kunit_errors())
 8020		slab_bug(s, "Objects remaining on __kmem_cache_shutdown()");
 8021
 8022	spin_lock(&object_map_lock);
 8023	__fill_map(object_map, s, slab);
 8024
 8025	for_each_object(p, s, addr, slab->objects) {
 8026
 8027		if (!test_bit(__obj_to_index(s, addr, p), object_map)) {
 8028			if (slab_add_kunit_errors())
 8029				continue;
 8030			pr_err("Object 0x%p @offset=%tu\n", p, p - addr);
 8031			print_tracking(s, p);
 8032		}
 8033	}
 8034	spin_unlock(&object_map_lock);
 8035
 8036	__slab_err(slab);
 8037#endif
 8038}
 8039
 8040/*
 8041 * Attempt to free all partial slabs on a node.
 8042 * This is called from __kmem_cache_shutdown(). We must take list_lock
 8043 * because sysfs file might still access partial list after the shutdowning.
 8044 */
 8045static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
 8046{
 8047	LIST_HEAD(discard);
 8048	struct slab *slab, *h;
 8049
 8050	BUG_ON(irqs_disabled());
 8051	spin_lock_irq(&n->list_lock);
 8052	list_for_each_entry_safe(slab, h, &n->partial, slab_list) {
 8053		if (!slab->inuse) {
 8054			remove_partial(n, slab);
 8055			list_add(&slab->slab_list, &discard);
 8056		} else {
 8057			list_slab_objects(s, slab);
 8058		}
 8059	}
 8060	spin_unlock_irq(&n->list_lock);
 8061
 8062	list_for_each_entry_safe(slab, h, &discard, slab_list)
 8063		discard_slab(s, slab);
 8064}
 8065
 8066bool __kmem_cache_empty(struct kmem_cache *s)
 8067{
 8068	int node;
 8069	struct kmem_cache_node *n;
 8070
 8071	for_each_kmem_cache_node(s, node, n)
 8072		if (n->nr_partial || node_nr_slabs(n))
 8073			return false;
 8074	return true;
 8075}
 8076
 8077/*
 8078 * Release all resources used by a slab cache.
 8079 */
 8080int __kmem_cache_shutdown(struct kmem_cache *s)
 8081{
 8082	int node;
 8083	struct kmem_cache_node *n;
 8084
 8085	flush_all_cpus_locked(s);
 8086
 8087	/* we might have rcu sheaves in flight */
 8088	if (s->cpu_sheaves)
 8089		rcu_barrier();
 8090
 8091	/* Attempt to free all objects */
 8092	for_each_kmem_cache_node(s, node, n) {
 8093		if (n->barn)
 8094			barn_shrink(s, n->barn);
 8095		free_partial(s, n);
 8096		if (n->nr_partial || node_nr_slabs(n))
 8097			return 1;
 8098	}
 8099	return 0;
 8100}
 8101
 8102#ifdef CONFIG_PRINTK
 8103void __kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab)
 8104{
 8105	void *base;
 8106	int __maybe_unused i;
 8107	unsigned int objnr;
 8108	void *objp;
 8109	void *objp0;
 8110	struct kmem_cache *s = slab->slab_cache;
 8111	struct track __maybe_unused *trackp;
 8112
 8113	kpp->kp_ptr = object;
 8114	kpp->kp_slab = slab;
 8115	kpp->kp_slab_cache = s;
 8116	base = slab_address(slab);
 8117	objp0 = kasan_reset_tag(object);
 8118#ifdef CONFIG_SLUB_DEBUG
 8119	objp = restore_red_left(s, objp0);
 8120#else
 8121	objp = objp0;
 8122#endif
 8123	objnr = obj_to_index(s, slab, objp);
 8124	kpp->kp_data_offset = (unsigned long)((char *)objp0 - (char *)objp);
 8125	objp = base + s->size * objnr;
 8126	kpp->kp_objp = objp;
 8127	if (WARN_ON_ONCE(objp < base || objp >= base + slab->objects * s->size
 8128			 || (objp - base) % s->size) ||
 8129	    !(s->flags & SLAB_STORE_USER))
 8130		return;
 8131#ifdef CONFIG_SLUB_DEBUG
 8132	objp = fixup_red_left(s, objp);
 8133	trackp = get_track(s, objp, TRACK_ALLOC);
 8134	kpp->kp_ret = (void *)trackp->addr;
 8135#ifdef CONFIG_STACKDEPOT
 8136	{
 8137		depot_stack_handle_t handle;
 8138		unsigned long *entries;
 8139		unsigned int nr_entries;
 8140
 8141		handle = READ_ONCE(trackp->handle);
 8142		if (handle) {
 8143			nr_entries = stack_depot_fetch(handle, &entries);
 8144			for (i = 0; i < KS_ADDRS_COUNT && i < nr_entries; i++)
 8145				kpp->kp_stack[i] = (void *)entries[i];
 8146		}
 8147
 8148		trackp = get_track(s, objp, TRACK_FREE);
 8149		handle = READ_ONCE(trackp->handle);
 8150		if (handle) {
 8151			nr_entries = stack_depot_fetch(handle, &entries);
 8152			for (i = 0; i < KS_ADDRS_COUNT && i < nr_entries; i++)
 8153				kpp->kp_free_stack[i] = (void *)entries[i];
 8154		}
 8155	}
 8156#endif
 8157#endif
 8158}
 8159#endif
 8160
 8161/********************************************************************
 8162 *		Kmalloc subsystem
 8163 *******************************************************************/
 8164
 8165static int __init setup_slub_min_order(const char *str, const struct kernel_param *kp)
 8166{
 8167	int ret;
 8168
 8169	ret = kstrtouint(str, 0, &slub_min_order);
 8170	if (ret)
 8171		return ret;
 8172
 8173	if (slub_min_order > slub_max_order)
 8174		slub_max_order = slub_min_order;
 8175
 8176	return 0;
 8177}
 8178
 8179static const struct kernel_param_ops param_ops_slab_min_order __initconst = {
 8180	.set = setup_slub_min_order,
 8181};
 8182__core_param_cb(slab_min_order, &param_ops_slab_min_order, &slub_min_order, 0);
 8183__core_param_cb(slub_min_order, &param_ops_slab_min_order, &slub_min_order, 0);
 8184
 8185static int __init setup_slub_max_order(const char *str, const struct kernel_param *kp)
 8186{
 8187	int ret;
 8188
 8189	ret = kstrtouint(str, 0, &slub_max_order);
 8190	if (ret)
 8191		return ret;
 8192
 8193	slub_max_order = min_t(unsigned int, slub_max_order, MAX_PAGE_ORDER);
 8194
 8195	if (slub_min_order > slub_max_order)
 8196		slub_min_order = slub_max_order;
 8197
 8198	return 0;
 8199}
 8200
 8201static const struct kernel_param_ops param_ops_slab_max_order __initconst = {
 8202	.set = setup_slub_max_order,
 8203};
 8204__core_param_cb(slab_max_order, &param_ops_slab_max_order, &slub_max_order, 0);
 8205__core_param_cb(slub_max_order, &param_ops_slab_max_order, &slub_max_order, 0);
 8206
 8207core_param(slab_min_objects, slub_min_objects, uint, 0);
 8208core_param(slub_min_objects, slub_min_objects, uint, 0);
 8209
 8210#ifdef CONFIG_NUMA
 8211static int __init setup_slab_strict_numa(const char *str, const struct kernel_param *kp)
 8212{
 8213	if (nr_node_ids > 1) {
 8214		static_branch_enable(&strict_numa);
 8215		pr_info("SLUB: Strict NUMA enabled.\n");
 8216	} else {
 8217		pr_warn("slab_strict_numa parameter set on non NUMA system.\n");
 8218	}
 8219
 8220	return 0;
 8221}
 8222
 8223static const struct kernel_param_ops param_ops_slab_strict_numa __initconst = {
 8224	.flags = KERNEL_PARAM_OPS_FL_NOARG,
 8225	.set = setup_slab_strict_numa,
 8226};
 8227__core_param_cb(slab_strict_numa, &param_ops_slab_strict_numa, NULL, 0);
 8228#endif
 8229
 8230
 8231#ifdef CONFIG_HARDENED_USERCOPY
 8232/*
 8233 * Rejects incorrectly sized objects and objects that are to be copied
 8234 * to/from userspace but do not fall entirely within the containing slab
 8235 * cache's usercopy region.
 8236 *
 8237 * Returns NULL if check passes, otherwise const char * to name of cache
 8238 * to indicate an error.
 8239 */
 8240void __check_heap_object(const void *ptr, unsigned long n,
 8241			 const struct slab *slab, bool to_user)
 8242{
 8243	struct kmem_cache *s;
 8244	unsigned int offset;
 8245	bool is_kfence = is_kfence_address(ptr);
 8246
 8247	ptr = kasan_reset_tag(ptr);
 8248
 8249	/* Find object and usable object size. */
 8250	s = slab->slab_cache;
 8251
 8252	/* Reject impossible pointers. */
 8253	if (ptr < slab_address(slab))
 8254		usercopy_abort("SLUB object not in SLUB page?!", NULL,
 8255			       to_user, 0, n);
 8256
 8257	/* Find offset within object. */
 8258	if (is_kfence)
 8259		offset = ptr - kfence_object_start(ptr);
 8260	else
 8261		offset = (ptr - slab_address(slab)) % s->size;
 8262
 8263	/* Adjust for redzone and reject if within the redzone. */
 8264	if (!is_kfence && kmem_cache_debug_flags(s, SLAB_RED_ZONE)) {
 8265		if (offset < s->red_left_pad)
 8266			usercopy_abort("SLUB object in left red zone",
 8267				       s->name, to_user, offset, n);
 8268		offset -= s->red_left_pad;
 8269	}
 8270
 8271	/* Allow address range falling entirely within usercopy region. */
 8272	if (offset >= s->useroffset &&
 8273	    offset - s->useroffset <= s->usersize &&
 8274	    n <= s->useroffset - offset + s->usersize)
 8275		return;
 8276
 8277	usercopy_abort("SLUB object", s->name, to_user, offset, n);
 8278}
 8279#endif /* CONFIG_HARDENED_USERCOPY */
 8280
 8281#define SHRINK_PROMOTE_MAX 32
 8282
 8283/*
 8284 * kmem_cache_shrink discards empty slabs and promotes the slabs filled
 8285 * up most to the head of the partial lists. New allocations will then
 8286 * fill those up and thus they can be removed from the partial lists.
 8287 *
 8288 * The slabs with the least items are placed last. This results in them
 8289 * being allocated from last increasing the chance that the last objects
 8290 * are freed in them.
 8291 */
 8292static int __kmem_cache_do_shrink(struct kmem_cache *s)
 8293{
 8294	int node;
 8295	int i;
 8296	struct kmem_cache_node *n;
 8297	struct slab *slab;
 8298	struct slab *t;
 8299	struct list_head discard;
 8300	struct list_head promote[SHRINK_PROMOTE_MAX];
 8301	unsigned long flags;
 8302	int ret = 0;
 8303
 8304	for_each_kmem_cache_node(s, node, n) {
 8305		INIT_LIST_HEAD(&discard);
 8306		for (i = 0; i < SHRINK_PROMOTE_MAX; i++)
 8307			INIT_LIST_HEAD(promote + i);
 8308
 8309		if (n->barn)
 8310			barn_shrink(s, n->barn);
 8311
 8312		spin_lock_irqsave(&n->list_lock, flags);
 8313
 8314		/*
 8315		 * Build lists of slabs to discard or promote.
 8316		 *
 8317		 * Note that concurrent frees may occur while we hold the
 8318		 * list_lock. slab->inuse here is the upper limit.
 8319		 */
 8320		list_for_each_entry_safe(slab, t, &n->partial, slab_list) {
 8321			int free = slab->objects - slab->inuse;
 8322
 8323			/* Do not reread slab->inuse */
 8324			barrier();
 8325
 8326			/* We do not keep full slabs on the list */
 8327			BUG_ON(free <= 0);
 8328
 8329			if (free == slab->objects) {
 8330				list_move(&slab->slab_list, &discard);
 8331				slab_clear_node_partial(slab);
 8332				n->nr_partial--;
 8333				dec_slabs_node(s, node, slab->objects);
 8334			} else if (free <= SHRINK_PROMOTE_MAX)
 8335				list_move(&slab->slab_list, promote + free - 1);
 8336		}
 8337
 8338		/*
 8339		 * Promote the slabs filled up most to the head of the
 8340		 * partial list.
 8341		 */
 8342		for (i = SHRINK_PROMOTE_MAX - 1; i >= 0; i--)
 8343			list_splice(promote + i, &n->partial);
 8344
 8345		spin_unlock_irqrestore(&n->list_lock, flags);
 8346
 8347		/* Release empty slabs */
 8348		list_for_each_entry_safe(slab, t, &discard, slab_list)
 8349			free_slab(s, slab);
 8350
 8351		if (node_nr_slabs(n))
 8352			ret = 1;
 8353	}
 8354
 8355	return ret;
 8356}
 8357
 8358int __kmem_cache_shrink(struct kmem_cache *s)
 8359{
 8360	flush_all(s);
 8361	return __kmem_cache_do_shrink(s);
 8362}
 8363
 8364static int slab_mem_going_offline_callback(void)
 8365{
 8366	struct kmem_cache *s;
 8367
 8368	mutex_lock(&slab_mutex);
 8369	list_for_each_entry(s, &slab_caches, list) {
 8370		flush_all_cpus_locked(s);
 8371		__kmem_cache_do_shrink(s);
 8372	}
 8373	mutex_unlock(&slab_mutex);
 8374
 8375	return 0;
 8376}
 8377
 8378static int slab_mem_going_online_callback(int nid)
 8379{
 8380	struct kmem_cache_node *n;
 8381	struct kmem_cache *s;
 8382	int ret = 0;
 8383
 8384	/*
 8385	 * We are bringing a node online. No memory is available yet. We must
 8386	 * allocate a kmem_cache_node structure in order to bring the node
 8387	 * online.
 8388	 */
 8389	mutex_lock(&slab_mutex);
 8390	list_for_each_entry(s, &slab_caches, list) {
 8391		struct node_barn *barn = NULL;
 8392
 8393		/*
 8394		 * The structure may already exist if the node was previously
 8395		 * onlined and offlined.
 8396		 */
 8397		if (get_node(s, nid))
 8398			continue;
 8399
 8400		if (s->cpu_sheaves) {
 8401			barn = kmalloc_node(sizeof(*barn), GFP_KERNEL, nid);
 8402
 8403			if (!barn) {
 8404				ret = -ENOMEM;
 8405				goto out;
 8406			}
 8407		}
 8408
 8409		/*
 8410		 * XXX: kmem_cache_alloc_node will fallback to other nodes
 8411		 *      since memory is not yet available from the node that
 8412		 *      is brought up.
 8413		 */
 8414		n = kmem_cache_alloc(kmem_cache_node, GFP_KERNEL);
 8415		if (!n) {
 8416			kfree(barn);
 8417			ret = -ENOMEM;
 8418			goto out;
 8419		}
 8420
 8421		init_kmem_cache_node(n, barn);
 8422
 8423		s->node[nid] = n;
 8424	}
 8425	/*
 8426	 * Any cache created after this point will also have kmem_cache_node
 8427	 * initialized for the new node.
 8428	 */
 8429	node_set(nid, slab_nodes);
 8430out:
 8431	mutex_unlock(&slab_mutex);
 8432	return ret;
 8433}
 8434
 8435static int slab_memory_callback(struct notifier_block *self,
 8436				unsigned long action, void *arg)
 8437{
 8438	struct node_notify *nn = arg;
 8439	int nid = nn->nid;
 8440	int ret = 0;
 8441
 8442	switch (action) {
 8443	case NODE_ADDING_FIRST_MEMORY:
 8444		ret = slab_mem_going_online_callback(nid);
 8445		break;
 8446	case NODE_REMOVING_LAST_MEMORY:
 8447		ret = slab_mem_going_offline_callback();
 8448		break;
 8449	}
 8450	if (ret)
 8451		ret = notifier_from_errno(ret);
 8452	else
 8453		ret = NOTIFY_OK;
 8454	return ret;
 8455}
 8456
 8457/********************************************************************
 8458 *			Basic setup of slabs
 8459 *******************************************************************/
 8460
 8461/*
 8462 * Used for early kmem_cache structures that were allocated using
 8463 * the page allocator. Allocate them properly then fix up the pointers
 8464 * that may be pointing to the wrong kmem_cache structure.
 8465 */
 8466
 8467static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache)
 8468{
 8469	int node;
 8470	struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
 8471	struct kmem_cache_node *n;
 8472
 8473	memcpy(s, static_cache, kmem_cache->object_size);
 8474
 8475	/*
 8476	 * This runs very early, and only the boot processor is supposed to be
 8477	 * up.  Even if it weren't true, IRQs are not up so we couldn't fire
 8478	 * IPIs around.
 8479	 */
 8480	__flush_cpu_slab(s, smp_processor_id());
 8481	for_each_kmem_cache_node(s, node, n) {
 8482		struct slab *p;
 8483
 8484		list_for_each_entry(p, &n->partial, slab_list)
 8485			p->slab_cache = s;
 8486
 8487#ifdef CONFIG_SLUB_DEBUG
 8488		list_for_each_entry(p, &n->full, slab_list)
 8489			p->slab_cache = s;
 8490#endif
 8491	}
 8492	list_add(&s->list, &slab_caches);
 8493	return s;
 8494}
 8495
 8496void __init kmem_cache_init(void)
 8497{
 8498	static __initdata struct kmem_cache boot_kmem_cache,
 8499		boot_kmem_cache_node;
 8500	int node;
 8501
 8502	if (debug_guardpage_minorder())
 8503		slub_max_order = 0;
 8504
 8505	/* Inform pointer hashing choice about slub debugging state. */
 8506	hash_pointers_finalize(__slub_debug_enabled());
 8507
 8508	kmem_cache_node = &boot_kmem_cache_node;
 8509	kmem_cache = &boot_kmem_cache;
 8510
 8511	/*
 8512	 * Initialize the nodemask for which we will allocate per node
 8513	 * structures. Here we don't need taking slab_mutex yet.
 8514	 */
 8515	for_each_node_state(node, N_MEMORY)
 8516		node_set(node, slab_nodes);
 8517
 8518	create_boot_cache(kmem_cache_node, "kmem_cache_node",
 8519			sizeof(struct kmem_cache_node),
 8520			SLAB_HWCACHE_ALIGN | SLAB_NO_OBJ_EXT, 0, 0);
 8521
 8522	hotplug_node_notifier(slab_memory_callback, SLAB_CALLBACK_PRI);
 8523
 8524	/* Able to allocate the per node structures */
 8525	slab_state = PARTIAL;
 8526
 8527	create_boot_cache(kmem_cache, "kmem_cache",
 8528			offsetof(struct kmem_cache, node) +
 8529				nr_node_ids * sizeof(struct kmem_cache_node *),
 8530			SLAB_HWCACHE_ALIGN | SLAB_NO_OBJ_EXT, 0, 0);
 8531
 8532	kmem_cache = bootstrap(&boot_kmem_cache);
 8533	kmem_cache_node = bootstrap(&boot_kmem_cache_node);
 8534
 8535	/* Now we can use the kmem_cache to allocate kmalloc slabs */
 8536	setup_kmalloc_cache_index_table();
 8537	create_kmalloc_caches();
 8538
 8539	/* Setup random freelists for each cache */
 8540	init_freelist_randomization();
 8541
 8542	cpuhp_setup_state_nocalls(CPUHP_SLUB_DEAD, "slub:dead", NULL,
 8543				  slub_cpu_dead);
 8544
 8545	pr_info("SLUB: HWalign=%d, Order=%u-%u, MinObjects=%u, CPUs=%u, Nodes=%u\n",
 8546		cache_line_size(),
 8547		slub_min_order, slub_max_order, slub_min_objects,
 8548		nr_cpu_ids, nr_node_ids);
 8549}
 8550
 8551void __init kmem_cache_init_late(void)
 8552{
 8553	flushwq = alloc_workqueue("slub_flushwq", WQ_MEM_RECLAIM, 0);
 8554	WARN_ON(!flushwq);
 8555}
 8556
 8557struct kmem_cache *
 8558__kmem_cache_alias(const char *name, unsigned int size, unsigned int align,
 8559		   slab_flags_t flags, void (*ctor)(void *))
 8560{
 8561	struct kmem_cache *s;
 8562
 8563	s = find_mergeable(size, align, flags, name, ctor);
 8564	if (s) {
 8565		if (sysfs_slab_alias(s, name))
 8566			pr_err("SLUB: Unable to add cache alias %s to sysfs\n",
 8567			       name);
 8568
 8569		s->refcount++;
 8570
 8571		/*
 8572		 * Adjust the object sizes so that we clear
 8573		 * the complete object on kzalloc.
 8574		 */
 8575		s->object_size = max(s->object_size, size);
 8576		s->inuse = max(s->inuse, ALIGN(size, sizeof(void *)));
 8577	}
 8578
 8579	return s;
 8580}
 8581
 8582int do_kmem_cache_create(struct kmem_cache *s, const char *name,
 8583			 unsigned int size, struct kmem_cache_args *args,
 8584			 slab_flags_t flags)
 8585{
 8586	int err = -EINVAL;
 8587
 8588	s->name = name;
 8589	s->size = s->object_size = size;
 8590
 8591	s->flags = kmem_cache_flags(flags, s->name);
 8592#ifdef CONFIG_SLAB_FREELIST_HARDENED
 8593	s->random = get_random_long();
 8594#endif
 8595	s->align = args->align;
 8596	s->ctor = args->ctor;
 8597#ifdef CONFIG_HARDENED_USERCOPY
 8598	s->useroffset = args->useroffset;
 8599	s->usersize = args->usersize;
 8600#endif
 8601
 8602	if (!calculate_sizes(args, s))
 8603		goto out;
 8604	if (disable_higher_order_debug) {
 8605		/*
 8606		 * Disable debugging flags that store metadata if the min slab
 8607		 * order increased.
 8608		 */
 8609		if (get_order(s->size) > get_order(s->object_size)) {
 8610			s->flags &= ~DEBUG_METADATA_FLAGS;
 8611			s->offset = 0;
 8612			if (!calculate_sizes(args, s))
 8613				goto out;
 8614		}
 8615	}
 8616
 8617#ifdef system_has_freelist_aba
 8618	if (system_has_freelist_aba() && !(s->flags & SLAB_NO_CMPXCHG)) {
 8619		/* Enable fast mode */
 8620		s->flags |= __CMPXCHG_DOUBLE;
 8621	}
 8622#endif
 8623
 8624	/*
 8625	 * The larger the object size is, the more slabs we want on the partial
 8626	 * list to avoid pounding the page allocator excessively.
 8627	 */
 8628	s->min_partial = min_t(unsigned long, MAX_PARTIAL, ilog2(s->size) / 2);
 8629	s->min_partial = max_t(unsigned long, MIN_PARTIAL, s->min_partial);
 8630
 8631	set_cpu_partial(s);
 8632
 8633	if (args->sheaf_capacity && !IS_ENABLED(CONFIG_SLUB_TINY)
 8634					&& !(s->flags & SLAB_DEBUG_FLAGS)) {
 8635		s->cpu_sheaves = alloc_percpu(struct slub_percpu_sheaves);
 8636		if (!s->cpu_sheaves) {
 8637			err = -ENOMEM;
 8638			goto out;
 8639		}
 8640		// TODO: increase capacity to grow slab_sheaf up to next kmalloc size?
 8641		s->sheaf_capacity = args->sheaf_capacity;
 8642	}
 8643
 8644#ifdef CONFIG_NUMA
 8645	s->remote_node_defrag_ratio = 1000;
 8646#endif
 8647
 8648	/* Initialize the pre-computed randomized freelist if slab is up */
 8649	if (slab_state >= UP) {
 8650		if (init_cache_random_seq(s))
 8651			goto out;
 8652	}
 8653
 8654	if (!init_kmem_cache_nodes(s))
 8655		goto out;
 8656
 8657	if (!alloc_kmem_cache_cpus(s))
 8658		goto out;
 8659
 8660	if (s->cpu_sheaves) {
 8661		err = init_percpu_sheaves(s);
 8662		if (err)
 8663			goto out;
 8664	}
 8665
 8666	err = 0;
 8667
 8668	/* Mutex is not taken during early boot */
 8669	if (slab_state <= UP)
 8670		goto out;
 8671
 8672	/*
 8673	 * Failing to create sysfs files is not critical to SLUB functionality.
 8674	 * If it fails, proceed with cache creation without these files.
 8675	 */
 8676	if (sysfs_slab_add(s))
 8677		pr_err("SLUB: Unable to add cache %s to sysfs\n", s->name);
 8678
 8679	if (s->flags & SLAB_STORE_USER)
 8680		debugfs_slab_add(s);
 8681
 8682out:
 8683	if (err)
 8684		__kmem_cache_release(s);
 8685	return err;
 8686}
 8687
 8688#ifdef SLAB_SUPPORTS_SYSFS
 8689static int count_inuse(struct slab *slab)
 8690{
 8691	return slab->inuse;
 8692}
 8693
 8694static int count_total(struct slab *slab)
 8695{
 8696	return slab->objects;
 8697}
 8698#endif
 8699
 8700#ifdef CONFIG_SLUB_DEBUG
 8701static void validate_slab(struct kmem_cache *s, struct slab *slab,
 8702			  unsigned long *obj_map)
 8703{
 8704	void *p;
 8705	void *addr = slab_address(slab);
 8706
 8707	if (!validate_slab_ptr(slab)) {
 8708		slab_err(s, slab, "Not a valid slab page");
 8709		return;
 8710	}
 8711
 8712	if (!check_slab(s, slab) || !on_freelist(s, slab, NULL))
 8713		return;
 8714
 8715	/* Now we know that a valid freelist exists */
 8716	__fill_map(obj_map, s, slab);
 8717	for_each_object(p, s, addr, slab->objects) {
 8718		u8 val = test_bit(__obj_to_index(s, addr, p), obj_map) ?
 8719			 SLUB_RED_INACTIVE : SLUB_RED_ACTIVE;
 8720
 8721		if (!check_object(s, slab, p, val))
 8722			break;
 8723	}
 8724}
 8725
 8726static int validate_slab_node(struct kmem_cache *s,
 8727		struct kmem_cache_node *n, unsigned long *obj_map)
 8728{
 8729	unsigned long count = 0;
 8730	struct slab *slab;
 8731	unsigned long flags;
 8732
 8733	spin_lock_irqsave(&n->list_lock, flags);
 8734
 8735	list_for_each_entry(slab, &n->partial, slab_list) {
 8736		validate_slab(s, slab, obj_map);
 8737		count++;
 8738	}
 8739	if (count != n->nr_partial) {
 8740		pr_err("SLUB %s: %ld partial slabs counted but counter=%ld\n",
 8741		       s->name, count, n->nr_partial);
 8742		slab_add_kunit_errors();
 8743	}
 8744
 8745	if (!(s->flags & SLAB_STORE_USER))
 8746		goto out;
 8747
 8748	list_for_each_entry(slab, &n->full, slab_list) {
 8749		validate_slab(s, slab, obj_map);
 8750		count++;
 8751	}
 8752	if (count != node_nr_slabs(n)) {
 8753		pr_err("SLUB: %s %ld slabs counted but counter=%ld\n",
 8754		       s->name, count, node_nr_slabs(n));
 8755		slab_add_kunit_errors();
 8756	}
 8757
 8758out:
 8759	spin_unlock_irqrestore(&n->list_lock, flags);
 8760	return count;
 8761}
 8762
 8763long validate_slab_cache(struct kmem_cache *s)
 8764{
 8765	int node;
 8766	unsigned long count = 0;
 8767	struct kmem_cache_node *n;
 8768	unsigned long *obj_map;
 8769
 8770	obj_map = bitmap_alloc(oo_objects(s->oo), GFP_KERNEL);
 8771	if (!obj_map)
 8772		return -ENOMEM;
 8773
 8774	flush_all(s);
 8775	for_each_kmem_cache_node(s, node, n)
 8776		count += validate_slab_node(s, n, obj_map);
 8777
 8778	bitmap_free(obj_map);
 8779
 8780	return count;
 8781}
 8782EXPORT_SYMBOL(validate_slab_cache);
 8783
 8784#ifdef CONFIG_DEBUG_FS
 8785/*
 8786 * Generate lists of code addresses where slabcache objects are allocated
 8787 * and freed.
 8788 */
 8789
 8790struct location {
 8791	depot_stack_handle_t handle;
 8792	unsigned long count;
 8793	unsigned long addr;
 8794	unsigned long waste;
 8795	long long sum_time;
 8796	long min_time;
 8797	long max_time;
 8798	long min_pid;
 8799	long max_pid;
 8800	DECLARE_BITMAP(cpus, NR_CPUS);
 8801	nodemask_t nodes;
 8802};
 8803
 8804struct loc_track {
 8805	unsigned long max;
 8806	unsigned long count;
 8807	struct location *loc;
 8808	loff_t idx;
 8809};
 8810
 8811static struct dentry *slab_debugfs_root;
 8812
 8813static void free_loc_track(struct loc_track *t)
 8814{
 8815	if (t->max)
 8816		free_pages((unsigned long)t->loc,
 8817			get_order(sizeof(struct location) * t->max));
 8818}
 8819
 8820static int alloc_loc_track(struct loc_track *t, unsigned long max, gfp_t flags)
 8821{
 8822	struct location *l;
 8823	int order;
 8824
 8825	order = get_order(sizeof(struct location) * max);
 8826
 8827	l = (void *)__get_free_pages(flags, order);
 8828	if (!l)
 8829		return 0;
 8830
 8831	if (t->count) {
 8832		memcpy(l, t->loc, sizeof(struct location) * t->count);
 8833		free_loc_track(t);
 8834	}
 8835	t->max = max;
 8836	t->loc = l;
 8837	return 1;
 8838}
 8839
 8840static int add_location(struct loc_track *t, struct kmem_cache *s,
 8841				const struct track *track,
 8842				unsigned int orig_size)
 8843{
 8844	long start, end, pos;
 8845	struct location *l;
 8846	unsigned long caddr, chandle, cwaste;
 8847	unsigned long age = jiffies - track->when;
 8848	depot_stack_handle_t handle = 0;
 8849	unsigned int waste = s->object_size - orig_size;
 8850
 8851#ifdef CONFIG_STACKDEPOT
 8852	handle = READ_ONCE(track->handle);
 8853#endif
 8854	start = -1;
 8855	end = t->count;
 8856
 8857	for ( ; ; ) {
 8858		pos = start + (end - start + 1) / 2;
 8859
 8860		/*
 8861		 * There is nothing at "end". If we end up there
 8862		 * we need to add something to before end.
 8863		 */
 8864		if (pos == end)
 8865			break;
 8866
 8867		l = &t->loc[pos];
 8868		caddr = l->addr;
 8869		chandle = l->handle;
 8870		cwaste = l->waste;
 8871		if ((track->addr == caddr) && (handle == chandle) &&
 8872			(waste == cwaste)) {
 8873
 8874			l->count++;
 8875			if (track->when) {
 8876				l->sum_time += age;
 8877				if (age < l->min_time)
 8878					l->min_time = age;
 8879				if (age > l->max_time)
 8880					l->max_time = age;
 8881
 8882				if (track->pid < l->min_pid)
 8883					l->min_pid = track->pid;
 8884				if (track->pid > l->max_pid)
 8885					l->max_pid = track->pid;
 8886
 8887				cpumask_set_cpu(track->cpu,
 8888						to_cpumask(l->cpus));
 8889			}
 8890			node_set(page_to_nid(virt_to_page(track)), l->nodes);
 8891			return 1;
 8892		}
 8893
 8894		if (track->addr < caddr)
 8895			end = pos;
 8896		else if (track->addr == caddr && handle < chandle)
 8897			end = pos;
 8898		else if (track->addr == caddr && handle == chandle &&
 8899				waste < cwaste)
 8900			end = pos;
 8901		else
 8902			start = pos;
 8903	}
 8904
 8905	/*
 8906	 * Not found. Insert new tracking element.
 8907	 */
 8908	if (t->count >= t->max && !alloc_loc_track(t, 2 * t->max, GFP_ATOMIC))
 8909		return 0;
 8910
 8911	l = t->loc + pos;
 8912	if (pos < t->count)
 8913		memmove(l + 1, l,
 8914			(t->count - pos) * sizeof(struct location));
 8915	t->count++;
 8916	l->count = 1;
 8917	l->addr = track->addr;
 8918	l->sum_time = age;
 8919	l->min_time = age;
 8920	l->max_time = age;
 8921	l->min_pid = track->pid;
 8922	l->max_pid = track->pid;
 8923	l->handle = handle;
 8924	l->waste = waste;
 8925	cpumask_clear(to_cpumask(l->cpus));
 8926	cpumask_set_cpu(track->cpu, to_cpumask(l->cpus));
 8927	nodes_clear(l->nodes);
 8928	node_set(page_to_nid(virt_to_page(track)), l->nodes);
 8929	return 1;
 8930}
 8931
 8932static void process_slab(struct loc_track *t, struct kmem_cache *s,
 8933		struct slab *slab, enum track_item alloc,
 8934		unsigned long *obj_map)
 8935{
 8936	void *addr = slab_address(slab);
 8937	bool is_alloc = (alloc == TRACK_ALLOC);
 8938	void *p;
 8939
 8940	__fill_map(obj_map, s, slab);
 8941
 8942	for_each_object(p, s, addr, slab->objects)
 8943		if (!test_bit(__obj_to_index(s, addr, p), obj_map))
 8944			add_location(t, s, get_track(s, p, alloc),
 8945				     is_alloc ? get_orig_size(s, p) :
 8946						s->object_size);
 8947}
 8948#endif  /* CONFIG_DEBUG_FS   */
 8949#endif	/* CONFIG_SLUB_DEBUG */
 8950
 8951#ifdef SLAB_SUPPORTS_SYSFS
 8952enum slab_stat_type {
 8953	SL_ALL,			/* All slabs */
 8954	SL_PARTIAL,		/* Only partially allocated slabs */
 8955	SL_CPU,			/* Only slabs used for cpu caches */
 8956	SL_OBJECTS,		/* Determine allocated objects not slabs */
 8957	SL_TOTAL		/* Determine object capacity not slabs */
 8958};
 8959
 8960#define SO_ALL		(1 << SL_ALL)
 8961#define SO_PARTIAL	(1 << SL_PARTIAL)
 8962#define SO_CPU		(1 << SL_CPU)
 8963#define SO_OBJECTS	(1 << SL_OBJECTS)
 8964#define SO_TOTAL	(1 << SL_TOTAL)
 8965
 8966static ssize_t show_slab_objects(struct kmem_cache *s,
 8967				 char *buf, unsigned long flags)
 8968{
 8969	unsigned long total = 0;
 8970	int node;
 8971	int x;
 8972	unsigned long *nodes;
 8973	int len = 0;
 8974
 8975	nodes = kcalloc(nr_node_ids, sizeof(unsigned long), GFP_KERNEL);
 8976	if (!nodes)
 8977		return -ENOMEM;
 8978
 8979	if (flags & SO_CPU) {
 8980		int cpu;
 8981
 8982		for_each_possible_cpu(cpu) {
 8983			struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab,
 8984							       cpu);
 8985			int node;
 8986			struct slab *slab;
 8987
 8988			slab = READ_ONCE(c->slab);
 8989			if (!slab)
 8990				continue;
 8991
 8992			node = slab_nid(slab);
 8993			if (flags & SO_TOTAL)
 8994				x = slab->objects;
 8995			else if (flags & SO_OBJECTS)
 8996				x = slab->inuse;
 8997			else
 8998				x = 1;
 8999
 9000			total += x;
 9001			nodes[node] += x;
 9002
 9003#ifdef CONFIG_SLUB_CPU_PARTIAL
 9004			slab = slub_percpu_partial_read_once(c);
 9005			if (slab) {
 9006				node = slab_nid(slab);
 9007				if (flags & SO_TOTAL)
 9008					WARN_ON_ONCE(1);
 9009				else if (flags & SO_OBJECTS)
 9010					WARN_ON_ONCE(1);
 9011				else
 9012					x = data_race(slab->slabs);
 9013				total += x;
 9014				nodes[node] += x;
 9015			}
 9016#endif
 9017		}
 9018	}
 9019
 9020	/*
 9021	 * It is impossible to take "mem_hotplug_lock" here with "kernfs_mutex"
 9022	 * already held which will conflict with an existing lock order:
 9023	 *
 9024	 * mem_hotplug_lock->slab_mutex->kernfs_mutex
 9025	 *
 9026	 * We don't really need mem_hotplug_lock (to hold off
 9027	 * slab_mem_going_offline_callback) here because slab's memory hot
 9028	 * unplug code doesn't destroy the kmem_cache->node[] data.
 9029	 */
 9030
 9031#ifdef CONFIG_SLUB_DEBUG
 9032	if (flags & SO_ALL) {
 9033		struct kmem_cache_node *n;
 9034
 9035		for_each_kmem_cache_node(s, node, n) {
 9036
 9037			if (flags & SO_TOTAL)
 9038				x = node_nr_objs(n);
 9039			else if (flags & SO_OBJECTS)
 9040				x = node_nr_objs(n) - count_partial(n, count_free);
 9041			else
 9042				x = node_nr_slabs(n);
 9043			total += x;
 9044			nodes[node] += x;
 9045		}
 9046
 9047	} else
 9048#endif
 9049	if (flags & SO_PARTIAL) {
 9050		struct kmem_cache_node *n;
 9051
 9052		for_each_kmem_cache_node(s, node, n) {
 9053			if (flags & SO_TOTAL)
 9054				x = count_partial(n, count_total);
 9055			else if (flags & SO_OBJECTS)
 9056				x = count_partial(n, count_inuse);
 9057			else
 9058				x = n->nr_partial;
 9059			total += x;
 9060			nodes[node] += x;
 9061		}
 9062	}
 9063
 9064	len += sysfs_emit_at(buf, len, "%lu", total);
 9065#ifdef CONFIG_NUMA
 9066	for (node = 0; node < nr_node_ids; node++) {
 9067		if (nodes[node])
 9068			len += sysfs_emit_at(buf, len, " N%d=%lu",
 9069					     node, nodes[node]);
 9070	}
 9071#endif
 9072	len += sysfs_emit_at(buf, len, "\n");
 9073	kfree(nodes);
 9074
 9075	return len;
 9076}
 9077
 9078#define to_slab_attr(n) container_of(n, struct slab_attribute, attr)
 9079#define to_slab(n) container_of(n, struct kmem_cache, kobj)
 9080
 9081struct slab_attribute {
 9082	struct attribute attr;
 9083	ssize_t (*show)(struct kmem_cache *s, char *buf);
 9084	ssize_t (*store)(struct kmem_cache *s, const char *x, size_t count);
 9085};
 9086
 9087#define SLAB_ATTR_RO(_name) \
 9088	static struct slab_attribute _name##_attr = __ATTR_RO_MODE(_name, 0400)
 9089
 9090#define SLAB_ATTR(_name) \
 9091	static struct slab_attribute _name##_attr = __ATTR_RW_MODE(_name, 0600)
 9092
 9093static ssize_t slab_size_show(struct kmem_cache *s, char *buf)
 9094{
 9095	return sysfs_emit(buf, "%u\n", s->size);
 9096}
 9097SLAB_ATTR_RO(slab_size);
 9098
 9099static ssize_t align_show(struct kmem_cache *s, char *buf)
 9100{
 9101	return sysfs_emit(buf, "%u\n", s->align);
 9102}
 9103SLAB_ATTR_RO(align);
 9104
 9105static ssize_t object_size_show(struct kmem_cache *s, char *buf)
 9106{
 9107	return sysfs_emit(buf, "%u\n", s->object_size);
 9108}
 9109SLAB_ATTR_RO(object_size);
 9110
 9111static ssize_t objs_per_slab_show(struct kmem_cache *s, char *buf)
 9112{
 9113	return sysfs_emit(buf, "%u\n", oo_objects(s->oo));
 9114}
 9115SLAB_ATTR_RO(objs_per_slab);
 9116
 9117static ssize_t order_show(struct kmem_cache *s, char *buf)
 9118{
 9119	return sysfs_emit(buf, "%u\n", oo_order(s->oo));
 9120}
 9121SLAB_ATTR_RO(order);
 9122
 9123static ssize_t sheaf_capacity_show(struct kmem_cache *s, char *buf)
 9124{
 9125	return sysfs_emit(buf, "%u\n", s->sheaf_capacity);
 9126}
 9127SLAB_ATTR_RO(sheaf_capacity);
 9128
 9129static ssize_t min_partial_show(struct kmem_cache *s, char *buf)
 9130{
 9131	return sysfs_emit(buf, "%lu\n", s->min_partial);
 9132}
 9133
 9134static ssize_t min_partial_store(struct kmem_cache *s, const char *buf,
 9135				 size_t length)
 9136{
 9137	unsigned long min;
 9138	int err;
 9139
 9140	err = kstrtoul(buf, 10, &min);
 9141	if (err)
 9142		return err;
 9143
 9144	s->min_partial = min;
 9145	return length;
 9146}
 9147SLAB_ATTR(min_partial);
 9148
 9149static ssize_t cpu_partial_show(struct kmem_cache *s, char *buf)
 9150{
 9151	unsigned int nr_partial = 0;
 9152#ifdef CONFIG_SLUB_CPU_PARTIAL
 9153	nr_partial = s->cpu_partial;
 9154#endif
 9155
 9156	return sysfs_emit(buf, "%u\n", nr_partial);
 9157}
 9158
 9159static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf,
 9160				 size_t length)
 9161{
 9162	unsigned int objects;
 9163	int err;
 9164
 9165	err = kstrtouint(buf, 10, &objects);
 9166	if (err)
 9167		return err;
 9168	if (objects && !kmem_cache_has_cpu_partial(s))
 9169		return -EINVAL;
 9170
 9171	slub_set_cpu_partial(s, objects);
 9172	flush_all(s);
 9173	return length;
 9174}
 9175SLAB_ATTR(cpu_partial);
 9176
 9177static ssize_t ctor_show(struct kmem_cache *s, char *buf)
 9178{
 9179	if (!s->ctor)
 9180		return 0;
 9181	return sysfs_emit(buf, "%pS\n", s->ctor);
 9182}
 9183SLAB_ATTR_RO(ctor);
 9184
 9185static ssize_t aliases_show(struct kmem_cache *s, char *buf)
 9186{
 9187	return sysfs_emit(buf, "%d\n", s->refcount < 0 ? 0 : s->refcount - 1);
 9188}
 9189SLAB_ATTR_RO(aliases);
 9190
 9191static ssize_t partial_show(struct kmem_cache *s, char *buf)
 9192{
 9193	return show_slab_objects(s, buf, SO_PARTIAL);
 9194}
 9195SLAB_ATTR_RO(partial);
 9196
 9197static ssize_t cpu_slabs_show(struct kmem_cache *s, char *buf)
 9198{
 9199	return show_slab_objects(s, buf, SO_CPU);
 9200}
 9201SLAB_ATTR_RO(cpu_slabs);
 9202
 9203static ssize_t objects_partial_show(struct kmem_cache *s, char *buf)
 9204{
 9205	return show_slab_objects(s, buf, SO_PARTIAL|SO_OBJECTS);
 9206}
 9207SLAB_ATTR_RO(objects_partial);
 9208
 9209static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf)
 9210{
 9211	int objects = 0;
 9212	int slabs = 0;
 9213	int cpu __maybe_unused;
 9214	int len = 0;
 9215
 9216#ifdef CONFIG_SLUB_CPU_PARTIAL
 9217	for_each_online_cpu(cpu) {
 9218		struct slab *slab;
 9219
 9220		slab = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu));
 9221
 9222		if (slab)
 9223			slabs += data_race(slab->slabs);
 9224	}
 9225#endif
 9226
 9227	/* Approximate half-full slabs, see slub_set_cpu_partial() */
 9228	objects = (slabs * oo_objects(s->oo)) / 2;
 9229	len += sysfs_emit_at(buf, len, "%d(%d)", objects, slabs);
 9230
 9231#ifdef CONFIG_SLUB_CPU_PARTIAL
 9232	for_each_online_cpu(cpu) {
 9233		struct slab *slab;
 9234
 9235		slab = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu));
 9236		if (slab) {
 9237			slabs = data_race(slab->slabs);
 9238			objects = (slabs * oo_objects(s->oo)) / 2;
 9239			len += sysfs_emit_at(buf, len, " C%d=%d(%d)",
 9240					     cpu, objects, slabs);
 9241		}
 9242	}
 9243#endif
 9244	len += sysfs_emit_at(buf, len, "\n");
 9245
 9246	return len;
 9247}
 9248SLAB_ATTR_RO(slabs_cpu_partial);
 9249
 9250static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf)
 9251{
 9252	return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT));
 9253}
 9254SLAB_ATTR_RO(reclaim_account);
 9255
 9256static ssize_t hwcache_align_show(struct kmem_cache *s, char *buf)
 9257{
 9258	return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_HWCACHE_ALIGN));
 9259}
 9260SLAB_ATTR_RO(hwcache_align);
 9261
 9262#ifdef CONFIG_ZONE_DMA
 9263static ssize_t cache_dma_show(struct kmem_cache *s, char *buf)
 9264{
 9265	return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_CACHE_DMA));
 9266}
 9267SLAB_ATTR_RO(cache_dma);
 9268#endif
 9269
 9270#ifdef CONFIG_HARDENED_USERCOPY
 9271static ssize_t usersize_show(struct kmem_cache *s, char *buf)
 9272{
 9273	return sysfs_emit(buf, "%u\n", s->usersize);
 9274}
 9275SLAB_ATTR_RO(usersize);
 9276#endif
 9277
 9278static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf)
 9279{
 9280	return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_TYPESAFE_BY_RCU));
 9281}
 9282SLAB_ATTR_RO(destroy_by_rcu);
 9283
 9284#ifdef CONFIG_SLUB_DEBUG
 9285static ssize_t slabs_show(struct kmem_cache *s, char *buf)
 9286{
 9287	return show_slab_objects(s, buf, SO_ALL);
 9288}
 9289SLAB_ATTR_RO(slabs);
 9290
 9291static ssize_t total_objects_show(struct kmem_cache *s, char *buf)
 9292{
 9293	return show_slab_objects(s, buf, SO_ALL|SO_TOTAL);
 9294}
 9295SLAB_ATTR_RO(total_objects);
 9296
 9297static ssize_t objects_show(struct kmem_cache *s, char *buf)
 9298{
 9299	return show_slab_objects(s, buf, SO_ALL|SO_OBJECTS);
 9300}
 9301SLAB_ATTR_RO(objects);
 9302
 9303static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf)
 9304{
 9305	return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_CONSISTENCY_CHECKS));
 9306}
 9307SLAB_ATTR_RO(sanity_checks);
 9308
 9309static ssize_t trace_show(struct kmem_cache *s, char *buf)
 9310{
 9311	return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_TRACE));
 9312}
 9313SLAB_ATTR_RO(trace);
 9314
 9315static ssize_t red_zone_show(struct kmem_cache *s, char *buf)
 9316{
 9317	return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_RED_ZONE));
 9318}
 9319
 9320SLAB_ATTR_RO(red_zone);
 9321
 9322static ssize_t poison_show(struct kmem_cache *s, char *buf)
 9323{
 9324	return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_POISON));
 9325}
 9326
 9327SLAB_ATTR_RO(poison);
 9328
 9329static ssize_t store_user_show(struct kmem_cache *s, char *buf)
 9330{
 9331	return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_STORE_USER));
 9332}
 9333
 9334SLAB_ATTR_RO(store_user);
 9335
 9336static ssize_t validate_show(struct kmem_cache *s, char *buf)
 9337{
 9338	return 0;
 9339}
 9340
 9341static ssize_t validate_store(struct kmem_cache *s,
 9342			const char *buf, size_t length)
 9343{
 9344	int ret = -EINVAL;
 9345
 9346	if (buf[0] == '1' && kmem_cache_debug(s)) {
 9347		ret = validate_slab_cache(s);
 9348		if (ret >= 0)
 9349			ret = length;
 9350	}
 9351	return ret;
 9352}
 9353SLAB_ATTR(validate);
 9354
 9355#endif /* CONFIG_SLUB_DEBUG */
 9356
 9357#ifdef CONFIG_FAILSLAB
 9358static ssize_t failslab_show(struct kmem_cache *s, char *buf)
 9359{
 9360	return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_FAILSLAB));
 9361}
 9362
 9363static ssize_t failslab_store(struct kmem_cache *s, const char *buf,
 9364				size_t length)
 9365{
 9366	if (s->refcount > 1)
 9367		return -EINVAL;
 9368
 9369	if (buf[0] == '1')
 9370		WRITE_ONCE(s->flags, s->flags | SLAB_FAILSLAB);
 9371	else
 9372		WRITE_ONCE(s->flags, s->flags & ~SLAB_FAILSLAB);
 9373
 9374	return length;
 9375}
 9376SLAB_ATTR(failslab);
 9377#endif
 9378
 9379static ssize_t shrink_show(struct kmem_cache *s, char *buf)
 9380{
 9381	return 0;
 9382}
 9383
 9384static ssize_t shrink_store(struct kmem_cache *s,
 9385			const char *buf, size_t length)
 9386{
 9387	if (buf[0] == '1')
 9388		kmem_cache_shrink(s);
 9389	else
 9390		return -EINVAL;
 9391	return length;
 9392}
 9393SLAB_ATTR(shrink);
 9394
 9395#ifdef CONFIG_NUMA
 9396static ssize_t remote_node_defrag_ratio_show(struct kmem_cache *s, char *buf)
 9397{
 9398	return sysfs_emit(buf, "%u\n", s->remote_node_defrag_ratio / 10);
 9399}
 9400
 9401static ssize_t remote_node_defrag_ratio_store(struct kmem_cache *s,
 9402				const char *buf, size_t length)
 9403{
 9404	unsigned int ratio;
 9405	int err;
 9406
 9407	err = kstrtouint(buf, 10, &ratio);
 9408	if (err)
 9409		return err;
 9410	if (ratio > 100)
 9411		return -ERANGE;
 9412
 9413	s->remote_node_defrag_ratio = ratio * 10;
 9414
 9415	return length;
 9416}
 9417SLAB_ATTR(remote_node_defrag_ratio);
 9418#endif
 9419
 9420#ifdef CONFIG_SLUB_STATS
 9421static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si)
 9422{
 9423	unsigned long sum  = 0;
 9424	int cpu;
 9425	int len = 0;
 9426	int *data = kmalloc_array(nr_cpu_ids, sizeof(int), GFP_KERNEL);
 9427
 9428	if (!data)
 9429		return -ENOMEM;
 9430
 9431	for_each_online_cpu(cpu) {
 9432		unsigned x = per_cpu_ptr(s->cpu_slab, cpu)->stat[si];
 9433
 9434		data[cpu] = x;
 9435		sum += x;
 9436	}
 9437
 9438	len += sysfs_emit_at(buf, len, "%lu", sum);
 9439
 9440#ifdef CONFIG_SMP
 9441	for_each_online_cpu(cpu) {
 9442		if (data[cpu])
 9443			len += sysfs_emit_at(buf, len, " C%d=%u",
 9444					     cpu, data[cpu]);
 9445	}
 9446#endif
 9447	kfree(data);
 9448	len += sysfs_emit_at(buf, len, "\n");
 9449
 9450	return len;
 9451}
 9452
 9453static void clear_stat(struct kmem_cache *s, enum stat_item si)
 9454{
 9455	int cpu;
 9456
 9457	for_each_online_cpu(cpu)
 9458		per_cpu_ptr(s->cpu_slab, cpu)->stat[si] = 0;
 9459}
 9460
 9461#define STAT_ATTR(si, text) 					\
 9462static ssize_t text##_show(struct kmem_cache *s, char *buf)	\
 9463{								\
 9464	return show_stat(s, buf, si);				\
 9465}								\
 9466static ssize_t text##_store(struct kmem_cache *s,		\
 9467				const char *buf, size_t length)	\
 9468{								\
 9469	if (buf[0] != '0')					\
 9470		return -EINVAL;					\
 9471	clear_stat(s, si);					\
 9472	return length;						\
 9473}								\
 9474SLAB_ATTR(text);						\
 9475
 9476STAT_ATTR(ALLOC_PCS, alloc_cpu_sheaf);
 9477STAT_ATTR(ALLOC_FASTPATH, alloc_fastpath);
 9478STAT_ATTR(ALLOC_SLOWPATH, alloc_slowpath);
 9479STAT_ATTR(FREE_PCS, free_cpu_sheaf);
 9480STAT_ATTR(FREE_RCU_SHEAF, free_rcu_sheaf);
 9481STAT_ATTR(FREE_RCU_SHEAF_FAIL, free_rcu_sheaf_fail);
 9482STAT_ATTR(FREE_FASTPATH, free_fastpath);
 9483STAT_ATTR(FREE_SLOWPATH, free_slowpath);
 9484STAT_ATTR(FREE_FROZEN, free_frozen);
 9485STAT_ATTR(FREE_ADD_PARTIAL, free_add_partial);
 9486STAT_ATTR(FREE_REMOVE_PARTIAL, free_remove_partial);
 9487STAT_ATTR(ALLOC_FROM_PARTIAL, alloc_from_partial);
 9488STAT_ATTR(ALLOC_SLAB, alloc_slab);
 9489STAT_ATTR(ALLOC_REFILL, alloc_refill);
 9490STAT_ATTR(ALLOC_NODE_MISMATCH, alloc_node_mismatch);
 9491STAT_ATTR(FREE_SLAB, free_slab);
 9492STAT_ATTR(CPUSLAB_FLUSH, cpuslab_flush);
 9493STAT_ATTR(DEACTIVATE_FULL, deactivate_full);
 9494STAT_ATTR(DEACTIVATE_EMPTY, deactivate_empty);
 9495STAT_ATTR(DEACTIVATE_TO_HEAD, deactivate_to_head);
 9496STAT_ATTR(DEACTIVATE_TO_TAIL, deactivate_to_tail);
 9497STAT_ATTR(DEACTIVATE_REMOTE_FREES, deactivate_remote_frees);
 9498STAT_ATTR(DEACTIVATE_BYPASS, deactivate_bypass);
 9499STAT_ATTR(ORDER_FALLBACK, order_fallback);
 9500STAT_ATTR(CMPXCHG_DOUBLE_CPU_FAIL, cmpxchg_double_cpu_fail);
 9501STAT_ATTR(CMPXCHG_DOUBLE_FAIL, cmpxchg_double_fail);
 9502STAT_ATTR(CPU_PARTIAL_ALLOC, cpu_partial_alloc);
 9503STAT_ATTR(CPU_PARTIAL_FREE, cpu_partial_free);
 9504STAT_ATTR(CPU_PARTIAL_NODE, cpu_partial_node);
 9505STAT_ATTR(CPU_PARTIAL_DRAIN, cpu_partial_drain);
 9506STAT_ATTR(SHEAF_FLUSH, sheaf_flush);
 9507STAT_ATTR(SHEAF_REFILL, sheaf_refill);
 9508STAT_ATTR(SHEAF_ALLOC, sheaf_alloc);
 9509STAT_ATTR(SHEAF_FREE, sheaf_free);
 9510STAT_ATTR(BARN_GET, barn_get);
 9511STAT_ATTR(BARN_GET_FAIL, barn_get_fail);
 9512STAT_ATTR(BARN_PUT, barn_put);
 9513STAT_ATTR(BARN_PUT_FAIL, barn_put_fail);
 9514STAT_ATTR(SHEAF_PREFILL_FAST, sheaf_prefill_fast);
 9515STAT_ATTR(SHEAF_PREFILL_SLOW, sheaf_prefill_slow);
 9516STAT_ATTR(SHEAF_PREFILL_OVERSIZE, sheaf_prefill_oversize);
 9517STAT_ATTR(SHEAF_RETURN_FAST, sheaf_return_fast);
 9518STAT_ATTR(SHEAF_RETURN_SLOW, sheaf_return_slow);
 9519#endif	/* CONFIG_SLUB_STATS */
 9520
 9521#ifdef CONFIG_KFENCE
 9522static ssize_t skip_kfence_show(struct kmem_cache *s, char *buf)
 9523{
 9524	return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_SKIP_KFENCE));
 9525}
 9526
 9527static ssize_t skip_kfence_store(struct kmem_cache *s,
 9528			const char *buf, size_t length)
 9529{
 9530	int ret = length;
 9531
 9532	if (buf[0] == '0')
 9533		s->flags &= ~SLAB_SKIP_KFENCE;
 9534	else if (buf[0] == '1')
 9535		s->flags |= SLAB_SKIP_KFENCE;
 9536	else
 9537		ret = -EINVAL;
 9538
 9539	return ret;
 9540}
 9541SLAB_ATTR(skip_kfence);
 9542#endif
 9543
 9544static struct attribute *slab_attrs[] = {
 9545	&slab_size_attr.attr,
 9546	&object_size_attr.attr,
 9547	&objs_per_slab_attr.attr,
 9548	&order_attr.attr,
 9549	&sheaf_capacity_attr.attr,
 9550	&min_partial_attr.attr,
 9551	&cpu_partial_attr.attr,
 9552	&objects_partial_attr.attr,
 9553	&partial_attr.attr,
 9554	&cpu_slabs_attr.attr,
 9555	&ctor_attr.attr,
 9556	&aliases_attr.attr,
 9557	&align_attr.attr,
 9558	&hwcache_align_attr.attr,
 9559	&reclaim_account_attr.attr,
 9560	&destroy_by_rcu_attr.attr,
 9561	&shrink_attr.attr,
 9562	&slabs_cpu_partial_attr.attr,
 9563#ifdef CONFIG_SLUB_DEBUG
 9564	&total_objects_attr.attr,
 9565	&objects_attr.attr,
 9566	&slabs_attr.attr,
 9567	&sanity_checks_attr.attr,
 9568	&trace_attr.attr,
 9569	&red_zone_attr.attr,
 9570	&poison_attr.attr,
 9571	&store_user_attr.attr,
 9572	&validate_attr.attr,
 9573#endif
 9574#ifdef CONFIG_ZONE_DMA
 9575	&cache_dma_attr.attr,
 9576#endif
 9577#ifdef CONFIG_NUMA
 9578	&remote_node_defrag_ratio_attr.attr,
 9579#endif
 9580#ifdef CONFIG_SLUB_STATS
 9581	&alloc_cpu_sheaf_attr.attr,
 9582	&alloc_fastpath_attr.attr,
 9583	&alloc_slowpath_attr.attr,
 9584	&free_cpu_sheaf_attr.attr,
 9585	&free_rcu_sheaf_attr.attr,
 9586	&free_rcu_sheaf_fail_attr.attr,
 9587	&free_fastpath_attr.attr,
 9588	&free_slowpath_attr.attr,
 9589	&free_frozen_attr.attr,
 9590	&free_add_partial_attr.attr,
 9591	&free_remove_partial_attr.attr,
 9592	&alloc_from_partial_attr.attr,
 9593	&alloc_slab_attr.attr,
 9594	&alloc_refill_attr.attr,
 9595	&alloc_node_mismatch_attr.attr,
 9596	&free_slab_attr.attr,
 9597	&cpuslab_flush_attr.attr,
 9598	&deactivate_full_attr.attr,
 9599	&deactivate_empty_attr.attr,
 9600	&deactivate_to_head_attr.attr,
 9601	&deactivate_to_tail_attr.attr,
 9602	&deactivate_remote_frees_attr.attr,
 9603	&deactivate_bypass_attr.attr,
 9604	&order_fallback_attr.attr,
 9605	&cmpxchg_double_fail_attr.attr,
 9606	&cmpxchg_double_cpu_fail_attr.attr,
 9607	&cpu_partial_alloc_attr.attr,
 9608	&cpu_partial_free_attr.attr,
 9609	&cpu_partial_node_attr.attr,
 9610	&cpu_partial_drain_attr.attr,
 9611	&sheaf_flush_attr.attr,
 9612	&sheaf_refill_attr.attr,
 9613	&sheaf_alloc_attr.attr,
 9614	&sheaf_free_attr.attr,
 9615	&barn_get_attr.attr,
 9616	&barn_get_fail_attr.attr,
 9617	&barn_put_attr.attr,
 9618	&barn_put_fail_attr.attr,
 9619	&sheaf_prefill_fast_attr.attr,
 9620	&sheaf_prefill_slow_attr.attr,
 9621	&sheaf_prefill_oversize_attr.attr,
 9622	&sheaf_return_fast_attr.attr,
 9623	&sheaf_return_slow_attr.attr,
 9624#endif
 9625#ifdef CONFIG_FAILSLAB
 9626	&failslab_attr.attr,
 9627#endif
 9628#ifdef CONFIG_HARDENED_USERCOPY
 9629	&usersize_attr.attr,
 9630#endif
 9631#ifdef CONFIG_KFENCE
 9632	&skip_kfence_attr.attr,
 9633#endif
 9634
 9635	NULL
 9636};
 9637
 9638static const struct attribute_group slab_attr_group = {
 9639	.attrs = slab_attrs,
 9640};
 9641
 9642static ssize_t slab_attr_show(struct kobject *kobj,
 9643				struct attribute *attr,
 9644				char *buf)
 9645{
 9646	struct slab_attribute *attribute;
 9647	struct kmem_cache *s;
 9648
 9649	attribute = to_slab_attr(attr);
 9650	s = to_slab(kobj);
 9651
 9652	if (!attribute->show)
 9653		return -EIO;
 9654
 9655	return attribute->show(s, buf);
 9656}
 9657
 9658static ssize_t slab_attr_store(struct kobject *kobj,
 9659				struct attribute *attr,
 9660				const char *buf, size_t len)
 9661{
 9662	struct slab_attribute *attribute;
 9663	struct kmem_cache *s;
 9664
 9665	attribute = to_slab_attr(attr);
 9666	s = to_slab(kobj);
 9667
 9668	if (!attribute->store)
 9669		return -EIO;
 9670
 9671	return attribute->store(s, buf, len);
 9672}
 9673
 9674static void kmem_cache_release(struct kobject *k)
 9675{
 9676	slab_kmem_cache_release(to_slab(k));
 9677}
 9678
 9679static const struct sysfs_ops slab_sysfs_ops = {
 9680	.show = slab_attr_show,
 9681	.store = slab_attr_store,
 9682};
 9683
 9684static const struct kobj_type slab_ktype = {
 9685	.sysfs_ops = &slab_sysfs_ops,
 9686	.release = kmem_cache_release,
 9687};
 9688
 9689static struct kset *slab_kset;
 9690
 9691static inline struct kset *cache_kset(struct kmem_cache *s)
 9692{
 9693	return slab_kset;
 9694}
 9695
 9696#define ID_STR_LENGTH 32
 9697
 9698/* Create a unique string id for a slab cache:
 9699 *
 9700 * Format	:[flags-]size
 9701 */
 9702static char *create_unique_id(struct kmem_cache *s)
 9703{
 9704	char *name = kmalloc(ID_STR_LENGTH, GFP_KERNEL);
 9705	char *p = name;
 9706
 9707	if (!name)
 9708		return ERR_PTR(-ENOMEM);
 9709
 9710	*p++ = ':';
 9711	/*
 9712	 * First flags affecting slabcache operations. We will only
 9713	 * get here for aliasable slabs so we do not need to support
 9714	 * too many flags. The flags here must cover all flags that
 9715	 * are matched during merging to guarantee that the id is
 9716	 * unique.
 9717	 */
 9718	if (s->flags & SLAB_CACHE_DMA)
 9719		*p++ = 'd';
 9720	if (s->flags & SLAB_CACHE_DMA32)
 9721		*p++ = 'D';
 9722	if (s->flags & SLAB_RECLAIM_ACCOUNT)
 9723		*p++ = 'a';
 9724	if (s->flags & SLAB_CONSISTENCY_CHECKS)
 9725		*p++ = 'F';
 9726	if (s->flags & SLAB_ACCOUNT)
 9727		*p++ = 'A';
 9728	if (p != name + 1)
 9729		*p++ = '-';
 9730	p += snprintf(p, ID_STR_LENGTH - (p - name), "%07u", s->size);
 9731
 9732	if (WARN_ON(p > name + ID_STR_LENGTH - 1)) {
 9733		kfree(name);
 9734		return ERR_PTR(-EINVAL);
 9735	}
 9736	kmsan_unpoison_memory(name, p - name);
 9737	return name;
 9738}
 9739
 9740static int sysfs_slab_add(struct kmem_cache *s)
 9741{
 9742	int err;
 9743	const char *name;
 9744	struct kset *kset = cache_kset(s);
 9745	int unmergeable = slab_unmergeable(s);
 9746
 9747	if (!unmergeable && disable_higher_order_debug &&
 9748			(slub_debug & DEBUG_METADATA_FLAGS))
 9749		unmergeable = 1;
 9750
 9751	if (unmergeable) {
 9752		/*
 9753		 * Slabcache can never be merged so we can use the name proper.
 9754		 * This is typically the case for debug situations. In that
 9755		 * case we can catch duplicate names easily.
 9756		 */
 9757		sysfs_remove_link(&slab_kset->kobj, s->name);
 9758		name = s->name;
 9759	} else {
 9760		/*
 9761		 * Create a unique name for the slab as a target
 9762		 * for the symlinks.
 9763		 */
 9764		name = create_unique_id(s);
 9765		if (IS_ERR(name))
 9766			return PTR_ERR(name);
 9767	}
 9768
 9769	s->kobj.kset = kset;
 9770	err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, "%s", name);
 9771	if (err)
 9772		goto out;
 9773
 9774	err = sysfs_create_group(&s->kobj, &slab_attr_group);
 9775	if (err)
 9776		goto out_del_kobj;
 9777
 9778	if (!unmergeable) {
 9779		/* Setup first alias */
 9780		sysfs_slab_alias(s, s->name);
 9781	}
 9782out:
 9783	if (!unmergeable)
 9784		kfree(name);
 9785	return err;
 9786out_del_kobj:
 9787	kobject_del(&s->kobj);
 9788	goto out;
 9789}
 9790
 9791void sysfs_slab_unlink(struct kmem_cache *s)
 9792{
 9793	if (s->kobj.state_in_sysfs)
 9794		kobject_del(&s->kobj);
 9795}
 9796
 9797void sysfs_slab_release(struct kmem_cache *s)
 9798{
 9799	kobject_put(&s->kobj);
 9800}
 9801
 9802/*
 9803 * Need to buffer aliases during bootup until sysfs becomes
 9804 * available lest we lose that information.
 9805 */
 9806struct saved_alias {
 9807	struct kmem_cache *s;
 9808	const char *name;
 9809	struct saved_alias *next;
 9810};
 9811
 9812static struct saved_alias *alias_list;
 9813
 9814static int sysfs_slab_alias(struct kmem_cache *s, const char *name)
 9815{
 9816	struct saved_alias *al;
 9817
 9818	if (slab_state == FULL) {
 9819		/*
 9820		 * If we have a leftover link then remove it.
 9821		 */
 9822		sysfs_remove_link(&slab_kset->kobj, name);
 9823		/*
 9824		 * The original cache may have failed to generate sysfs file.
 9825		 * In that case, sysfs_create_link() returns -ENOENT and
 9826		 * symbolic link creation is skipped.
 9827		 */
 9828		return sysfs_create_link(&slab_kset->kobj, &s->kobj, name);
 9829	}
 9830
 9831	al = kmalloc(sizeof(struct saved_alias), GFP_KERNEL);
 9832	if (!al)
 9833		return -ENOMEM;
 9834
 9835	al->s = s;
 9836	al->name = name;
 9837	al->next = alias_list;
 9838	alias_list = al;
 9839	kmsan_unpoison_memory(al, sizeof(*al));
 9840	return 0;
 9841}
 9842
 9843static int __init slab_sysfs_init(void)
 9844{
 9845	struct kmem_cache *s;
 9846	int err;
 9847
 9848	mutex_lock(&slab_mutex);
 9849
 9850	slab_kset = kset_create_and_add("slab", NULL, kernel_kobj);
 9851	if (!slab_kset) {
 9852		mutex_unlock(&slab_mutex);
 9853		pr_err("Cannot register slab subsystem.\n");
 9854		return -ENOMEM;
 9855	}
 9856
 9857	slab_state = FULL;
 9858
 9859	list_for_each_entry(s, &slab_caches, list) {
 9860		err = sysfs_slab_add(s);
 9861		if (err)
 9862			pr_err("SLUB: Unable to add boot slab %s to sysfs\n",
 9863			       s->name);
 9864	}
 9865
 9866	while (alias_list) {
 9867		struct saved_alias *al = alias_list;
 9868
 9869		alias_list = alias_list->next;
 9870		err = sysfs_slab_alias(al->s, al->name);
 9871		if (err)
 9872			pr_err("SLUB: Unable to add boot slab alias %s to sysfs\n",
 9873			       al->name);
 9874		kfree(al);
 9875	}
 9876
 9877	mutex_unlock(&slab_mutex);
 9878	return 0;
 9879}
 9880late_initcall(slab_sysfs_init);
 9881#endif /* SLAB_SUPPORTS_SYSFS */
 9882
 9883#if defined(CONFIG_SLUB_DEBUG) && defined(CONFIG_DEBUG_FS)
 9884static int slab_debugfs_show(struct seq_file *seq, void *v)
 9885{
 9886	struct loc_track *t = seq->private;
 9887	struct location *l;
 9888	unsigned long idx;
 9889
 9890	idx = (unsigned long) t->idx;
 9891	if (idx < t->count) {
 9892		l = &t->loc[idx];
 9893
 9894		seq_printf(seq, "%7ld ", l->count);
 9895
 9896		if (l->addr)
 9897			seq_printf(seq, "%pS", (void *)l->addr);
 9898		else
 9899			seq_puts(seq, "<not-available>");
 9900
 9901		if (l->waste)
 9902			seq_printf(seq, " waste=%lu/%lu",
 9903				l->count * l->waste, l->waste);
 9904
 9905		if (l->sum_time != l->min_time) {
 9906			seq_printf(seq, " age=%ld/%llu/%ld",
 9907				l->min_time, div_u64(l->sum_time, l->count),
 9908				l->max_time);
 9909		} else
 9910			seq_printf(seq, " age=%ld", l->min_time);
 9911
 9912		if (l->min_pid != l->max_pid)
 9913			seq_printf(seq, " pid=%ld-%ld", l->min_pid, l->max_pid);
 9914		else
 9915			seq_printf(seq, " pid=%ld",
 9916				l->min_pid);
 9917
 9918		if (num_online_cpus() > 1 && !cpumask_empty(to_cpumask(l->cpus)))
 9919			seq_printf(seq, " cpus=%*pbl",
 9920				 cpumask_pr_args(to_cpumask(l->cpus)));
 9921
 9922		if (nr_online_nodes > 1 && !nodes_empty(l->nodes))
 9923			seq_printf(seq, " nodes=%*pbl",
 9924				 nodemask_pr_args(&l->nodes));
 9925
 9926#ifdef CONFIG_STACKDEPOT
 9927		{
 9928			depot_stack_handle_t handle;
 9929			unsigned long *entries;
 9930			unsigned int nr_entries, j;
 9931
 9932			handle = READ_ONCE(l->handle);
 9933			if (handle) {
 9934				nr_entries = stack_depot_fetch(handle, &entries);
 9935				seq_puts(seq, "\n");
 9936				for (j = 0; j < nr_entries; j++)
 9937					seq_printf(seq, "        %pS\n", (void *)entries[j]);
 9938			}
 9939		}
 9940#endif
 9941		seq_puts(seq, "\n");
 9942	}
 9943
 9944	if (!idx && !t->count)
 9945		seq_puts(seq, "No data\n");
 9946
 9947	return 0;
 9948}
 9949
 9950static void slab_debugfs_stop(struct seq_file *seq, void *v)
 9951{
 9952}
 9953
 9954static void *slab_debugfs_next(struct seq_file *seq, void *v, loff_t *ppos)
 9955{
 9956	struct loc_track *t = seq->private;
 9957
 9958	t->idx = ++(*ppos);
 9959	if (*ppos <= t->count)
 9960		return ppos;
 9961
 9962	return NULL;
 9963}
 9964
 9965static int cmp_loc_by_count(const void *a, const void *b)
 9966{
 9967	struct location *loc1 = (struct location *)a;
 9968	struct location *loc2 = (struct location *)b;
 9969
 9970	return cmp_int(loc2->count, loc1->count);
 9971}
 9972
 9973static void *slab_debugfs_start(struct seq_file *seq, loff_t *ppos)
 9974{
 9975	struct loc_track *t = seq->private;
 9976
 9977	t->idx = *ppos;
 9978	return ppos;
 9979}
 9980
 9981static const struct seq_operations slab_debugfs_sops = {
 9982	.start  = slab_debugfs_start,
 9983	.next   = slab_debugfs_next,
 9984	.stop   = slab_debugfs_stop,
 9985	.show   = slab_debugfs_show,
 9986};
 9987
 9988static int slab_debug_trace_open(struct inode *inode, struct file *filep)
 9989{
 9990
 9991	struct kmem_cache_node *n;
 9992	enum track_item alloc;
 9993	int node;
 9994	struct loc_track *t = __seq_open_private(filep, &slab_debugfs_sops,
 9995						sizeof(struct loc_track));
 9996	struct kmem_cache *s = file_inode(filep)->i_private;
 9997	unsigned long *obj_map;
 9998
 9999	if (!t)
10000		return -ENOMEM;
10001
10002	obj_map = bitmap_alloc(oo_objects(s->oo), GFP_KERNEL);
10003	if (!obj_map) {
10004		seq_release_private(inode, filep);
10005		return -ENOMEM;
10006	}
10007
10008	alloc = debugfs_get_aux_num(filep);
10009
10010	if (!alloc_loc_track(t, PAGE_SIZE / sizeof(struct location), GFP_KERNEL)) {
10011		bitmap_free(obj_map);
10012		seq_release_private(inode, filep);
10013		return -ENOMEM;
10014	}
10015
10016	for_each_kmem_cache_node(s, node, n) {
10017		unsigned long flags;
10018		struct slab *slab;
10019
10020		if (!node_nr_slabs(n))
10021			continue;
10022
10023		spin_lock_irqsave(&n->list_lock, flags);
10024		list_for_each_entry(slab, &n->partial, slab_list)
10025			process_slab(t, s, slab, alloc, obj_map);
10026		list_for_each_entry(slab, &n->full, slab_list)
10027			process_slab(t, s, slab, alloc, obj_map);
10028		spin_unlock_irqrestore(&n->list_lock, flags);
10029	}
10030
10031	/* Sort locations by count */
10032	sort(t->loc, t->count, sizeof(struct location),
10033	     cmp_loc_by_count, NULL);
10034
10035	bitmap_free(obj_map);
10036	return 0;
10037}
10038
10039static int slab_debug_trace_release(struct inode *inode, struct file *file)
10040{
10041	struct seq_file *seq = file->private_data;
10042	struct loc_track *t = seq->private;
10043
10044	free_loc_track(t);
10045	return seq_release_private(inode, file);
10046}
10047
10048static const struct file_operations slab_debugfs_fops = {
10049	.open    = slab_debug_trace_open,
10050	.read    = seq_read,
10051	.llseek  = seq_lseek,
10052	.release = slab_debug_trace_release,
10053};
10054
10055static void debugfs_slab_add(struct kmem_cache *s)
10056{
10057	struct dentry *slab_cache_dir;
10058
10059	if (unlikely(!slab_debugfs_root))
10060		return;
10061
10062	slab_cache_dir = debugfs_create_dir(s->name, slab_debugfs_root);
10063
10064	debugfs_create_file_aux_num("alloc_traces", 0400, slab_cache_dir, s,
10065					TRACK_ALLOC, &slab_debugfs_fops);
10066
10067	debugfs_create_file_aux_num("free_traces", 0400, slab_cache_dir, s,
10068					TRACK_FREE, &slab_debugfs_fops);
10069}
10070
10071void debugfs_slab_release(struct kmem_cache *s)
10072{
10073	debugfs_lookup_and_remove(s->name, slab_debugfs_root);
10074}
10075
10076static int __init slab_debugfs_init(void)
10077{
10078	struct kmem_cache *s;
10079
10080	slab_debugfs_root = debugfs_create_dir("slab", NULL);
10081
10082	list_for_each_entry(s, &slab_caches, list)
10083		if (s->flags & SLAB_STORE_USER)
10084			debugfs_slab_add(s);
10085
10086	return 0;
10087
10088}
10089__initcall(slab_debugfs_init);
10090#endif
10091/*
10092 * The /proc/slabinfo ABI
10093 */
10094#ifdef CONFIG_SLUB_DEBUG
10095void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo)
10096{
10097	unsigned long nr_slabs = 0;
10098	unsigned long nr_objs = 0;
10099	unsigned long nr_free = 0;
10100	int node;
10101	struct kmem_cache_node *n;
10102
10103	for_each_kmem_cache_node(s, node, n) {
10104		nr_slabs += node_nr_slabs(n);
10105		nr_objs += node_nr_objs(n);
10106		nr_free += count_partial_free_approx(n);
10107	}
10108
10109	sinfo->active_objs = nr_objs - nr_free;
10110	sinfo->num_objs = nr_objs;
10111	sinfo->active_slabs = nr_slabs;
10112	sinfo->num_slabs = nr_slabs;
10113	sinfo->objects_per_slab = oo_objects(s->oo);
10114	sinfo->cache_order = oo_order(s->oo);
10115}
10116#endif /* CONFIG_SLUB_DEBUG */