mm/slub.c at master · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / mm / slub.c
at master 260 kB view raw
    1// SPDX-License-Identifier: GPL-2.0
    2/*
    3 * SLUB: A slab allocator that limits cache line use instead of queuing
    4 * objects in per cpu and per node lists.
    5 *
    6 * The allocator synchronizes using per slab locks or atomic operations
    7 * and only uses a centralized lock to manage a pool of partial slabs.
    8 *
    9 * (C) 2007 SGI, Christoph Lameter
   10 * (C) 2011 Linux Foundation, Christoph Lameter
   11 */
   12
   13#include <linux/mm.h>
   14#include <linux/swap.h> /* mm_account_reclaimed_pages() */
   15#include <linux/module.h>
   16#include <linux/bit_spinlock.h>
   17#include <linux/interrupt.h>
   18#include <linux/swab.h>
   19#include <linux/bitops.h>
   20#include <linux/slab.h>
   21#include "slab.h"
   22#include <linux/vmalloc.h>
   23#include <linux/proc_fs.h>
   24#include <linux/seq_file.h>
   25#include <linux/kasan.h>
   26#include <linux/node.h>
   27#include <linux/kmsan.h>
   28#include <linux/cpu.h>
   29#include <linux/cpuset.h>
   30#include <linux/mempolicy.h>
   31#include <linux/ctype.h>
   32#include <linux/stackdepot.h>
   33#include <linux/debugobjects.h>
   34#include <linux/kallsyms.h>
   35#include <linux/kfence.h>
   36#include <linux/memory.h>
   37#include <linux/math64.h>
   38#include <linux/fault-inject.h>
   39#include <linux/kmemleak.h>
   40#include <linux/stacktrace.h>
   41#include <linux/prefetch.h>
   42#include <linux/memcontrol.h>
   43#include <linux/random.h>
   44#include <kunit/test.h>
   45#include <kunit/test-bug.h>
   46#include <linux/sort.h>
   47#include <linux/irq_work.h>
   48#include <linux/kprobes.h>
   49#include <linux/debugfs.h>
   50#include <trace/events/kmem.h>
   51
   52#include "internal.h"
   53
   54/*
   55 * Lock order:
   56 *   1. slab_mutex (Global Mutex)
   57 *   2. node->list_lock (Spinlock)
   58 *   3. kmem_cache->cpu_slab->lock (Local lock)
   59 *   4. slab_lock(slab) (Only on some arches)
   60 *   5. object_map_lock (Only for debugging)
   61 *
   62 *   slab_mutex
   63 *
   64 *   The role of the slab_mutex is to protect the list of all the slabs
   65 *   and to synchronize major metadata changes to slab cache structures.
   66 *   Also synchronizes memory hotplug callbacks.
   67 *
   68 *   slab_lock
   69 *
   70 *   The slab_lock is a wrapper around the page lock, thus it is a bit
   71 *   spinlock.
   72 *
   73 *   The slab_lock is only used on arches that do not have the ability
   74 *   to do a cmpxchg_double. It only protects:
   75 *
   76 *	A. slab->freelist	-> List of free objects in a slab
   77 *	B. slab->inuse		-> Number of objects in use
   78 *	C. slab->objects	-> Number of objects in slab
   79 *	D. slab->frozen		-> frozen state
   80 *
   81 *   Frozen slabs
   82 *
   83 *   If a slab is frozen then it is exempt from list management. It is
   84 *   the cpu slab which is actively allocated from by the processor that
   85 *   froze it and it is not on any list. The processor that froze the
   86 *   slab is the one who can perform list operations on the slab. Other
   87 *   processors may put objects onto the freelist but the processor that
   88 *   froze the slab is the only one that can retrieve the objects from the
   89 *   slab's freelist.
   90 *
   91 *   CPU partial slabs
   92 *
   93 *   The partially empty slabs cached on the CPU partial list are used
   94 *   for performance reasons, which speeds up the allocation process.
   95 *   These slabs are not frozen, but are also exempt from list management,
   96 *   by clearing the SL_partial flag when moving out of the node
   97 *   partial list. Please see __slab_free() for more details.
   98 *
   99 *   To sum up, the current scheme is:
  100 *   - node partial slab: SL_partial && !frozen
  101 *   - cpu partial slab: !SL_partial && !frozen
  102 *   - cpu slab: !SL_partial && frozen
  103 *   - full slab: !SL_partial && !frozen
  104 *
  105 *   list_lock
  106 *
  107 *   The list_lock protects the partial and full list on each node and
  108 *   the partial slab counter. If taken then no new slabs may be added or
  109 *   removed from the lists nor make the number of partial slabs be modified.
  110 *   (Note that the total number of slabs is an atomic value that may be
  111 *   modified without taking the list lock).
  112 *
  113 *   The list_lock is a centralized lock and thus we avoid taking it as
  114 *   much as possible. As long as SLUB does not have to handle partial
  115 *   slabs, operations can continue without any centralized lock. F.e.
  116 *   allocating a long series of objects that fill up slabs does not require
  117 *   the list lock.
  118 *
  119 *   For debug caches, all allocations are forced to go through a list_lock
  120 *   protected region to serialize against concurrent validation.
  121 *
  122 *   cpu_slab->lock local lock
  123 *
  124 *   This locks protect slowpath manipulation of all kmem_cache_cpu fields
  125 *   except the stat counters. This is a percpu structure manipulated only by
  126 *   the local cpu, so the lock protects against being preempted or interrupted
  127 *   by an irq. Fast path operations rely on lockless operations instead.
  128 *
  129 *   On PREEMPT_RT, the local lock neither disables interrupts nor preemption
  130 *   which means the lockless fastpath cannot be used as it might interfere with
  131 *   an in-progress slow path operations. In this case the local lock is always
  132 *   taken but it still utilizes the freelist for the common operations.
  133 *
  134 *   lockless fastpaths
  135 *
  136 *   The fast path allocation (slab_alloc_node()) and freeing (do_slab_free())
  137 *   are fully lockless when satisfied from the percpu slab (and when
  138 *   cmpxchg_double is possible to use, otherwise slab_lock is taken).
  139 *   They also don't disable preemption or migration or irqs. They rely on
  140 *   the transaction id (tid) field to detect being preempted or moved to
  141 *   another cpu.
  142 *
  143 *   irq, preemption, migration considerations
  144 *
  145 *   Interrupts are disabled as part of list_lock or local_lock operations, or
  146 *   around the slab_lock operation, in order to make the slab allocator safe
  147 *   to use in the context of an irq.
  148 *
  149 *   In addition, preemption (or migration on PREEMPT_RT) is disabled in the
  150 *   allocation slowpath, bulk allocation, and put_cpu_partial(), so that the
  151 *   local cpu doesn't change in the process and e.g. the kmem_cache_cpu pointer
  152 *   doesn't have to be revalidated in each section protected by the local lock.
  153 *
  154 * SLUB assigns one slab for allocation to each processor.
  155 * Allocations only occur from these slabs called cpu slabs.
  156 *
  157 * Slabs with free elements are kept on a partial list and during regular
  158 * operations no list for full slabs is used. If an object in a full slab is
  159 * freed then the slab will show up again on the partial lists.
  160 * We track full slabs for debugging purposes though because otherwise we
  161 * cannot scan all objects.
  162 *
  163 * Slabs are freed when they become empty. Teardown and setup is
  164 * minimal so we rely on the page allocators per cpu caches for
  165 * fast frees and allocs.
  166 *
  167 * slab->frozen		The slab is frozen and exempt from list processing.
  168 * 			This means that the slab is dedicated to a purpose
  169 * 			such as satisfying allocations for a specific
  170 * 			processor. Objects may be freed in the slab while
  171 * 			it is frozen but slab_free will then skip the usual
  172 * 			list operations. It is up to the processor holding
  173 * 			the slab to integrate the slab into the slab lists
  174 * 			when the slab is no longer needed.
  175 *
  176 * 			One use of this flag is to mark slabs that are
  177 * 			used for allocations. Then such a slab becomes a cpu
  178 * 			slab. The cpu slab may be equipped with an additional
  179 * 			freelist that allows lockless access to
  180 * 			free objects in addition to the regular freelist
  181 * 			that requires the slab lock.
  182 *
  183 * SLAB_DEBUG_FLAGS	Slab requires special handling due to debug
  184 * 			options set. This moves	slab handling out of
  185 * 			the fast path and disables lockless freelists.
  186 */
  187
  188/**
  189 * enum slab_flags - How the slab flags bits are used.
  190 * @SL_locked: Is locked with slab_lock()
  191 * @SL_partial: On the per-node partial list
  192 * @SL_pfmemalloc: Was allocated from PF_MEMALLOC reserves
  193 *
  194 * The slab flags share space with the page flags but some bits have
  195 * different interpretations.  The high bits are used for information
  196 * like zone/node/section.
  197 */
  198enum slab_flags {
  199	SL_locked = PG_locked,
  200	SL_partial = PG_workingset,	/* Historical reasons for this bit */
  201	SL_pfmemalloc = PG_active,	/* Historical reasons for this bit */
  202};
  203
  204/*
  205 * We could simply use migrate_disable()/enable() but as long as it's a
  206 * function call even on !PREEMPT_RT, use inline preempt_disable() there.
  207 */
  208#ifndef CONFIG_PREEMPT_RT
  209#define slub_get_cpu_ptr(var)		get_cpu_ptr(var)
  210#define slub_put_cpu_ptr(var)		put_cpu_ptr(var)
  211#define USE_LOCKLESS_FAST_PATH()	(true)
  212#else
  213#define slub_get_cpu_ptr(var)		\
  214({					\
  215	migrate_disable();		\
  216	this_cpu_ptr(var);		\
  217})
  218#define slub_put_cpu_ptr(var)		\
  219do {					\
  220	(void)(var);			\
  221	migrate_enable();		\
  222} while (0)
  223#define USE_LOCKLESS_FAST_PATH()	(false)
  224#endif
  225
  226#ifndef CONFIG_SLUB_TINY
  227#define __fastpath_inline __always_inline
  228#else
  229#define __fastpath_inline
  230#endif
  231
  232#ifdef CONFIG_SLUB_DEBUG
  233#ifdef CONFIG_SLUB_DEBUG_ON
  234DEFINE_STATIC_KEY_TRUE(slub_debug_enabled);
  235#else
  236DEFINE_STATIC_KEY_FALSE(slub_debug_enabled);
  237#endif
  238#endif		/* CONFIG_SLUB_DEBUG */
  239
  240#ifdef CONFIG_NUMA
  241static DEFINE_STATIC_KEY_FALSE(strict_numa);
  242#endif
  243
  244/* Structure holding parameters for get_partial() call chain */
  245struct partial_context {
  246	gfp_t flags;
  247	unsigned int orig_size;
  248	void *object;
  249};
  250
  251static inline bool kmem_cache_debug(struct kmem_cache *s)
  252{
  253	return kmem_cache_debug_flags(s, SLAB_DEBUG_FLAGS);
  254}
  255
  256void *fixup_red_left(struct kmem_cache *s, void *p)
  257{
  258	if (kmem_cache_debug_flags(s, SLAB_RED_ZONE))
  259		p += s->red_left_pad;
  260
  261	return p;
  262}
  263
  264static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s)
  265{
  266#ifdef CONFIG_SLUB_CPU_PARTIAL
  267	return !kmem_cache_debug(s);
  268#else
  269	return false;
  270#endif
  271}
  272
  273/*
  274 * Issues still to be resolved:
  275 *
  276 * - Support PAGE_ALLOC_DEBUG. Should be easy to do.
  277 *
  278 * - Variable sizing of the per node arrays
  279 */
  280
  281/* Enable to log cmpxchg failures */
  282#undef SLUB_DEBUG_CMPXCHG
  283
  284#ifndef CONFIG_SLUB_TINY
  285/*
  286 * Minimum number of partial slabs. These will be left on the partial
  287 * lists even if they are empty. kmem_cache_shrink may reclaim them.
  288 */
  289#define MIN_PARTIAL 5
  290
  291/*
  292 * Maximum number of desirable partial slabs.
  293 * The existence of more partial slabs makes kmem_cache_shrink
  294 * sort the partial list by the number of objects in use.
  295 */
  296#define MAX_PARTIAL 10
  297#else
  298#define MIN_PARTIAL 0
  299#define MAX_PARTIAL 0
  300#endif
  301
  302#define DEBUG_DEFAULT_FLAGS (SLAB_CONSISTENCY_CHECKS | SLAB_RED_ZONE | \
  303				SLAB_POISON | SLAB_STORE_USER)
  304
  305/*
  306 * These debug flags cannot use CMPXCHG because there might be consistency
  307 * issues when checking or reading debug information
  308 */
  309#define SLAB_NO_CMPXCHG (SLAB_CONSISTENCY_CHECKS | SLAB_STORE_USER | \
  310				SLAB_TRACE)
  311
  312
  313/*
  314 * Debugging flags that require metadata to be stored in the slab.  These get
  315 * disabled when slab_debug=O is used and a cache's min order increases with
  316 * metadata.
  317 */
  318#define DEBUG_METADATA_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER)
  319
  320#define OO_SHIFT	16
  321#define OO_MASK		((1 << OO_SHIFT) - 1)
  322#define MAX_OBJS_PER_PAGE	32767 /* since slab.objects is u15 */
  323
  324/* Internal SLUB flags */
  325/* Poison object */
  326#define __OBJECT_POISON		__SLAB_FLAG_BIT(_SLAB_OBJECT_POISON)
  327/* Use cmpxchg_double */
  328
  329#ifdef system_has_freelist_aba
  330#define __CMPXCHG_DOUBLE	__SLAB_FLAG_BIT(_SLAB_CMPXCHG_DOUBLE)
  331#else
  332#define __CMPXCHG_DOUBLE	__SLAB_FLAG_UNUSED
  333#endif
  334
  335/*
  336 * Tracking user of a slab.
  337 */
  338#define TRACK_ADDRS_COUNT 16
  339struct track {
  340	unsigned long addr;	/* Called from address */
  341#ifdef CONFIG_STACKDEPOT
  342	depot_stack_handle_t handle;
  343#endif
  344	int cpu;		/* Was running on cpu */
  345	int pid;		/* Pid context */
  346	unsigned long when;	/* When did the operation occur */
  347};
  348
  349enum track_item { TRACK_ALLOC, TRACK_FREE };
  350
  351#ifdef SLAB_SUPPORTS_SYSFS
  352static int sysfs_slab_add(struct kmem_cache *);
  353static int sysfs_slab_alias(struct kmem_cache *, const char *);
  354#else
  355static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; }
  356static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p)
  357							{ return 0; }
  358#endif
  359
  360#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_SLUB_DEBUG)
  361static void debugfs_slab_add(struct kmem_cache *);
  362#else
  363static inline void debugfs_slab_add(struct kmem_cache *s) { }
  364#endif
  365
  366enum stat_item {
  367	ALLOC_PCS,		/* Allocation from percpu sheaf */
  368	ALLOC_FASTPATH,		/* Allocation from cpu slab */
  369	ALLOC_SLOWPATH,		/* Allocation by getting a new cpu slab */
  370	FREE_PCS,		/* Free to percpu sheaf */
  371	FREE_RCU_SHEAF,		/* Free to rcu_free sheaf */
  372	FREE_RCU_SHEAF_FAIL,	/* Failed to free to a rcu_free sheaf */
  373	FREE_FASTPATH,		/* Free to cpu slab */
  374	FREE_SLOWPATH,		/* Freeing not to cpu slab */
  375	FREE_FROZEN,		/* Freeing to frozen slab */
  376	FREE_ADD_PARTIAL,	/* Freeing moves slab to partial list */
  377	FREE_REMOVE_PARTIAL,	/* Freeing removes last object */
  378	ALLOC_FROM_PARTIAL,	/* Cpu slab acquired from node partial list */
  379	ALLOC_SLAB,		/* Cpu slab acquired from page allocator */
  380	ALLOC_REFILL,		/* Refill cpu slab from slab freelist */
  381	ALLOC_NODE_MISMATCH,	/* Switching cpu slab */
  382	FREE_SLAB,		/* Slab freed to the page allocator */
  383	CPUSLAB_FLUSH,		/* Abandoning of the cpu slab */
  384	DEACTIVATE_FULL,	/* Cpu slab was full when deactivated */
  385	DEACTIVATE_EMPTY,	/* Cpu slab was empty when deactivated */
  386	DEACTIVATE_TO_HEAD,	/* Cpu slab was moved to the head of partials */
  387	DEACTIVATE_TO_TAIL,	/* Cpu slab was moved to the tail of partials */
  388	DEACTIVATE_REMOTE_FREES,/* Slab contained remotely freed objects */
  389	DEACTIVATE_BYPASS,	/* Implicit deactivation */
  390	ORDER_FALLBACK,		/* Number of times fallback was necessary */
  391	CMPXCHG_DOUBLE_CPU_FAIL,/* Failures of this_cpu_cmpxchg_double */
  392	CMPXCHG_DOUBLE_FAIL,	/* Failures of slab freelist update */
  393	CPU_PARTIAL_ALLOC,	/* Used cpu partial on alloc */
  394	CPU_PARTIAL_FREE,	/* Refill cpu partial on free */
  395	CPU_PARTIAL_NODE,	/* Refill cpu partial from node partial */
  396	CPU_PARTIAL_DRAIN,	/* Drain cpu partial to node partial */
  397	SHEAF_FLUSH,		/* Objects flushed from a sheaf */
  398	SHEAF_REFILL,		/* Objects refilled to a sheaf */
  399	SHEAF_ALLOC,		/* Allocation of an empty sheaf */
  400	SHEAF_FREE,		/* Freeing of an empty sheaf */
  401	BARN_GET,		/* Got full sheaf from barn */
  402	BARN_GET_FAIL,		/* Failed to get full sheaf from barn */
  403	BARN_PUT,		/* Put full sheaf to barn */
  404	BARN_PUT_FAIL,		/* Failed to put full sheaf to barn */
  405	SHEAF_PREFILL_FAST,	/* Sheaf prefill grabbed the spare sheaf */
  406	SHEAF_PREFILL_SLOW,	/* Sheaf prefill found no spare sheaf */
  407	SHEAF_PREFILL_OVERSIZE,	/* Allocation of oversize sheaf for prefill */
  408	SHEAF_RETURN_FAST,	/* Sheaf return reattached spare sheaf */
  409	SHEAF_RETURN_SLOW,	/* Sheaf return could not reattach spare */
  410	NR_SLUB_STAT_ITEMS
  411};
  412
  413struct freelist_tid {
  414	union {
  415		struct {
  416			void *freelist;		/* Pointer to next available object */
  417			unsigned long tid;	/* Globally unique transaction id */
  418		};
  419		freelist_full_t freelist_tid;
  420	};
  421};
  422
  423/*
  424 * When changing the layout, make sure freelist and tid are still compatible
  425 * with this_cpu_cmpxchg_double() alignment requirements.
  426 */
  427struct kmem_cache_cpu {
  428	struct freelist_tid;
  429	struct slab *slab;	/* The slab from which we are allocating */
  430#ifdef CONFIG_SLUB_CPU_PARTIAL
  431	struct slab *partial;	/* Partially allocated slabs */
  432#endif
  433	local_trylock_t lock;	/* Protects the fields above */
  434#ifdef CONFIG_SLUB_STATS
  435	unsigned int stat[NR_SLUB_STAT_ITEMS];
  436#endif
  437};
  438
  439static inline void stat(const struct kmem_cache *s, enum stat_item si)
  440{
  441#ifdef CONFIG_SLUB_STATS
  442	/*
  443	 * The rmw is racy on a preemptible kernel but this is acceptable, so
  444	 * avoid this_cpu_add()'s irq-disable overhead.
  445	 */
  446	raw_cpu_inc(s->cpu_slab->stat[si]);
  447#endif
  448}
  449
  450static inline
  451void stat_add(const struct kmem_cache *s, enum stat_item si, int v)
  452{
  453#ifdef CONFIG_SLUB_STATS
  454	raw_cpu_add(s->cpu_slab->stat[si], v);
  455#endif
  456}
  457
  458#define MAX_FULL_SHEAVES	10
  459#define MAX_EMPTY_SHEAVES	10
  460
  461struct node_barn {
  462	spinlock_t lock;
  463	struct list_head sheaves_full;
  464	struct list_head sheaves_empty;
  465	unsigned int nr_full;
  466	unsigned int nr_empty;
  467};
  468
  469struct slab_sheaf {
  470	union {
  471		struct rcu_head rcu_head;
  472		struct list_head barn_list;
  473		/* only used for prefilled sheafs */
  474		struct {
  475			unsigned int capacity;
  476			bool pfmemalloc;
  477		};
  478	};
  479	struct kmem_cache *cache;
  480	unsigned int size;
  481	int node; /* only used for rcu_sheaf */
  482	void *objects[];
  483};
  484
  485struct slub_percpu_sheaves {
  486	local_trylock_t lock;
  487	struct slab_sheaf *main; /* never NULL when unlocked */
  488	struct slab_sheaf *spare; /* empty or full, may be NULL */
  489	struct slab_sheaf *rcu_free; /* for batching kfree_rcu() */
  490};
  491
  492/*
  493 * The slab lists for all objects.
  494 */
  495struct kmem_cache_node {
  496	spinlock_t list_lock;
  497	unsigned long nr_partial;
  498	struct list_head partial;
  499#ifdef CONFIG_SLUB_DEBUG
  500	atomic_long_t nr_slabs;
  501	atomic_long_t total_objects;
  502	struct list_head full;
  503#endif
  504	struct node_barn *barn;
  505};
  506
  507static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node)
  508{
  509	return s->node[node];
  510}
  511
  512/*
  513 * Get the barn of the current cpu's closest memory node. It may not exist on
  514 * systems with memoryless nodes but without CONFIG_HAVE_MEMORYLESS_NODES
  515 */
  516static inline struct node_barn *get_barn(struct kmem_cache *s)
  517{
  518	struct kmem_cache_node *n = get_node(s, numa_mem_id());
  519
  520	if (!n)
  521		return NULL;
  522
  523	return n->barn;
  524}
  525
  526/*
  527 * Iterator over all nodes. The body will be executed for each node that has
  528 * a kmem_cache_node structure allocated (which is true for all online nodes)
  529 */
  530#define for_each_kmem_cache_node(__s, __node, __n) \
  531	for (__node = 0; __node < nr_node_ids; __node++) \
  532		 if ((__n = get_node(__s, __node)))
  533
  534/*
  535 * Tracks for which NUMA nodes we have kmem_cache_nodes allocated.
  536 * Corresponds to node_state[N_MEMORY], but can temporarily
  537 * differ during memory hotplug/hotremove operations.
  538 * Protected by slab_mutex.
  539 */
  540static nodemask_t slab_nodes;
  541
  542/*
  543 * Workqueue used for flush_cpu_slab().
  544 */
  545static struct workqueue_struct *flushwq;
  546
  547struct slub_flush_work {
  548	struct work_struct work;
  549	struct kmem_cache *s;
  550	bool skip;
  551};
  552
  553static DEFINE_MUTEX(flush_lock);
  554static DEFINE_PER_CPU(struct slub_flush_work, slub_flush);
  555
  556/********************************************************************
  557 * 			Core slab cache functions
  558 *******************************************************************/
  559
  560/*
  561 * Returns freelist pointer (ptr). With hardening, this is obfuscated
  562 * with an XOR of the address where the pointer is held and a per-cache
  563 * random number.
  564 */
  565static inline freeptr_t freelist_ptr_encode(const struct kmem_cache *s,
  566					    void *ptr, unsigned long ptr_addr)
  567{
  568	unsigned long encoded;
  569
  570#ifdef CONFIG_SLAB_FREELIST_HARDENED
  571	encoded = (unsigned long)ptr ^ s->random ^ swab(ptr_addr);
  572#else
  573	encoded = (unsigned long)ptr;
  574#endif
  575	return (freeptr_t){.v = encoded};
  576}
  577
  578static inline void *freelist_ptr_decode(const struct kmem_cache *s,
  579					freeptr_t ptr, unsigned long ptr_addr)
  580{
  581	void *decoded;
  582
  583#ifdef CONFIG_SLAB_FREELIST_HARDENED
  584	decoded = (void *)(ptr.v ^ s->random ^ swab(ptr_addr));
  585#else
  586	decoded = (void *)ptr.v;
  587#endif
  588	return decoded;
  589}
  590
  591static inline void *get_freepointer(struct kmem_cache *s, void *object)
  592{
  593	unsigned long ptr_addr;
  594	freeptr_t p;
  595
  596	object = kasan_reset_tag(object);
  597	ptr_addr = (unsigned long)object + s->offset;
  598	p = *(freeptr_t *)(ptr_addr);
  599	return freelist_ptr_decode(s, p, ptr_addr);
  600}
  601
  602static void prefetch_freepointer(const struct kmem_cache *s, void *object)
  603{
  604	prefetchw(object + s->offset);
  605}
  606
  607/*
  608 * When running under KMSAN, get_freepointer_safe() may return an uninitialized
  609 * pointer value in the case the current thread loses the race for the next
  610 * memory chunk in the freelist. In that case this_cpu_cmpxchg_double() in
  611 * slab_alloc_node() will fail, so the uninitialized value won't be used, but
  612 * KMSAN will still check all arguments of cmpxchg because of imperfect
  613 * handling of inline assembly.
  614 * To work around this problem, we apply __no_kmsan_checks to ensure that
  615 * get_freepointer_safe() returns initialized memory.
  616 */
  617__no_kmsan_checks
  618static inline void *get_freepointer_safe(struct kmem_cache *s, void *object)
  619{
  620	unsigned long freepointer_addr;
  621	freeptr_t p;
  622
  623	if (!debug_pagealloc_enabled_static())
  624		return get_freepointer(s, object);
  625
  626	object = kasan_reset_tag(object);
  627	freepointer_addr = (unsigned long)object + s->offset;
  628	copy_from_kernel_nofault(&p, (freeptr_t *)freepointer_addr, sizeof(p));
  629	return freelist_ptr_decode(s, p, freepointer_addr);
  630}
  631
  632static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
  633{
  634	unsigned long freeptr_addr = (unsigned long)object + s->offset;
  635
  636#ifdef CONFIG_SLAB_FREELIST_HARDENED
  637	BUG_ON(object == fp); /* naive detection of double free or corruption */
  638#endif
  639
  640	freeptr_addr = (unsigned long)kasan_reset_tag((void *)freeptr_addr);
  641	*(freeptr_t *)freeptr_addr = freelist_ptr_encode(s, fp, freeptr_addr);
  642}
  643
  644/*
  645 * See comment in calculate_sizes().
  646 */
  647static inline bool freeptr_outside_object(struct kmem_cache *s)
  648{
  649	return s->offset >= s->inuse;
  650}
  651
  652/*
  653 * Return offset of the end of info block which is inuse + free pointer if
  654 * not overlapping with object.
  655 */
  656static inline unsigned int get_info_end(struct kmem_cache *s)
  657{
  658	if (freeptr_outside_object(s))
  659		return s->inuse + sizeof(void *);
  660	else
  661		return s->inuse;
  662}
  663
  664/* Loop over all objects in a slab */
  665#define for_each_object(__p, __s, __addr, __objects) \
  666	for (__p = fixup_red_left(__s, __addr); \
  667		__p < (__addr) + (__objects) * (__s)->size; \
  668		__p += (__s)->size)
  669
  670static inline unsigned int order_objects(unsigned int order, unsigned int size)
  671{
  672	return ((unsigned int)PAGE_SIZE << order) / size;
  673}
  674
  675static inline struct kmem_cache_order_objects oo_make(unsigned int order,
  676		unsigned int size)
  677{
  678	struct kmem_cache_order_objects x = {
  679		(order << OO_SHIFT) + order_objects(order, size)
  680	};
  681
  682	return x;
  683}
  684
  685static inline unsigned int oo_order(struct kmem_cache_order_objects x)
  686{
  687	return x.x >> OO_SHIFT;
  688}
  689
  690static inline unsigned int oo_objects(struct kmem_cache_order_objects x)
  691{
  692	return x.x & OO_MASK;
  693}
  694
  695#ifdef CONFIG_SLUB_CPU_PARTIAL
  696static void slub_set_cpu_partial(struct kmem_cache *s, unsigned int nr_objects)
  697{
  698	unsigned int nr_slabs;
  699
  700	s->cpu_partial = nr_objects;
  701
  702	/*
  703	 * We take the number of objects but actually limit the number of
  704	 * slabs on the per cpu partial list, in order to limit excessive
  705	 * growth of the list. For simplicity we assume that the slabs will
  706	 * be half-full.
  707	 */
  708	nr_slabs = DIV_ROUND_UP(nr_objects * 2, oo_objects(s->oo));
  709	s->cpu_partial_slabs = nr_slabs;
  710}
  711
  712static inline unsigned int slub_get_cpu_partial(struct kmem_cache *s)
  713{
  714	return s->cpu_partial_slabs;
  715}
  716#else
  717#ifdef SLAB_SUPPORTS_SYSFS
  718static inline void
  719slub_set_cpu_partial(struct kmem_cache *s, unsigned int nr_objects)
  720{
  721}
  722#endif
  723
  724static inline unsigned int slub_get_cpu_partial(struct kmem_cache *s)
  725{
  726	return 0;
  727}
  728#endif /* CONFIG_SLUB_CPU_PARTIAL */
  729
  730/*
  731 * If network-based swap is enabled, slub must keep track of whether memory
  732 * were allocated from pfmemalloc reserves.
  733 */
  734static inline bool slab_test_pfmemalloc(const struct slab *slab)
  735{
  736	return test_bit(SL_pfmemalloc, &slab->flags.f);
  737}
  738
  739static inline void slab_set_pfmemalloc(struct slab *slab)
  740{
  741	set_bit(SL_pfmemalloc, &slab->flags.f);
  742}
  743
  744static inline void __slab_clear_pfmemalloc(struct slab *slab)
  745{
  746	__clear_bit(SL_pfmemalloc, &slab->flags.f);
  747}
  748
  749/*
  750 * Per slab locking using the pagelock
  751 */
  752static __always_inline void slab_lock(struct slab *slab)
  753{
  754	bit_spin_lock(SL_locked, &slab->flags.f);
  755}
  756
  757static __always_inline void slab_unlock(struct slab *slab)
  758{
  759	bit_spin_unlock(SL_locked, &slab->flags.f);
  760}
  761
  762static inline bool
  763__update_freelist_fast(struct slab *slab, struct freelist_counters *old,
  764		       struct freelist_counters *new)
  765{
  766#ifdef system_has_freelist_aba
  767	return try_cmpxchg_freelist(&slab->freelist_counters,
  768				    &old->freelist_counters,
  769				    new->freelist_counters);
  770#else
  771	return false;
  772#endif
  773}
  774
  775static inline bool
  776__update_freelist_slow(struct slab *slab, struct freelist_counters *old,
  777		       struct freelist_counters *new)
  778{
  779	bool ret = false;
  780
  781	slab_lock(slab);
  782	if (slab->freelist == old->freelist &&
  783	    slab->counters == old->counters) {
  784		slab->freelist = new->freelist;
  785		slab->counters = new->counters;
  786		ret = true;
  787	}
  788	slab_unlock(slab);
  789
  790	return ret;
  791}
  792
  793/*
  794 * Interrupts must be disabled (for the fallback code to work right), typically
  795 * by an _irqsave() lock variant. On PREEMPT_RT the preempt_disable(), which is
  796 * part of bit_spin_lock(), is sufficient because the policy is not to allow any
  797 * allocation/ free operation in hardirq context. Therefore nothing can
  798 * interrupt the operation.
  799 */
  800static inline bool __slab_update_freelist(struct kmem_cache *s, struct slab *slab,
  801		struct freelist_counters *old, struct freelist_counters *new, const char *n)
  802{
  803	bool ret;
  804
  805	if (USE_LOCKLESS_FAST_PATH())
  806		lockdep_assert_irqs_disabled();
  807
  808	if (s->flags & __CMPXCHG_DOUBLE)
  809		ret = __update_freelist_fast(slab, old, new);
  810	else
  811		ret = __update_freelist_slow(slab, old, new);
  812
  813	if (likely(ret))
  814		return true;
  815
  816	cpu_relax();
  817	stat(s, CMPXCHG_DOUBLE_FAIL);
  818
  819#ifdef SLUB_DEBUG_CMPXCHG
  820	pr_info("%s %s: cmpxchg double redo ", n, s->name);
  821#endif
  822
  823	return false;
  824}
  825
  826static inline bool slab_update_freelist(struct kmem_cache *s, struct slab *slab,
  827		struct freelist_counters *old, struct freelist_counters *new, const char *n)
  828{
  829	bool ret;
  830
  831	if (s->flags & __CMPXCHG_DOUBLE) {
  832		ret = __update_freelist_fast(slab, old, new);
  833	} else {
  834		unsigned long flags;
  835
  836		local_irq_save(flags);
  837		ret = __update_freelist_slow(slab, old, new);
  838		local_irq_restore(flags);
  839	}
  840	if (likely(ret))
  841		return true;
  842
  843	cpu_relax();
  844	stat(s, CMPXCHG_DOUBLE_FAIL);
  845
  846#ifdef SLUB_DEBUG_CMPXCHG
  847	pr_info("%s %s: cmpxchg double redo ", n, s->name);
  848#endif
  849
  850	return false;
  851}
  852
  853/*
  854 * kmalloc caches has fixed sizes (mostly power of 2), and kmalloc() API
  855 * family will round up the real request size to these fixed ones, so
  856 * there could be an extra area than what is requested. Save the original
  857 * request size in the meta data area, for better debug and sanity check.
  858 */
  859static inline void set_orig_size(struct kmem_cache *s,
  860				void *object, unsigned int orig_size)
  861{
  862	void *p = kasan_reset_tag(object);
  863
  864	if (!slub_debug_orig_size(s))
  865		return;
  866
  867	p += get_info_end(s);
  868	p += sizeof(struct track) * 2;
  869
  870	*(unsigned int *)p = orig_size;
  871}
  872
  873static inline unsigned int get_orig_size(struct kmem_cache *s, void *object)
  874{
  875	void *p = kasan_reset_tag(object);
  876
  877	if (is_kfence_address(object))
  878		return kfence_ksize(object);
  879
  880	if (!slub_debug_orig_size(s))
  881		return s->object_size;
  882
  883	p += get_info_end(s);
  884	p += sizeof(struct track) * 2;
  885
  886	return *(unsigned int *)p;
  887}
  888
  889#ifdef CONFIG_SLUB_DEBUG
  890
  891/*
  892 * For debugging context when we want to check if the struct slab pointer
  893 * appears to be valid.
  894 */
  895static inline bool validate_slab_ptr(struct slab *slab)
  896{
  897	return PageSlab(slab_page(slab));
  898}
  899
  900static unsigned long object_map[BITS_TO_LONGS(MAX_OBJS_PER_PAGE)];
  901static DEFINE_SPINLOCK(object_map_lock);
  902
  903static void __fill_map(unsigned long *obj_map, struct kmem_cache *s,
  904		       struct slab *slab)
  905{
  906	void *addr = slab_address(slab);
  907	void *p;
  908
  909	bitmap_zero(obj_map, slab->objects);
  910
  911	for (p = slab->freelist; p; p = get_freepointer(s, p))
  912		set_bit(__obj_to_index(s, addr, p), obj_map);
  913}
  914
  915#if IS_ENABLED(CONFIG_KUNIT)
  916static bool slab_add_kunit_errors(void)
  917{
  918	struct kunit_resource *resource;
  919
  920	if (!kunit_get_current_test())
  921		return false;
  922
  923	resource = kunit_find_named_resource(current->kunit_test, "slab_errors");
  924	if (!resource)
  925		return false;
  926
  927	(*(int *)resource->data)++;
  928	kunit_put_resource(resource);
  929	return true;
  930}
  931
  932bool slab_in_kunit_test(void)
  933{
  934	struct kunit_resource *resource;
  935
  936	if (!kunit_get_current_test())
  937		return false;
  938
  939	resource = kunit_find_named_resource(current->kunit_test, "slab_errors");
  940	if (!resource)
  941		return false;
  942
  943	kunit_put_resource(resource);
  944	return true;
  945}
  946#else
  947static inline bool slab_add_kunit_errors(void) { return false; }
  948#endif
  949
  950static inline unsigned int size_from_object(struct kmem_cache *s)
  951{
  952	if (s->flags & SLAB_RED_ZONE)
  953		return s->size - s->red_left_pad;
  954
  955	return s->size;
  956}
  957
  958static inline void *restore_red_left(struct kmem_cache *s, void *p)
  959{
  960	if (s->flags & SLAB_RED_ZONE)
  961		p -= s->red_left_pad;
  962
  963	return p;
  964}
  965
  966/*
  967 * Debug settings:
  968 */
  969#if defined(CONFIG_SLUB_DEBUG_ON)
  970static slab_flags_t slub_debug = DEBUG_DEFAULT_FLAGS;
  971#else
  972static slab_flags_t slub_debug;
  973#endif
  974
  975static const char *slub_debug_string __ro_after_init;
  976static int disable_higher_order_debug;
  977
  978/*
  979 * slub is about to manipulate internal object metadata.  This memory lies
  980 * outside the range of the allocated object, so accessing it would normally
  981 * be reported by kasan as a bounds error.  metadata_access_enable() is used
  982 * to tell kasan that these accesses are OK.
  983 */
  984static inline void metadata_access_enable(void)
  985{
  986	kasan_disable_current();
  987	kmsan_disable_current();
  988}
  989
  990static inline void metadata_access_disable(void)
  991{
  992	kmsan_enable_current();
  993	kasan_enable_current();
  994}
  995
  996/*
  997 * Object debugging
  998 */
  999
 1000/* Verify that a pointer has an address that is valid within a slab page */
 1001static inline int check_valid_pointer(struct kmem_cache *s,
 1002				struct slab *slab, void *object)
 1003{
 1004	void *base;
 1005
 1006	if (!object)
 1007		return 1;
 1008
 1009	base = slab_address(slab);
 1010	object = kasan_reset_tag(object);
 1011	object = restore_red_left(s, object);
 1012	if (object < base || object >= base + slab->objects * s->size ||
 1013		(object - base) % s->size) {
 1014		return 0;
 1015	}
 1016
 1017	return 1;
 1018}
 1019
 1020static void print_section(char *level, char *text, u8 *addr,
 1021			  unsigned int length)
 1022{
 1023	metadata_access_enable();
 1024	print_hex_dump(level, text, DUMP_PREFIX_ADDRESS,
 1025			16, 1, kasan_reset_tag((void *)addr), length, 1);
 1026	metadata_access_disable();
 1027}
 1028
 1029static struct track *get_track(struct kmem_cache *s, void *object,
 1030	enum track_item alloc)
 1031{
 1032	struct track *p;
 1033
 1034	p = object + get_info_end(s);
 1035
 1036	return kasan_reset_tag(p + alloc);
 1037}
 1038
 1039#ifdef CONFIG_STACKDEPOT
 1040static noinline depot_stack_handle_t set_track_prepare(gfp_t gfp_flags)
 1041{
 1042	depot_stack_handle_t handle;
 1043	unsigned long entries[TRACK_ADDRS_COUNT];
 1044	unsigned int nr_entries;
 1045
 1046	nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 3);
 1047	handle = stack_depot_save(entries, nr_entries, gfp_flags);
 1048
 1049	return handle;
 1050}
 1051#else
 1052static inline depot_stack_handle_t set_track_prepare(gfp_t gfp_flags)
 1053{
 1054	return 0;
 1055}
 1056#endif
 1057
 1058static void set_track_update(struct kmem_cache *s, void *object,
 1059			     enum track_item alloc, unsigned long addr,
 1060			     depot_stack_handle_t handle)
 1061{
 1062	struct track *p = get_track(s, object, alloc);
 1063
 1064#ifdef CONFIG_STACKDEPOT
 1065	p->handle = handle;
 1066#endif
 1067	p->addr = addr;
 1068	p->cpu = smp_processor_id();
 1069	p->pid = current->pid;
 1070	p->when = jiffies;
 1071}
 1072
 1073static __always_inline void set_track(struct kmem_cache *s, void *object,
 1074				      enum track_item alloc, unsigned long addr, gfp_t gfp_flags)
 1075{
 1076	depot_stack_handle_t handle = set_track_prepare(gfp_flags);
 1077
 1078	set_track_update(s, object, alloc, addr, handle);
 1079}
 1080
 1081static void init_tracking(struct kmem_cache *s, void *object)
 1082{
 1083	struct track *p;
 1084
 1085	if (!(s->flags & SLAB_STORE_USER))
 1086		return;
 1087
 1088	p = get_track(s, object, TRACK_ALLOC);
 1089	memset(p, 0, 2*sizeof(struct track));
 1090}
 1091
 1092static void print_track(const char *s, struct track *t, unsigned long pr_time)
 1093{
 1094	depot_stack_handle_t handle __maybe_unused;
 1095
 1096	if (!t->addr)
 1097		return;
 1098
 1099	pr_err("%s in %pS age=%lu cpu=%u pid=%d\n",
 1100	       s, (void *)t->addr, pr_time - t->when, t->cpu, t->pid);
 1101#ifdef CONFIG_STACKDEPOT
 1102	handle = READ_ONCE(t->handle);
 1103	if (handle)
 1104		stack_depot_print(handle);
 1105	else
 1106		pr_err("object allocation/free stack trace missing\n");
 1107#endif
 1108}
 1109
 1110void print_tracking(struct kmem_cache *s, void *object)
 1111{
 1112	unsigned long pr_time = jiffies;
 1113	if (!(s->flags & SLAB_STORE_USER))
 1114		return;
 1115
 1116	print_track("Allocated", get_track(s, object, TRACK_ALLOC), pr_time);
 1117	print_track("Freed", get_track(s, object, TRACK_FREE), pr_time);
 1118}
 1119
 1120static void print_slab_info(const struct slab *slab)
 1121{
 1122	pr_err("Slab 0x%p objects=%u used=%u fp=0x%p flags=%pGp\n",
 1123	       slab, slab->objects, slab->inuse, slab->freelist,
 1124	       &slab->flags.f);
 1125}
 1126
 1127void skip_orig_size_check(struct kmem_cache *s, const void *object)
 1128{
 1129	set_orig_size(s, (void *)object, s->object_size);
 1130}
 1131
 1132static void __slab_bug(struct kmem_cache *s, const char *fmt, va_list argsp)
 1133{
 1134	struct va_format vaf;
 1135	va_list args;
 1136
 1137	va_copy(args, argsp);
 1138	vaf.fmt = fmt;
 1139	vaf.va = &args;
 1140	pr_err("=============================================================================\n");
 1141	pr_err("BUG %s (%s): %pV\n", s ? s->name : "<unknown>", print_tainted(), &vaf);
 1142	pr_err("-----------------------------------------------------------------------------\n\n");
 1143	va_end(args);
 1144}
 1145
 1146static void slab_bug(struct kmem_cache *s, const char *fmt, ...)
 1147{
 1148	va_list args;
 1149
 1150	va_start(args, fmt);
 1151	__slab_bug(s, fmt, args);
 1152	va_end(args);
 1153}
 1154
 1155__printf(2, 3)
 1156static void slab_fix(struct kmem_cache *s, const char *fmt, ...)
 1157{
 1158	struct va_format vaf;
 1159	va_list args;
 1160
 1161	if (slab_add_kunit_errors())
 1162		return;
 1163
 1164	va_start(args, fmt);
 1165	vaf.fmt = fmt;
 1166	vaf.va = &args;
 1167	pr_err("FIX %s: %pV\n", s->name, &vaf);
 1168	va_end(args);
 1169}
 1170
 1171static void print_trailer(struct kmem_cache *s, struct slab *slab, u8 *p)
 1172{
 1173	unsigned int off;	/* Offset of last byte */
 1174	u8 *addr = slab_address(slab);
 1175
 1176	print_tracking(s, p);
 1177
 1178	print_slab_info(slab);
 1179
 1180	pr_err("Object 0x%p @offset=%tu fp=0x%p\n\n",
 1181	       p, p - addr, get_freepointer(s, p));
 1182
 1183	if (s->flags & SLAB_RED_ZONE)
 1184		print_section(KERN_ERR, "Redzone  ", p - s->red_left_pad,
 1185			      s->red_left_pad);
 1186	else if (p > addr + 16)
 1187		print_section(KERN_ERR, "Bytes b4 ", p - 16, 16);
 1188
 1189	print_section(KERN_ERR,         "Object   ", p,
 1190		      min_t(unsigned int, s->object_size, PAGE_SIZE));
 1191	if (s->flags & SLAB_RED_ZONE)
 1192		print_section(KERN_ERR, "Redzone  ", p + s->object_size,
 1193			s->inuse - s->object_size);
 1194
 1195	off = get_info_end(s);
 1196
 1197	if (s->flags & SLAB_STORE_USER)
 1198		off += 2 * sizeof(struct track);
 1199
 1200	if (slub_debug_orig_size(s))
 1201		off += sizeof(unsigned int);
 1202
 1203	off += kasan_metadata_size(s, false);
 1204
 1205	if (off != size_from_object(s))
 1206		/* Beginning of the filler is the free pointer */
 1207		print_section(KERN_ERR, "Padding  ", p + off,
 1208			      size_from_object(s) - off);
 1209}
 1210
 1211static void object_err(struct kmem_cache *s, struct slab *slab,
 1212			u8 *object, const char *reason)
 1213{
 1214	if (slab_add_kunit_errors())
 1215		return;
 1216
 1217	slab_bug(s, reason);
 1218	if (!object || !check_valid_pointer(s, slab, object)) {
 1219		print_slab_info(slab);
 1220		pr_err("Invalid pointer 0x%p\n", object);
 1221	} else {
 1222		print_trailer(s, slab, object);
 1223	}
 1224	add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
 1225
 1226	WARN_ON(1);
 1227}
 1228
 1229static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab,
 1230			       void **freelist, void *nextfree)
 1231{
 1232	if ((s->flags & SLAB_CONSISTENCY_CHECKS) &&
 1233	    !check_valid_pointer(s, slab, nextfree) && freelist) {
 1234		object_err(s, slab, *freelist, "Freechain corrupt");
 1235		*freelist = NULL;
 1236		slab_fix(s, "Isolate corrupted freechain");
 1237		return true;
 1238	}
 1239
 1240	return false;
 1241}
 1242
 1243static void __slab_err(struct slab *slab)
 1244{
 1245	if (slab_in_kunit_test())
 1246		return;
 1247
 1248	print_slab_info(slab);
 1249	add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
 1250
 1251	WARN_ON(1);
 1252}
 1253
 1254static __printf(3, 4) void slab_err(struct kmem_cache *s, struct slab *slab,
 1255			const char *fmt, ...)
 1256{
 1257	va_list args;
 1258
 1259	if (slab_add_kunit_errors())
 1260		return;
 1261
 1262	va_start(args, fmt);
 1263	__slab_bug(s, fmt, args);
 1264	va_end(args);
 1265
 1266	__slab_err(slab);
 1267}
 1268
 1269static void init_object(struct kmem_cache *s, void *object, u8 val)
 1270{
 1271	u8 *p = kasan_reset_tag(object);
 1272	unsigned int poison_size = s->object_size;
 1273
 1274	if (s->flags & SLAB_RED_ZONE) {
 1275		/*
 1276		 * Here and below, avoid overwriting the KMSAN shadow. Keeping
 1277		 * the shadow makes it possible to distinguish uninit-value
 1278		 * from use-after-free.
 1279		 */
 1280		memset_no_sanitize_memory(p - s->red_left_pad, val,
 1281					  s->red_left_pad);
 1282
 1283		if (slub_debug_orig_size(s) && val == SLUB_RED_ACTIVE) {
 1284			/*
 1285			 * Redzone the extra allocated space by kmalloc than
 1286			 * requested, and the poison size will be limited to
 1287			 * the original request size accordingly.
 1288			 */
 1289			poison_size = get_orig_size(s, object);
 1290		}
 1291	}
 1292
 1293	if (s->flags & __OBJECT_POISON) {
 1294		memset_no_sanitize_memory(p, POISON_FREE, poison_size - 1);
 1295		memset_no_sanitize_memory(p + poison_size - 1, POISON_END, 1);
 1296	}
 1297
 1298	if (s->flags & SLAB_RED_ZONE)
 1299		memset_no_sanitize_memory(p + poison_size, val,
 1300					  s->inuse - poison_size);
 1301}
 1302
 1303static void restore_bytes(struct kmem_cache *s, const char *message, u8 data,
 1304						void *from, void *to)
 1305{
 1306	slab_fix(s, "Restoring %s 0x%p-0x%p=0x%x", message, from, to - 1, data);
 1307	memset(from, data, to - from);
 1308}
 1309
 1310#ifdef CONFIG_KMSAN
 1311#define pad_check_attributes noinline __no_kmsan_checks
 1312#else
 1313#define pad_check_attributes
 1314#endif
 1315
 1316static pad_check_attributes int
 1317check_bytes_and_report(struct kmem_cache *s, struct slab *slab,
 1318		       u8 *object, const char *what, u8 *start, unsigned int value,
 1319		       unsigned int bytes, bool slab_obj_print)
 1320{
 1321	u8 *fault;
 1322	u8 *end;
 1323	u8 *addr = slab_address(slab);
 1324
 1325	metadata_access_enable();
 1326	fault = memchr_inv(kasan_reset_tag(start), value, bytes);
 1327	metadata_access_disable();
 1328	if (!fault)
 1329		return 1;
 1330
 1331	end = start + bytes;
 1332	while (end > fault && end[-1] == value)
 1333		end--;
 1334
 1335	if (slab_add_kunit_errors())
 1336		goto skip_bug_print;
 1337
 1338	pr_err("[%s overwritten] 0x%p-0x%p @offset=%tu. First byte 0x%x instead of 0x%x\n",
 1339	       what, fault, end - 1, fault - addr, fault[0], value);
 1340
 1341	if (slab_obj_print)
 1342		object_err(s, slab, object, "Object corrupt");
 1343
 1344skip_bug_print:
 1345	restore_bytes(s, what, value, fault, end);
 1346	return 0;
 1347}
 1348
 1349/*
 1350 * Object layout:
 1351 *
 1352 * object address
 1353 * 	Bytes of the object to be managed.
 1354 * 	If the freepointer may overlay the object then the free
 1355 *	pointer is at the middle of the object.
 1356 *
 1357 * 	Poisoning uses 0x6b (POISON_FREE) and the last byte is
 1358 * 	0xa5 (POISON_END)
 1359 *
 1360 * object + s->object_size
 1361 * 	Padding to reach word boundary. This is also used for Redzoning.
 1362 * 	Padding is extended by another word if Redzoning is enabled and
 1363 * 	object_size == inuse.
 1364 *
 1365 * 	We fill with 0xbb (SLUB_RED_INACTIVE) for inactive objects and with
 1366 * 	0xcc (SLUB_RED_ACTIVE) for objects in use.
 1367 *
 1368 * object + s->inuse
 1369 * 	Meta data starts here.
 1370 *
 1371 * 	A. Free pointer (if we cannot overwrite object on free)
 1372 * 	B. Tracking data for SLAB_STORE_USER
 1373 *	C. Original request size for kmalloc object (SLAB_STORE_USER enabled)
 1374 *	D. Padding to reach required alignment boundary or at minimum
 1375 * 		one word if debugging is on to be able to detect writes
 1376 * 		before the word boundary.
 1377 *
 1378 *	Padding is done using 0x5a (POISON_INUSE)
 1379 *
 1380 * object + s->size
 1381 * 	Nothing is used beyond s->size.
 1382 *
 1383 * If slabcaches are merged then the object_size and inuse boundaries are mostly
 1384 * ignored. And therefore no slab options that rely on these boundaries
 1385 * may be used with merged slabcaches.
 1386 */
 1387
 1388static int check_pad_bytes(struct kmem_cache *s, struct slab *slab, u8 *p)
 1389{
 1390	unsigned long off = get_info_end(s);	/* The end of info */
 1391
 1392	if (s->flags & SLAB_STORE_USER) {
 1393		/* We also have user information there */
 1394		off += 2 * sizeof(struct track);
 1395
 1396		if (s->flags & SLAB_KMALLOC)
 1397			off += sizeof(unsigned int);
 1398	}
 1399
 1400	off += kasan_metadata_size(s, false);
 1401
 1402	if (size_from_object(s) == off)
 1403		return 1;
 1404
 1405	return check_bytes_and_report(s, slab, p, "Object padding",
 1406			p + off, POISON_INUSE, size_from_object(s) - off, true);
 1407}
 1408
 1409/* Check the pad bytes at the end of a slab page */
 1410static pad_check_attributes void
 1411slab_pad_check(struct kmem_cache *s, struct slab *slab)
 1412{
 1413	u8 *start;
 1414	u8 *fault;
 1415	u8 *end;
 1416	u8 *pad;
 1417	int length;
 1418	int remainder;
 1419
 1420	if (!(s->flags & SLAB_POISON))
 1421		return;
 1422
 1423	start = slab_address(slab);
 1424	length = slab_size(slab);
 1425	end = start + length;
 1426	remainder = length % s->size;
 1427	if (!remainder)
 1428		return;
 1429
 1430	pad = end - remainder;
 1431	metadata_access_enable();
 1432	fault = memchr_inv(kasan_reset_tag(pad), POISON_INUSE, remainder);
 1433	metadata_access_disable();
 1434	if (!fault)
 1435		return;
 1436	while (end > fault && end[-1] == POISON_INUSE)
 1437		end--;
 1438
 1439	slab_bug(s, "Padding overwritten. 0x%p-0x%p @offset=%tu",
 1440		 fault, end - 1, fault - start);
 1441	print_section(KERN_ERR, "Padding ", pad, remainder);
 1442	__slab_err(slab);
 1443
 1444	restore_bytes(s, "slab padding", POISON_INUSE, fault, end);
 1445}
 1446
 1447static int check_object(struct kmem_cache *s, struct slab *slab,
 1448					void *object, u8 val)
 1449{
 1450	u8 *p = object;
 1451	u8 *endobject = object + s->object_size;
 1452	unsigned int orig_size, kasan_meta_size;
 1453	int ret = 1;
 1454
 1455	if (s->flags & SLAB_RED_ZONE) {
 1456		if (!check_bytes_and_report(s, slab, object, "Left Redzone",
 1457			object - s->red_left_pad, val, s->red_left_pad, ret))
 1458			ret = 0;
 1459
 1460		if (!check_bytes_and_report(s, slab, object, "Right Redzone",
 1461			endobject, val, s->inuse - s->object_size, ret))
 1462			ret = 0;
 1463
 1464		if (slub_debug_orig_size(s) && val == SLUB_RED_ACTIVE) {
 1465			orig_size = get_orig_size(s, object);
 1466
 1467			if (s->object_size > orig_size  &&
 1468				!check_bytes_and_report(s, slab, object,
 1469					"kmalloc Redzone", p + orig_size,
 1470					val, s->object_size - orig_size, ret)) {
 1471				ret = 0;
 1472			}
 1473		}
 1474	} else {
 1475		if ((s->flags & SLAB_POISON) && s->object_size < s->inuse) {
 1476			if (!check_bytes_and_report(s, slab, p, "Alignment padding",
 1477				endobject, POISON_INUSE,
 1478				s->inuse - s->object_size, ret))
 1479				ret = 0;
 1480		}
 1481	}
 1482
 1483	if (s->flags & SLAB_POISON) {
 1484		if (val != SLUB_RED_ACTIVE && (s->flags & __OBJECT_POISON)) {
 1485			/*
 1486			 * KASAN can save its free meta data inside of the
 1487			 * object at offset 0. Thus, skip checking the part of
 1488			 * the redzone that overlaps with the meta data.
 1489			 */
 1490			kasan_meta_size = kasan_metadata_size(s, true);
 1491			if (kasan_meta_size < s->object_size - 1 &&
 1492			    !check_bytes_and_report(s, slab, p, "Poison",
 1493					p + kasan_meta_size, POISON_FREE,
 1494					s->object_size - kasan_meta_size - 1, ret))
 1495				ret = 0;
 1496			if (kasan_meta_size < s->object_size &&
 1497			    !check_bytes_and_report(s, slab, p, "End Poison",
 1498					p + s->object_size - 1, POISON_END, 1, ret))
 1499				ret = 0;
 1500		}
 1501		/*
 1502		 * check_pad_bytes cleans up on its own.
 1503		 */
 1504		if (!check_pad_bytes(s, slab, p))
 1505			ret = 0;
 1506	}
 1507
 1508	/*
 1509	 * Cannot check freepointer while object is allocated if
 1510	 * object and freepointer overlap.
 1511	 */
 1512	if ((freeptr_outside_object(s) || val != SLUB_RED_ACTIVE) &&
 1513	    !check_valid_pointer(s, slab, get_freepointer(s, p))) {
 1514		object_err(s, slab, p, "Freepointer corrupt");
 1515		/*
 1516		 * No choice but to zap it and thus lose the remainder
 1517		 * of the free objects in this slab. May cause
 1518		 * another error because the object count is now wrong.
 1519		 */
 1520		set_freepointer(s, p, NULL);
 1521		ret = 0;
 1522	}
 1523
 1524	return ret;
 1525}
 1526
 1527/*
 1528 * Checks if the slab state looks sane. Assumes the struct slab pointer
 1529 * was either obtained in a way that ensures it's valid, or validated
 1530 * by validate_slab_ptr()
 1531 */
 1532static int check_slab(struct kmem_cache *s, struct slab *slab)
 1533{
 1534	int maxobj;
 1535
 1536	maxobj = order_objects(slab_order(slab), s->size);
 1537	if (slab->objects > maxobj) {
 1538		slab_err(s, slab, "objects %u > max %u",
 1539			slab->objects, maxobj);
 1540		return 0;
 1541	}
 1542	if (slab->inuse > slab->objects) {
 1543		slab_err(s, slab, "inuse %u > max %u",
 1544			slab->inuse, slab->objects);
 1545		return 0;
 1546	}
 1547	if (slab->frozen) {
 1548		slab_err(s, slab, "Slab disabled since SLUB metadata consistency check failed");
 1549		return 0;
 1550	}
 1551
 1552	/* Slab_pad_check fixes things up after itself */
 1553	slab_pad_check(s, slab);
 1554	return 1;
 1555}
 1556
 1557/*
 1558 * Determine if a certain object in a slab is on the freelist. Must hold the
 1559 * slab lock to guarantee that the chains are in a consistent state.
 1560 */
 1561static bool on_freelist(struct kmem_cache *s, struct slab *slab, void *search)
 1562{
 1563	int nr = 0;
 1564	void *fp;
 1565	void *object = NULL;
 1566	int max_objects;
 1567
 1568	fp = slab->freelist;
 1569	while (fp && nr <= slab->objects) {
 1570		if (fp == search)
 1571			return true;
 1572		if (!check_valid_pointer(s, slab, fp)) {
 1573			if (object) {
 1574				object_err(s, slab, object,
 1575					"Freechain corrupt");
 1576				set_freepointer(s, object, NULL);
 1577				break;
 1578			} else {
 1579				slab_err(s, slab, "Freepointer corrupt");
 1580				slab->freelist = NULL;
 1581				slab->inuse = slab->objects;
 1582				slab_fix(s, "Freelist cleared");
 1583				return false;
 1584			}
 1585		}
 1586		object = fp;
 1587		fp = get_freepointer(s, object);
 1588		nr++;
 1589	}
 1590
 1591	if (nr > slab->objects) {
 1592		slab_err(s, slab, "Freelist cycle detected");
 1593		slab->freelist = NULL;
 1594		slab->inuse = slab->objects;
 1595		slab_fix(s, "Freelist cleared");
 1596		return false;
 1597	}
 1598
 1599	max_objects = order_objects(slab_order(slab), s->size);
 1600	if (max_objects > MAX_OBJS_PER_PAGE)
 1601		max_objects = MAX_OBJS_PER_PAGE;
 1602
 1603	if (slab->objects != max_objects) {
 1604		slab_err(s, slab, "Wrong number of objects. Found %d but should be %d",
 1605			 slab->objects, max_objects);
 1606		slab->objects = max_objects;
 1607		slab_fix(s, "Number of objects adjusted");
 1608	}
 1609	if (slab->inuse != slab->objects - nr) {
 1610		slab_err(s, slab, "Wrong object count. Counter is %d but counted were %d",
 1611			 slab->inuse, slab->objects - nr);
 1612		slab->inuse = slab->objects - nr;
 1613		slab_fix(s, "Object count adjusted");
 1614	}
 1615	return search == NULL;
 1616}
 1617
 1618static void trace(struct kmem_cache *s, struct slab *slab, void *object,
 1619								int alloc)
 1620{
 1621	if (s->flags & SLAB_TRACE) {
 1622		pr_info("TRACE %s %s 0x%p inuse=%d fp=0x%p\n",
 1623			s->name,
 1624			alloc ? "alloc" : "free",
 1625			object, slab->inuse,
 1626			slab->freelist);
 1627
 1628		if (!alloc)
 1629			print_section(KERN_INFO, "Object ", (void *)object,
 1630					s->object_size);
 1631
 1632		dump_stack();
 1633	}
 1634}
 1635
 1636/*
 1637 * Tracking of fully allocated slabs for debugging purposes.
 1638 */
 1639static void add_full(struct kmem_cache *s,
 1640	struct kmem_cache_node *n, struct slab *slab)
 1641{
 1642	if (!(s->flags & SLAB_STORE_USER))
 1643		return;
 1644
 1645	lockdep_assert_held(&n->list_lock);
 1646	list_add(&slab->slab_list, &n->full);
 1647}
 1648
 1649static void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, struct slab *slab)
 1650{
 1651	if (!(s->flags & SLAB_STORE_USER))
 1652		return;
 1653
 1654	lockdep_assert_held(&n->list_lock);
 1655	list_del(&slab->slab_list);
 1656}
 1657
 1658static inline unsigned long node_nr_slabs(struct kmem_cache_node *n)
 1659{
 1660	return atomic_long_read(&n->nr_slabs);
 1661}
 1662
 1663static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects)
 1664{
 1665	struct kmem_cache_node *n = get_node(s, node);
 1666
 1667	atomic_long_inc(&n->nr_slabs);
 1668	atomic_long_add(objects, &n->total_objects);
 1669}
 1670static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects)
 1671{
 1672	struct kmem_cache_node *n = get_node(s, node);
 1673
 1674	atomic_long_dec(&n->nr_slabs);
 1675	atomic_long_sub(objects, &n->total_objects);
 1676}
 1677
 1678/* Object debug checks for alloc/free paths */
 1679static void setup_object_debug(struct kmem_cache *s, void *object)
 1680{
 1681	if (!kmem_cache_debug_flags(s, SLAB_STORE_USER|SLAB_RED_ZONE|__OBJECT_POISON))
 1682		return;
 1683
 1684	init_object(s, object, SLUB_RED_INACTIVE);
 1685	init_tracking(s, object);
 1686}
 1687
 1688static
 1689void setup_slab_debug(struct kmem_cache *s, struct slab *slab, void *addr)
 1690{
 1691	if (!kmem_cache_debug_flags(s, SLAB_POISON))
 1692		return;
 1693
 1694	metadata_access_enable();
 1695	memset(kasan_reset_tag(addr), POISON_INUSE, slab_size(slab));
 1696	metadata_access_disable();
 1697}
 1698
 1699static inline int alloc_consistency_checks(struct kmem_cache *s,
 1700					struct slab *slab, void *object)
 1701{
 1702	if (!check_slab(s, slab))
 1703		return 0;
 1704
 1705	if (!check_valid_pointer(s, slab, object)) {
 1706		object_err(s, slab, object, "Freelist Pointer check fails");
 1707		return 0;
 1708	}
 1709
 1710	if (!check_object(s, slab, object, SLUB_RED_INACTIVE))
 1711		return 0;
 1712
 1713	return 1;
 1714}
 1715
 1716static noinline bool alloc_debug_processing(struct kmem_cache *s,
 1717			struct slab *slab, void *object, int orig_size)
 1718{
 1719	if (s->flags & SLAB_CONSISTENCY_CHECKS) {
 1720		if (!alloc_consistency_checks(s, slab, object))
 1721			goto bad;
 1722	}
 1723
 1724	/* Success. Perform special debug activities for allocs */
 1725	trace(s, slab, object, 1);
 1726	set_orig_size(s, object, orig_size);
 1727	init_object(s, object, SLUB_RED_ACTIVE);
 1728	return true;
 1729
 1730bad:
 1731	/*
 1732	 * Let's do the best we can to avoid issues in the future. Marking all
 1733	 * objects as used avoids touching the remaining objects.
 1734	 */
 1735	slab_fix(s, "Marking all objects used");
 1736	slab->inuse = slab->objects;
 1737	slab->freelist = NULL;
 1738	slab->frozen = 1; /* mark consistency-failed slab as frozen */
 1739
 1740	return false;
 1741}
 1742
 1743static inline int free_consistency_checks(struct kmem_cache *s,
 1744		struct slab *slab, void *object, unsigned long addr)
 1745{
 1746	if (!check_valid_pointer(s, slab, object)) {
 1747		slab_err(s, slab, "Invalid object pointer 0x%p", object);
 1748		return 0;
 1749	}
 1750
 1751	if (on_freelist(s, slab, object)) {
 1752		object_err(s, slab, object, "Object already free");
 1753		return 0;
 1754	}
 1755
 1756	if (!check_object(s, slab, object, SLUB_RED_ACTIVE))
 1757		return 0;
 1758
 1759	if (unlikely(s != slab->slab_cache)) {
 1760		if (!slab->slab_cache) {
 1761			slab_err(NULL, slab, "No slab cache for object 0x%p",
 1762				 object);
 1763		} else {
 1764			object_err(s, slab, object,
 1765				   "page slab pointer corrupt.");
 1766		}
 1767		return 0;
 1768	}
 1769	return 1;
 1770}
 1771
 1772/*
 1773 * Parse a block of slab_debug options. Blocks are delimited by ';'
 1774 *
 1775 * @str:    start of block
 1776 * @flags:  returns parsed flags, or DEBUG_DEFAULT_FLAGS if none specified
 1777 * @slabs:  return start of list of slabs, or NULL when there's no list
 1778 * @init:   assume this is initial parsing and not per-kmem-create parsing
 1779 *
 1780 * returns the start of next block if there's any, or NULL
 1781 */
 1782static const char *
 1783parse_slub_debug_flags(const char *str, slab_flags_t *flags, const char **slabs, bool init)
 1784{
 1785	bool higher_order_disable = false;
 1786
 1787	/* Skip any completely empty blocks */
 1788	while (*str && *str == ';')
 1789		str++;
 1790
 1791	if (*str == ',') {
 1792		/*
 1793		 * No options but restriction on slabs. This means full
 1794		 * debugging for slabs matching a pattern.
 1795		 */
 1796		*flags = DEBUG_DEFAULT_FLAGS;
 1797		goto check_slabs;
 1798	}
 1799	*flags = 0;
 1800
 1801	/* Determine which debug features should be switched on */
 1802	for (; *str && *str != ',' && *str != ';'; str++) {
 1803		switch (tolower(*str)) {
 1804		case '-':
 1805			*flags = 0;
 1806			break;
 1807		case 'f':
 1808			*flags |= SLAB_CONSISTENCY_CHECKS;
 1809			break;
 1810		case 'z':
 1811			*flags |= SLAB_RED_ZONE;
 1812			break;
 1813		case 'p':
 1814			*flags |= SLAB_POISON;
 1815			break;
 1816		case 'u':
 1817			*flags |= SLAB_STORE_USER;
 1818			break;
 1819		case 't':
 1820			*flags |= SLAB_TRACE;
 1821			break;
 1822		case 'a':
 1823			*flags |= SLAB_FAILSLAB;
 1824			break;
 1825		case 'o':
 1826			/*
 1827			 * Avoid enabling debugging on caches if its minimum
 1828			 * order would increase as a result.
 1829			 */
 1830			higher_order_disable = true;
 1831			break;
 1832		default:
 1833			if (init)
 1834				pr_err("slab_debug option '%c' unknown. skipped\n", *str);
 1835		}
 1836	}
 1837check_slabs:
 1838	if (*str == ',')
 1839		*slabs = ++str;
 1840	else
 1841		*slabs = NULL;
 1842
 1843	/* Skip over the slab list */
 1844	while (*str && *str != ';')
 1845		str++;
 1846
 1847	/* Skip any completely empty blocks */
 1848	while (*str && *str == ';')
 1849		str++;
 1850
 1851	if (init && higher_order_disable)
 1852		disable_higher_order_debug = 1;
 1853
 1854	if (*str)
 1855		return str;
 1856	else
 1857		return NULL;
 1858}
 1859
 1860static int __init setup_slub_debug(const char *str, const struct kernel_param *kp)
 1861{
 1862	slab_flags_t flags;
 1863	slab_flags_t global_flags;
 1864	const char *saved_str;
 1865	const char *slab_list;
 1866	bool global_slub_debug_changed = false;
 1867	bool slab_list_specified = false;
 1868
 1869	global_flags = DEBUG_DEFAULT_FLAGS;
 1870	if (!str || !*str)
 1871		/*
 1872		 * No options specified. Switch on full debugging.
 1873		 */
 1874		goto out;
 1875
 1876	saved_str = str;
 1877	while (str) {
 1878		str = parse_slub_debug_flags(str, &flags, &slab_list, true);
 1879
 1880		if (!slab_list) {
 1881			global_flags = flags;
 1882			global_slub_debug_changed = true;
 1883		} else {
 1884			slab_list_specified = true;
 1885			if (flags & SLAB_STORE_USER)
 1886				stack_depot_request_early_init();
 1887		}
 1888	}
 1889
 1890	/*
 1891	 * For backwards compatibility, a single list of flags with list of
 1892	 * slabs means debugging is only changed for those slabs, so the global
 1893	 * slab_debug should be unchanged (0 or DEBUG_DEFAULT_FLAGS, depending
 1894	 * on CONFIG_SLUB_DEBUG_ON). We can extended that to multiple lists as
 1895	 * long as there is no option specifying flags without a slab list.
 1896	 */
 1897	if (slab_list_specified) {
 1898		if (!global_slub_debug_changed)
 1899			global_flags = slub_debug;
 1900		slub_debug_string = saved_str;
 1901	}
 1902out:
 1903	slub_debug = global_flags;
 1904	if (slub_debug & SLAB_STORE_USER)
 1905		stack_depot_request_early_init();
 1906	if (slub_debug != 0 || slub_debug_string)
 1907		static_branch_enable(&slub_debug_enabled);
 1908	else
 1909		static_branch_disable(&slub_debug_enabled);
 1910	if ((static_branch_unlikely(&init_on_alloc) ||
 1911	     static_branch_unlikely(&init_on_free)) &&
 1912	    (slub_debug & SLAB_POISON))
 1913		pr_info("mem auto-init: SLAB_POISON will take precedence over init_on_alloc/init_on_free\n");
 1914	return 0;
 1915}
 1916
 1917static const struct kernel_param_ops param_ops_slab_debug __initconst = {
 1918	.flags = KERNEL_PARAM_OPS_FL_NOARG,
 1919	.set = setup_slub_debug,
 1920};
 1921__core_param_cb(slab_debug, &param_ops_slab_debug, NULL, 0);
 1922__core_param_cb(slub_debug, &param_ops_slab_debug, NULL, 0);
 1923
 1924/*
 1925 * kmem_cache_flags - apply debugging options to the cache
 1926 * @flags:		flags to set
 1927 * @name:		name of the cache
 1928 *
 1929 * Debug option(s) are applied to @flags. In addition to the debug
 1930 * option(s), if a slab name (or multiple) is specified i.e.
 1931 * slab_debug=<Debug-Options>,<slab name1>,<slab name2> ...
 1932 * then only the select slabs will receive the debug option(s).
 1933 */
 1934slab_flags_t kmem_cache_flags(slab_flags_t flags, const char *name)
 1935{
 1936	const char *iter;
 1937	size_t len;
 1938	const char *next_block;
 1939	slab_flags_t block_flags;
 1940	slab_flags_t slub_debug_local = slub_debug;
 1941
 1942	if (flags & SLAB_NO_USER_FLAGS)
 1943		return flags;
 1944
 1945	/*
 1946	 * If the slab cache is for debugging (e.g. kmemleak) then
 1947	 * don't store user (stack trace) information by default,
 1948	 * but let the user enable it via the command line below.
 1949	 */
 1950	if (flags & SLAB_NOLEAKTRACE)
 1951		slub_debug_local &= ~SLAB_STORE_USER;
 1952
 1953	len = strlen(name);
 1954	next_block = slub_debug_string;
 1955	/* Go through all blocks of debug options, see if any matches our slab's name */
 1956	while (next_block) {
 1957		next_block = parse_slub_debug_flags(next_block, &block_flags, &iter, false);
 1958		if (!iter)
 1959			continue;
 1960		/* Found a block that has a slab list, search it */
 1961		while (*iter) {
 1962			const char *end, *glob;
 1963			size_t cmplen;
 1964
 1965			end = strchrnul(iter, ',');
 1966			if (next_block && next_block < end)
 1967				end = next_block - 1;
 1968
 1969			glob = strnchr(iter, end - iter, '*');
 1970			if (glob)
 1971				cmplen = glob - iter;
 1972			else
 1973				cmplen = max_t(size_t, len, (end - iter));
 1974
 1975			if (!strncmp(name, iter, cmplen)) {
 1976				flags |= block_flags;
 1977				return flags;
 1978			}
 1979
 1980			if (!*end || *end == ';')
 1981				break;
 1982			iter = end + 1;
 1983		}
 1984	}
 1985
 1986	return flags | slub_debug_local;
 1987}
 1988#else /* !CONFIG_SLUB_DEBUG */
 1989static inline void setup_object_debug(struct kmem_cache *s, void *object) {}
 1990static inline
 1991void setup_slab_debug(struct kmem_cache *s, struct slab *slab, void *addr) {}
 1992
 1993static inline bool alloc_debug_processing(struct kmem_cache *s,
 1994	struct slab *slab, void *object, int orig_size) { return true; }
 1995
 1996static inline bool free_debug_processing(struct kmem_cache *s,
 1997	struct slab *slab, void *head, void *tail, int *bulk_cnt,
 1998	unsigned long addr, depot_stack_handle_t handle) { return true; }
 1999
 2000static inline void slab_pad_check(struct kmem_cache *s, struct slab *slab) {}
 2001static inline int check_object(struct kmem_cache *s, struct slab *slab,
 2002			void *object, u8 val) { return 1; }
 2003static inline depot_stack_handle_t set_track_prepare(gfp_t gfp_flags) { return 0; }
 2004static inline void set_track(struct kmem_cache *s, void *object,
 2005			     enum track_item alloc, unsigned long addr, gfp_t gfp_flags) {}
 2006static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n,
 2007					struct slab *slab) {}
 2008static inline void remove_full(struct kmem_cache *s, struct kmem_cache_node *n,
 2009					struct slab *slab) {}
 2010slab_flags_t kmem_cache_flags(slab_flags_t flags, const char *name)
 2011{
 2012	return flags;
 2013}
 2014#define slub_debug 0
 2015
 2016#define disable_higher_order_debug 0
 2017
 2018static inline unsigned long node_nr_slabs(struct kmem_cache_node *n)
 2019							{ return 0; }
 2020static inline void inc_slabs_node(struct kmem_cache *s, int node,
 2021							int objects) {}
 2022static inline void dec_slabs_node(struct kmem_cache *s, int node,
 2023							int objects) {}
 2024static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab,
 2025			       void **freelist, void *nextfree)
 2026{
 2027	return false;
 2028}
 2029#endif /* CONFIG_SLUB_DEBUG */
 2030
 2031/*
 2032 * The allocated objcg pointers array is not accounted directly.
 2033 * Moreover, it should not come from DMA buffer and is not readily
 2034 * reclaimable. So those GFP bits should be masked off.
 2035 */
 2036#define OBJCGS_CLEAR_MASK	(__GFP_DMA | __GFP_RECLAIMABLE | \
 2037				__GFP_ACCOUNT | __GFP_NOFAIL)
 2038
 2039#ifdef CONFIG_SLAB_OBJ_EXT
 2040
 2041#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG
 2042
 2043static inline void mark_objexts_empty(struct slabobj_ext *obj_exts)
 2044{
 2045	struct slabobj_ext *slab_exts;
 2046	struct slab *obj_exts_slab;
 2047
 2048	obj_exts_slab = virt_to_slab(obj_exts);
 2049	slab_exts = slab_obj_exts(obj_exts_slab);
 2050	if (slab_exts) {
 2051		unsigned int offs = obj_to_index(obj_exts_slab->slab_cache,
 2052						 obj_exts_slab, obj_exts);
 2053
 2054		if (unlikely(is_codetag_empty(&slab_exts[offs].ref)))
 2055			return;
 2056
 2057		/* codetag should be NULL here */
 2058		WARN_ON(slab_exts[offs].ref.ct);
 2059		set_codetag_empty(&slab_exts[offs].ref);
 2060	}
 2061}
 2062
 2063static inline bool mark_failed_objexts_alloc(struct slab *slab)
 2064{
 2065	return cmpxchg(&slab->obj_exts, 0, OBJEXTS_ALLOC_FAIL) == 0;
 2066}
 2067
 2068static inline void handle_failed_objexts_alloc(unsigned long obj_exts,
 2069			struct slabobj_ext *vec, unsigned int objects)
 2070{
 2071	/*
 2072	 * If vector previously failed to allocate then we have live
 2073	 * objects with no tag reference. Mark all references in this
 2074	 * vector as empty to avoid warnings later on.
 2075	 */
 2076	if (obj_exts == OBJEXTS_ALLOC_FAIL) {
 2077		unsigned int i;
 2078
 2079		for (i = 0; i < objects; i++)
 2080			set_codetag_empty(&vec[i].ref);
 2081	}
 2082}
 2083
 2084#else /* CONFIG_MEM_ALLOC_PROFILING_DEBUG */
 2085
 2086static inline void mark_objexts_empty(struct slabobj_ext *obj_exts) {}
 2087static inline bool mark_failed_objexts_alloc(struct slab *slab) { return false; }
 2088static inline void handle_failed_objexts_alloc(unsigned long obj_exts,
 2089			struct slabobj_ext *vec, unsigned int objects) {}
 2090
 2091#endif /* CONFIG_MEM_ALLOC_PROFILING_DEBUG */
 2092
 2093static inline void init_slab_obj_exts(struct slab *slab)
 2094{
 2095	slab->obj_exts = 0;
 2096}
 2097
 2098int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s,
 2099		        gfp_t gfp, bool new_slab)
 2100{
 2101	bool allow_spin = gfpflags_allow_spinning(gfp);
 2102	unsigned int objects = objs_per_slab(s, slab);
 2103	unsigned long new_exts;
 2104	unsigned long old_exts;
 2105	struct slabobj_ext *vec;
 2106
 2107	gfp &= ~OBJCGS_CLEAR_MASK;
 2108	/* Prevent recursive extension vector allocation */
 2109	gfp |= __GFP_NO_OBJ_EXT;
 2110
 2111	/*
 2112	 * Note that allow_spin may be false during early boot and its
 2113	 * restricted GFP_BOOT_MASK. Due to kmalloc_nolock() only supporting
 2114	 * architectures with cmpxchg16b, early obj_exts will be missing for
 2115	 * very early allocations on those.
 2116	 */
 2117	if (unlikely(!allow_spin)) {
 2118		size_t sz = objects * sizeof(struct slabobj_ext);
 2119
 2120		vec = kmalloc_nolock(sz, __GFP_ZERO | __GFP_NO_OBJ_EXT,
 2121				     slab_nid(slab));
 2122	} else {
 2123		vec = kcalloc_node(objects, sizeof(struct slabobj_ext), gfp,
 2124				   slab_nid(slab));
 2125	}
 2126	if (!vec) {
 2127		/*
 2128		 * Try to mark vectors which failed to allocate.
 2129		 * If this operation fails, there may be a racing process
 2130		 * that has already completed the allocation.
 2131		 */
 2132		if (!mark_failed_objexts_alloc(slab) &&
 2133		    slab_obj_exts(slab))
 2134			return 0;
 2135
 2136		return -ENOMEM;
 2137	}
 2138
 2139	new_exts = (unsigned long)vec;
 2140	if (unlikely(!allow_spin))
 2141		new_exts |= OBJEXTS_NOSPIN_ALLOC;
 2142#ifdef CONFIG_MEMCG
 2143	new_exts |= MEMCG_DATA_OBJEXTS;
 2144#endif
 2145retry:
 2146	old_exts = READ_ONCE(slab->obj_exts);
 2147	handle_failed_objexts_alloc(old_exts, vec, objects);
 2148	if (new_slab) {
 2149		/*
 2150		 * If the slab is brand new and nobody can yet access its
 2151		 * obj_exts, no synchronization is required and obj_exts can
 2152		 * be simply assigned.
 2153		 */
 2154		slab->obj_exts = new_exts;
 2155	} else if (old_exts & ~OBJEXTS_FLAGS_MASK) {
 2156		/*
 2157		 * If the slab is already in use, somebody can allocate and
 2158		 * assign slabobj_exts in parallel. In this case the existing
 2159		 * objcg vector should be reused.
 2160		 */
 2161		mark_objexts_empty(vec);
 2162		if (unlikely(!allow_spin))
 2163			kfree_nolock(vec);
 2164		else
 2165			kfree(vec);
 2166		return 0;
 2167	} else if (cmpxchg(&slab->obj_exts, old_exts, new_exts) != old_exts) {
 2168		/* Retry if a racing thread changed slab->obj_exts from under us. */
 2169		goto retry;
 2170	}
 2171
 2172	if (allow_spin)
 2173		kmemleak_not_leak(vec);
 2174	return 0;
 2175}
 2176
 2177static inline void free_slab_obj_exts(struct slab *slab)
 2178{
 2179	struct slabobj_ext *obj_exts;
 2180
 2181	obj_exts = slab_obj_exts(slab);
 2182	if (!obj_exts) {
 2183		/*
 2184		 * If obj_exts allocation failed, slab->obj_exts is set to
 2185		 * OBJEXTS_ALLOC_FAIL. In this case, we end up here and should
 2186		 * clear the flag.
 2187		 */
 2188		slab->obj_exts = 0;
 2189		return;
 2190	}
 2191
 2192	/*
 2193	 * obj_exts was created with __GFP_NO_OBJ_EXT flag, therefore its
 2194	 * corresponding extension will be NULL. alloc_tag_sub() will throw a
 2195	 * warning if slab has extensions but the extension of an object is
 2196	 * NULL, therefore replace NULL with CODETAG_EMPTY to indicate that
 2197	 * the extension for obj_exts is expected to be NULL.
 2198	 */
 2199	mark_objexts_empty(obj_exts);
 2200	if (unlikely(READ_ONCE(slab->obj_exts) & OBJEXTS_NOSPIN_ALLOC))
 2201		kfree_nolock(obj_exts);
 2202	else
 2203		kfree(obj_exts);
 2204	slab->obj_exts = 0;
 2205}
 2206
 2207#else /* CONFIG_SLAB_OBJ_EXT */
 2208
 2209static inline void init_slab_obj_exts(struct slab *slab)
 2210{
 2211}
 2212
 2213static int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s,
 2214			       gfp_t gfp, bool new_slab)
 2215{
 2216	return 0;
 2217}
 2218
 2219static inline void free_slab_obj_exts(struct slab *slab)
 2220{
 2221}
 2222
 2223#endif /* CONFIG_SLAB_OBJ_EXT */
 2224
 2225#ifdef CONFIG_MEM_ALLOC_PROFILING
 2226
 2227static inline struct slabobj_ext *
 2228prepare_slab_obj_exts_hook(struct kmem_cache *s, gfp_t flags, void *p)
 2229{
 2230	struct slab *slab;
 2231
 2232	slab = virt_to_slab(p);
 2233	if (!slab_obj_exts(slab) &&
 2234	    alloc_slab_obj_exts(slab, s, flags, false)) {
 2235		pr_warn_once("%s, %s: Failed to create slab extension vector!\n",
 2236			     __func__, s->name);
 2237		return NULL;
 2238	}
 2239
 2240	return slab_obj_exts(slab) + obj_to_index(s, slab, p);
 2241}
 2242
 2243/* Should be called only if mem_alloc_profiling_enabled() */
 2244static noinline void
 2245__alloc_tagging_slab_alloc_hook(struct kmem_cache *s, void *object, gfp_t flags)
 2246{
 2247	struct slabobj_ext *obj_exts;
 2248
 2249	if (!object)
 2250		return;
 2251
 2252	if (s->flags & (SLAB_NO_OBJ_EXT | SLAB_NOLEAKTRACE))
 2253		return;
 2254
 2255	if (flags & __GFP_NO_OBJ_EXT)
 2256		return;
 2257
 2258	obj_exts = prepare_slab_obj_exts_hook(s, flags, object);
 2259	/*
 2260	 * Currently obj_exts is used only for allocation profiling.
 2261	 * If other users appear then mem_alloc_profiling_enabled()
 2262	 * check should be added before alloc_tag_add().
 2263	 */
 2264	if (likely(obj_exts))
 2265		alloc_tag_add(&obj_exts->ref, current->alloc_tag, s->size);
 2266	else
 2267		alloc_tag_set_inaccurate(current->alloc_tag);
 2268}
 2269
 2270static inline void
 2271alloc_tagging_slab_alloc_hook(struct kmem_cache *s, void *object, gfp_t flags)
 2272{
 2273	if (mem_alloc_profiling_enabled())
 2274		__alloc_tagging_slab_alloc_hook(s, object, flags);
 2275}
 2276
 2277/* Should be called only if mem_alloc_profiling_enabled() */
 2278static noinline void
 2279__alloc_tagging_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p,
 2280			       int objects)
 2281{
 2282	struct slabobj_ext *obj_exts;
 2283	int i;
 2284
 2285	/* slab->obj_exts might not be NULL if it was created for MEMCG accounting. */
 2286	if (s->flags & (SLAB_NO_OBJ_EXT | SLAB_NOLEAKTRACE))
 2287		return;
 2288
 2289	obj_exts = slab_obj_exts(slab);
 2290	if (!obj_exts)
 2291		return;
 2292
 2293	for (i = 0; i < objects; i++) {
 2294		unsigned int off = obj_to_index(s, slab, p[i]);
 2295
 2296		alloc_tag_sub(&obj_exts[off].ref, s->size);
 2297	}
 2298}
 2299
 2300static inline void
 2301alloc_tagging_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p,
 2302			     int objects)
 2303{
 2304	if (mem_alloc_profiling_enabled())
 2305		__alloc_tagging_slab_free_hook(s, slab, p, objects);
 2306}
 2307
 2308#else /* CONFIG_MEM_ALLOC_PROFILING */
 2309
 2310static inline void
 2311alloc_tagging_slab_alloc_hook(struct kmem_cache *s, void *object, gfp_t flags)
 2312{
 2313}
 2314
 2315static inline void
 2316alloc_tagging_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p,
 2317			     int objects)
 2318{
 2319}
 2320
 2321#endif /* CONFIG_MEM_ALLOC_PROFILING */
 2322
 2323
 2324#ifdef CONFIG_MEMCG
 2325
 2326static void memcg_alloc_abort_single(struct kmem_cache *s, void *object);
 2327
 2328static __fastpath_inline
 2329bool memcg_slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru,
 2330				gfp_t flags, size_t size, void **p)
 2331{
 2332	if (likely(!memcg_kmem_online()))
 2333		return true;
 2334
 2335	if (likely(!(flags & __GFP_ACCOUNT) && !(s->flags & SLAB_ACCOUNT)))
 2336		return true;
 2337
 2338	if (likely(__memcg_slab_post_alloc_hook(s, lru, flags, size, p)))
 2339		return true;
 2340
 2341	if (likely(size == 1)) {
 2342		memcg_alloc_abort_single(s, *p);
 2343		*p = NULL;
 2344	} else {
 2345		kmem_cache_free_bulk(s, size, p);
 2346	}
 2347
 2348	return false;
 2349}
 2350
 2351static __fastpath_inline
 2352void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p,
 2353			  int objects)
 2354{
 2355	struct slabobj_ext *obj_exts;
 2356
 2357	if (!memcg_kmem_online())
 2358		return;
 2359
 2360	obj_exts = slab_obj_exts(slab);
 2361	if (likely(!obj_exts))
 2362		return;
 2363
 2364	__memcg_slab_free_hook(s, slab, p, objects, obj_exts);
 2365}
 2366
 2367static __fastpath_inline
 2368bool memcg_slab_post_charge(void *p, gfp_t flags)
 2369{
 2370	struct slabobj_ext *slab_exts;
 2371	struct kmem_cache *s;
 2372	struct page *page;
 2373	struct slab *slab;
 2374	unsigned long off;
 2375
 2376	page = virt_to_page(p);
 2377	if (PageLargeKmalloc(page)) {
 2378		unsigned int order;
 2379		int size;
 2380
 2381		if (PageMemcgKmem(page))
 2382			return true;
 2383
 2384		order = large_kmalloc_order(page);
 2385		if (__memcg_kmem_charge_page(page, flags, order))
 2386			return false;
 2387
 2388		/*
 2389		 * This page has already been accounted in the global stats but
 2390		 * not in the memcg stats. So, subtract from the global and use
 2391		 * the interface which adds to both global and memcg stats.
 2392		 */
 2393		size = PAGE_SIZE << order;
 2394		mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE_B, -size);
 2395		mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B, size);
 2396		return true;
 2397	}
 2398
 2399	slab = page_slab(page);
 2400	s = slab->slab_cache;
 2401
 2402	/*
 2403	 * Ignore KMALLOC_NORMAL cache to avoid possible circular dependency
 2404	 * of slab_obj_exts being allocated from the same slab and thus the slab
 2405	 * becoming effectively unfreeable.
 2406	 */
 2407	if (is_kmalloc_normal(s))
 2408		return true;
 2409
 2410	/* Ignore already charged objects. */
 2411	slab_exts = slab_obj_exts(slab);
 2412	if (slab_exts) {
 2413		off = obj_to_index(s, slab, p);
 2414		if (unlikely(slab_exts[off].objcg))
 2415			return true;
 2416	}
 2417
 2418	return __memcg_slab_post_alloc_hook(s, NULL, flags, 1, &p);
 2419}
 2420
 2421#else /* CONFIG_MEMCG */
 2422static inline bool memcg_slab_post_alloc_hook(struct kmem_cache *s,
 2423					      struct list_lru *lru,
 2424					      gfp_t flags, size_t size,
 2425					      void **p)
 2426{
 2427	return true;
 2428}
 2429
 2430static inline void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab,
 2431					void **p, int objects)
 2432{
 2433}
 2434
 2435static inline bool memcg_slab_post_charge(void *p, gfp_t flags)
 2436{
 2437	return true;
 2438}
 2439#endif /* CONFIG_MEMCG */
 2440
 2441#ifdef CONFIG_SLUB_RCU_DEBUG
 2442static void slab_free_after_rcu_debug(struct rcu_head *rcu_head);
 2443
 2444struct rcu_delayed_free {
 2445	struct rcu_head head;
 2446	void *object;
 2447};
 2448#endif
 2449
 2450/*
 2451 * Hooks for other subsystems that check memory allocations. In a typical
 2452 * production configuration these hooks all should produce no code at all.
 2453 *
 2454 * Returns true if freeing of the object can proceed, false if its reuse
 2455 * was delayed by CONFIG_SLUB_RCU_DEBUG or KASAN quarantine, or it was returned
 2456 * to KFENCE.
 2457 */
 2458static __always_inline
 2459bool slab_free_hook(struct kmem_cache *s, void *x, bool init,
 2460		    bool after_rcu_delay)
 2461{
 2462	/* Are the object contents still accessible? */
 2463	bool still_accessible = (s->flags & SLAB_TYPESAFE_BY_RCU) && !after_rcu_delay;
 2464
 2465	kmemleak_free_recursive(x, s->flags);
 2466	kmsan_slab_free(s, x);
 2467
 2468	debug_check_no_locks_freed(x, s->object_size);
 2469
 2470	if (!(s->flags & SLAB_DEBUG_OBJECTS))
 2471		debug_check_no_obj_freed(x, s->object_size);
 2472
 2473	/* Use KCSAN to help debug racy use-after-free. */
 2474	if (!still_accessible)
 2475		__kcsan_check_access(x, s->object_size,
 2476				     KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ASSERT);
 2477
 2478	if (kfence_free(x))
 2479		return false;
 2480
 2481	/*
 2482	 * Give KASAN a chance to notice an invalid free operation before we
 2483	 * modify the object.
 2484	 */
 2485	if (kasan_slab_pre_free(s, x))
 2486		return false;
 2487
 2488#ifdef CONFIG_SLUB_RCU_DEBUG
 2489	if (still_accessible) {
 2490		struct rcu_delayed_free *delayed_free;
 2491
 2492		delayed_free = kmalloc(sizeof(*delayed_free), GFP_NOWAIT);
 2493		if (delayed_free) {
 2494			/*
 2495			 * Let KASAN track our call stack as a "related work
 2496			 * creation", just like if the object had been freed
 2497			 * normally via kfree_rcu().
 2498			 * We have to do this manually because the rcu_head is
 2499			 * not located inside the object.
 2500			 */
 2501			kasan_record_aux_stack(x);
 2502
 2503			delayed_free->object = x;
 2504			call_rcu(&delayed_free->head, slab_free_after_rcu_debug);
 2505			return false;
 2506		}
 2507	}
 2508#endif /* CONFIG_SLUB_RCU_DEBUG */
 2509
 2510	/*
 2511	 * As memory initialization might be integrated into KASAN,
 2512	 * kasan_slab_free and initialization memset's must be
 2513	 * kept together to avoid discrepancies in behavior.
 2514	 *
 2515	 * The initialization memset's clear the object and the metadata,
 2516	 * but don't touch the SLAB redzone.
 2517	 *
 2518	 * The object's freepointer is also avoided if stored outside the
 2519	 * object.
 2520	 */
 2521	if (unlikely(init)) {
 2522		int rsize;
 2523		unsigned int inuse, orig_size;
 2524
 2525		inuse = get_info_end(s);
 2526		orig_size = get_orig_size(s, x);
 2527		if (!kasan_has_integrated_init())
 2528			memset(kasan_reset_tag(x), 0, orig_size);
 2529		rsize = (s->flags & SLAB_RED_ZONE) ? s->red_left_pad : 0;
 2530		memset((char *)kasan_reset_tag(x) + inuse, 0,
 2531		       s->size - inuse - rsize);
 2532		/*
 2533		 * Restore orig_size, otherwise kmalloc redzone overwritten
 2534		 * would be reported
 2535		 */
 2536		set_orig_size(s, x, orig_size);
 2537
 2538	}
 2539	/* KASAN might put x into memory quarantine, delaying its reuse. */
 2540	return !kasan_slab_free(s, x, init, still_accessible, false);
 2541}
 2542
 2543static __fastpath_inline
 2544bool slab_free_freelist_hook(struct kmem_cache *s, void **head, void **tail,
 2545			     int *cnt)
 2546{
 2547
 2548	void *object;
 2549	void *next = *head;
 2550	void *old_tail = *tail;
 2551	bool init;
 2552
 2553	if (is_kfence_address(next)) {
 2554		slab_free_hook(s, next, false, false);
 2555		return false;
 2556	}
 2557
 2558	/* Head and tail of the reconstructed freelist */
 2559	*head = NULL;
 2560	*tail = NULL;
 2561
 2562	init = slab_want_init_on_free(s);
 2563
 2564	do {
 2565		object = next;
 2566		next = get_freepointer(s, object);
 2567
 2568		/* If object's reuse doesn't have to be delayed */
 2569		if (likely(slab_free_hook(s, object, init, false))) {
 2570			/* Move object to the new freelist */
 2571			set_freepointer(s, object, *head);
 2572			*head = object;
 2573			if (!*tail)
 2574				*tail = object;
 2575		} else {
 2576			/*
 2577			 * Adjust the reconstructed freelist depth
 2578			 * accordingly if object's reuse is delayed.
 2579			 */
 2580			--(*cnt);
 2581		}
 2582	} while (object != old_tail);
 2583
 2584	return *head != NULL;
 2585}
 2586
 2587static void *setup_object(struct kmem_cache *s, void *object)
 2588{
 2589	setup_object_debug(s, object);
 2590	object = kasan_init_slab_obj(s, object);
 2591	if (unlikely(s->ctor)) {
 2592		kasan_unpoison_new_object(s, object);
 2593		s->ctor(object);
 2594		kasan_poison_new_object(s, object);
 2595	}
 2596	return object;
 2597}
 2598
 2599static struct slab_sheaf *alloc_empty_sheaf(struct kmem_cache *s, gfp_t gfp)
 2600{
 2601	struct slab_sheaf *sheaf;
 2602	size_t sheaf_size;
 2603
 2604	if (gfp & __GFP_NO_OBJ_EXT)
 2605		return NULL;
 2606
 2607	gfp &= ~OBJCGS_CLEAR_MASK;
 2608
 2609	/*
 2610	 * Prevent recursion to the same cache, or a deep stack of kmallocs of
 2611	 * varying sizes (sheaf capacity might differ for each kmalloc size
 2612	 * bucket)
 2613	 */
 2614	if (s->flags & SLAB_KMALLOC)
 2615		gfp |= __GFP_NO_OBJ_EXT;
 2616
 2617	sheaf_size = struct_size(sheaf, objects, s->sheaf_capacity);
 2618	sheaf = kzalloc(sheaf_size, gfp);
 2619
 2620	if (unlikely(!sheaf))
 2621		return NULL;
 2622
 2623	sheaf->cache = s;
 2624
 2625	stat(s, SHEAF_ALLOC);
 2626
 2627	return sheaf;
 2628}
 2629
 2630static void free_empty_sheaf(struct kmem_cache *s, struct slab_sheaf *sheaf)
 2631{
 2632	kfree(sheaf);
 2633
 2634	stat(s, SHEAF_FREE);
 2635}
 2636
 2637static int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags,
 2638				   size_t size, void **p);
 2639
 2640
 2641static int refill_sheaf(struct kmem_cache *s, struct slab_sheaf *sheaf,
 2642			 gfp_t gfp)
 2643{
 2644	int to_fill = s->sheaf_capacity - sheaf->size;
 2645	int filled;
 2646
 2647	if (!to_fill)
 2648		return 0;
 2649
 2650	filled = __kmem_cache_alloc_bulk(s, gfp, to_fill,
 2651					 &sheaf->objects[sheaf->size]);
 2652
 2653	sheaf->size += filled;
 2654
 2655	stat_add(s, SHEAF_REFILL, filled);
 2656
 2657	if (filled < to_fill)
 2658		return -ENOMEM;
 2659
 2660	return 0;
 2661}
 2662
 2663
 2664static struct slab_sheaf *alloc_full_sheaf(struct kmem_cache *s, gfp_t gfp)
 2665{
 2666	struct slab_sheaf *sheaf = alloc_empty_sheaf(s, gfp);
 2667
 2668	if (!sheaf)
 2669		return NULL;
 2670
 2671	if (refill_sheaf(s, sheaf, gfp | __GFP_NOMEMALLOC)) {
 2672		free_empty_sheaf(s, sheaf);
 2673		return NULL;
 2674	}
 2675
 2676	return sheaf;
 2677}
 2678
 2679/*
 2680 * Maximum number of objects freed during a single flush of main pcs sheaf.
 2681 * Translates directly to an on-stack array size.
 2682 */
 2683#define PCS_BATCH_MAX	32U
 2684
 2685static void __kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p);
 2686
 2687/*
 2688 * Free all objects from the main sheaf. In order to perform
 2689 * __kmem_cache_free_bulk() outside of cpu_sheaves->lock, work in batches where
 2690 * object pointers are moved to a on-stack array under the lock. To bound the
 2691 * stack usage, limit each batch to PCS_BATCH_MAX.
 2692 *
 2693 * returns true if at least partially flushed
 2694 */
 2695static bool sheaf_flush_main(struct kmem_cache *s)
 2696{
 2697	struct slub_percpu_sheaves *pcs;
 2698	unsigned int batch, remaining;
 2699	void *objects[PCS_BATCH_MAX];
 2700	struct slab_sheaf *sheaf;
 2701	bool ret = false;
 2702
 2703next_batch:
 2704	if (!local_trylock(&s->cpu_sheaves->lock))
 2705		return ret;
 2706
 2707	pcs = this_cpu_ptr(s->cpu_sheaves);
 2708	sheaf = pcs->main;
 2709
 2710	batch = min(PCS_BATCH_MAX, sheaf->size);
 2711
 2712	sheaf->size -= batch;
 2713	memcpy(objects, sheaf->objects + sheaf->size, batch * sizeof(void *));
 2714
 2715	remaining = sheaf->size;
 2716
 2717	local_unlock(&s->cpu_sheaves->lock);
 2718
 2719	__kmem_cache_free_bulk(s, batch, &objects[0]);
 2720
 2721	stat_add(s, SHEAF_FLUSH, batch);
 2722
 2723	ret = true;
 2724
 2725	if (remaining)
 2726		goto next_batch;
 2727
 2728	return ret;
 2729}
 2730
 2731/*
 2732 * Free all objects from a sheaf that's unused, i.e. not linked to any
 2733 * cpu_sheaves, so we need no locking and batching. The locking is also not
 2734 * necessary when flushing cpu's sheaves (both spare and main) during cpu
 2735 * hotremove as the cpu is not executing anymore.
 2736 */
 2737static void sheaf_flush_unused(struct kmem_cache *s, struct slab_sheaf *sheaf)
 2738{
 2739	if (!sheaf->size)
 2740		return;
 2741
 2742	stat_add(s, SHEAF_FLUSH, sheaf->size);
 2743
 2744	__kmem_cache_free_bulk(s, sheaf->size, &sheaf->objects[0]);
 2745
 2746	sheaf->size = 0;
 2747}
 2748
 2749static bool __rcu_free_sheaf_prepare(struct kmem_cache *s,
 2750				     struct slab_sheaf *sheaf)
 2751{
 2752	bool init = slab_want_init_on_free(s);
 2753	void **p = &sheaf->objects[0];
 2754	unsigned int i = 0;
 2755	bool pfmemalloc = false;
 2756
 2757	while (i < sheaf->size) {
 2758		struct slab *slab = virt_to_slab(p[i]);
 2759
 2760		memcg_slab_free_hook(s, slab, p + i, 1);
 2761		alloc_tagging_slab_free_hook(s, slab, p + i, 1);
 2762
 2763		if (unlikely(!slab_free_hook(s, p[i], init, true))) {
 2764			p[i] = p[--sheaf->size];
 2765			continue;
 2766		}
 2767
 2768		if (slab_test_pfmemalloc(slab))
 2769			pfmemalloc = true;
 2770
 2771		i++;
 2772	}
 2773
 2774	return pfmemalloc;
 2775}
 2776
 2777static void rcu_free_sheaf_nobarn(struct rcu_head *head)
 2778{
 2779	struct slab_sheaf *sheaf;
 2780	struct kmem_cache *s;
 2781
 2782	sheaf = container_of(head, struct slab_sheaf, rcu_head);
 2783	s = sheaf->cache;
 2784
 2785	__rcu_free_sheaf_prepare(s, sheaf);
 2786
 2787	sheaf_flush_unused(s, sheaf);
 2788
 2789	free_empty_sheaf(s, sheaf);
 2790}
 2791
 2792/*
 2793 * Caller needs to make sure migration is disabled in order to fully flush
 2794 * single cpu's sheaves
 2795 *
 2796 * must not be called from an irq
 2797 *
 2798 * flushing operations are rare so let's keep it simple and flush to slabs
 2799 * directly, skipping the barn
 2800 */
 2801static void pcs_flush_all(struct kmem_cache *s)
 2802{
 2803	struct slub_percpu_sheaves *pcs;
 2804	struct slab_sheaf *spare, *rcu_free;
 2805
 2806	local_lock(&s->cpu_sheaves->lock);
 2807	pcs = this_cpu_ptr(s->cpu_sheaves);
 2808
 2809	spare = pcs->spare;
 2810	pcs->spare = NULL;
 2811
 2812	rcu_free = pcs->rcu_free;
 2813	pcs->rcu_free = NULL;
 2814
 2815	local_unlock(&s->cpu_sheaves->lock);
 2816
 2817	if (spare) {
 2818		sheaf_flush_unused(s, spare);
 2819		free_empty_sheaf(s, spare);
 2820	}
 2821
 2822	if (rcu_free)
 2823		call_rcu(&rcu_free->rcu_head, rcu_free_sheaf_nobarn);
 2824
 2825	sheaf_flush_main(s);
 2826}
 2827
 2828static void __pcs_flush_all_cpu(struct kmem_cache *s, unsigned int cpu)
 2829{
 2830	struct slub_percpu_sheaves *pcs;
 2831
 2832	pcs = per_cpu_ptr(s->cpu_sheaves, cpu);
 2833
 2834	/* The cpu is not executing anymore so we don't need pcs->lock */
 2835	sheaf_flush_unused(s, pcs->main);
 2836	if (pcs->spare) {
 2837		sheaf_flush_unused(s, pcs->spare);
 2838		free_empty_sheaf(s, pcs->spare);
 2839		pcs->spare = NULL;
 2840	}
 2841
 2842	if (pcs->rcu_free) {
 2843		call_rcu(&pcs->rcu_free->rcu_head, rcu_free_sheaf_nobarn);
 2844		pcs->rcu_free = NULL;
 2845	}
 2846}
 2847
 2848static void pcs_destroy(struct kmem_cache *s)
 2849{
 2850	int cpu;
 2851
 2852	for_each_possible_cpu(cpu) {
 2853		struct slub_percpu_sheaves *pcs;
 2854
 2855		pcs = per_cpu_ptr(s->cpu_sheaves, cpu);
 2856
 2857		/* can happen when unwinding failed create */
 2858		if (!pcs->main)
 2859			continue;
 2860
 2861		/*
 2862		 * We have already passed __kmem_cache_shutdown() so everything
 2863		 * was flushed and there should be no objects allocated from
 2864		 * slabs, otherwise kmem_cache_destroy() would have aborted.
 2865		 * Therefore something would have to be really wrong if the
 2866		 * warnings here trigger, and we should rather leave objects and
 2867		 * sheaves to leak in that case.
 2868		 */
 2869
 2870		WARN_ON(pcs->spare);
 2871		WARN_ON(pcs->rcu_free);
 2872
 2873		if (!WARN_ON(pcs->main->size)) {
 2874			free_empty_sheaf(s, pcs->main);
 2875			pcs->main = NULL;
 2876		}
 2877	}
 2878
 2879	free_percpu(s->cpu_sheaves);
 2880	s->cpu_sheaves = NULL;
 2881}
 2882
 2883static struct slab_sheaf *barn_get_empty_sheaf(struct node_barn *barn)
 2884{
 2885	struct slab_sheaf *empty = NULL;
 2886	unsigned long flags;
 2887
 2888	if (!data_race(barn->nr_empty))
 2889		return NULL;
 2890
 2891	spin_lock_irqsave(&barn->lock, flags);
 2892
 2893	if (likely(barn->nr_empty)) {
 2894		empty = list_first_entry(&barn->sheaves_empty,
 2895					 struct slab_sheaf, barn_list);
 2896		list_del(&empty->barn_list);
 2897		barn->nr_empty--;
 2898	}
 2899
 2900	spin_unlock_irqrestore(&barn->lock, flags);
 2901
 2902	return empty;
 2903}
 2904
 2905/*
 2906 * The following two functions are used mainly in cases where we have to undo an
 2907 * intended action due to a race or cpu migration. Thus they do not check the
 2908 * empty or full sheaf limits for simplicity.
 2909 */
 2910
 2911static void barn_put_empty_sheaf(struct node_barn *barn, struct slab_sheaf *sheaf)
 2912{
 2913	unsigned long flags;
 2914
 2915	spin_lock_irqsave(&barn->lock, flags);
 2916
 2917	list_add(&sheaf->barn_list, &barn->sheaves_empty);
 2918	barn->nr_empty++;
 2919
 2920	spin_unlock_irqrestore(&barn->lock, flags);
 2921}
 2922
 2923static void barn_put_full_sheaf(struct node_barn *barn, struct slab_sheaf *sheaf)
 2924{
 2925	unsigned long flags;
 2926
 2927	spin_lock_irqsave(&barn->lock, flags);
 2928
 2929	list_add(&sheaf->barn_list, &barn->sheaves_full);
 2930	barn->nr_full++;
 2931
 2932	spin_unlock_irqrestore(&barn->lock, flags);
 2933}
 2934
 2935static struct slab_sheaf *barn_get_full_or_empty_sheaf(struct node_barn *barn)
 2936{
 2937	struct slab_sheaf *sheaf = NULL;
 2938	unsigned long flags;
 2939
 2940	if (!data_race(barn->nr_full) && !data_race(barn->nr_empty))
 2941		return NULL;
 2942
 2943	spin_lock_irqsave(&barn->lock, flags);
 2944
 2945	if (barn->nr_full) {
 2946		sheaf = list_first_entry(&barn->sheaves_full, struct slab_sheaf,
 2947					barn_list);
 2948		list_del(&sheaf->barn_list);
 2949		barn->nr_full--;
 2950	} else if (barn->nr_empty) {
 2951		sheaf = list_first_entry(&barn->sheaves_empty,
 2952					 struct slab_sheaf, barn_list);
 2953		list_del(&sheaf->barn_list);
 2954		barn->nr_empty--;
 2955	}
 2956
 2957	spin_unlock_irqrestore(&barn->lock, flags);
 2958
 2959	return sheaf;
 2960}
 2961
 2962/*
 2963 * If a full sheaf is available, return it and put the supplied empty one to
 2964 * barn. We ignore the limit on empty sheaves as the number of sheaves doesn't
 2965 * change.
 2966 */
 2967static struct slab_sheaf *
 2968barn_replace_empty_sheaf(struct node_barn *barn, struct slab_sheaf *empty)
 2969{
 2970	struct slab_sheaf *full = NULL;
 2971	unsigned long flags;
 2972
 2973	if (!data_race(barn->nr_full))
 2974		return NULL;
 2975
 2976	spin_lock_irqsave(&barn->lock, flags);
 2977
 2978	if (likely(barn->nr_full)) {
 2979		full = list_first_entry(&barn->sheaves_full, struct slab_sheaf,
 2980					barn_list);
 2981		list_del(&full->barn_list);
 2982		list_add(&empty->barn_list, &barn->sheaves_empty);
 2983		barn->nr_full--;
 2984		barn->nr_empty++;
 2985	}
 2986
 2987	spin_unlock_irqrestore(&barn->lock, flags);
 2988
 2989	return full;
 2990}
 2991
 2992/*
 2993 * If an empty sheaf is available, return it and put the supplied full one to
 2994 * barn. But if there are too many full sheaves, reject this with -E2BIG.
 2995 */
 2996static struct slab_sheaf *
 2997barn_replace_full_sheaf(struct node_barn *barn, struct slab_sheaf *full)
 2998{
 2999	struct slab_sheaf *empty;
 3000	unsigned long flags;
 3001
 3002	/* we don't repeat this check under barn->lock as it's not critical */
 3003	if (data_race(barn->nr_full) >= MAX_FULL_SHEAVES)
 3004		return ERR_PTR(-E2BIG);
 3005	if (!data_race(barn->nr_empty))
 3006		return ERR_PTR(-ENOMEM);
 3007
 3008	spin_lock_irqsave(&barn->lock, flags);
 3009
 3010	if (likely(barn->nr_empty)) {
 3011		empty = list_first_entry(&barn->sheaves_empty, struct slab_sheaf,
 3012					 barn_list);
 3013		list_del(&empty->barn_list);
 3014		list_add(&full->barn_list, &barn->sheaves_full);
 3015		barn->nr_empty--;
 3016		barn->nr_full++;
 3017	} else {
 3018		empty = ERR_PTR(-ENOMEM);
 3019	}
 3020
 3021	spin_unlock_irqrestore(&barn->lock, flags);
 3022
 3023	return empty;
 3024}
 3025
 3026static void barn_init(struct node_barn *barn)
 3027{
 3028	spin_lock_init(&barn->lock);
 3029	INIT_LIST_HEAD(&barn->sheaves_full);
 3030	INIT_LIST_HEAD(&barn->sheaves_empty);
 3031	barn->nr_full = 0;
 3032	barn->nr_empty = 0;
 3033}
 3034
 3035static void barn_shrink(struct kmem_cache *s, struct node_barn *barn)
 3036{
 3037	LIST_HEAD(empty_list);
 3038	LIST_HEAD(full_list);
 3039	struct slab_sheaf *sheaf, *sheaf2;
 3040	unsigned long flags;
 3041
 3042	spin_lock_irqsave(&barn->lock, flags);
 3043
 3044	list_splice_init(&barn->sheaves_full, &full_list);
 3045	barn->nr_full = 0;
 3046	list_splice_init(&barn->sheaves_empty, &empty_list);
 3047	barn->nr_empty = 0;
 3048
 3049	spin_unlock_irqrestore(&barn->lock, flags);
 3050
 3051	list_for_each_entry_safe(sheaf, sheaf2, &full_list, barn_list) {
 3052		sheaf_flush_unused(s, sheaf);
 3053		free_empty_sheaf(s, sheaf);
 3054	}
 3055
 3056	list_for_each_entry_safe(sheaf, sheaf2, &empty_list, barn_list)
 3057		free_empty_sheaf(s, sheaf);
 3058}
 3059
 3060/*
 3061 * Slab allocation and freeing
 3062 */
 3063static inline struct slab *alloc_slab_page(gfp_t flags, int node,
 3064					   struct kmem_cache_order_objects oo,
 3065					   bool allow_spin)
 3066{
 3067	struct page *page;
 3068	struct slab *slab;
 3069	unsigned int order = oo_order(oo);
 3070
 3071	if (unlikely(!allow_spin))
 3072		page = alloc_frozen_pages_nolock(0/* __GFP_COMP is implied */,
 3073								  node, order);
 3074	else if (node == NUMA_NO_NODE)
 3075		page = alloc_frozen_pages(flags, order);
 3076	else
 3077		page = __alloc_frozen_pages(flags, order, node, NULL);
 3078
 3079	if (!page)
 3080		return NULL;
 3081
 3082	__SetPageSlab(page);
 3083	slab = page_slab(page);
 3084	if (page_is_pfmemalloc(page))
 3085		slab_set_pfmemalloc(slab);
 3086
 3087	return slab;
 3088}
 3089
 3090#ifdef CONFIG_SLAB_FREELIST_RANDOM
 3091/* Pre-initialize the random sequence cache */
 3092static int init_cache_random_seq(struct kmem_cache *s)
 3093{
 3094	unsigned int count = oo_objects(s->oo);
 3095	int err;
 3096
 3097	/* Bailout if already initialised */
 3098	if (s->random_seq)
 3099		return 0;
 3100
 3101	err = cache_random_seq_create(s, count, GFP_KERNEL);
 3102	if (err) {
 3103		pr_err("SLUB: Unable to initialize free list for %s\n",
 3104			s->name);
 3105		return err;
 3106	}
 3107
 3108	/* Transform to an offset on the set of pages */
 3109	if (s->random_seq) {
 3110		unsigned int i;
 3111
 3112		for (i = 0; i < count; i++)
 3113			s->random_seq[i] *= s->size;
 3114	}
 3115	return 0;
 3116}
 3117
 3118/* Initialize each random sequence freelist per cache */
 3119static void __init init_freelist_randomization(void)
 3120{
 3121	struct kmem_cache *s;
 3122
 3123	mutex_lock(&slab_mutex);
 3124
 3125	list_for_each_entry(s, &slab_caches, list)
 3126		init_cache_random_seq(s);
 3127
 3128	mutex_unlock(&slab_mutex);
 3129}
 3130
 3131/* Get the next entry on the pre-computed freelist randomized */
 3132static void *next_freelist_entry(struct kmem_cache *s,
 3133				unsigned long *pos, void *start,
 3134				unsigned long page_limit,
 3135				unsigned long freelist_count)
 3136{
 3137	unsigned int idx;
 3138
 3139	/*
 3140	 * If the target page allocation failed, the number of objects on the
 3141	 * page might be smaller than the usual size defined by the cache.
 3142	 */
 3143	do {
 3144		idx = s->random_seq[*pos];
 3145		*pos += 1;
 3146		if (*pos >= freelist_count)
 3147			*pos = 0;
 3148	} while (unlikely(idx >= page_limit));
 3149
 3150	return (char *)start + idx;
 3151}
 3152
 3153/* Shuffle the single linked freelist based on a random pre-computed sequence */
 3154static bool shuffle_freelist(struct kmem_cache *s, struct slab *slab)
 3155{
 3156	void *start;
 3157	void *cur;
 3158	void *next;
 3159	unsigned long idx, pos, page_limit, freelist_count;
 3160
 3161	if (slab->objects < 2 || !s->random_seq)
 3162		return false;
 3163
 3164	freelist_count = oo_objects(s->oo);
 3165	pos = get_random_u32_below(freelist_count);
 3166
 3167	page_limit = slab->objects * s->size;
 3168	start = fixup_red_left(s, slab_address(slab));
 3169
 3170	/* First entry is used as the base of the freelist */
 3171	cur = next_freelist_entry(s, &pos, start, page_limit, freelist_count);
 3172	cur = setup_object(s, cur);
 3173	slab->freelist = cur;
 3174
 3175	for (idx = 1; idx < slab->objects; idx++) {
 3176		next = next_freelist_entry(s, &pos, start, page_limit,
 3177			freelist_count);
 3178		next = setup_object(s, next);
 3179		set_freepointer(s, cur, next);
 3180		cur = next;
 3181	}
 3182	set_freepointer(s, cur, NULL);
 3183
 3184	return true;
 3185}
 3186#else
 3187static inline int init_cache_random_seq(struct kmem_cache *s)
 3188{
 3189	return 0;
 3190}
 3191static inline void init_freelist_randomization(void) { }
 3192static inline bool shuffle_freelist(struct kmem_cache *s, struct slab *slab)
 3193{
 3194	return false;
 3195}
 3196#endif /* CONFIG_SLAB_FREELIST_RANDOM */
 3197
 3198static __always_inline void account_slab(struct slab *slab, int order,
 3199					 struct kmem_cache *s, gfp_t gfp)
 3200{
 3201	if (memcg_kmem_online() && (s->flags & SLAB_ACCOUNT))
 3202		alloc_slab_obj_exts(slab, s, gfp, true);
 3203
 3204	mod_node_page_state(slab_pgdat(slab), cache_vmstat_idx(s),
 3205			    PAGE_SIZE << order);
 3206}
 3207
 3208static __always_inline void unaccount_slab(struct slab *slab, int order,
 3209					   struct kmem_cache *s)
 3210{
 3211	/*
 3212	 * The slab object extensions should now be freed regardless of
 3213	 * whether mem_alloc_profiling_enabled() or not because profiling
 3214	 * might have been disabled after slab->obj_exts got allocated.
 3215	 */
 3216	free_slab_obj_exts(slab);
 3217
 3218	mod_node_page_state(slab_pgdat(slab), cache_vmstat_idx(s),
 3219			    -(PAGE_SIZE << order));
 3220}
 3221
 3222static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
 3223{
 3224	bool allow_spin = gfpflags_allow_spinning(flags);
 3225	struct slab *slab;
 3226	struct kmem_cache_order_objects oo = s->oo;
 3227	gfp_t alloc_gfp;
 3228	void *start, *p, *next;
 3229	int idx;
 3230	bool shuffle;
 3231
 3232	flags &= gfp_allowed_mask;
 3233
 3234	flags |= s->allocflags;
 3235
 3236	/*
 3237	 * Let the initial higher-order allocation fail under memory pressure
 3238	 * so we fall-back to the minimum order allocation.
 3239	 */
 3240	alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL;
 3241	if ((alloc_gfp & __GFP_DIRECT_RECLAIM) && oo_order(oo) > oo_order(s->min))
 3242		alloc_gfp = (alloc_gfp | __GFP_NOMEMALLOC) & ~__GFP_RECLAIM;
 3243
 3244	/*
 3245	 * __GFP_RECLAIM could be cleared on the first allocation attempt,
 3246	 * so pass allow_spin flag directly.
 3247	 */
 3248	slab = alloc_slab_page(alloc_gfp, node, oo, allow_spin);
 3249	if (unlikely(!slab)) {
 3250		oo = s->min;
 3251		alloc_gfp = flags;
 3252		/*
 3253		 * Allocation may have failed due to fragmentation.
 3254		 * Try a lower order alloc if possible
 3255		 */
 3256		slab = alloc_slab_page(alloc_gfp, node, oo, allow_spin);
 3257		if (unlikely(!slab))
 3258			return NULL;
 3259		stat(s, ORDER_FALLBACK);
 3260	}
 3261
 3262	slab->objects = oo_objects(oo);
 3263	slab->inuse = 0;
 3264	slab->frozen = 0;
 3265	init_slab_obj_exts(slab);
 3266
 3267	account_slab(slab, oo_order(oo), s, flags);
 3268
 3269	slab->slab_cache = s;
 3270
 3271	kasan_poison_slab(slab);
 3272
 3273	start = slab_address(slab);
 3274
 3275	setup_slab_debug(s, slab, start);
 3276
 3277	shuffle = shuffle_freelist(s, slab);
 3278
 3279	if (!shuffle) {
 3280		start = fixup_red_left(s, start);
 3281		start = setup_object(s, start);
 3282		slab->freelist = start;
 3283		for (idx = 0, p = start; idx < slab->objects - 1; idx++) {
 3284			next = p + s->size;
 3285			next = setup_object(s, next);
 3286			set_freepointer(s, p, next);
 3287			p = next;
 3288		}
 3289		set_freepointer(s, p, NULL);
 3290	}
 3291
 3292	return slab;
 3293}
 3294
 3295static struct slab *new_slab(struct kmem_cache *s, gfp_t flags, int node)
 3296{
 3297	if (unlikely(flags & GFP_SLAB_BUG_MASK))
 3298		flags = kmalloc_fix_flags(flags);
 3299
 3300	WARN_ON_ONCE(s->ctor && (flags & __GFP_ZERO));
 3301
 3302	return allocate_slab(s,
 3303		flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node);
 3304}
 3305
 3306static void __free_slab(struct kmem_cache *s, struct slab *slab)
 3307{
 3308	struct page *page = slab_page(slab);
 3309	int order = compound_order(page);
 3310	int pages = 1 << order;
 3311
 3312	__slab_clear_pfmemalloc(slab);
 3313	page->mapping = NULL;
 3314	__ClearPageSlab(page);
 3315	mm_account_reclaimed_pages(pages);
 3316	unaccount_slab(slab, order, s);
 3317	free_frozen_pages(page, order);
 3318}
 3319
 3320static void rcu_free_slab(struct rcu_head *h)
 3321{
 3322	struct slab *slab = container_of(h, struct slab, rcu_head);
 3323
 3324	__free_slab(slab->slab_cache, slab);
 3325}
 3326
 3327static void free_slab(struct kmem_cache *s, struct slab *slab)
 3328{
 3329	if (kmem_cache_debug_flags(s, SLAB_CONSISTENCY_CHECKS)) {
 3330		void *p;
 3331
 3332		slab_pad_check(s, slab);
 3333		for_each_object(p, s, slab_address(slab), slab->objects)
 3334			check_object(s, slab, p, SLUB_RED_INACTIVE);
 3335	}
 3336
 3337	if (unlikely(s->flags & SLAB_TYPESAFE_BY_RCU))
 3338		call_rcu(&slab->rcu_head, rcu_free_slab);
 3339	else
 3340		__free_slab(s, slab);
 3341}
 3342
 3343static void discard_slab(struct kmem_cache *s, struct slab *slab)
 3344{
 3345	dec_slabs_node(s, slab_nid(slab), slab->objects);
 3346	free_slab(s, slab);
 3347}
 3348
 3349static inline bool slab_test_node_partial(const struct slab *slab)
 3350{
 3351	return test_bit(SL_partial, &slab->flags.f);
 3352}
 3353
 3354static inline void slab_set_node_partial(struct slab *slab)
 3355{
 3356	set_bit(SL_partial, &slab->flags.f);
 3357}
 3358
 3359static inline void slab_clear_node_partial(struct slab *slab)
 3360{
 3361	clear_bit(SL_partial, &slab->flags.f);
 3362}
 3363
 3364/*
 3365 * Management of partially allocated slabs.
 3366 */
 3367static inline void
 3368__add_partial(struct kmem_cache_node *n, struct slab *slab, int tail)
 3369{
 3370	n->nr_partial++;
 3371	if (tail == DEACTIVATE_TO_TAIL)
 3372		list_add_tail(&slab->slab_list, &n->partial);
 3373	else
 3374		list_add(&slab->slab_list, &n->partial);
 3375	slab_set_node_partial(slab);
 3376}
 3377
 3378static inline void add_partial(struct kmem_cache_node *n,
 3379				struct slab *slab, int tail)
 3380{
 3381	lockdep_assert_held(&n->list_lock);
 3382	__add_partial(n, slab, tail);
 3383}
 3384
 3385static inline void remove_partial(struct kmem_cache_node *n,
 3386					struct slab *slab)
 3387{
 3388	lockdep_assert_held(&n->list_lock);
 3389	list_del(&slab->slab_list);
 3390	slab_clear_node_partial(slab);
 3391	n->nr_partial--;
 3392}
 3393
 3394/*
 3395 * Called only for kmem_cache_debug() caches instead of remove_partial(), with a
 3396 * slab from the n->partial list. Remove only a single object from the slab, do
 3397 * the alloc_debug_processing() checks and leave the slab on the list, or move
 3398 * it to full list if it was the last free object.
 3399 */
 3400static void *alloc_single_from_partial(struct kmem_cache *s,
 3401		struct kmem_cache_node *n, struct slab *slab, int orig_size)
 3402{
 3403	void *object;
 3404
 3405	lockdep_assert_held(&n->list_lock);
 3406
 3407#ifdef CONFIG_SLUB_DEBUG
 3408	if (s->flags & SLAB_CONSISTENCY_CHECKS) {
 3409		if (!validate_slab_ptr(slab)) {
 3410			slab_err(s, slab, "Not a valid slab page");
 3411			return NULL;
 3412		}
 3413	}
 3414#endif
 3415
 3416	object = slab->freelist;
 3417	slab->freelist = get_freepointer(s, object);
 3418	slab->inuse++;
 3419
 3420	if (!alloc_debug_processing(s, slab, object, orig_size)) {
 3421		remove_partial(n, slab);
 3422		return NULL;
 3423	}
 3424
 3425	if (slab->inuse == slab->objects) {
 3426		remove_partial(n, slab);
 3427		add_full(s, n, slab);
 3428	}
 3429
 3430	return object;
 3431}
 3432
 3433static void defer_deactivate_slab(struct slab *slab, void *flush_freelist);
 3434
 3435/*
 3436 * Called only for kmem_cache_debug() caches to allocate from a freshly
 3437 * allocated slab. Allocate a single object instead of whole freelist
 3438 * and put the slab to the partial (or full) list.
 3439 */
 3440static void *alloc_single_from_new_slab(struct kmem_cache *s, struct slab *slab,
 3441					int orig_size, gfp_t gfpflags)
 3442{
 3443	bool allow_spin = gfpflags_allow_spinning(gfpflags);
 3444	int nid = slab_nid(slab);
 3445	struct kmem_cache_node *n = get_node(s, nid);
 3446	unsigned long flags;
 3447	void *object;
 3448
 3449	if (!allow_spin && !spin_trylock_irqsave(&n->list_lock, flags)) {
 3450		/* Unlucky, discard newly allocated slab */
 3451		defer_deactivate_slab(slab, NULL);
 3452		return NULL;
 3453	}
 3454
 3455	object = slab->freelist;
 3456	slab->freelist = get_freepointer(s, object);
 3457	slab->inuse = 1;
 3458
 3459	if (!alloc_debug_processing(s, slab, object, orig_size)) {
 3460		/*
 3461		 * It's not really expected that this would fail on a
 3462		 * freshly allocated slab, but a concurrent memory
 3463		 * corruption in theory could cause that.
 3464		 * Leak memory of allocated slab.
 3465		 */
 3466		if (!allow_spin)
 3467			spin_unlock_irqrestore(&n->list_lock, flags);
 3468		return NULL;
 3469	}
 3470
 3471	if (allow_spin)
 3472		spin_lock_irqsave(&n->list_lock, flags);
 3473
 3474	if (slab->inuse == slab->objects)
 3475		add_full(s, n, slab);
 3476	else
 3477		add_partial(n, slab, DEACTIVATE_TO_HEAD);
 3478
 3479	inc_slabs_node(s, nid, slab->objects);
 3480	spin_unlock_irqrestore(&n->list_lock, flags);
 3481
 3482	return object;
 3483}
 3484
 3485#ifdef CONFIG_SLUB_CPU_PARTIAL
 3486static void put_cpu_partial(struct kmem_cache *s, struct slab *slab, int drain);
 3487#else
 3488static inline void put_cpu_partial(struct kmem_cache *s, struct slab *slab,
 3489				   int drain) { }
 3490#endif
 3491static inline bool pfmemalloc_match(struct slab *slab, gfp_t gfpflags);
 3492
 3493/*
 3494 * Try to allocate a partial slab from a specific node.
 3495 */
 3496static struct slab *get_partial_node(struct kmem_cache *s,
 3497				     struct kmem_cache_node *n,
 3498				     struct partial_context *pc)
 3499{
 3500	struct slab *slab, *slab2, *partial = NULL;
 3501	unsigned long flags;
 3502	unsigned int partial_slabs = 0;
 3503
 3504	/*
 3505	 * Racy check. If we mistakenly see no partial slabs then we
 3506	 * just allocate an empty slab. If we mistakenly try to get a
 3507	 * partial slab and there is none available then get_partial()
 3508	 * will return NULL.
 3509	 */
 3510	if (!n || !n->nr_partial)
 3511		return NULL;
 3512
 3513	if (gfpflags_allow_spinning(pc->flags))
 3514		spin_lock_irqsave(&n->list_lock, flags);
 3515	else if (!spin_trylock_irqsave(&n->list_lock, flags))
 3516		return NULL;
 3517	list_for_each_entry_safe(slab, slab2, &n->partial, slab_list) {
 3518		if (!pfmemalloc_match(slab, pc->flags))
 3519			continue;
 3520
 3521		if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) {
 3522			void *object = alloc_single_from_partial(s, n, slab,
 3523							pc->orig_size);
 3524			if (object) {
 3525				partial = slab;
 3526				pc->object = object;
 3527				break;
 3528			}
 3529			continue;
 3530		}
 3531
 3532		remove_partial(n, slab);
 3533
 3534		if (!partial) {
 3535			partial = slab;
 3536			stat(s, ALLOC_FROM_PARTIAL);
 3537
 3538			if ((slub_get_cpu_partial(s) == 0)) {
 3539				break;
 3540			}
 3541		} else {
 3542			put_cpu_partial(s, slab, 0);
 3543			stat(s, CPU_PARTIAL_NODE);
 3544
 3545			if (++partial_slabs > slub_get_cpu_partial(s) / 2) {
 3546				break;
 3547			}
 3548		}
 3549	}
 3550	spin_unlock_irqrestore(&n->list_lock, flags);
 3551	return partial;
 3552}
 3553
 3554/*
 3555 * Get a slab from somewhere. Search in increasing NUMA distances.
 3556 */
 3557static struct slab *get_any_partial(struct kmem_cache *s,
 3558				    struct partial_context *pc)
 3559{
 3560#ifdef CONFIG_NUMA
 3561	struct zonelist *zonelist;
 3562	struct zoneref *z;
 3563	struct zone *zone;
 3564	enum zone_type highest_zoneidx = gfp_zone(pc->flags);
 3565	struct slab *slab;
 3566	unsigned int cpuset_mems_cookie;
 3567
 3568	/*
 3569	 * The defrag ratio allows a configuration of the tradeoffs between
 3570	 * inter node defragmentation and node local allocations. A lower
 3571	 * defrag_ratio increases the tendency to do local allocations
 3572	 * instead of attempting to obtain partial slabs from other nodes.
 3573	 *
 3574	 * If the defrag_ratio is set to 0 then kmalloc() always
 3575	 * returns node local objects. If the ratio is higher then kmalloc()
 3576	 * may return off node objects because partial slabs are obtained
 3577	 * from other nodes and filled up.
 3578	 *
 3579	 * If /sys/kernel/slab/xx/remote_node_defrag_ratio is set to 100
 3580	 * (which makes defrag_ratio = 1000) then every (well almost)
 3581	 * allocation will first attempt to defrag slab caches on other nodes.
 3582	 * This means scanning over all nodes to look for partial slabs which
 3583	 * may be expensive if we do it every time we are trying to find a slab
 3584	 * with available objects.
 3585	 */
 3586	if (!s->remote_node_defrag_ratio ||
 3587			get_cycles() % 1024 > s->remote_node_defrag_ratio)
 3588		return NULL;
 3589
 3590	do {
 3591		cpuset_mems_cookie = read_mems_allowed_begin();
 3592		zonelist = node_zonelist(mempolicy_slab_node(), pc->flags);
 3593		for_each_zone_zonelist(zone, z, zonelist, highest_zoneidx) {
 3594			struct kmem_cache_node *n;
 3595
 3596			n = get_node(s, zone_to_nid(zone));
 3597
 3598			if (n && cpuset_zone_allowed(zone, pc->flags) &&
 3599					n->nr_partial > s->min_partial) {
 3600				slab = get_partial_node(s, n, pc);
 3601				if (slab) {
 3602					/*
 3603					 * Don't check read_mems_allowed_retry()
 3604					 * here - if mems_allowed was updated in
 3605					 * parallel, that was a harmless race
 3606					 * between allocation and the cpuset
 3607					 * update
 3608					 */
 3609					return slab;
 3610				}
 3611			}
 3612		}
 3613	} while (read_mems_allowed_retry(cpuset_mems_cookie));
 3614#endif	/* CONFIG_NUMA */
 3615	return NULL;
 3616}
 3617
 3618/*
 3619 * Get a partial slab, lock it and return it.
 3620 */
 3621static struct slab *get_partial(struct kmem_cache *s, int node,
 3622				struct partial_context *pc)
 3623{
 3624	struct slab *slab;
 3625	int searchnode = node;
 3626
 3627	if (node == NUMA_NO_NODE)
 3628		searchnode = numa_mem_id();
 3629
 3630	slab = get_partial_node(s, get_node(s, searchnode), pc);
 3631	if (slab || (node != NUMA_NO_NODE && (pc->flags & __GFP_THISNODE)))
 3632		return slab;
 3633
 3634	return get_any_partial(s, pc);
 3635}
 3636
 3637#ifdef CONFIG_PREEMPTION
 3638/*
 3639 * Calculate the next globally unique transaction for disambiguation
 3640 * during cmpxchg. The transactions start with the cpu number and are then
 3641 * incremented by CONFIG_NR_CPUS.
 3642 */
 3643#define TID_STEP  roundup_pow_of_two(CONFIG_NR_CPUS)
 3644#else
 3645/*
 3646 * No preemption supported therefore also no need to check for
 3647 * different cpus.
 3648 */
 3649#define TID_STEP 1
 3650#endif /* CONFIG_PREEMPTION */
 3651
 3652static inline unsigned long next_tid(unsigned long tid)
 3653{
 3654	return tid + TID_STEP;
 3655}
 3656
 3657#ifdef SLUB_DEBUG_CMPXCHG
 3658static inline unsigned int tid_to_cpu(unsigned long tid)
 3659{
 3660	return tid % TID_STEP;
 3661}
 3662
 3663static inline unsigned long tid_to_event(unsigned long tid)
 3664{
 3665	return tid / TID_STEP;
 3666}
 3667#endif
 3668
 3669static inline unsigned int init_tid(int cpu)
 3670{
 3671	return cpu;
 3672}
 3673
 3674static inline void note_cmpxchg_failure(const char *n,
 3675		const struct kmem_cache *s, unsigned long tid)
 3676{
 3677#ifdef SLUB_DEBUG_CMPXCHG
 3678	unsigned long actual_tid = __this_cpu_read(s->cpu_slab->tid);
 3679
 3680	pr_info("%s %s: cmpxchg redo ", n, s->name);
 3681
 3682	if (IS_ENABLED(CONFIG_PREEMPTION) &&
 3683	    tid_to_cpu(tid) != tid_to_cpu(actual_tid)) {
 3684		pr_warn("due to cpu change %d -> %d\n",
 3685			tid_to_cpu(tid), tid_to_cpu(actual_tid));
 3686	} else if (tid_to_event(tid) != tid_to_event(actual_tid)) {
 3687		pr_warn("due to cpu running other code. Event %ld->%ld\n",
 3688			tid_to_event(tid), tid_to_event(actual_tid));
 3689	} else {
 3690		pr_warn("for unknown reason: actual=%lx was=%lx target=%lx\n",
 3691			actual_tid, tid, next_tid(tid));
 3692	}
 3693#endif
 3694	stat(s, CMPXCHG_DOUBLE_CPU_FAIL);
 3695}
 3696
 3697static void init_kmem_cache_cpus(struct kmem_cache *s)
 3698{
 3699#ifdef CONFIG_PREEMPT_RT
 3700	/*
 3701	 * Register lockdep key for non-boot kmem caches to avoid
 3702	 * WARN_ON_ONCE(static_obj(key))) in lockdep_register_key()
 3703	 */
 3704	bool finegrain_lockdep = !init_section_contains(s, 1);
 3705#else
 3706	/*
 3707	 * Don't bother with different lockdep classes for each
 3708	 * kmem_cache, since we only use local_trylock_irqsave().
 3709	 */
 3710	bool finegrain_lockdep = false;
 3711#endif
 3712	int cpu;
 3713	struct kmem_cache_cpu *c;
 3714
 3715	if (finegrain_lockdep)
 3716		lockdep_register_key(&s->lock_key);
 3717	for_each_possible_cpu(cpu) {
 3718		c = per_cpu_ptr(s->cpu_slab, cpu);
 3719		local_trylock_init(&c->lock);
 3720		if (finegrain_lockdep)
 3721			lockdep_set_class(&c->lock, &s->lock_key);
 3722		c->tid = init_tid(cpu);
 3723	}
 3724}
 3725
 3726/*
 3727 * Finishes removing the cpu slab. Merges cpu's freelist with slab's freelist,
 3728 * unfreezes the slabs and puts it on the proper list.
 3729 * Assumes the slab has been already safely taken away from kmem_cache_cpu
 3730 * by the caller.
 3731 */
 3732static void deactivate_slab(struct kmem_cache *s, struct slab *slab,
 3733			    void *freelist)
 3734{
 3735	struct kmem_cache_node *n = get_node(s, slab_nid(slab));
 3736	int free_delta = 0;
 3737	void *nextfree, *freelist_iter, *freelist_tail;
 3738	int tail = DEACTIVATE_TO_HEAD;
 3739	unsigned long flags = 0;
 3740	struct freelist_counters old, new;
 3741
 3742	if (READ_ONCE(slab->freelist)) {
 3743		stat(s, DEACTIVATE_REMOTE_FREES);
 3744		tail = DEACTIVATE_TO_TAIL;
 3745	}
 3746
 3747	/*
 3748	 * Stage one: Count the objects on cpu's freelist as free_delta and
 3749	 * remember the last object in freelist_tail for later splicing.
 3750	 */
 3751	freelist_tail = NULL;
 3752	freelist_iter = freelist;
 3753	while (freelist_iter) {
 3754		nextfree = get_freepointer(s, freelist_iter);
 3755
 3756		/*
 3757		 * If 'nextfree' is invalid, it is possible that the object at
 3758		 * 'freelist_iter' is already corrupted.  So isolate all objects
 3759		 * starting at 'freelist_iter' by skipping them.
 3760		 */
 3761		if (freelist_corrupted(s, slab, &freelist_iter, nextfree))
 3762			break;
 3763
 3764		freelist_tail = freelist_iter;
 3765		free_delta++;
 3766
 3767		freelist_iter = nextfree;
 3768	}
 3769
 3770	/*
 3771	 * Stage two: Unfreeze the slab while splicing the per-cpu
 3772	 * freelist to the head of slab's freelist.
 3773	 */
 3774	do {
 3775		old.freelist = READ_ONCE(slab->freelist);
 3776		old.counters = READ_ONCE(slab->counters);
 3777		VM_BUG_ON(!old.frozen);
 3778
 3779		/* Determine target state of the slab */
 3780		new.counters = old.counters;
 3781		new.frozen = 0;
 3782		if (freelist_tail) {
 3783			new.inuse -= free_delta;
 3784			set_freepointer(s, freelist_tail, old.freelist);
 3785			new.freelist = freelist;
 3786		} else {
 3787			new.freelist = old.freelist;
 3788		}
 3789	} while (!slab_update_freelist(s, slab, &old, &new, "unfreezing slab"));
 3790
 3791	/*
 3792	 * Stage three: Manipulate the slab list based on the updated state.
 3793	 */
 3794	if (!new.inuse && n->nr_partial >= s->min_partial) {
 3795		stat(s, DEACTIVATE_EMPTY);
 3796		discard_slab(s, slab);
 3797		stat(s, FREE_SLAB);
 3798	} else if (new.freelist) {
 3799		spin_lock_irqsave(&n->list_lock, flags);
 3800		add_partial(n, slab, tail);
 3801		spin_unlock_irqrestore(&n->list_lock, flags);
 3802		stat(s, tail);
 3803	} else {
 3804		stat(s, DEACTIVATE_FULL);
 3805	}
 3806}
 3807
 3808/*
 3809 * ___slab_alloc()'s caller is supposed to check if kmem_cache::kmem_cache_cpu::lock
 3810 * can be acquired without a deadlock before invoking the function.
 3811 *
 3812 * Without LOCKDEP we trust the code to be correct. kmalloc_nolock() is
 3813 * using local_lock_is_locked() properly before calling local_lock_cpu_slab(),
 3814 * and kmalloc() is not used in an unsupported context.
 3815 *
 3816 * With LOCKDEP, on PREEMPT_RT lockdep does its checking in local_lock_irqsave().
 3817 * On !PREEMPT_RT we use trylock to avoid false positives in NMI, but
 3818 * lockdep_assert() will catch a bug in case:
 3819 * #1
 3820 * kmalloc() -> ___slab_alloc() -> irqsave -> NMI -> bpf -> kmalloc_nolock()
 3821 * or
 3822 * #2
 3823 * kmalloc() -> ___slab_alloc() -> irqsave -> tracepoint/kprobe -> bpf -> kmalloc_nolock()
 3824 *
 3825 * On PREEMPT_RT an invocation is not possible from IRQ-off or preempt
 3826 * disabled context. The lock will always be acquired and if needed it
 3827 * block and sleep until the lock is available.
 3828 * #1 is possible in !PREEMPT_RT only.
 3829 * #2 is possible in both with a twist that irqsave is replaced with rt_spinlock:
 3830 * kmalloc() -> ___slab_alloc() -> rt_spin_lock(kmem_cache_A) ->
 3831 *    tracepoint/kprobe -> bpf -> kmalloc_nolock() -> rt_spin_lock(kmem_cache_B)
 3832 *
 3833 * local_lock_is_locked() prevents the case kmem_cache_A == kmem_cache_B
 3834 */
 3835#if defined(CONFIG_PREEMPT_RT) || !defined(CONFIG_LOCKDEP)
 3836#define local_lock_cpu_slab(s, flags)	\
 3837	local_lock_irqsave(&(s)->cpu_slab->lock, flags)
 3838#else
 3839#define local_lock_cpu_slab(s, flags)					       \
 3840	do {								       \
 3841		bool __l = local_trylock_irqsave(&(s)->cpu_slab->lock, flags); \
 3842		lockdep_assert(__l);					       \
 3843	} while (0)
 3844#endif
 3845
 3846#define local_unlock_cpu_slab(s, flags)	\
 3847	local_unlock_irqrestore(&(s)->cpu_slab->lock, flags)
 3848
 3849#ifdef CONFIG_SLUB_CPU_PARTIAL
 3850static void __put_partials(struct kmem_cache *s, struct slab *partial_slab)
 3851{
 3852	struct kmem_cache_node *n = NULL, *n2 = NULL;
 3853	struct slab *slab, *slab_to_discard = NULL;
 3854	unsigned long flags = 0;
 3855
 3856	while (partial_slab) {
 3857		slab = partial_slab;
 3858		partial_slab = slab->next;
 3859
 3860		n2 = get_node(s, slab_nid(slab));
 3861		if (n != n2) {
 3862			if (n)
 3863				spin_unlock_irqrestore(&n->list_lock, flags);
 3864
 3865			n = n2;
 3866			spin_lock_irqsave(&n->list_lock, flags);
 3867		}
 3868
 3869		if (unlikely(!slab->inuse && n->nr_partial >= s->min_partial)) {
 3870			slab->next = slab_to_discard;
 3871			slab_to_discard = slab;
 3872		} else {
 3873			add_partial(n, slab, DEACTIVATE_TO_TAIL);
 3874			stat(s, FREE_ADD_PARTIAL);
 3875		}
 3876	}
 3877
 3878	if (n)
 3879		spin_unlock_irqrestore(&n->list_lock, flags);
 3880
 3881	while (slab_to_discard) {
 3882		slab = slab_to_discard;
 3883		slab_to_discard = slab_to_discard->next;
 3884
 3885		stat(s, DEACTIVATE_EMPTY);
 3886		discard_slab(s, slab);
 3887		stat(s, FREE_SLAB);
 3888	}
 3889}
 3890
 3891/*
 3892 * Put all the cpu partial slabs to the node partial list.
 3893 */
 3894static void put_partials(struct kmem_cache *s)
 3895{
 3896	struct slab *partial_slab;
 3897	unsigned long flags;
 3898
 3899	local_lock_irqsave(&s->cpu_slab->lock, flags);
 3900	partial_slab = this_cpu_read(s->cpu_slab->partial);
 3901	this_cpu_write(s->cpu_slab->partial, NULL);
 3902	local_unlock_irqrestore(&s->cpu_slab->lock, flags);
 3903
 3904	if (partial_slab)
 3905		__put_partials(s, partial_slab);
 3906}
 3907
 3908static void put_partials_cpu(struct kmem_cache *s,
 3909			     struct kmem_cache_cpu *c)
 3910{
 3911	struct slab *partial_slab;
 3912
 3913	partial_slab = slub_percpu_partial(c);
 3914	c->partial = NULL;
 3915
 3916	if (partial_slab)
 3917		__put_partials(s, partial_slab);
 3918}
 3919
 3920/*
 3921 * Put a slab into a partial slab slot if available.
 3922 *
 3923 * If we did not find a slot then simply move all the partials to the
 3924 * per node partial list.
 3925 */
 3926static void put_cpu_partial(struct kmem_cache *s, struct slab *slab, int drain)
 3927{
 3928	struct slab *oldslab;
 3929	struct slab *slab_to_put = NULL;
 3930	unsigned long flags;
 3931	int slabs = 0;
 3932
 3933	local_lock_cpu_slab(s, flags);
 3934
 3935	oldslab = this_cpu_read(s->cpu_slab->partial);
 3936
 3937	if (oldslab) {
 3938		if (drain && oldslab->slabs >= s->cpu_partial_slabs) {
 3939			/*
 3940			 * Partial array is full. Move the existing set to the
 3941			 * per node partial list. Postpone the actual unfreezing
 3942			 * outside of the critical section.
 3943			 */
 3944			slab_to_put = oldslab;
 3945			oldslab = NULL;
 3946		} else {
 3947			slabs = oldslab->slabs;
 3948		}
 3949	}
 3950
 3951	slabs++;
 3952
 3953	slab->slabs = slabs;
 3954	slab->next = oldslab;
 3955
 3956	this_cpu_write(s->cpu_slab->partial, slab);
 3957
 3958	local_unlock_cpu_slab(s, flags);
 3959
 3960	if (slab_to_put) {
 3961		__put_partials(s, slab_to_put);
 3962		stat(s, CPU_PARTIAL_DRAIN);
 3963	}
 3964}
 3965
 3966#else	/* CONFIG_SLUB_CPU_PARTIAL */
 3967
 3968static inline void put_partials(struct kmem_cache *s) { }
 3969static inline void put_partials_cpu(struct kmem_cache *s,
 3970				    struct kmem_cache_cpu *c) { }
 3971
 3972#endif	/* CONFIG_SLUB_CPU_PARTIAL */
 3973
 3974static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
 3975{
 3976	unsigned long flags;
 3977	struct slab *slab;
 3978	void *freelist;
 3979
 3980	local_lock_irqsave(&s->cpu_slab->lock, flags);
 3981
 3982	slab = c->slab;
 3983	freelist = c->freelist;
 3984
 3985	c->slab = NULL;
 3986	c->freelist = NULL;
 3987	c->tid = next_tid(c->tid);
 3988
 3989	local_unlock_irqrestore(&s->cpu_slab->lock, flags);
 3990
 3991	if (slab) {
 3992		deactivate_slab(s, slab, freelist);
 3993		stat(s, CPUSLAB_FLUSH);
 3994	}
 3995}
 3996
 3997static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu)
 3998{
 3999	struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
 4000	void *freelist = c->freelist;
 4001	struct slab *slab = c->slab;
 4002
 4003	c->slab = NULL;
 4004	c->freelist = NULL;
 4005	c->tid = next_tid(c->tid);
 4006
 4007	if (slab) {
 4008		deactivate_slab(s, slab, freelist);
 4009		stat(s, CPUSLAB_FLUSH);
 4010	}
 4011
 4012	put_partials_cpu(s, c);
 4013}
 4014
 4015static inline void flush_this_cpu_slab(struct kmem_cache *s)
 4016{
 4017	struct kmem_cache_cpu *c = this_cpu_ptr(s->cpu_slab);
 4018
 4019	if (c->slab)
 4020		flush_slab(s, c);
 4021
 4022	put_partials(s);
 4023}
 4024
 4025static bool has_cpu_slab(int cpu, struct kmem_cache *s)
 4026{
 4027	struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
 4028
 4029	return c->slab || slub_percpu_partial(c);
 4030}
 4031
 4032static bool has_pcs_used(int cpu, struct kmem_cache *s)
 4033{
 4034	struct slub_percpu_sheaves *pcs;
 4035
 4036	if (!s->cpu_sheaves)
 4037		return false;
 4038
 4039	pcs = per_cpu_ptr(s->cpu_sheaves, cpu);
 4040
 4041	return (pcs->spare || pcs->rcu_free || pcs->main->size);
 4042}
 4043
 4044/*
 4045 * Flush cpu slab.
 4046 *
 4047 * Called from CPU work handler with migration disabled.
 4048 */
 4049static void flush_cpu_slab(struct work_struct *w)
 4050{
 4051	struct kmem_cache *s;
 4052	struct slub_flush_work *sfw;
 4053
 4054	sfw = container_of(w, struct slub_flush_work, work);
 4055
 4056	s = sfw->s;
 4057
 4058	if (s->cpu_sheaves)
 4059		pcs_flush_all(s);
 4060
 4061	flush_this_cpu_slab(s);
 4062}
 4063
 4064static void flush_all_cpus_locked(struct kmem_cache *s)
 4065{
 4066	struct slub_flush_work *sfw;
 4067	unsigned int cpu;
 4068
 4069	lockdep_assert_cpus_held();
 4070	mutex_lock(&flush_lock);
 4071
 4072	for_each_online_cpu(cpu) {
 4073		sfw = &per_cpu(slub_flush, cpu);
 4074		if (!has_cpu_slab(cpu, s) && !has_pcs_used(cpu, s)) {
 4075			sfw->skip = true;
 4076			continue;
 4077		}
 4078		INIT_WORK(&sfw->work, flush_cpu_slab);
 4079		sfw->skip = false;
 4080		sfw->s = s;
 4081		queue_work_on(cpu, flushwq, &sfw->work);
 4082	}
 4083
 4084	for_each_online_cpu(cpu) {
 4085		sfw = &per_cpu(slub_flush, cpu);
 4086		if (sfw->skip)
 4087			continue;
 4088		flush_work(&sfw->work);
 4089	}
 4090
 4091	mutex_unlock(&flush_lock);
 4092}
 4093
 4094static void flush_all(struct kmem_cache *s)
 4095{
 4096	cpus_read_lock();
 4097	flush_all_cpus_locked(s);
 4098	cpus_read_unlock();
 4099}
 4100
 4101static void flush_rcu_sheaf(struct work_struct *w)
 4102{
 4103	struct slub_percpu_sheaves *pcs;
 4104	struct slab_sheaf *rcu_free;
 4105	struct slub_flush_work *sfw;
 4106	struct kmem_cache *s;
 4107
 4108	sfw = container_of(w, struct slub_flush_work, work);
 4109	s = sfw->s;
 4110
 4111	local_lock(&s->cpu_sheaves->lock);
 4112	pcs = this_cpu_ptr(s->cpu_sheaves);
 4113
 4114	rcu_free = pcs->rcu_free;
 4115	pcs->rcu_free = NULL;
 4116
 4117	local_unlock(&s->cpu_sheaves->lock);
 4118
 4119	if (rcu_free)
 4120		call_rcu(&rcu_free->rcu_head, rcu_free_sheaf_nobarn);
 4121}
 4122
 4123
 4124/* needed for kvfree_rcu_barrier() */
 4125void flush_rcu_sheaves_on_cache(struct kmem_cache *s)
 4126{
 4127	struct slub_flush_work *sfw;
 4128	unsigned int cpu;
 4129
 4130	mutex_lock(&flush_lock);
 4131
 4132	for_each_online_cpu(cpu) {
 4133		sfw = &per_cpu(slub_flush, cpu);
 4134
 4135		/*
 4136		 * we don't check if rcu_free sheaf exists - racing
 4137		 * __kfree_rcu_sheaf() might have just removed it.
 4138		 * by executing flush_rcu_sheaf() on the cpu we make
 4139		 * sure the __kfree_rcu_sheaf() finished its call_rcu()
 4140		 */
 4141
 4142		INIT_WORK(&sfw->work, flush_rcu_sheaf);
 4143		sfw->s = s;
 4144		queue_work_on(cpu, flushwq, &sfw->work);
 4145	}
 4146
 4147	for_each_online_cpu(cpu) {
 4148		sfw = &per_cpu(slub_flush, cpu);
 4149		flush_work(&sfw->work);
 4150	}
 4151
 4152	mutex_unlock(&flush_lock);
 4153}
 4154
 4155void flush_all_rcu_sheaves(void)
 4156{
 4157	struct kmem_cache *s;
 4158
 4159	cpus_read_lock();
 4160	mutex_lock(&slab_mutex);
 4161
 4162	list_for_each_entry(s, &slab_caches, list) {
 4163		if (!s->cpu_sheaves)
 4164			continue;
 4165		flush_rcu_sheaves_on_cache(s);
 4166	}
 4167
 4168	mutex_unlock(&slab_mutex);
 4169	cpus_read_unlock();
 4170
 4171	rcu_barrier();
 4172}
 4173
 4174/*
 4175 * Use the cpu notifier to insure that the cpu slabs are flushed when
 4176 * necessary.
 4177 */
 4178static int slub_cpu_dead(unsigned int cpu)
 4179{
 4180	struct kmem_cache *s;
 4181
 4182	mutex_lock(&slab_mutex);
 4183	list_for_each_entry(s, &slab_caches, list) {
 4184		__flush_cpu_slab(s, cpu);
 4185		if (s->cpu_sheaves)
 4186			__pcs_flush_all_cpu(s, cpu);
 4187	}
 4188	mutex_unlock(&slab_mutex);
 4189	return 0;
 4190}
 4191
 4192/*
 4193 * Check if the objects in a per cpu structure fit numa
 4194 * locality expectations.
 4195 */
 4196static inline int node_match(struct slab *slab, int node)
 4197{
 4198#ifdef CONFIG_NUMA
 4199	if (node != NUMA_NO_NODE && slab_nid(slab) != node)
 4200		return 0;
 4201#endif
 4202	return 1;
 4203}
 4204
 4205#ifdef CONFIG_SLUB_DEBUG
 4206static int count_free(struct slab *slab)
 4207{
 4208	return slab->objects - slab->inuse;
 4209}
 4210
 4211static inline unsigned long node_nr_objs(struct kmem_cache_node *n)
 4212{
 4213	return atomic_long_read(&n->total_objects);
 4214}
 4215
 4216/* Supports checking bulk free of a constructed freelist */
 4217static inline bool free_debug_processing(struct kmem_cache *s,
 4218	struct slab *slab, void *head, void *tail, int *bulk_cnt,
 4219	unsigned long addr, depot_stack_handle_t handle)
 4220{
 4221	bool checks_ok = false;
 4222	void *object = head;
 4223	int cnt = 0;
 4224
 4225	if (s->flags & SLAB_CONSISTENCY_CHECKS) {
 4226		if (!check_slab(s, slab))
 4227			goto out;
 4228	}
 4229
 4230	if (slab->inuse < *bulk_cnt) {
 4231		slab_err(s, slab, "Slab has %d allocated objects but %d are to be freed\n",
 4232			 slab->inuse, *bulk_cnt);
 4233		goto out;
 4234	}
 4235
 4236next_object:
 4237
 4238	if (++cnt > *bulk_cnt)
 4239		goto out_cnt;
 4240
 4241	if (s->flags & SLAB_CONSISTENCY_CHECKS) {
 4242		if (!free_consistency_checks(s, slab, object, addr))
 4243			goto out;
 4244	}
 4245
 4246	if (s->flags & SLAB_STORE_USER)
 4247		set_track_update(s, object, TRACK_FREE, addr, handle);
 4248	trace(s, slab, object, 0);
 4249	/* Freepointer not overwritten by init_object(), SLAB_POISON moved it */
 4250	init_object(s, object, SLUB_RED_INACTIVE);
 4251
 4252	/* Reached end of constructed freelist yet? */
 4253	if (object != tail) {
 4254		object = get_freepointer(s, object);
 4255		goto next_object;
 4256	}
 4257	checks_ok = true;
 4258
 4259out_cnt:
 4260	if (cnt != *bulk_cnt) {
 4261		slab_err(s, slab, "Bulk free expected %d objects but found %d\n",
 4262			 *bulk_cnt, cnt);
 4263		*bulk_cnt = cnt;
 4264	}
 4265
 4266out:
 4267
 4268	if (!checks_ok)
 4269		slab_fix(s, "Object at 0x%p not freed", object);
 4270
 4271	return checks_ok;
 4272}
 4273#endif /* CONFIG_SLUB_DEBUG */
 4274
 4275#if defined(CONFIG_SLUB_DEBUG) || defined(SLAB_SUPPORTS_SYSFS)
 4276static unsigned long count_partial(struct kmem_cache_node *n,
 4277					int (*get_count)(struct slab *))
 4278{
 4279	unsigned long flags;
 4280	unsigned long x = 0;
 4281	struct slab *slab;
 4282
 4283	spin_lock_irqsave(&n->list_lock, flags);
 4284	list_for_each_entry(slab, &n->partial, slab_list)
 4285		x += get_count(slab);
 4286	spin_unlock_irqrestore(&n->list_lock, flags);
 4287	return x;
 4288}
 4289#endif /* CONFIG_SLUB_DEBUG || SLAB_SUPPORTS_SYSFS */
 4290
 4291#ifdef CONFIG_SLUB_DEBUG
 4292#define MAX_PARTIAL_TO_SCAN 10000
 4293
 4294static unsigned long count_partial_free_approx(struct kmem_cache_node *n)
 4295{
 4296	unsigned long flags;
 4297	unsigned long x = 0;
 4298	struct slab *slab;
 4299
 4300	spin_lock_irqsave(&n->list_lock, flags);
 4301	if (n->nr_partial <= MAX_PARTIAL_TO_SCAN) {
 4302		list_for_each_entry(slab, &n->partial, slab_list)
 4303			x += slab->objects - slab->inuse;
 4304	} else {
 4305		/*
 4306		 * For a long list, approximate the total count of objects in
 4307		 * it to meet the limit on the number of slabs to scan.
 4308		 * Scan from both the list's head and tail for better accuracy.
 4309		 */
 4310		unsigned long scanned = 0;
 4311
 4312		list_for_each_entry(slab, &n->partial, slab_list) {
 4313			x += slab->objects - slab->inuse;
 4314			if (++scanned == MAX_PARTIAL_TO_SCAN / 2)
 4315				break;
 4316		}
 4317		list_for_each_entry_reverse(slab, &n->partial, slab_list) {
 4318			x += slab->objects - slab->inuse;
 4319			if (++scanned == MAX_PARTIAL_TO_SCAN)
 4320				break;
 4321		}
 4322		x = mult_frac(x, n->nr_partial, scanned);
 4323		x = min(x, node_nr_objs(n));
 4324	}
 4325	spin_unlock_irqrestore(&n->list_lock, flags);
 4326	return x;
 4327}
 4328
 4329static noinline void
 4330slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid)
 4331{
 4332	static DEFINE_RATELIMIT_STATE(slub_oom_rs, DEFAULT_RATELIMIT_INTERVAL,
 4333				      DEFAULT_RATELIMIT_BURST);
 4334	int cpu = raw_smp_processor_id();
 4335	int node;
 4336	struct kmem_cache_node *n;
 4337
 4338	if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slub_oom_rs))
 4339		return;
 4340
 4341	pr_warn("SLUB: Unable to allocate memory on CPU %u (of node %d) on node %d, gfp=%#x(%pGg)\n",
 4342		cpu, cpu_to_node(cpu), nid, gfpflags, &gfpflags);
 4343	pr_warn("  cache: %s, object size: %u, buffer size: %u, default order: %u, min order: %u\n",
 4344		s->name, s->object_size, s->size, oo_order(s->oo),
 4345		oo_order(s->min));
 4346
 4347	if (oo_order(s->min) > get_order(s->object_size))
 4348		pr_warn("  %s debugging increased min order, use slab_debug=O to disable.\n",
 4349			s->name);
 4350
 4351	for_each_kmem_cache_node(s, node, n) {
 4352		unsigned long nr_slabs;
 4353		unsigned long nr_objs;
 4354		unsigned long nr_free;
 4355
 4356		nr_free  = count_partial_free_approx(n);
 4357		nr_slabs = node_nr_slabs(n);
 4358		nr_objs  = node_nr_objs(n);
 4359
 4360		pr_warn("  node %d: slabs: %ld, objs: %ld, free: %ld\n",
 4361			node, nr_slabs, nr_objs, nr_free);
 4362	}
 4363}
 4364#else /* CONFIG_SLUB_DEBUG */
 4365static inline void
 4366slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) { }
 4367#endif
 4368
 4369static inline bool pfmemalloc_match(struct slab *slab, gfp_t gfpflags)
 4370{
 4371	if (unlikely(slab_test_pfmemalloc(slab)))
 4372		return gfp_pfmemalloc_allowed(gfpflags);
 4373
 4374	return true;
 4375}
 4376
 4377static inline bool
 4378__update_cpu_freelist_fast(struct kmem_cache *s,
 4379			   void *freelist_old, void *freelist_new,
 4380			   unsigned long tid)
 4381{
 4382	struct freelist_tid old = { .freelist = freelist_old, .tid = tid };
 4383	struct freelist_tid new = { .freelist = freelist_new, .tid = next_tid(tid) };
 4384
 4385	return this_cpu_try_cmpxchg_freelist(s->cpu_slab->freelist_tid,
 4386					     &old.freelist_tid, new.freelist_tid);
 4387}
 4388
 4389/*
 4390 * Check the slab->freelist and either transfer the freelist to the
 4391 * per cpu freelist or deactivate the slab.
 4392 *
 4393 * The slab is still frozen if the return value is not NULL.
 4394 *
 4395 * If this function returns NULL then the slab has been unfrozen.
 4396 */
 4397static inline void *get_freelist(struct kmem_cache *s, struct slab *slab)
 4398{
 4399	struct freelist_counters old, new;
 4400
 4401	lockdep_assert_held(this_cpu_ptr(&s->cpu_slab->lock));
 4402
 4403	do {
 4404		old.freelist = slab->freelist;
 4405		old.counters = slab->counters;
 4406
 4407		new.freelist = NULL;
 4408		new.counters = old.counters;
 4409
 4410		new.inuse = old.objects;
 4411		new.frozen = old.freelist != NULL;
 4412
 4413
 4414	} while (!__slab_update_freelist(s, slab, &old, &new, "get_freelist"));
 4415
 4416	return old.freelist;
 4417}
 4418
 4419/*
 4420 * Freeze the partial slab and return the pointer to the freelist.
 4421 */
 4422static inline void *freeze_slab(struct kmem_cache *s, struct slab *slab)
 4423{
 4424	struct freelist_counters old, new;
 4425
 4426	do {
 4427		old.freelist = slab->freelist;
 4428		old.counters = slab->counters;
 4429
 4430		new.freelist = NULL;
 4431		new.counters = old.counters;
 4432		VM_BUG_ON(new.frozen);
 4433
 4434		new.inuse = old.objects;
 4435		new.frozen = 1;
 4436
 4437	} while (!slab_update_freelist(s, slab, &old, &new, "freeze_slab"));
 4438
 4439	return old.freelist;
 4440}
 4441
 4442/*
 4443 * Slow path. The lockless freelist is empty or we need to perform
 4444 * debugging duties.
 4445 *
 4446 * Processing is still very fast if new objects have been freed to the
 4447 * regular freelist. In that case we simply take over the regular freelist
 4448 * as the lockless freelist and zap the regular freelist.
 4449 *
 4450 * If that is not working then we fall back to the partial lists. We take the
 4451 * first element of the freelist as the object to allocate now and move the
 4452 * rest of the freelist to the lockless freelist.
 4453 *
 4454 * And if we were unable to get a new slab from the partial slab lists then
 4455 * we need to allocate a new slab. This is the slowest path since it involves
 4456 * a call to the page allocator and the setup of a new slab.
 4457 *
 4458 * Version of __slab_alloc to use when we know that preemption is
 4459 * already disabled (which is the case for bulk allocation).
 4460 */
 4461static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
 4462			  unsigned long addr, struct kmem_cache_cpu *c, unsigned int orig_size)
 4463{
 4464	bool allow_spin = gfpflags_allow_spinning(gfpflags);
 4465	void *freelist;
 4466	struct slab *slab;
 4467	unsigned long flags;
 4468	struct partial_context pc;
 4469	bool try_thisnode = true;
 4470
 4471	stat(s, ALLOC_SLOWPATH);
 4472
 4473reread_slab:
 4474
 4475	slab = READ_ONCE(c->slab);
 4476	if (!slab) {
 4477		/*
 4478		 * if the node is not online or has no normal memory, just
 4479		 * ignore the node constraint
 4480		 */
 4481		if (unlikely(node != NUMA_NO_NODE &&
 4482			     !node_isset(node, slab_nodes)))
 4483			node = NUMA_NO_NODE;
 4484		goto new_slab;
 4485	}
 4486
 4487	if (unlikely(!node_match(slab, node))) {
 4488		/*
 4489		 * same as above but node_match() being false already
 4490		 * implies node != NUMA_NO_NODE.
 4491		 *
 4492		 * We don't strictly honor pfmemalloc and NUMA preferences
 4493		 * when !allow_spin because:
 4494		 *
 4495		 * 1. Most kmalloc() users allocate objects on the local node,
 4496		 *    so kmalloc_nolock() tries not to interfere with them by
 4497		 *    deactivating the cpu slab.
 4498		 *
 4499		 * 2. Deactivating due to NUMA or pfmemalloc mismatch may cause
 4500		 *    unnecessary slab allocations even when n->partial list
 4501		 *    is not empty.
 4502		 */
 4503		if (!node_isset(node, slab_nodes) ||
 4504		    !allow_spin) {
 4505			node = NUMA_NO_NODE;
 4506		} else {
 4507			stat(s, ALLOC_NODE_MISMATCH);
 4508			goto deactivate_slab;
 4509		}
 4510	}
 4511
 4512	/*
 4513	 * By rights, we should be searching for a slab page that was
 4514	 * PFMEMALLOC but right now, we are losing the pfmemalloc
 4515	 * information when the page leaves the per-cpu allocator
 4516	 */
 4517	if (unlikely(!pfmemalloc_match(slab, gfpflags) && allow_spin))
 4518		goto deactivate_slab;
 4519
 4520	/* must check again c->slab in case we got preempted and it changed */
 4521	local_lock_cpu_slab(s, flags);
 4522
 4523	if (unlikely(slab != c->slab)) {
 4524		local_unlock_cpu_slab(s, flags);
 4525		goto reread_slab;
 4526	}
 4527	freelist = c->freelist;
 4528	if (freelist)
 4529		goto load_freelist;
 4530
 4531	freelist = get_freelist(s, slab);
 4532
 4533	if (!freelist) {
 4534		c->slab = NULL;
 4535		c->tid = next_tid(c->tid);
 4536		local_unlock_cpu_slab(s, flags);
 4537		stat(s, DEACTIVATE_BYPASS);
 4538		goto new_slab;
 4539	}
 4540
 4541	stat(s, ALLOC_REFILL);
 4542
 4543load_freelist:
 4544
 4545	lockdep_assert_held(this_cpu_ptr(&s->cpu_slab->lock));
 4546
 4547	/*
 4548	 * freelist is pointing to the list of objects to be used.
 4549	 * slab is pointing to the slab from which the objects are obtained.
 4550	 * That slab must be frozen for per cpu allocations to work.
 4551	 */
 4552	VM_BUG_ON(!c->slab->frozen);
 4553	c->freelist = get_freepointer(s, freelist);
 4554	c->tid = next_tid(c->tid);
 4555	local_unlock_cpu_slab(s, flags);
 4556	return freelist;
 4557
 4558deactivate_slab:
 4559
 4560	local_lock_cpu_slab(s, flags);
 4561	if (slab != c->slab) {
 4562		local_unlock_cpu_slab(s, flags);
 4563		goto reread_slab;
 4564	}
 4565	freelist = c->freelist;
 4566	c->slab = NULL;
 4567	c->freelist = NULL;
 4568	c->tid = next_tid(c->tid);
 4569	local_unlock_cpu_slab(s, flags);
 4570	deactivate_slab(s, slab, freelist);
 4571
 4572new_slab:
 4573
 4574#ifdef CONFIG_SLUB_CPU_PARTIAL
 4575	while (slub_percpu_partial(c)) {
 4576		local_lock_cpu_slab(s, flags);
 4577		if (unlikely(c->slab)) {
 4578			local_unlock_cpu_slab(s, flags);
 4579			goto reread_slab;
 4580		}
 4581		if (unlikely(!slub_percpu_partial(c))) {
 4582			local_unlock_cpu_slab(s, flags);
 4583			/* we were preempted and partial list got empty */
 4584			goto new_objects;
 4585		}
 4586
 4587		slab = slub_percpu_partial(c);
 4588		slub_set_percpu_partial(c, slab);
 4589
 4590		if (likely(node_match(slab, node) &&
 4591			   pfmemalloc_match(slab, gfpflags)) ||
 4592		    !allow_spin) {
 4593			c->slab = slab;
 4594			freelist = get_freelist(s, slab);
 4595			VM_BUG_ON(!freelist);
 4596			stat(s, CPU_PARTIAL_ALLOC);
 4597			goto load_freelist;
 4598		}
 4599
 4600		local_unlock_cpu_slab(s, flags);
 4601
 4602		slab->next = NULL;
 4603		__put_partials(s, slab);
 4604	}
 4605#endif
 4606
 4607new_objects:
 4608
 4609	pc.flags = gfpflags;
 4610	/*
 4611	 * When a preferred node is indicated but no __GFP_THISNODE
 4612	 *
 4613	 * 1) try to get a partial slab from target node only by having
 4614	 *    __GFP_THISNODE in pc.flags for get_partial()
 4615	 * 2) if 1) failed, try to allocate a new slab from target node with
 4616	 *    GPF_NOWAIT | __GFP_THISNODE opportunistically
 4617	 * 3) if 2) failed, retry with original gfpflags which will allow
 4618	 *    get_partial() try partial lists of other nodes before potentially
 4619	 *    allocating new page from other nodes
 4620	 */
 4621	if (unlikely(node != NUMA_NO_NODE && !(gfpflags & __GFP_THISNODE)
 4622		     && try_thisnode)) {
 4623		if (unlikely(!allow_spin))
 4624			/* Do not upgrade gfp to NOWAIT from more restrictive mode */
 4625			pc.flags = gfpflags | __GFP_THISNODE;
 4626		else
 4627			pc.flags = GFP_NOWAIT | __GFP_THISNODE;
 4628	}
 4629
 4630	pc.orig_size = orig_size;
 4631	slab = get_partial(s, node, &pc);
 4632	if (slab) {
 4633		if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) {
 4634			freelist = pc.object;
 4635			/*
 4636			 * For debug caches here we had to go through
 4637			 * alloc_single_from_partial() so just store the
 4638			 * tracking info and return the object.
 4639			 *
 4640			 * Due to disabled preemption we need to disallow
 4641			 * blocking. The flags are further adjusted by
 4642			 * gfp_nested_mask() in stack_depot itself.
 4643			 */
 4644			if (s->flags & SLAB_STORE_USER)
 4645				set_track(s, freelist, TRACK_ALLOC, addr,
 4646					  gfpflags & ~(__GFP_DIRECT_RECLAIM));
 4647
 4648			return freelist;
 4649		}
 4650
 4651		freelist = freeze_slab(s, slab);
 4652		goto retry_load_slab;
 4653	}
 4654
 4655	slub_put_cpu_ptr(s->cpu_slab);
 4656	slab = new_slab(s, pc.flags, node);
 4657	c = slub_get_cpu_ptr(s->cpu_slab);
 4658
 4659	if (unlikely(!slab)) {
 4660		if (node != NUMA_NO_NODE && !(gfpflags & __GFP_THISNODE)
 4661		    && try_thisnode) {
 4662			try_thisnode = false;
 4663			goto new_objects;
 4664		}
 4665		slab_out_of_memory(s, gfpflags, node);
 4666		return NULL;
 4667	}
 4668
 4669	stat(s, ALLOC_SLAB);
 4670
 4671	if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) {
 4672		freelist = alloc_single_from_new_slab(s, slab, orig_size, gfpflags);
 4673
 4674		if (unlikely(!freelist)) {
 4675			/* This could cause an endless loop. Fail instead. */
 4676			if (!allow_spin)
 4677				return NULL;
 4678			goto new_objects;
 4679		}
 4680
 4681		if (s->flags & SLAB_STORE_USER)
 4682			set_track(s, freelist, TRACK_ALLOC, addr,
 4683				  gfpflags & ~(__GFP_DIRECT_RECLAIM));
 4684
 4685		return freelist;
 4686	}
 4687
 4688	/*
 4689	 * No other reference to the slab yet so we can
 4690	 * muck around with it freely without cmpxchg
 4691	 */
 4692	freelist = slab->freelist;
 4693	slab->freelist = NULL;
 4694	slab->inuse = slab->objects;
 4695	slab->frozen = 1;
 4696
 4697	inc_slabs_node(s, slab_nid(slab), slab->objects);
 4698
 4699	if (unlikely(!pfmemalloc_match(slab, gfpflags) && allow_spin)) {
 4700		/*
 4701		 * For !pfmemalloc_match() case we don't load freelist so that
 4702		 * we don't make further mismatched allocations easier.
 4703		 */
 4704		deactivate_slab(s, slab, get_freepointer(s, freelist));
 4705		return freelist;
 4706	}
 4707
 4708retry_load_slab:
 4709
 4710	local_lock_cpu_slab(s, flags);
 4711	if (unlikely(c->slab)) {
 4712		void *flush_freelist = c->freelist;
 4713		struct slab *flush_slab = c->slab;
 4714
 4715		c->slab = NULL;
 4716		c->freelist = NULL;
 4717		c->tid = next_tid(c->tid);
 4718
 4719		local_unlock_cpu_slab(s, flags);
 4720
 4721		if (unlikely(!allow_spin)) {
 4722			/* Reentrant slub cannot take locks, defer */
 4723			defer_deactivate_slab(flush_slab, flush_freelist);
 4724		} else {
 4725			deactivate_slab(s, flush_slab, flush_freelist);
 4726		}
 4727
 4728		stat(s, CPUSLAB_FLUSH);
 4729
 4730		goto retry_load_slab;
 4731	}
 4732	c->slab = slab;
 4733
 4734	goto load_freelist;
 4735}
 4736/*
 4737 * We disallow kprobes in ___slab_alloc() to prevent reentrance
 4738 *
 4739 * kmalloc() -> ___slab_alloc() -> local_lock_cpu_slab() protected part of
 4740 * ___slab_alloc() manipulating c->freelist -> kprobe -> bpf ->
 4741 * kmalloc_nolock() or kfree_nolock() -> __update_cpu_freelist_fast()
 4742 * manipulating c->freelist without lock.
 4743 *
 4744 * This does not prevent kprobe in functions called from ___slab_alloc() such as
 4745 * local_lock_irqsave() itself, and that is fine, we only need to protect the
 4746 * c->freelist manipulation in ___slab_alloc() itself.
 4747 */
 4748NOKPROBE_SYMBOL(___slab_alloc);
 4749
 4750/*
 4751 * A wrapper for ___slab_alloc() for contexts where preemption is not yet
 4752 * disabled. Compensates for possible cpu changes by refetching the per cpu area
 4753 * pointer.
 4754 */
 4755static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
 4756			  unsigned long addr, struct kmem_cache_cpu *c, unsigned int orig_size)
 4757{
 4758	void *p;
 4759
 4760#ifdef CONFIG_PREEMPT_COUNT
 4761	/*
 4762	 * We may have been preempted and rescheduled on a different
 4763	 * cpu before disabling preemption. Need to reload cpu area
 4764	 * pointer.
 4765	 */
 4766	c = slub_get_cpu_ptr(s->cpu_slab);
 4767#endif
 4768	if (unlikely(!gfpflags_allow_spinning(gfpflags))) {
 4769		if (local_lock_is_locked(&s->cpu_slab->lock)) {
 4770			/*
 4771			 * EBUSY is an internal signal to kmalloc_nolock() to
 4772			 * retry a different bucket. It's not propagated
 4773			 * to the caller.
 4774			 */
 4775			p = ERR_PTR(-EBUSY);
 4776			goto out;
 4777		}
 4778	}
 4779	p = ___slab_alloc(s, gfpflags, node, addr, c, orig_size);
 4780out:
 4781#ifdef CONFIG_PREEMPT_COUNT
 4782	slub_put_cpu_ptr(s->cpu_slab);
 4783#endif
 4784	return p;
 4785}
 4786
 4787static __always_inline void *__slab_alloc_node(struct kmem_cache *s,
 4788		gfp_t gfpflags, int node, unsigned long addr, size_t orig_size)
 4789{
 4790	struct kmem_cache_cpu *c;
 4791	struct slab *slab;
 4792	unsigned long tid;
 4793	void *object;
 4794
 4795redo:
 4796	/*
 4797	 * Must read kmem_cache cpu data via this cpu ptr. Preemption is
 4798	 * enabled. We may switch back and forth between cpus while
 4799	 * reading from one cpu area. That does not matter as long
 4800	 * as we end up on the original cpu again when doing the cmpxchg.
 4801	 *
 4802	 * We must guarantee that tid and kmem_cache_cpu are retrieved on the
 4803	 * same cpu. We read first the kmem_cache_cpu pointer and use it to read
 4804	 * the tid. If we are preempted and switched to another cpu between the
 4805	 * two reads, it's OK as the two are still associated with the same cpu
 4806	 * and cmpxchg later will validate the cpu.
 4807	 */
 4808	c = raw_cpu_ptr(s->cpu_slab);
 4809	tid = READ_ONCE(c->tid);
 4810
 4811	/*
 4812	 * Irqless object alloc/free algorithm used here depends on sequence
 4813	 * of fetching cpu_slab's data. tid should be fetched before anything
 4814	 * on c to guarantee that object and slab associated with previous tid
 4815	 * won't be used with current tid. If we fetch tid first, object and
 4816	 * slab could be one associated with next tid and our alloc/free
 4817	 * request will be failed. In this case, we will retry. So, no problem.
 4818	 */
 4819	barrier();
 4820
 4821	/*
 4822	 * The transaction ids are globally unique per cpu and per operation on
 4823	 * a per cpu queue. Thus they can be guarantee that the cmpxchg_double
 4824	 * occurs on the right processor and that there was no operation on the
 4825	 * linked list in between.
 4826	 */
 4827
 4828	object = c->freelist;
 4829	slab = c->slab;
 4830
 4831#ifdef CONFIG_NUMA
 4832	if (static_branch_unlikely(&strict_numa) &&
 4833			node == NUMA_NO_NODE) {
 4834
 4835		struct mempolicy *mpol = current->mempolicy;
 4836
 4837		if (mpol) {
 4838			/*
 4839			 * Special BIND rule support. If existing slab
 4840			 * is in permitted set then do not redirect
 4841			 * to a particular node.
 4842			 * Otherwise we apply the memory policy to get
 4843			 * the node we need to allocate on.
 4844			 */
 4845			if (mpol->mode != MPOL_BIND || !slab ||
 4846					!node_isset(slab_nid(slab), mpol->nodes))
 4847
 4848				node = mempolicy_slab_node();
 4849		}
 4850	}
 4851#endif
 4852
 4853	if (!USE_LOCKLESS_FAST_PATH() ||
 4854	    unlikely(!object || !slab || !node_match(slab, node))) {
 4855		object = __slab_alloc(s, gfpflags, node, addr, c, orig_size);
 4856	} else {
 4857		void *next_object = get_freepointer_safe(s, object);
 4858
 4859		/*
 4860		 * The cmpxchg will only match if there was no additional
 4861		 * operation and if we are on the right processor.
 4862		 *
 4863		 * The cmpxchg does the following atomically (without lock
 4864		 * semantics!)
 4865		 * 1. Relocate first pointer to the current per cpu area.
 4866		 * 2. Verify that tid and freelist have not been changed
 4867		 * 3. If they were not changed replace tid and freelist
 4868		 *
 4869		 * Since this is without lock semantics the protection is only
 4870		 * against code executing on this cpu *not* from access by
 4871		 * other cpus.
 4872		 */
 4873		if (unlikely(!__update_cpu_freelist_fast(s, object, next_object, tid))) {
 4874			note_cmpxchg_failure("slab_alloc", s, tid);
 4875			goto redo;
 4876		}
 4877		prefetch_freepointer(s, next_object);
 4878		stat(s, ALLOC_FASTPATH);
 4879	}
 4880
 4881	return object;
 4882}
 4883
 4884/*
 4885 * If the object has been wiped upon free, make sure it's fully initialized by
 4886 * zeroing out freelist pointer.
 4887 *
 4888 * Note that we also wipe custom freelist pointers.
 4889 */
 4890static __always_inline void maybe_wipe_obj_freeptr(struct kmem_cache *s,
 4891						   void *obj)
 4892{
 4893	if (unlikely(slab_want_init_on_free(s)) && obj &&
 4894	    !freeptr_outside_object(s))
 4895		memset((void *)((char *)kasan_reset_tag(obj) + s->offset),
 4896			0, sizeof(void *));
 4897}
 4898
 4899static __fastpath_inline
 4900struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags)
 4901{
 4902	flags &= gfp_allowed_mask;
 4903
 4904	might_alloc(flags);
 4905
 4906	if (unlikely(should_failslab(s, flags)))
 4907		return NULL;
 4908
 4909	return s;
 4910}
 4911
 4912static __fastpath_inline
 4913bool slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru,
 4914			  gfp_t flags, size_t size, void **p, bool init,
 4915			  unsigned int orig_size)
 4916{
 4917	unsigned int zero_size = s->object_size;
 4918	bool kasan_init = init;
 4919	size_t i;
 4920	gfp_t init_flags = flags & gfp_allowed_mask;
 4921
 4922	/*
 4923	 * For kmalloc object, the allocated memory size(object_size) is likely
 4924	 * larger than the requested size(orig_size). If redzone check is
 4925	 * enabled for the extra space, don't zero it, as it will be redzoned
 4926	 * soon. The redzone operation for this extra space could be seen as a
 4927	 * replacement of current poisoning under certain debug option, and
 4928	 * won't break other sanity checks.
 4929	 */
 4930	if (kmem_cache_debug_flags(s, SLAB_STORE_USER | SLAB_RED_ZONE) &&
 4931	    (s->flags & SLAB_KMALLOC))
 4932		zero_size = orig_size;
 4933
 4934	/*
 4935	 * When slab_debug is enabled, avoid memory initialization integrated
 4936	 * into KASAN and instead zero out the memory via the memset below with
 4937	 * the proper size. Otherwise, KASAN might overwrite SLUB redzones and
 4938	 * cause false-positive reports. This does not lead to a performance
 4939	 * penalty on production builds, as slab_debug is not intended to be
 4940	 * enabled there.
 4941	 */
 4942	if (__slub_debug_enabled())
 4943		kasan_init = false;
 4944
 4945	/*
 4946	 * As memory initialization might be integrated into KASAN,
 4947	 * kasan_slab_alloc and initialization memset must be
 4948	 * kept together to avoid discrepancies in behavior.
 4949	 *
 4950	 * As p[i] might get tagged, memset and kmemleak hook come after KASAN.
 4951	 */
 4952	for (i = 0; i < size; i++) {
 4953		p[i] = kasan_slab_alloc(s, p[i], init_flags, kasan_init);
 4954		if (p[i] && init && (!kasan_init ||
 4955				     !kasan_has_integrated_init()))
 4956			memset(p[i], 0, zero_size);
 4957		if (gfpflags_allow_spinning(flags))
 4958			kmemleak_alloc_recursive(p[i], s->object_size, 1,
 4959						 s->flags, init_flags);
 4960		kmsan_slab_alloc(s, p[i], init_flags);
 4961		alloc_tagging_slab_alloc_hook(s, p[i], flags);
 4962	}
 4963
 4964	return memcg_slab_post_alloc_hook(s, lru, flags, size, p);
 4965}
 4966
 4967/*
 4968 * Replace the empty main sheaf with a (at least partially) full sheaf.
 4969 *
 4970 * Must be called with the cpu_sheaves local lock locked. If successful, returns
 4971 * the pcs pointer and the local lock locked (possibly on a different cpu than
 4972 * initially called). If not successful, returns NULL and the local lock
 4973 * unlocked.
 4974 */
 4975static struct slub_percpu_sheaves *
 4976__pcs_replace_empty_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs, gfp_t gfp)
 4977{
 4978	struct slab_sheaf *empty = NULL;
 4979	struct slab_sheaf *full;
 4980	struct node_barn *barn;
 4981	bool can_alloc;
 4982
 4983	lockdep_assert_held(this_cpu_ptr(&s->cpu_sheaves->lock));
 4984
 4985	if (pcs->spare && pcs->spare->size > 0) {
 4986		swap(pcs->main, pcs->spare);
 4987		return pcs;
 4988	}
 4989
 4990	barn = get_barn(s);
 4991	if (!barn) {
 4992		local_unlock(&s->cpu_sheaves->lock);
 4993		return NULL;
 4994	}
 4995
 4996	full = barn_replace_empty_sheaf(barn, pcs->main);
 4997
 4998	if (full) {
 4999		stat(s, BARN_GET);
 5000		pcs->main = full;
 5001		return pcs;
 5002	}
 5003
 5004	stat(s, BARN_GET_FAIL);
 5005
 5006	can_alloc = gfpflags_allow_blocking(gfp);
 5007
 5008	if (can_alloc) {
 5009		if (pcs->spare) {
 5010			empty = pcs->spare;
 5011			pcs->spare = NULL;
 5012		} else {
 5013			empty = barn_get_empty_sheaf(barn);
 5014		}
 5015	}
 5016
 5017	local_unlock(&s->cpu_sheaves->lock);
 5018
 5019	if (!can_alloc)
 5020		return NULL;
 5021
 5022	if (empty) {
 5023		if (!refill_sheaf(s, empty, gfp | __GFP_NOMEMALLOC)) {
 5024			full = empty;
 5025		} else {
 5026			/*
 5027			 * we must be very low on memory so don't bother
 5028			 * with the barn
 5029			 */
 5030			free_empty_sheaf(s, empty);
 5031		}
 5032	} else {
 5033		full = alloc_full_sheaf(s, gfp);
 5034	}
 5035
 5036	if (!full)
 5037		return NULL;
 5038
 5039	/*
 5040	 * we can reach here only when gfpflags_allow_blocking
 5041	 * so this must not be an irq
 5042	 */
 5043	local_lock(&s->cpu_sheaves->lock);
 5044	pcs = this_cpu_ptr(s->cpu_sheaves);
 5045
 5046	/*
 5047	 * If we are returning empty sheaf, we either got it from the
 5048	 * barn or had to allocate one. If we are returning a full
 5049	 * sheaf, it's due to racing or being migrated to a different
 5050	 * cpu. Breaching the barn's sheaf limits should be thus rare
 5051	 * enough so just ignore them to simplify the recovery.
 5052	 */
 5053
 5054	if (pcs->main->size == 0) {
 5055		barn_put_empty_sheaf(barn, pcs->main);
 5056		pcs->main = full;
 5057		return pcs;
 5058	}
 5059
 5060	if (!pcs->spare) {
 5061		pcs->spare = full;
 5062		return pcs;
 5063	}
 5064
 5065	if (pcs->spare->size == 0) {
 5066		barn_put_empty_sheaf(barn, pcs->spare);
 5067		pcs->spare = full;
 5068		return pcs;
 5069	}
 5070
 5071	barn_put_full_sheaf(barn, full);
 5072	stat(s, BARN_PUT);
 5073
 5074	return pcs;
 5075}
 5076
 5077static __fastpath_inline
 5078void *alloc_from_pcs(struct kmem_cache *s, gfp_t gfp, int node)
 5079{
 5080	struct slub_percpu_sheaves *pcs;
 5081	bool node_requested;
 5082	void *object;
 5083
 5084#ifdef CONFIG_NUMA
 5085	if (static_branch_unlikely(&strict_numa) &&
 5086			 node == NUMA_NO_NODE) {
 5087
 5088		struct mempolicy *mpol = current->mempolicy;
 5089
 5090		if (mpol) {
 5091			/*
 5092			 * Special BIND rule support. If the local node
 5093			 * is in permitted set then do not redirect
 5094			 * to a particular node.
 5095			 * Otherwise we apply the memory policy to get
 5096			 * the node we need to allocate on.
 5097			 */
 5098			if (mpol->mode != MPOL_BIND ||
 5099					!node_isset(numa_mem_id(), mpol->nodes))
 5100
 5101				node = mempolicy_slab_node();
 5102		}
 5103	}
 5104#endif
 5105
 5106	node_requested = IS_ENABLED(CONFIG_NUMA) && node != NUMA_NO_NODE;
 5107
 5108	/*
 5109	 * We assume the percpu sheaves contain only local objects although it's
 5110	 * not completely guaranteed, so we verify later.
 5111	 */
 5112	if (unlikely(node_requested && node != numa_mem_id()))
 5113		return NULL;
 5114
 5115	if (!local_trylock(&s->cpu_sheaves->lock))
 5116		return NULL;
 5117
 5118	pcs = this_cpu_ptr(s->cpu_sheaves);
 5119
 5120	if (unlikely(pcs->main->size == 0)) {
 5121		pcs = __pcs_replace_empty_main(s, pcs, gfp);
 5122		if (unlikely(!pcs))
 5123			return NULL;
 5124	}
 5125
 5126	object = pcs->main->objects[pcs->main->size - 1];
 5127
 5128	if (unlikely(node_requested)) {
 5129		/*
 5130		 * Verify that the object was from the node we want. This could
 5131		 * be false because of cpu migration during an unlocked part of
 5132		 * the current allocation or previous freeing process.
 5133		 */
 5134		if (page_to_nid(virt_to_page(object)) != node) {
 5135			local_unlock(&s->cpu_sheaves->lock);
 5136			return NULL;
 5137		}
 5138	}
 5139
 5140	pcs->main->size--;
 5141
 5142	local_unlock(&s->cpu_sheaves->lock);
 5143
 5144	stat(s, ALLOC_PCS);
 5145
 5146	return object;
 5147}
 5148
 5149static __fastpath_inline
 5150unsigned int alloc_from_pcs_bulk(struct kmem_cache *s, size_t size, void **p)
 5151{
 5152	struct slub_percpu_sheaves *pcs;
 5153	struct slab_sheaf *main;
 5154	unsigned int allocated = 0;
 5155	unsigned int batch;
 5156
 5157next_batch:
 5158	if (!local_trylock(&s->cpu_sheaves->lock))
 5159		return allocated;
 5160
 5161	pcs = this_cpu_ptr(s->cpu_sheaves);
 5162
 5163	if (unlikely(pcs->main->size == 0)) {
 5164
 5165		struct slab_sheaf *full;
 5166		struct node_barn *barn;
 5167
 5168		if (pcs->spare && pcs->spare->size > 0) {
 5169			swap(pcs->main, pcs->spare);
 5170			goto do_alloc;
 5171		}
 5172
 5173		barn = get_barn(s);
 5174		if (!barn) {
 5175			local_unlock(&s->cpu_sheaves->lock);
 5176			return allocated;
 5177		}
 5178
 5179		full = barn_replace_empty_sheaf(barn, pcs->main);
 5180
 5181		if (full) {
 5182			stat(s, BARN_GET);
 5183			pcs->main = full;
 5184			goto do_alloc;
 5185		}
 5186
 5187		stat(s, BARN_GET_FAIL);
 5188
 5189		local_unlock(&s->cpu_sheaves->lock);
 5190
 5191		/*
 5192		 * Once full sheaves in barn are depleted, let the bulk
 5193		 * allocation continue from slab pages, otherwise we would just
 5194		 * be copying arrays of pointers twice.
 5195		 */
 5196		return allocated;
 5197	}
 5198
 5199do_alloc:
 5200
 5201	main = pcs->main;
 5202	batch = min(size, main->size);
 5203
 5204	main->size -= batch;
 5205	memcpy(p, main->objects + main->size, batch * sizeof(void *));
 5206
 5207	local_unlock(&s->cpu_sheaves->lock);
 5208
 5209	stat_add(s, ALLOC_PCS, batch);
 5210
 5211	allocated += batch;
 5212
 5213	if (batch < size) {
 5214		p += batch;
 5215		size -= batch;
 5216		goto next_batch;
 5217	}
 5218
 5219	return allocated;
 5220}
 5221
 5222
 5223/*
 5224 * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc)
 5225 * have the fastpath folded into their functions. So no function call
 5226 * overhead for requests that can be satisfied on the fastpath.
 5227 *
 5228 * The fastpath works by first checking if the lockless freelist can be used.
 5229 * If not then __slab_alloc is called for slow processing.
 5230 *
 5231 * Otherwise we can simply pick the next object from the lockless free list.
 5232 */
 5233static __fastpath_inline void *slab_alloc_node(struct kmem_cache *s, struct list_lru *lru,
 5234		gfp_t gfpflags, int node, unsigned long addr, size_t orig_size)
 5235{
 5236	void *object;
 5237	bool init = false;
 5238
 5239	s = slab_pre_alloc_hook(s, gfpflags);
 5240	if (unlikely(!s))
 5241		return NULL;
 5242
 5243	object = kfence_alloc(s, orig_size, gfpflags);
 5244	if (unlikely(object))
 5245		goto out;
 5246
 5247	if (s->cpu_sheaves)
 5248		object = alloc_from_pcs(s, gfpflags, node);
 5249
 5250	if (!object)
 5251		object = __slab_alloc_node(s, gfpflags, node, addr, orig_size);
 5252
 5253	maybe_wipe_obj_freeptr(s, object);
 5254	init = slab_want_init_on_alloc(gfpflags, s);
 5255
 5256out:
 5257	/*
 5258	 * When init equals 'true', like for kzalloc() family, only
 5259	 * @orig_size bytes might be zeroed instead of s->object_size
 5260	 * In case this fails due to memcg_slab_post_alloc_hook(),
 5261	 * object is set to NULL
 5262	 */
 5263	slab_post_alloc_hook(s, lru, gfpflags, 1, &object, init, orig_size);
 5264
 5265	return object;
 5266}
 5267
 5268void *kmem_cache_alloc_noprof(struct kmem_cache *s, gfp_t gfpflags)
 5269{
 5270	void *ret = slab_alloc_node(s, NULL, gfpflags, NUMA_NO_NODE, _RET_IP_,
 5271				    s->object_size);
 5272
 5273	trace_kmem_cache_alloc(_RET_IP_, ret, s, gfpflags, NUMA_NO_NODE);
 5274
 5275	return ret;
 5276}
 5277EXPORT_SYMBOL(kmem_cache_alloc_noprof);
 5278
 5279void *kmem_cache_alloc_lru_noprof(struct kmem_cache *s, struct list_lru *lru,
 5280			   gfp_t gfpflags)
 5281{
 5282	void *ret = slab_alloc_node(s, lru, gfpflags, NUMA_NO_NODE, _RET_IP_,
 5283				    s->object_size);
 5284
 5285	trace_kmem_cache_alloc(_RET_IP_, ret, s, gfpflags, NUMA_NO_NODE);
 5286
 5287	return ret;
 5288}
 5289EXPORT_SYMBOL(kmem_cache_alloc_lru_noprof);
 5290
 5291bool kmem_cache_charge(void *objp, gfp_t gfpflags)
 5292{
 5293	if (!memcg_kmem_online())
 5294		return true;
 5295
 5296	return memcg_slab_post_charge(objp, gfpflags);
 5297}
 5298EXPORT_SYMBOL(kmem_cache_charge);
 5299
 5300/**
 5301 * kmem_cache_alloc_node - Allocate an object on the specified node
 5302 * @s: The cache to allocate from.
 5303 * @gfpflags: See kmalloc().
 5304 * @node: node number of the target node.
 5305 *
 5306 * Identical to kmem_cache_alloc but it will allocate memory on the given
 5307 * node, which can improve the performance for cpu bound structures.
 5308 *
 5309 * Fallback to other node is possible if __GFP_THISNODE is not set.
 5310 *
 5311 * Return: pointer to the new object or %NULL in case of error
 5312 */
 5313void *kmem_cache_alloc_node_noprof(struct kmem_cache *s, gfp_t gfpflags, int node)
 5314{
 5315	void *ret = slab_alloc_node(s, NULL, gfpflags, node, _RET_IP_, s->object_size);
 5316
 5317	trace_kmem_cache_alloc(_RET_IP_, ret, s, gfpflags, node);
 5318
 5319	return ret;
 5320}
 5321EXPORT_SYMBOL(kmem_cache_alloc_node_noprof);
 5322
 5323static int __prefill_sheaf_pfmemalloc(struct kmem_cache *s,
 5324				      struct slab_sheaf *sheaf, gfp_t gfp)
 5325{
 5326	int ret = 0;
 5327
 5328	ret = refill_sheaf(s, sheaf, gfp | __GFP_NOMEMALLOC);
 5329
 5330	if (likely(!ret || !gfp_pfmemalloc_allowed(gfp)))
 5331		return ret;
 5332
 5333	/*
 5334	 * if we are allowed to, refill sheaf with pfmemalloc but then remember
 5335	 * it for when it's returned
 5336	 */
 5337	ret = refill_sheaf(s, sheaf, gfp);
 5338	sheaf->pfmemalloc = true;
 5339
 5340	return ret;
 5341}
 5342
 5343/*
 5344 * returns a sheaf that has at least the requested size
 5345 * when prefilling is needed, do so with given gfp flags
 5346 *
 5347 * return NULL if sheaf allocation or prefilling failed
 5348 */
 5349struct slab_sheaf *
 5350kmem_cache_prefill_sheaf(struct kmem_cache *s, gfp_t gfp, unsigned int size)
 5351{
 5352	struct slub_percpu_sheaves *pcs;
 5353	struct slab_sheaf *sheaf = NULL;
 5354	struct node_barn *barn;
 5355
 5356	if (unlikely(size > s->sheaf_capacity)) {
 5357
 5358		/*
 5359		 * slab_debug disables cpu sheaves intentionally so all
 5360		 * prefilled sheaves become "oversize" and we give up on
 5361		 * performance for the debugging. Same with SLUB_TINY.
 5362		 * Creating a cache without sheaves and then requesting a
 5363		 * prefilled sheaf is however not expected, so warn.
 5364		 */
 5365		WARN_ON_ONCE(s->sheaf_capacity == 0 &&
 5366			     !IS_ENABLED(CONFIG_SLUB_TINY) &&
 5367			     !(s->flags & SLAB_DEBUG_FLAGS));
 5368
 5369		sheaf = kzalloc(struct_size(sheaf, objects, size), gfp);
 5370		if (!sheaf)
 5371			return NULL;
 5372
 5373		stat(s, SHEAF_PREFILL_OVERSIZE);
 5374		sheaf->cache = s;
 5375		sheaf->capacity = size;
 5376
 5377		/*
 5378		 * we do not need to care about pfmemalloc here because oversize
 5379		 * sheaves area always flushed and freed when returned
 5380		 */
 5381		if (!__kmem_cache_alloc_bulk(s, gfp, size,
 5382					     &sheaf->objects[0])) {
 5383			kfree(sheaf);
 5384			return NULL;
 5385		}
 5386
 5387		sheaf->size = size;
 5388
 5389		return sheaf;
 5390	}
 5391
 5392	local_lock(&s->cpu_sheaves->lock);
 5393	pcs = this_cpu_ptr(s->cpu_sheaves);
 5394
 5395	if (pcs->spare) {
 5396		sheaf = pcs->spare;
 5397		pcs->spare = NULL;
 5398		stat(s, SHEAF_PREFILL_FAST);
 5399	} else {
 5400		barn = get_barn(s);
 5401
 5402		stat(s, SHEAF_PREFILL_SLOW);
 5403		if (barn)
 5404			sheaf = barn_get_full_or_empty_sheaf(barn);
 5405		if (sheaf && sheaf->size)
 5406			stat(s, BARN_GET);
 5407		else
 5408			stat(s, BARN_GET_FAIL);
 5409	}
 5410
 5411	local_unlock(&s->cpu_sheaves->lock);
 5412
 5413
 5414	if (!sheaf)
 5415		sheaf = alloc_empty_sheaf(s, gfp);
 5416
 5417	if (sheaf) {
 5418		sheaf->capacity = s->sheaf_capacity;
 5419		sheaf->pfmemalloc = false;
 5420
 5421		if (sheaf->size < size &&
 5422		    __prefill_sheaf_pfmemalloc(s, sheaf, gfp)) {
 5423			sheaf_flush_unused(s, sheaf);
 5424			free_empty_sheaf(s, sheaf);
 5425			sheaf = NULL;
 5426		}
 5427	}
 5428
 5429	return sheaf;
 5430}
 5431
 5432/*
 5433 * Use this to return a sheaf obtained by kmem_cache_prefill_sheaf()
 5434 *
 5435 * If the sheaf cannot simply become the percpu spare sheaf, but there's space
 5436 * for a full sheaf in the barn, we try to refill the sheaf back to the cache's
 5437 * sheaf_capacity to avoid handling partially full sheaves.
 5438 *
 5439 * If the refill fails because gfp is e.g. GFP_NOWAIT, or the barn is full, the
 5440 * sheaf is instead flushed and freed.
 5441 */
 5442void kmem_cache_return_sheaf(struct kmem_cache *s, gfp_t gfp,
 5443			     struct slab_sheaf *sheaf)
 5444{
 5445	struct slub_percpu_sheaves *pcs;
 5446	struct node_barn *barn;
 5447
 5448	if (unlikely((sheaf->capacity != s->sheaf_capacity)
 5449		     || sheaf->pfmemalloc)) {
 5450		sheaf_flush_unused(s, sheaf);
 5451		kfree(sheaf);
 5452		return;
 5453	}
 5454
 5455	local_lock(&s->cpu_sheaves->lock);
 5456	pcs = this_cpu_ptr(s->cpu_sheaves);
 5457	barn = get_barn(s);
 5458
 5459	if (!pcs->spare) {
 5460		pcs->spare = sheaf;
 5461		sheaf = NULL;
 5462		stat(s, SHEAF_RETURN_FAST);
 5463	}
 5464
 5465	local_unlock(&s->cpu_sheaves->lock);
 5466
 5467	if (!sheaf)
 5468		return;
 5469
 5470	stat(s, SHEAF_RETURN_SLOW);
 5471
 5472	/*
 5473	 * If the barn has too many full sheaves or we fail to refill the sheaf,
 5474	 * simply flush and free it.
 5475	 */
 5476	if (!barn || data_race(barn->nr_full) >= MAX_FULL_SHEAVES ||
 5477	    refill_sheaf(s, sheaf, gfp)) {
 5478		sheaf_flush_unused(s, sheaf);
 5479		free_empty_sheaf(s, sheaf);
 5480		return;
 5481	}
 5482
 5483	barn_put_full_sheaf(barn, sheaf);
 5484	stat(s, BARN_PUT);
 5485}
 5486
 5487/*
 5488 * refill a sheaf previously returned by kmem_cache_prefill_sheaf to at least
 5489 * the given size
 5490 *
 5491 * the sheaf might be replaced by a new one when requesting more than
 5492 * s->sheaf_capacity objects if such replacement is necessary, but the refill
 5493 * fails (returning -ENOMEM), the existing sheaf is left intact
 5494 *
 5495 * In practice we always refill to full sheaf's capacity.
 5496 */
 5497int kmem_cache_refill_sheaf(struct kmem_cache *s, gfp_t gfp,
 5498			    struct slab_sheaf **sheafp, unsigned int size)
 5499{
 5500	struct slab_sheaf *sheaf;
 5501
 5502	/*
 5503	 * TODO: do we want to support *sheaf == NULL to be equivalent of
 5504	 * kmem_cache_prefill_sheaf() ?
 5505	 */
 5506	if (!sheafp || !(*sheafp))
 5507		return -EINVAL;
 5508
 5509	sheaf = *sheafp;
 5510	if (sheaf->size >= size)
 5511		return 0;
 5512
 5513	if (likely(sheaf->capacity >= size)) {
 5514		if (likely(sheaf->capacity == s->sheaf_capacity))
 5515			return __prefill_sheaf_pfmemalloc(s, sheaf, gfp);
 5516
 5517		if (!__kmem_cache_alloc_bulk(s, gfp, sheaf->capacity - sheaf->size,
 5518					     &sheaf->objects[sheaf->size])) {
 5519			return -ENOMEM;
 5520		}
 5521		sheaf->size = sheaf->capacity;
 5522
 5523		return 0;
 5524	}
 5525
 5526	/*
 5527	 * We had a regular sized sheaf and need an oversize one, or we had an
 5528	 * oversize one already but need a larger one now.
 5529	 * This should be a very rare path so let's not complicate it.
 5530	 */
 5531	sheaf = kmem_cache_prefill_sheaf(s, gfp, size);
 5532	if (!sheaf)
 5533		return -ENOMEM;
 5534
 5535	kmem_cache_return_sheaf(s, gfp, *sheafp);
 5536	*sheafp = sheaf;
 5537	return 0;
 5538}
 5539
 5540/*
 5541 * Allocate from a sheaf obtained by kmem_cache_prefill_sheaf()
 5542 *
 5543 * Guaranteed not to fail as many allocations as was the requested size.
 5544 * After the sheaf is emptied, it fails - no fallback to the slab cache itself.
 5545 *
 5546 * The gfp parameter is meant only to specify __GFP_ZERO or __GFP_ACCOUNT
 5547 * memcg charging is forced over limit if necessary, to avoid failure.
 5548 *
 5549 * It is possible that the allocation comes from kfence and then the sheaf
 5550 * size is not decreased.
 5551 */
 5552void *
 5553kmem_cache_alloc_from_sheaf_noprof(struct kmem_cache *s, gfp_t gfp,
 5554				   struct slab_sheaf *sheaf)
 5555{
 5556	void *ret = NULL;
 5557	bool init;
 5558
 5559	if (sheaf->size == 0)
 5560		goto out;
 5561
 5562	ret = kfence_alloc(s, s->object_size, gfp);
 5563
 5564	if (likely(!ret))
 5565		ret = sheaf->objects[--sheaf->size];
 5566
 5567	init = slab_want_init_on_alloc(gfp, s);
 5568
 5569	/* add __GFP_NOFAIL to force successful memcg charging */
 5570	slab_post_alloc_hook(s, NULL, gfp | __GFP_NOFAIL, 1, &ret, init, s->object_size);
 5571out:
 5572	trace_kmem_cache_alloc(_RET_IP_, ret, s, gfp, NUMA_NO_NODE);
 5573
 5574	return ret;
 5575}
 5576
 5577unsigned int kmem_cache_sheaf_size(struct slab_sheaf *sheaf)
 5578{
 5579	return sheaf->size;
 5580}
 5581/*
 5582 * To avoid unnecessary overhead, we pass through large allocation requests
 5583 * directly to the page allocator. We use __GFP_COMP, because we will need to
 5584 * know the allocation order to free the pages properly in kfree.
 5585 */
 5586static void *___kmalloc_large_node(size_t size, gfp_t flags, int node)
 5587{
 5588	struct page *page;
 5589	void *ptr = NULL;
 5590	unsigned int order = get_order(size);
 5591
 5592	if (unlikely(flags & GFP_SLAB_BUG_MASK))
 5593		flags = kmalloc_fix_flags(flags);
 5594
 5595	flags |= __GFP_COMP;
 5596
 5597	if (node == NUMA_NO_NODE)
 5598		page = alloc_frozen_pages_noprof(flags, order);
 5599	else
 5600		page = __alloc_frozen_pages_noprof(flags, order, node, NULL);
 5601
 5602	if (page) {
 5603		ptr = page_address(page);
 5604		mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B,
 5605				      PAGE_SIZE << order);
 5606		__SetPageLargeKmalloc(page);
 5607	}
 5608
 5609	ptr = kasan_kmalloc_large(ptr, size, flags);
 5610	/* As ptr might get tagged, call kmemleak hook after KASAN. */
 5611	kmemleak_alloc(ptr, size, 1, flags);
 5612	kmsan_kmalloc_large(ptr, size, flags);
 5613
 5614	return ptr;
 5615}
 5616
 5617void *__kmalloc_large_noprof(size_t size, gfp_t flags)
 5618{
 5619	void *ret = ___kmalloc_large_node(size, flags, NUMA_NO_NODE);
 5620
 5621	trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << get_order(size),
 5622		      flags, NUMA_NO_NODE);
 5623	return ret;
 5624}
 5625EXPORT_SYMBOL(__kmalloc_large_noprof);
 5626
 5627void *__kmalloc_large_node_noprof(size_t size, gfp_t flags, int node)
 5628{
 5629	void *ret = ___kmalloc_large_node(size, flags, node);
 5630
 5631	trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << get_order(size),
 5632		      flags, node);
 5633	return ret;
 5634}
 5635EXPORT_SYMBOL(__kmalloc_large_node_noprof);
 5636
 5637static __always_inline
 5638void *__do_kmalloc_node(size_t size, kmem_buckets *b, gfp_t flags, int node,
 5639			unsigned long caller)
 5640{
 5641	struct kmem_cache *s;
 5642	void *ret;
 5643
 5644	if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) {
 5645		ret = __kmalloc_large_node_noprof(size, flags, node);
 5646		trace_kmalloc(caller, ret, size,
 5647			      PAGE_SIZE << get_order(size), flags, node);
 5648		return ret;
 5649	}
 5650
 5651	if (unlikely(!size))
 5652		return ZERO_SIZE_PTR;
 5653
 5654	s = kmalloc_slab(size, b, flags, caller);
 5655
 5656	ret = slab_alloc_node(s, NULL, flags, node, caller, size);
 5657	ret = kasan_kmalloc(s, ret, size, flags);
 5658	trace_kmalloc(caller, ret, size, s->size, flags, node);
 5659	return ret;
 5660}
 5661void *__kmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags, int node)
 5662{
 5663	return __do_kmalloc_node(size, PASS_BUCKET_PARAM(b), flags, node, _RET_IP_);
 5664}
 5665EXPORT_SYMBOL(__kmalloc_node_noprof);
 5666
 5667void *__kmalloc_noprof(size_t size, gfp_t flags)
 5668{
 5669	return __do_kmalloc_node(size, NULL, flags, NUMA_NO_NODE, _RET_IP_);
 5670}
 5671EXPORT_SYMBOL(__kmalloc_noprof);
 5672
 5673/**
 5674 * kmalloc_nolock - Allocate an object of given size from any context.
 5675 * @size: size to allocate
 5676 * @gfp_flags: GFP flags. Only __GFP_ACCOUNT, __GFP_ZERO, __GFP_NO_OBJ_EXT
 5677 * allowed.
 5678 * @node: node number of the target node.
 5679 *
 5680 * Return: pointer to the new object or NULL in case of error.
 5681 * NULL does not mean EBUSY or EAGAIN. It means ENOMEM.
 5682 * There is no reason to call it again and expect !NULL.
 5683 */
 5684void *kmalloc_nolock_noprof(size_t size, gfp_t gfp_flags, int node)
 5685{
 5686	gfp_t alloc_gfp = __GFP_NOWARN | __GFP_NOMEMALLOC | gfp_flags;
 5687	struct kmem_cache *s;
 5688	bool can_retry = true;
 5689	void *ret = ERR_PTR(-EBUSY);
 5690
 5691	VM_WARN_ON_ONCE(gfp_flags & ~(__GFP_ACCOUNT | __GFP_ZERO |
 5692				      __GFP_NO_OBJ_EXT));
 5693
 5694	if (unlikely(!size))
 5695		return ZERO_SIZE_PTR;
 5696
 5697	if (IS_ENABLED(CONFIG_PREEMPT_RT) && (in_nmi() || in_hardirq()))
 5698		/* kmalloc_nolock() in PREEMPT_RT is not supported from irq */
 5699		return NULL;
 5700retry:
 5701	if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))
 5702		return NULL;
 5703	s = kmalloc_slab(size, NULL, alloc_gfp, _RET_IP_);
 5704
 5705	if (!(s->flags & __CMPXCHG_DOUBLE) && !kmem_cache_debug(s))
 5706		/*
 5707		 * kmalloc_nolock() is not supported on architectures that
 5708		 * don't implement cmpxchg16b, but debug caches don't use
 5709		 * per-cpu slab and per-cpu partial slabs. They rely on
 5710		 * kmem_cache_node->list_lock, so kmalloc_nolock() can
 5711		 * attempt to allocate from debug caches by
 5712		 * spin_trylock_irqsave(&n->list_lock, ...)
 5713		 */
 5714		return NULL;
 5715
 5716	/*
 5717	 * Do not call slab_alloc_node(), since trylock mode isn't
 5718	 * compatible with slab_pre_alloc_hook/should_failslab and
 5719	 * kfence_alloc. Hence call __slab_alloc_node() (at most twice)
 5720	 * and slab_post_alloc_hook() directly.
 5721	 *
 5722	 * In !PREEMPT_RT ___slab_alloc() manipulates (freelist,tid) pair
 5723	 * in irq saved region. It assumes that the same cpu will not
 5724	 * __update_cpu_freelist_fast() into the same (freelist,tid) pair.
 5725	 * Therefore use in_nmi() to check whether particular bucket is in
 5726	 * irq protected section.
 5727	 *
 5728	 * If in_nmi() && local_lock_is_locked(s->cpu_slab) then it means that
 5729	 * this cpu was interrupted somewhere inside ___slab_alloc() after
 5730	 * it did local_lock_irqsave(&s->cpu_slab->lock, flags).
 5731	 * In this case fast path with __update_cpu_freelist_fast() is not safe.
 5732	 */
 5733	if (!in_nmi() || !local_lock_is_locked(&s->cpu_slab->lock))
 5734		ret = __slab_alloc_node(s, alloc_gfp, node, _RET_IP_, size);
 5735
 5736	if (PTR_ERR(ret) == -EBUSY) {
 5737		if (can_retry) {
 5738			/* pick the next kmalloc bucket */
 5739			size = s->object_size + 1;
 5740			/*
 5741			 * Another alternative is to
 5742			 * if (memcg) alloc_gfp &= ~__GFP_ACCOUNT;
 5743			 * else if (!memcg) alloc_gfp |= __GFP_ACCOUNT;
 5744			 * to retry from bucket of the same size.
 5745			 */
 5746			can_retry = false;
 5747			goto retry;
 5748		}
 5749		ret = NULL;
 5750	}
 5751
 5752	maybe_wipe_obj_freeptr(s, ret);
 5753	slab_post_alloc_hook(s, NULL, alloc_gfp, 1, &ret,
 5754			     slab_want_init_on_alloc(alloc_gfp, s), size);
 5755
 5756	ret = kasan_kmalloc(s, ret, size, alloc_gfp);
 5757	return ret;
 5758}
 5759EXPORT_SYMBOL_GPL(kmalloc_nolock_noprof);
 5760
 5761void *__kmalloc_node_track_caller_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags,
 5762					 int node, unsigned long caller)
 5763{
 5764	return __do_kmalloc_node(size, PASS_BUCKET_PARAM(b), flags, node, caller);
 5765
 5766}
 5767EXPORT_SYMBOL(__kmalloc_node_track_caller_noprof);
 5768
 5769void *__kmalloc_cache_noprof(struct kmem_cache *s, gfp_t gfpflags, size_t size)
 5770{
 5771	void *ret = slab_alloc_node(s, NULL, gfpflags, NUMA_NO_NODE,
 5772					    _RET_IP_, size);
 5773
 5774	trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags, NUMA_NO_NODE);
 5775
 5776	ret = kasan_kmalloc(s, ret, size, gfpflags);
 5777	return ret;
 5778}
 5779EXPORT_SYMBOL(__kmalloc_cache_noprof);
 5780
 5781void *__kmalloc_cache_node_noprof(struct kmem_cache *s, gfp_t gfpflags,
 5782				  int node, size_t size)
 5783{
 5784	void *ret = slab_alloc_node(s, NULL, gfpflags, node, _RET_IP_, size);
 5785
 5786	trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags, node);
 5787
 5788	ret = kasan_kmalloc(s, ret, size, gfpflags);
 5789	return ret;
 5790}
 5791EXPORT_SYMBOL(__kmalloc_cache_node_noprof);
 5792
 5793static noinline void free_to_partial_list(
 5794	struct kmem_cache *s, struct slab *slab,
 5795	void *head, void *tail, int bulk_cnt,
 5796	unsigned long addr)
 5797{
 5798	struct kmem_cache_node *n = get_node(s, slab_nid(slab));
 5799	struct slab *slab_free = NULL;
 5800	int cnt = bulk_cnt;
 5801	unsigned long flags;
 5802	depot_stack_handle_t handle = 0;
 5803
 5804	/*
 5805	 * We cannot use GFP_NOWAIT as there are callsites where waking up
 5806	 * kswapd could deadlock
 5807	 */
 5808	if (s->flags & SLAB_STORE_USER)
 5809		handle = set_track_prepare(__GFP_NOWARN);
 5810
 5811	spin_lock_irqsave(&n->list_lock, flags);
 5812
 5813	if (free_debug_processing(s, slab, head, tail, &cnt, addr, handle)) {
 5814		void *prior = slab->freelist;
 5815
 5816		/* Perform the actual freeing while we still hold the locks */
 5817		slab->inuse -= cnt;
 5818		set_freepointer(s, tail, prior);
 5819		slab->freelist = head;
 5820
 5821		/*
 5822		 * If the slab is empty, and node's partial list is full,
 5823		 * it should be discarded anyway no matter it's on full or
 5824		 * partial list.
 5825		 */
 5826		if (slab->inuse == 0 && n->nr_partial >= s->min_partial)
 5827			slab_free = slab;
 5828
 5829		if (!prior) {
 5830			/* was on full list */
 5831			remove_full(s, n, slab);
 5832			if (!slab_free) {
 5833				add_partial(n, slab, DEACTIVATE_TO_TAIL);
 5834				stat(s, FREE_ADD_PARTIAL);
 5835			}
 5836		} else if (slab_free) {
 5837			remove_partial(n, slab);
 5838			stat(s, FREE_REMOVE_PARTIAL);
 5839		}
 5840	}
 5841
 5842	if (slab_free) {
 5843		/*
 5844		 * Update the counters while still holding n->list_lock to
 5845		 * prevent spurious validation warnings
 5846		 */
 5847		dec_slabs_node(s, slab_nid(slab_free), slab_free->objects);
 5848	}
 5849
 5850	spin_unlock_irqrestore(&n->list_lock, flags);
 5851
 5852	if (slab_free) {
 5853		stat(s, FREE_SLAB);
 5854		free_slab(s, slab_free);
 5855	}
 5856}
 5857
 5858/*
 5859 * Slow path handling. This may still be called frequently since objects
 5860 * have a longer lifetime than the cpu slabs in most processing loads.
 5861 *
 5862 * So we still attempt to reduce cache line usage. Just take the slab
 5863 * lock and free the item. If there is no additional partial slab
 5864 * handling required then we can return immediately.
 5865 */
 5866static void __slab_free(struct kmem_cache *s, struct slab *slab,
 5867			void *head, void *tail, int cnt,
 5868			unsigned long addr)
 5869
 5870{
 5871	bool was_frozen, was_full;
 5872	struct freelist_counters old, new;
 5873	struct kmem_cache_node *n = NULL;
 5874	unsigned long flags;
 5875	bool on_node_partial;
 5876
 5877	stat(s, FREE_SLOWPATH);
 5878
 5879	if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) {
 5880		free_to_partial_list(s, slab, head, tail, cnt, addr);
 5881		return;
 5882	}
 5883
 5884	/*
 5885	 * It is enough to test IS_ENABLED(CONFIG_SLUB_CPU_PARTIAL) below
 5886	 * instead of kmem_cache_has_cpu_partial(s), because kmem_cache_debug(s)
 5887	 * is the only other reason it can be false, and it is already handled
 5888	 * above.
 5889	 */
 5890
 5891	do {
 5892		if (unlikely(n)) {
 5893			spin_unlock_irqrestore(&n->list_lock, flags);
 5894			n = NULL;
 5895		}
 5896
 5897		old.freelist = slab->freelist;
 5898		old.counters = slab->counters;
 5899
 5900		was_full = (old.freelist == NULL);
 5901		was_frozen = old.frozen;
 5902
 5903		set_freepointer(s, tail, old.freelist);
 5904
 5905		new.freelist = head;
 5906		new.counters = old.counters;
 5907		new.inuse -= cnt;
 5908
 5909		/*
 5910		 * Might need to be taken off (due to becoming empty) or added
 5911		 * to (due to not being full anymore) the partial list.
 5912		 * Unless it's frozen.
 5913		 */
 5914		if ((!new.inuse || was_full) && !was_frozen) {
 5915			/*
 5916			 * If slab becomes non-full and we have cpu partial
 5917			 * lists, we put it there unconditionally to avoid
 5918			 * taking the list_lock. Otherwise we need it.
 5919			 */
 5920			if (!(IS_ENABLED(CONFIG_SLUB_CPU_PARTIAL) && was_full)) {
 5921
 5922				n = get_node(s, slab_nid(slab));
 5923				/*
 5924				 * Speculatively acquire the list_lock.
 5925				 * If the cmpxchg does not succeed then we may
 5926				 * drop the list_lock without any processing.
 5927				 *
 5928				 * Otherwise the list_lock will synchronize with
 5929				 * other processors updating the list of slabs.
 5930				 */
 5931				spin_lock_irqsave(&n->list_lock, flags);
 5932
 5933				on_node_partial = slab_test_node_partial(slab);
 5934			}
 5935		}
 5936
 5937	} while (!slab_update_freelist(s, slab, &old, &new, "__slab_free"));
 5938
 5939	if (likely(!n)) {
 5940
 5941		if (likely(was_frozen)) {
 5942			/*
 5943			 * The list lock was not taken therefore no list
 5944			 * activity can be necessary.
 5945			 */
 5946			stat(s, FREE_FROZEN);
 5947		} else if (IS_ENABLED(CONFIG_SLUB_CPU_PARTIAL) && was_full) {
 5948			/*
 5949			 * If we started with a full slab then put it onto the
 5950			 * per cpu partial list.
 5951			 */
 5952			put_cpu_partial(s, slab, 1);
 5953			stat(s, CPU_PARTIAL_FREE);
 5954		}
 5955
 5956		/*
 5957		 * In other cases we didn't take the list_lock because the slab
 5958		 * was already on the partial list and will remain there.
 5959		 */
 5960
 5961		return;
 5962	}
 5963
 5964	/*
 5965	 * This slab was partially empty but not on the per-node partial list,
 5966	 * in which case we shouldn't manipulate its list, just return.
 5967	 */
 5968	if (!was_full && !on_node_partial) {
 5969		spin_unlock_irqrestore(&n->list_lock, flags);
 5970		return;
 5971	}
 5972
 5973	/*
 5974	 * If slab became empty, should we add/keep it on the partial list or we
 5975	 * have enough?
 5976	 */
 5977	if (unlikely(!new.inuse && n->nr_partial >= s->min_partial))
 5978		goto slab_empty;
 5979
 5980	/*
 5981	 * Objects left in the slab. If it was not on the partial list before
 5982	 * then add it. This can only happen when cache has no per cpu partial
 5983	 * list otherwise we would have put it there.
 5984	 */
 5985	if (!IS_ENABLED(CONFIG_SLUB_CPU_PARTIAL) && unlikely(was_full)) {
 5986		add_partial(n, slab, DEACTIVATE_TO_TAIL);
 5987		stat(s, FREE_ADD_PARTIAL);
 5988	}
 5989	spin_unlock_irqrestore(&n->list_lock, flags);
 5990	return;
 5991
 5992slab_empty:
 5993	/*
 5994	 * The slab could have a single object and thus go from full to empty in
 5995	 * a single free, but more likely it was on the partial list. Remove it.
 5996	 */
 5997	if (likely(!was_full)) {
 5998		remove_partial(n, slab);
 5999		stat(s, FREE_REMOVE_PARTIAL);
 6000	}
 6001
 6002	spin_unlock_irqrestore(&n->list_lock, flags);
 6003	stat(s, FREE_SLAB);
 6004	discard_slab(s, slab);
 6005}
 6006
 6007/*
 6008 * pcs is locked. We should have get rid of the spare sheaf and obtained an
 6009 * empty sheaf, while the main sheaf is full. We want to install the empty sheaf
 6010 * as a main sheaf, and make the current main sheaf a spare sheaf.
 6011 *
 6012 * However due to having relinquished the cpu_sheaves lock when obtaining
 6013 * the empty sheaf, we need to handle some unlikely but possible cases.
 6014 *
 6015 * If we put any sheaf to barn here, it's because we were interrupted or have
 6016 * been migrated to a different cpu, which should be rare enough so just ignore
 6017 * the barn's limits to simplify the handling.
 6018 *
 6019 * An alternative scenario that gets us here is when we fail
 6020 * barn_replace_full_sheaf(), because there's no empty sheaf available in the
 6021 * barn, so we had to allocate it by alloc_empty_sheaf(). But because we saw the
 6022 * limit on full sheaves was not exceeded, we assume it didn't change and just
 6023 * put the full sheaf there.
 6024 */
 6025static void __pcs_install_empty_sheaf(struct kmem_cache *s,
 6026		struct slub_percpu_sheaves *pcs, struct slab_sheaf *empty,
 6027		struct node_barn *barn)
 6028{
 6029	lockdep_assert_held(this_cpu_ptr(&s->cpu_sheaves->lock));
 6030
 6031	/* This is what we expect to find if nobody interrupted us. */
 6032	if (likely(!pcs->spare)) {
 6033		pcs->spare = pcs->main;
 6034		pcs->main = empty;
 6035		return;
 6036	}
 6037
 6038	/*
 6039	 * Unlikely because if the main sheaf had space, we would have just
 6040	 * freed to it. Get rid of our empty sheaf.
 6041	 */
 6042	if (pcs->main->size < s->sheaf_capacity) {
 6043		barn_put_empty_sheaf(barn, empty);
 6044		return;
 6045	}
 6046
 6047	/* Also unlikely for the same reason */
 6048	if (pcs->spare->size < s->sheaf_capacity) {
 6049		swap(pcs->main, pcs->spare);
 6050		barn_put_empty_sheaf(barn, empty);
 6051		return;
 6052	}
 6053
 6054	/*
 6055	 * We probably failed barn_replace_full_sheaf() due to no empty sheaf
 6056	 * available there, but we allocated one, so finish the job.
 6057	 */
 6058	barn_put_full_sheaf(barn, pcs->main);
 6059	stat(s, BARN_PUT);
 6060	pcs->main = empty;
 6061}
 6062
 6063/*
 6064 * Replace the full main sheaf with a (at least partially) empty sheaf.
 6065 *
 6066 * Must be called with the cpu_sheaves local lock locked. If successful, returns
 6067 * the pcs pointer and the local lock locked (possibly on a different cpu than
 6068 * initially called). If not successful, returns NULL and the local lock
 6069 * unlocked.
 6070 */
 6071static struct slub_percpu_sheaves *
 6072__pcs_replace_full_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs)
 6073{
 6074	struct slab_sheaf *empty;
 6075	struct node_barn *barn;
 6076	bool put_fail;
 6077
 6078restart:
 6079	lockdep_assert_held(this_cpu_ptr(&s->cpu_sheaves->lock));
 6080
 6081	barn = get_barn(s);
 6082	if (!barn) {
 6083		local_unlock(&s->cpu_sheaves->lock);
 6084		return NULL;
 6085	}
 6086
 6087	put_fail = false;
 6088
 6089	if (!pcs->spare) {
 6090		empty = barn_get_empty_sheaf(barn);
 6091		if (empty) {
 6092			pcs->spare = pcs->main;
 6093			pcs->main = empty;
 6094			return pcs;
 6095		}
 6096		goto alloc_empty;
 6097	}
 6098
 6099	if (pcs->spare->size < s->sheaf_capacity) {
 6100		swap(pcs->main, pcs->spare);
 6101		return pcs;
 6102	}
 6103
 6104	empty = barn_replace_full_sheaf(barn, pcs->main);
 6105
 6106	if (!IS_ERR(empty)) {
 6107		stat(s, BARN_PUT);
 6108		pcs->main = empty;
 6109		return pcs;
 6110	}
 6111
 6112	if (PTR_ERR(empty) == -E2BIG) {
 6113		/* Since we got here, spare exists and is full */
 6114		struct slab_sheaf *to_flush = pcs->spare;
 6115
 6116		stat(s, BARN_PUT_FAIL);
 6117
 6118		pcs->spare = NULL;
 6119		local_unlock(&s->cpu_sheaves->lock);
 6120
 6121		sheaf_flush_unused(s, to_flush);
 6122		empty = to_flush;
 6123		goto got_empty;
 6124	}
 6125
 6126	/*
 6127	 * We could not replace full sheaf because barn had no empty
 6128	 * sheaves. We can still allocate it and put the full sheaf in
 6129	 * __pcs_install_empty_sheaf(), but if we fail to allocate it,
 6130	 * make sure to count the fail.
 6131	 */
 6132	put_fail = true;
 6133
 6134alloc_empty:
 6135	local_unlock(&s->cpu_sheaves->lock);
 6136
 6137	empty = alloc_empty_sheaf(s, GFP_NOWAIT);
 6138	if (empty)
 6139		goto got_empty;
 6140
 6141	if (put_fail)
 6142		 stat(s, BARN_PUT_FAIL);
 6143
 6144	if (!sheaf_flush_main(s))
 6145		return NULL;
 6146
 6147	if (!local_trylock(&s->cpu_sheaves->lock))
 6148		return NULL;
 6149
 6150	pcs = this_cpu_ptr(s->cpu_sheaves);
 6151
 6152	/*
 6153	 * we flushed the main sheaf so it should be empty now,
 6154	 * but in case we got preempted or migrated, we need to
 6155	 * check again
 6156	 */
 6157	if (pcs->main->size == s->sheaf_capacity)
 6158		goto restart;
 6159
 6160	return pcs;
 6161
 6162got_empty:
 6163	if (!local_trylock(&s->cpu_sheaves->lock)) {
 6164		barn_put_empty_sheaf(barn, empty);
 6165		return NULL;
 6166	}
 6167
 6168	pcs = this_cpu_ptr(s->cpu_sheaves);
 6169	__pcs_install_empty_sheaf(s, pcs, empty, barn);
 6170
 6171	return pcs;
 6172}
 6173
 6174/*
 6175 * Free an object to the percpu sheaves.
 6176 * The object is expected to have passed slab_free_hook() already.
 6177 */
 6178static __fastpath_inline
 6179bool free_to_pcs(struct kmem_cache *s, void *object)
 6180{
 6181	struct slub_percpu_sheaves *pcs;
 6182
 6183	if (!local_trylock(&s->cpu_sheaves->lock))
 6184		return false;
 6185
 6186	pcs = this_cpu_ptr(s->cpu_sheaves);
 6187
 6188	if (unlikely(pcs->main->size == s->sheaf_capacity)) {
 6189
 6190		pcs = __pcs_replace_full_main(s, pcs);
 6191		if (unlikely(!pcs))
 6192			return false;
 6193	}
 6194
 6195	pcs->main->objects[pcs->main->size++] = object;
 6196
 6197	local_unlock(&s->cpu_sheaves->lock);
 6198
 6199	stat(s, FREE_PCS);
 6200
 6201	return true;
 6202}
 6203
 6204static void rcu_free_sheaf(struct rcu_head *head)
 6205{
 6206	struct kmem_cache_node *n;
 6207	struct slab_sheaf *sheaf;
 6208	struct node_barn *barn = NULL;
 6209	struct kmem_cache *s;
 6210
 6211	sheaf = container_of(head, struct slab_sheaf, rcu_head);
 6212
 6213	s = sheaf->cache;
 6214
 6215	/*
 6216	 * This may remove some objects due to slab_free_hook() returning false,
 6217	 * so that the sheaf might no longer be completely full. But it's easier
 6218	 * to handle it as full (unless it became completely empty), as the code
 6219	 * handles it fine. The only downside is that sheaf will serve fewer
 6220	 * allocations when reused. It only happens due to debugging, which is a
 6221	 * performance hit anyway.
 6222	 *
 6223	 * If it returns true, there was at least one object from pfmemalloc
 6224	 * slab so simply flush everything.
 6225	 */
 6226	if (__rcu_free_sheaf_prepare(s, sheaf))
 6227		goto flush;
 6228
 6229	n = get_node(s, sheaf->node);
 6230	if (!n)
 6231		goto flush;
 6232
 6233	barn = n->barn;
 6234
 6235	/* due to slab_free_hook() */
 6236	if (unlikely(sheaf->size == 0))
 6237		goto empty;
 6238
 6239	/*
 6240	 * Checking nr_full/nr_empty outside lock avoids contention in case the
 6241	 * barn is at the respective limit. Due to the race we might go over the
 6242	 * limit but that should be rare and harmless.
 6243	 */
 6244
 6245	if (data_race(barn->nr_full) < MAX_FULL_SHEAVES) {
 6246		stat(s, BARN_PUT);
 6247		barn_put_full_sheaf(barn, sheaf);
 6248		return;
 6249	}
 6250
 6251flush:
 6252	stat(s, BARN_PUT_FAIL);
 6253	sheaf_flush_unused(s, sheaf);
 6254
 6255empty:
 6256	if (barn && data_race(barn->nr_empty) < MAX_EMPTY_SHEAVES) {
 6257		barn_put_empty_sheaf(barn, sheaf);
 6258		return;
 6259	}
 6260
 6261	free_empty_sheaf(s, sheaf);
 6262}
 6263
 6264bool __kfree_rcu_sheaf(struct kmem_cache *s, void *obj)
 6265{
 6266	struct slub_percpu_sheaves *pcs;
 6267	struct slab_sheaf *rcu_sheaf;
 6268
 6269	if (!local_trylock(&s->cpu_sheaves->lock))
 6270		goto fail;
 6271
 6272	pcs = this_cpu_ptr(s->cpu_sheaves);
 6273
 6274	if (unlikely(!pcs->rcu_free)) {
 6275
 6276		struct slab_sheaf *empty;
 6277		struct node_barn *barn;
 6278
 6279		if (pcs->spare && pcs->spare->size == 0) {
 6280			pcs->rcu_free = pcs->spare;
 6281			pcs->spare = NULL;
 6282			goto do_free;
 6283		}
 6284
 6285		barn = get_barn(s);
 6286		if (!barn) {
 6287			local_unlock(&s->cpu_sheaves->lock);
 6288			goto fail;
 6289		}
 6290
 6291		empty = barn_get_empty_sheaf(barn);
 6292
 6293		if (empty) {
 6294			pcs->rcu_free = empty;
 6295			goto do_free;
 6296		}
 6297
 6298		local_unlock(&s->cpu_sheaves->lock);
 6299
 6300		empty = alloc_empty_sheaf(s, GFP_NOWAIT);
 6301
 6302		if (!empty)
 6303			goto fail;
 6304
 6305		if (!local_trylock(&s->cpu_sheaves->lock)) {
 6306			barn_put_empty_sheaf(barn, empty);
 6307			goto fail;
 6308		}
 6309
 6310		pcs = this_cpu_ptr(s->cpu_sheaves);
 6311
 6312		if (unlikely(pcs->rcu_free))
 6313			barn_put_empty_sheaf(barn, empty);
 6314		else
 6315			pcs->rcu_free = empty;
 6316	}
 6317
 6318do_free:
 6319
 6320	rcu_sheaf = pcs->rcu_free;
 6321
 6322	/*
 6323	 * Since we flush immediately when size reaches capacity, we never reach
 6324	 * this with size already at capacity, so no OOB write is possible.
 6325	 */
 6326	rcu_sheaf->objects[rcu_sheaf->size++] = obj;
 6327
 6328	if (likely(rcu_sheaf->size < s->sheaf_capacity)) {
 6329		rcu_sheaf = NULL;
 6330	} else {
 6331		pcs->rcu_free = NULL;
 6332		rcu_sheaf->node = numa_mem_id();
 6333	}
 6334
 6335	/*
 6336	 * we flush before local_unlock to make sure a racing
 6337	 * flush_all_rcu_sheaves() doesn't miss this sheaf
 6338	 */
 6339	if (rcu_sheaf)
 6340		call_rcu(&rcu_sheaf->rcu_head, rcu_free_sheaf);
 6341
 6342	local_unlock(&s->cpu_sheaves->lock);
 6343
 6344	stat(s, FREE_RCU_SHEAF);
 6345	return true;
 6346
 6347fail:
 6348	stat(s, FREE_RCU_SHEAF_FAIL);
 6349	return false;
 6350}
 6351
 6352/*
 6353 * Bulk free objects to the percpu sheaves.
 6354 * Unlike free_to_pcs() this includes the calls to all necessary hooks
 6355 * and the fallback to freeing to slab pages.
 6356 */
 6357static void free_to_pcs_bulk(struct kmem_cache *s, size_t size, void **p)
 6358{
 6359	struct slub_percpu_sheaves *pcs;
 6360	struct slab_sheaf *main, *empty;
 6361	bool init = slab_want_init_on_free(s);
 6362	unsigned int batch, i = 0;
 6363	struct node_barn *barn;
 6364	void *remote_objects[PCS_BATCH_MAX];
 6365	unsigned int remote_nr = 0;
 6366	int node = numa_mem_id();
 6367
 6368next_remote_batch:
 6369	while (i < size) {
 6370		struct slab *slab = virt_to_slab(p[i]);
 6371
 6372		memcg_slab_free_hook(s, slab, p + i, 1);
 6373		alloc_tagging_slab_free_hook(s, slab, p + i, 1);
 6374
 6375		if (unlikely(!slab_free_hook(s, p[i], init, false))) {
 6376			p[i] = p[--size];
 6377			continue;
 6378		}
 6379
 6380		if (unlikely((IS_ENABLED(CONFIG_NUMA) && slab_nid(slab) != node)
 6381			     || slab_test_pfmemalloc(slab))) {
 6382			remote_objects[remote_nr] = p[i];
 6383			p[i] = p[--size];
 6384			if (++remote_nr >= PCS_BATCH_MAX)
 6385				goto flush_remote;
 6386			continue;
 6387		}
 6388
 6389		i++;
 6390	}
 6391
 6392	if (!size)
 6393		goto flush_remote;
 6394
 6395next_batch:
 6396	if (!local_trylock(&s->cpu_sheaves->lock))
 6397		goto fallback;
 6398
 6399	pcs = this_cpu_ptr(s->cpu_sheaves);
 6400
 6401	if (likely(pcs->main->size < s->sheaf_capacity))
 6402		goto do_free;
 6403
 6404	barn = get_barn(s);
 6405	if (!barn)
 6406		goto no_empty;
 6407
 6408	if (!pcs->spare) {
 6409		empty = barn_get_empty_sheaf(barn);
 6410		if (!empty)
 6411			goto no_empty;
 6412
 6413		pcs->spare = pcs->main;
 6414		pcs->main = empty;
 6415		goto do_free;
 6416	}
 6417
 6418	if (pcs->spare->size < s->sheaf_capacity) {
 6419		swap(pcs->main, pcs->spare);
 6420		goto do_free;
 6421	}
 6422
 6423	empty = barn_replace_full_sheaf(barn, pcs->main);
 6424	if (IS_ERR(empty)) {
 6425		stat(s, BARN_PUT_FAIL);
 6426		goto no_empty;
 6427	}
 6428
 6429	stat(s, BARN_PUT);
 6430	pcs->main = empty;
 6431
 6432do_free:
 6433	main = pcs->main;
 6434	batch = min(size, s->sheaf_capacity - main->size);
 6435
 6436	memcpy(main->objects + main->size, p, batch * sizeof(void *));
 6437	main->size += batch;
 6438
 6439	local_unlock(&s->cpu_sheaves->lock);
 6440
 6441	stat_add(s, FREE_PCS, batch);
 6442
 6443	if (batch < size) {
 6444		p += batch;
 6445		size -= batch;
 6446		goto next_batch;
 6447	}
 6448
 6449	if (remote_nr)
 6450		goto flush_remote;
 6451
 6452	return;
 6453
 6454no_empty:
 6455	local_unlock(&s->cpu_sheaves->lock);
 6456
 6457	/*
 6458	 * if we depleted all empty sheaves in the barn or there are too
 6459	 * many full sheaves, free the rest to slab pages
 6460	 */
 6461fallback:
 6462	__kmem_cache_free_bulk(s, size, p);
 6463
 6464flush_remote:
 6465	if (remote_nr) {
 6466		__kmem_cache_free_bulk(s, remote_nr, &remote_objects[0]);
 6467		if (i < size) {
 6468			remote_nr = 0;
 6469			goto next_remote_batch;
 6470		}
 6471	}
 6472}
 6473
 6474struct defer_free {
 6475	struct llist_head objects;
 6476	struct llist_head slabs;
 6477	struct irq_work work;
 6478};
 6479
 6480static void free_deferred_objects(struct irq_work *work);
 6481
 6482static DEFINE_PER_CPU(struct defer_free, defer_free_objects) = {
 6483	.objects = LLIST_HEAD_INIT(objects),
 6484	.slabs = LLIST_HEAD_INIT(slabs),
 6485	.work = IRQ_WORK_INIT(free_deferred_objects),
 6486};
 6487
 6488/*
 6489 * In PREEMPT_RT irq_work runs in per-cpu kthread, so it's safe
 6490 * to take sleeping spin_locks from __slab_free() and deactivate_slab().
 6491 * In !PREEMPT_RT irq_work will run after local_unlock_irqrestore().
 6492 */
 6493static void free_deferred_objects(struct irq_work *work)
 6494{
 6495	struct defer_free *df = container_of(work, struct defer_free, work);
 6496	struct llist_head *objs = &df->objects;
 6497	struct llist_head *slabs = &df->slabs;
 6498	struct llist_node *llnode, *pos, *t;
 6499
 6500	if (llist_empty(objs) && llist_empty(slabs))
 6501		return;
 6502
 6503	llnode = llist_del_all(objs);
 6504	llist_for_each_safe(pos, t, llnode) {
 6505		struct kmem_cache *s;
 6506		struct slab *slab;
 6507		void *x = pos;
 6508
 6509		slab = virt_to_slab(x);
 6510		s = slab->slab_cache;
 6511
 6512		/* Point 'x' back to the beginning of allocated object */
 6513		x -= s->offset;
 6514
 6515		/*
 6516		 * We used freepointer in 'x' to link 'x' into df->objects.
 6517		 * Clear it to NULL to avoid false positive detection
 6518		 * of "Freepointer corruption".
 6519		 */
 6520		set_freepointer(s, x, NULL);
 6521
 6522		__slab_free(s, slab, x, x, 1, _THIS_IP_);
 6523	}
 6524
 6525	llnode = llist_del_all(slabs);
 6526	llist_for_each_safe(pos, t, llnode) {
 6527		struct slab *slab = container_of(pos, struct slab, llnode);
 6528
 6529		if (slab->frozen)
 6530			deactivate_slab(slab->slab_cache, slab, slab->flush_freelist);
 6531		else
 6532			free_slab(slab->slab_cache, slab);
 6533	}
 6534}
 6535
 6536static void defer_free(struct kmem_cache *s, void *head)
 6537{
 6538	struct defer_free *df;
 6539
 6540	guard(preempt)();
 6541
 6542	head = kasan_reset_tag(head);
 6543
 6544	df = this_cpu_ptr(&defer_free_objects);
 6545	if (llist_add(head + s->offset, &df->objects))
 6546		irq_work_queue(&df->work);
 6547}
 6548
 6549static void defer_deactivate_slab(struct slab *slab, void *flush_freelist)
 6550{
 6551	struct defer_free *df;
 6552
 6553	slab->flush_freelist = flush_freelist;
 6554
 6555	guard(preempt)();
 6556
 6557	df = this_cpu_ptr(&defer_free_objects);
 6558	if (llist_add(&slab->llnode, &df->slabs))
 6559		irq_work_queue(&df->work);
 6560}
 6561
 6562void defer_free_barrier(void)
 6563{
 6564	int cpu;
 6565
 6566	for_each_possible_cpu(cpu)
 6567		irq_work_sync(&per_cpu_ptr(&defer_free_objects, cpu)->work);
 6568}
 6569
 6570/*
 6571 * Fastpath with forced inlining to produce a kfree and kmem_cache_free that
 6572 * can perform fastpath freeing without additional function calls.
 6573 *
 6574 * The fastpath is only possible if we are freeing to the current cpu slab
 6575 * of this processor. This typically the case if we have just allocated
 6576 * the item before.
 6577 *
 6578 * If fastpath is not possible then fall back to __slab_free where we deal
 6579 * with all sorts of special processing.
 6580 *
 6581 * Bulk free of a freelist with several objects (all pointing to the
 6582 * same slab) possible by specifying head and tail ptr, plus objects
 6583 * count (cnt). Bulk free indicated by tail pointer being set.
 6584 */
 6585static __always_inline void do_slab_free(struct kmem_cache *s,
 6586				struct slab *slab, void *head, void *tail,
 6587				int cnt, unsigned long addr)
 6588{
 6589	/* cnt == 0 signals that it's called from kfree_nolock() */
 6590	bool allow_spin = cnt;
 6591	struct kmem_cache_cpu *c;
 6592	unsigned long tid;
 6593	void **freelist;
 6594
 6595redo:
 6596	/*
 6597	 * Determine the currently cpus per cpu slab.
 6598	 * The cpu may change afterward. However that does not matter since
 6599	 * data is retrieved via this pointer. If we are on the same cpu
 6600	 * during the cmpxchg then the free will succeed.
 6601	 */
 6602	c = raw_cpu_ptr(s->cpu_slab);
 6603	tid = READ_ONCE(c->tid);
 6604
 6605	/* Same with comment on barrier() in __slab_alloc_node() */
 6606	barrier();
 6607
 6608	if (unlikely(slab != c->slab)) {
 6609		if (unlikely(!allow_spin)) {
 6610			/*
 6611			 * __slab_free() can locklessly cmpxchg16 into a slab,
 6612			 * but then it might need to take spin_lock or local_lock
 6613			 * in put_cpu_partial() for further processing.
 6614			 * Avoid the complexity and simply add to a deferred list.
 6615			 */
 6616			defer_free(s, head);
 6617		} else {
 6618			__slab_free(s, slab, head, tail, cnt, addr);
 6619		}
 6620		return;
 6621	}
 6622
 6623	if (unlikely(!allow_spin)) {
 6624		if ((in_nmi() || !USE_LOCKLESS_FAST_PATH()) &&
 6625		    local_lock_is_locked(&s->cpu_slab->lock)) {
 6626			defer_free(s, head);
 6627			return;
 6628		}
 6629		cnt = 1; /* restore cnt. kfree_nolock() frees one object at a time */
 6630	}
 6631
 6632	if (USE_LOCKLESS_FAST_PATH()) {
 6633		freelist = READ_ONCE(c->freelist);
 6634
 6635		set_freepointer(s, tail, freelist);
 6636
 6637		if (unlikely(!__update_cpu_freelist_fast(s, freelist, head, tid))) {
 6638			note_cmpxchg_failure("slab_free", s, tid);
 6639			goto redo;
 6640		}
 6641	} else {
 6642		__maybe_unused unsigned long flags = 0;
 6643
 6644		/* Update the free list under the local lock */
 6645		local_lock_cpu_slab(s, flags);
 6646		c = this_cpu_ptr(s->cpu_slab);
 6647		if (unlikely(slab != c->slab)) {
 6648			local_unlock_cpu_slab(s, flags);
 6649			goto redo;
 6650		}
 6651		tid = c->tid;
 6652		freelist = c->freelist;
 6653
 6654		set_freepointer(s, tail, freelist);
 6655		c->freelist = head;
 6656		c->tid = next_tid(tid);
 6657
 6658		local_unlock_cpu_slab(s, flags);
 6659	}
 6660	stat_add(s, FREE_FASTPATH, cnt);
 6661}
 6662
 6663static __fastpath_inline
 6664void slab_free(struct kmem_cache *s, struct slab *slab, void *object,
 6665	       unsigned long addr)
 6666{
 6667	memcg_slab_free_hook(s, slab, &object, 1);
 6668	alloc_tagging_slab_free_hook(s, slab, &object, 1);
 6669
 6670	if (unlikely(!slab_free_hook(s, object, slab_want_init_on_free(s), false)))
 6671		return;
 6672
 6673	if (s->cpu_sheaves && likely(!IS_ENABLED(CONFIG_NUMA) ||
 6674				     slab_nid(slab) == numa_mem_id())
 6675			   && likely(!slab_test_pfmemalloc(slab))) {
 6676		if (likely(free_to_pcs(s, object)))
 6677			return;
 6678	}
 6679
 6680	do_slab_free(s, slab, object, object, 1, addr);
 6681}
 6682
 6683#ifdef CONFIG_MEMCG
 6684/* Do not inline the rare memcg charging failed path into the allocation path */
 6685static noinline
 6686void memcg_alloc_abort_single(struct kmem_cache *s, void *object)
 6687{
 6688	if (likely(slab_free_hook(s, object, slab_want_init_on_free(s), false)))
 6689		do_slab_free(s, virt_to_slab(object), object, object, 1, _RET_IP_);
 6690}
 6691#endif
 6692
 6693static __fastpath_inline
 6694void slab_free_bulk(struct kmem_cache *s, struct slab *slab, void *head,
 6695		    void *tail, void **p, int cnt, unsigned long addr)
 6696{
 6697	memcg_slab_free_hook(s, slab, p, cnt);
 6698	alloc_tagging_slab_free_hook(s, slab, p, cnt);
 6699	/*
 6700	 * With KASAN enabled slab_free_freelist_hook modifies the freelist
 6701	 * to remove objects, whose reuse must be delayed.
 6702	 */
 6703	if (likely(slab_free_freelist_hook(s, &head, &tail, &cnt)))
 6704		do_slab_free(s, slab, head, tail, cnt, addr);
 6705}
 6706
 6707#ifdef CONFIG_SLUB_RCU_DEBUG
 6708static void slab_free_after_rcu_debug(struct rcu_head *rcu_head)
 6709{
 6710	struct rcu_delayed_free *delayed_free =
 6711			container_of(rcu_head, struct rcu_delayed_free, head);
 6712	void *object = delayed_free->object;
 6713	struct slab *slab = virt_to_slab(object);
 6714	struct kmem_cache *s;
 6715
 6716	kfree(delayed_free);
 6717
 6718	if (WARN_ON(is_kfence_address(object)))
 6719		return;
 6720
 6721	/* find the object and the cache again */
 6722	if (WARN_ON(!slab))
 6723		return;
 6724	s = slab->slab_cache;
 6725	if (WARN_ON(!(s->flags & SLAB_TYPESAFE_BY_RCU)))
 6726		return;
 6727
 6728	/* resume freeing */
 6729	if (slab_free_hook(s, object, slab_want_init_on_free(s), true))
 6730		do_slab_free(s, slab, object, object, 1, _THIS_IP_);
 6731}
 6732#endif /* CONFIG_SLUB_RCU_DEBUG */
 6733
 6734#ifdef CONFIG_KASAN_GENERIC
 6735void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr)
 6736{
 6737	do_slab_free(cache, virt_to_slab(x), x, x, 1, addr);
 6738}
 6739#endif
 6740
 6741static inline struct kmem_cache *virt_to_cache(const void *obj)
 6742{
 6743	struct slab *slab;
 6744
 6745	slab = virt_to_slab(obj);
 6746	if (WARN_ONCE(!slab, "%s: Object is not a Slab page!\n", __func__))
 6747		return NULL;
 6748	return slab->slab_cache;
 6749}
 6750
 6751static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x)
 6752{
 6753	struct kmem_cache *cachep;
 6754
 6755	if (!IS_ENABLED(CONFIG_SLAB_FREELIST_HARDENED) &&
 6756	    !kmem_cache_debug_flags(s, SLAB_CONSISTENCY_CHECKS))
 6757		return s;
 6758
 6759	cachep = virt_to_cache(x);
 6760	if (WARN(cachep && cachep != s,
 6761		 "%s: Wrong slab cache. %s but object is from %s\n",
 6762		 __func__, s->name, cachep->name))
 6763		print_tracking(cachep, x);
 6764	return cachep;
 6765}
 6766
 6767/**
 6768 * kmem_cache_free - Deallocate an object
 6769 * @s: The cache the allocation was from.
 6770 * @x: The previously allocated object.
 6771 *
 6772 * Free an object which was previously allocated from this
 6773 * cache.
 6774 */
 6775void kmem_cache_free(struct kmem_cache *s, void *x)
 6776{
 6777	s = cache_from_obj(s, x);
 6778	if (!s)
 6779		return;
 6780	trace_kmem_cache_free(_RET_IP_, x, s);
 6781	slab_free(s, virt_to_slab(x), x, _RET_IP_);
 6782}
 6783EXPORT_SYMBOL(kmem_cache_free);
 6784
 6785static void free_large_kmalloc(struct page *page, void *object)
 6786{
 6787	unsigned int order = compound_order(page);
 6788
 6789	if (WARN_ON_ONCE(!PageLargeKmalloc(page))) {
 6790		dump_page(page, "Not a kmalloc allocation");
 6791		return;
 6792	}
 6793
 6794	if (WARN_ON_ONCE(order == 0))
 6795		pr_warn_once("object pointer: 0x%p\n", object);
 6796
 6797	kmemleak_free(object);
 6798	kasan_kfree_large(object);
 6799	kmsan_kfree_large(object);
 6800
 6801	mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B,
 6802			      -(PAGE_SIZE << order));
 6803	__ClearPageLargeKmalloc(page);
 6804	free_frozen_pages(page, order);
 6805}
 6806
 6807/*
 6808 * Given an rcu_head embedded within an object obtained from kvmalloc at an
 6809 * offset < 4k, free the object in question.
 6810 */
 6811void kvfree_rcu_cb(struct rcu_head *head)
 6812{
 6813	void *obj = head;
 6814	struct page *page;
 6815	struct slab *slab;
 6816	struct kmem_cache *s;
 6817	void *slab_addr;
 6818
 6819	if (is_vmalloc_addr(obj)) {
 6820		obj = (void *) PAGE_ALIGN_DOWN((unsigned long)obj);
 6821		vfree(obj);
 6822		return;
 6823	}
 6824
 6825	page = virt_to_page(obj);
 6826	slab = page_slab(page);
 6827	if (!slab) {
 6828		/*
 6829		 * rcu_head offset can be only less than page size so no need to
 6830		 * consider allocation order
 6831		 */
 6832		obj = (void *) PAGE_ALIGN_DOWN((unsigned long)obj);
 6833		free_large_kmalloc(page, obj);
 6834		return;
 6835	}
 6836
 6837	s = slab->slab_cache;
 6838	slab_addr = slab_address(slab);
 6839
 6840	if (is_kfence_address(obj)) {
 6841		obj = kfence_object_start(obj);
 6842	} else {
 6843		unsigned int idx = __obj_to_index(s, slab_addr, obj);
 6844
 6845		obj = slab_addr + s->size * idx;
 6846		obj = fixup_red_left(s, obj);
 6847	}
 6848
 6849	slab_free(s, slab, obj, _RET_IP_);
 6850}
 6851
 6852/**
 6853 * kfree - free previously allocated memory
 6854 * @object: pointer returned by kmalloc() or kmem_cache_alloc()
 6855 *
 6856 * If @object is NULL, no operation is performed.
 6857 */
 6858void kfree(const void *object)
 6859{
 6860	struct page *page;
 6861	struct slab *slab;
 6862	struct kmem_cache *s;
 6863	void *x = (void *)object;
 6864
 6865	trace_kfree(_RET_IP_, object);
 6866
 6867	if (unlikely(ZERO_OR_NULL_PTR(object)))
 6868		return;
 6869
 6870	page = virt_to_page(object);
 6871	slab = page_slab(page);
 6872	if (!slab) {
 6873		free_large_kmalloc(page, (void *)object);
 6874		return;
 6875	}
 6876
 6877	s = slab->slab_cache;
 6878	slab_free(s, slab, x, _RET_IP_);
 6879}
 6880EXPORT_SYMBOL(kfree);
 6881
 6882/*
 6883 * Can be called while holding raw_spinlock_t or from IRQ and NMI,
 6884 * but ONLY for objects allocated by kmalloc_nolock().
 6885 * Debug checks (like kmemleak and kfence) were skipped on allocation,
 6886 * hence
 6887 * obj = kmalloc(); kfree_nolock(obj);
 6888 * will miss kmemleak/kfence book keeping and will cause false positives.
 6889 * large_kmalloc is not supported either.
 6890 */
 6891void kfree_nolock(const void *object)
 6892{
 6893	struct slab *slab;
 6894	struct kmem_cache *s;
 6895	void *x = (void *)object;
 6896
 6897	if (unlikely(ZERO_OR_NULL_PTR(object)))
 6898		return;
 6899
 6900	slab = virt_to_slab(object);
 6901	if (unlikely(!slab)) {
 6902		WARN_ONCE(1, "large_kmalloc is not supported by kfree_nolock()");
 6903		return;
 6904	}
 6905
 6906	s = slab->slab_cache;
 6907
 6908	memcg_slab_free_hook(s, slab, &x, 1);
 6909	alloc_tagging_slab_free_hook(s, slab, &x, 1);
 6910	/*
 6911	 * Unlike slab_free() do NOT call the following:
 6912	 * kmemleak_free_recursive(x, s->flags);
 6913	 * debug_check_no_locks_freed(x, s->object_size);
 6914	 * debug_check_no_obj_freed(x, s->object_size);
 6915	 * __kcsan_check_access(x, s->object_size, ..);
 6916	 * kfence_free(x);
 6917	 * since they take spinlocks or not safe from any context.
 6918	 */
 6919	kmsan_slab_free(s, x);
 6920	/*
 6921	 * If KASAN finds a kernel bug it will do kasan_report_invalid_free()
 6922	 * which will call raw_spin_lock_irqsave() which is technically
 6923	 * unsafe from NMI, but take chance and report kernel bug.
 6924	 * The sequence of
 6925	 * kasan_report_invalid_free() -> raw_spin_lock_irqsave() -> NMI
 6926	 *  -> kfree_nolock() -> kasan_report_invalid_free() on the same CPU
 6927	 * is double buggy and deserves to deadlock.
 6928	 */
 6929	if (kasan_slab_pre_free(s, x))
 6930		return;
 6931	/*
 6932	 * memcg, kasan_slab_pre_free are done for 'x'.
 6933	 * The only thing left is kasan_poison without quarantine,
 6934	 * since kasan quarantine takes locks and not supported from NMI.
 6935	 */
 6936	kasan_slab_free(s, x, false, false, /* skip quarantine */true);
 6937	do_slab_free(s, slab, x, x, 0, _RET_IP_);
 6938}
 6939EXPORT_SYMBOL_GPL(kfree_nolock);
 6940
 6941static __always_inline __realloc_size(2) void *
 6942__do_krealloc(const void *p, size_t new_size, unsigned long align, gfp_t flags, int nid)
 6943{
 6944	void *ret;
 6945	size_t ks = 0;
 6946	int orig_size = 0;
 6947	struct kmem_cache *s = NULL;
 6948
 6949	if (unlikely(ZERO_OR_NULL_PTR(p)))
 6950		goto alloc_new;
 6951
 6952	/* Check for double-free. */
 6953	if (!kasan_check_byte(p))
 6954		return NULL;
 6955
 6956	/*
 6957	 * If reallocation is not necessary (e. g. the new size is less
 6958	 * than the current allocated size), the current allocation will be
 6959	 * preserved unless __GFP_THISNODE is set. In the latter case a new
 6960	 * allocation on the requested node will be attempted.
 6961	 */
 6962	if (unlikely(flags & __GFP_THISNODE) && nid != NUMA_NO_NODE &&
 6963		     nid != page_to_nid(virt_to_page(p)))
 6964		goto alloc_new;
 6965
 6966	if (is_kfence_address(p)) {
 6967		ks = orig_size = kfence_ksize(p);
 6968	} else {
 6969		struct page *page = virt_to_page(p);
 6970		struct slab *slab = page_slab(page);
 6971
 6972		if (!slab) {
 6973			/* Big kmalloc object */
 6974			ks = page_size(page);
 6975			WARN_ON(ks <= KMALLOC_MAX_CACHE_SIZE);
 6976			WARN_ON(p != page_address(page));
 6977		} else {
 6978			s = slab->slab_cache;
 6979			orig_size = get_orig_size(s, (void *)p);
 6980			ks = s->object_size;
 6981		}
 6982	}
 6983
 6984	/* If the old object doesn't fit, allocate a bigger one */
 6985	if (new_size > ks)
 6986		goto alloc_new;
 6987
 6988	/* If the old object doesn't satisfy the new alignment, allocate a new one */
 6989	if (!IS_ALIGNED((unsigned long)p, align))
 6990		goto alloc_new;
 6991
 6992	/* Zero out spare memory. */
 6993	if (want_init_on_alloc(flags)) {
 6994		kasan_disable_current();
 6995		if (orig_size && orig_size < new_size)
 6996			memset(kasan_reset_tag(p) + orig_size, 0, new_size - orig_size);
 6997		else
 6998			memset(kasan_reset_tag(p) + new_size, 0, ks - new_size);
 6999		kasan_enable_current();
 7000	}
 7001
 7002	/* Setup kmalloc redzone when needed */
 7003	if (s && slub_debug_orig_size(s)) {
 7004		set_orig_size(s, (void *)p, new_size);
 7005		if (s->flags & SLAB_RED_ZONE && new_size < ks)
 7006			memset_no_sanitize_memory(kasan_reset_tag(p) + new_size,
 7007						SLUB_RED_ACTIVE, ks - new_size);
 7008	}
 7009
 7010	p = kasan_krealloc(p, new_size, flags);
 7011	return (void *)p;
 7012
 7013alloc_new:
 7014	ret = kmalloc_node_track_caller_noprof(new_size, flags, nid, _RET_IP_);
 7015	if (ret && p) {
 7016		/* Disable KASAN checks as the object's redzone is accessed. */
 7017		kasan_disable_current();
 7018		memcpy(ret, kasan_reset_tag(p), orig_size ?: ks);
 7019		kasan_enable_current();
 7020	}
 7021
 7022	return ret;
 7023}
 7024
 7025/**
 7026 * krealloc_node_align - reallocate memory. The contents will remain unchanged.
 7027 * @p: object to reallocate memory for.
 7028 * @new_size: how many bytes of memory are required.
 7029 * @align: desired alignment.
 7030 * @flags: the type of memory to allocate.
 7031 * @nid: NUMA node or NUMA_NO_NODE
 7032 *
 7033 * If @p is %NULL, krealloc() behaves exactly like kmalloc().  If @new_size
 7034 * is 0 and @p is not a %NULL pointer, the object pointed to is freed.
 7035 *
 7036 * Only alignments up to those guaranteed by kmalloc() will be honored. Please see
 7037 * Documentation/core-api/memory-allocation.rst for more details.
 7038 *
 7039 * If __GFP_ZERO logic is requested, callers must ensure that, starting with the
 7040 * initial memory allocation, every subsequent call to this API for the same
 7041 * memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that
 7042 * __GFP_ZERO is not fully honored by this API.
 7043 *
 7044 * When slub_debug_orig_size() is off, krealloc() only knows about the bucket
 7045 * size of an allocation (but not the exact size it was allocated with) and
 7046 * hence implements the following semantics for shrinking and growing buffers
 7047 * with __GFP_ZERO::
 7048 *
 7049 *           new             bucket
 7050 *   0       size             size
 7051 *   |--------|----------------|
 7052 *   |  keep  |      zero      |
 7053 *
 7054 * Otherwise, the original allocation size 'orig_size' could be used to
 7055 * precisely clear the requested size, and the new size will also be stored
 7056 * as the new 'orig_size'.
 7057 *
 7058 * In any case, the contents of the object pointed to are preserved up to the
 7059 * lesser of the new and old sizes.
 7060 *
 7061 * Return: pointer to the allocated memory or %NULL in case of error
 7062 */
 7063void *krealloc_node_align_noprof(const void *p, size_t new_size, unsigned long align,
 7064				 gfp_t flags, int nid)
 7065{
 7066	void *ret;
 7067
 7068	if (unlikely(!new_size)) {
 7069		kfree(p);
 7070		return ZERO_SIZE_PTR;
 7071	}
 7072
 7073	ret = __do_krealloc(p, new_size, align, flags, nid);
 7074	if (ret && kasan_reset_tag(p) != kasan_reset_tag(ret))
 7075		kfree(p);
 7076
 7077	return ret;
 7078}
 7079EXPORT_SYMBOL(krealloc_node_align_noprof);
 7080
 7081static gfp_t kmalloc_gfp_adjust(gfp_t flags, size_t size)
 7082{
 7083	/*
 7084	 * We want to attempt a large physically contiguous block first because
 7085	 * it is less likely to fragment multiple larger blocks and therefore
 7086	 * contribute to a long term fragmentation less than vmalloc fallback.
 7087	 * However make sure that larger requests are not too disruptive - i.e.
 7088	 * do not direct reclaim unless physically continuous memory is preferred
 7089	 * (__GFP_RETRY_MAYFAIL mode). We still kick in kswapd/kcompactd to
 7090	 * start working in the background
 7091	 */
 7092	if (size > PAGE_SIZE) {
 7093		flags |= __GFP_NOWARN;
 7094
 7095		if (!(flags & __GFP_RETRY_MAYFAIL))
 7096			flags &= ~__GFP_DIRECT_RECLAIM;
 7097
 7098		/* nofail semantic is implemented by the vmalloc fallback */
 7099		flags &= ~__GFP_NOFAIL;
 7100	}
 7101
 7102	return flags;
 7103}
 7104
 7105/**
 7106 * __kvmalloc_node - attempt to allocate physically contiguous memory, but upon
 7107 * failure, fall back to non-contiguous (vmalloc) allocation.
 7108 * @size: size of the request.
 7109 * @b: which set of kmalloc buckets to allocate from.
 7110 * @align: desired alignment.
 7111 * @flags: gfp mask for the allocation - must be compatible (superset) with GFP_KERNEL.
 7112 * @node: numa node to allocate from
 7113 *
 7114 * Only alignments up to those guaranteed by kmalloc() will be honored. Please see
 7115 * Documentation/core-api/memory-allocation.rst for more details.
 7116 *
 7117 * Uses kmalloc to get the memory but if the allocation fails then falls back
 7118 * to the vmalloc allocator. Use kvfree for freeing the memory.
 7119 *
 7120 * GFP_NOWAIT and GFP_ATOMIC are supported, the __GFP_NORETRY modifier is not.
 7121 * __GFP_RETRY_MAYFAIL is supported, and it should be used only if kmalloc is
 7122 * preferable to the vmalloc fallback, due to visible performance drawbacks.
 7123 *
 7124 * Return: pointer to the allocated memory of %NULL in case of failure
 7125 */
 7126void *__kvmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), unsigned long align,
 7127			     gfp_t flags, int node)
 7128{
 7129	bool allow_block;
 7130	void *ret;
 7131
 7132	/*
 7133	 * It doesn't really make sense to fallback to vmalloc for sub page
 7134	 * requests
 7135	 */
 7136	ret = __do_kmalloc_node(size, PASS_BUCKET_PARAM(b),
 7137				kmalloc_gfp_adjust(flags, size),
 7138				node, _RET_IP_);
 7139	if (ret || size <= PAGE_SIZE)
 7140		return ret;
 7141
 7142	/* Don't even allow crazy sizes */
 7143	if (unlikely(size > INT_MAX)) {
 7144		WARN_ON_ONCE(!(flags & __GFP_NOWARN));
 7145		return NULL;
 7146	}
 7147
 7148	/*
 7149	 * For non-blocking the VM_ALLOW_HUGE_VMAP is not used
 7150	 * because the huge-mapping path in vmalloc contains at
 7151	 * least one might_sleep() call.
 7152	 *
 7153	 * TODO: Revise huge-mapping path to support non-blocking
 7154	 * flags.
 7155	 */
 7156	allow_block = gfpflags_allow_blocking(flags);
 7157
 7158	/*
 7159	 * kvmalloc() can always use VM_ALLOW_HUGE_VMAP,
 7160	 * since the callers already cannot assume anything
 7161	 * about the resulting pointer, and cannot play
 7162	 * protection games.
 7163	 */
 7164	return __vmalloc_node_range_noprof(size, align, VMALLOC_START, VMALLOC_END,
 7165			flags, PAGE_KERNEL, allow_block ? VM_ALLOW_HUGE_VMAP:0,
 7166			node, __builtin_return_address(0));
 7167}
 7168EXPORT_SYMBOL(__kvmalloc_node_noprof);
 7169
 7170/**
 7171 * kvfree() - Free memory.
 7172 * @addr: Pointer to allocated memory.
 7173 *
 7174 * kvfree frees memory allocated by any of vmalloc(), kmalloc() or kvmalloc().
 7175 * It is slightly more efficient to use kfree() or vfree() if you are certain
 7176 * that you know which one to use.
 7177 *
 7178 * Context: Either preemptible task context or not-NMI interrupt.
 7179 */
 7180void kvfree(const void *addr)
 7181{
 7182	if (is_vmalloc_addr(addr))
 7183		vfree(addr);
 7184	else
 7185		kfree(addr);
 7186}
 7187EXPORT_SYMBOL(kvfree);
 7188
 7189/**
 7190 * kvfree_sensitive - Free a data object containing sensitive information.
 7191 * @addr: address of the data object to be freed.
 7192 * @len: length of the data object.
 7193 *
 7194 * Use the special memzero_explicit() function to clear the content of a
 7195 * kvmalloc'ed object containing sensitive data to make sure that the
 7196 * compiler won't optimize out the data clearing.
 7197 */
 7198void kvfree_sensitive(const void *addr, size_t len)
 7199{
 7200	if (likely(!ZERO_OR_NULL_PTR(addr))) {
 7201		memzero_explicit((void *)addr, len);
 7202		kvfree(addr);
 7203	}
 7204}
 7205EXPORT_SYMBOL(kvfree_sensitive);
 7206
 7207/**
 7208 * kvrealloc_node_align - reallocate memory; contents remain unchanged
 7209 * @p: object to reallocate memory for
 7210 * @size: the size to reallocate
 7211 * @align: desired alignment
 7212 * @flags: the flags for the page level allocator
 7213 * @nid: NUMA node id
 7214 *
 7215 * If @p is %NULL, kvrealloc() behaves exactly like kvmalloc(). If @size is 0
 7216 * and @p is not a %NULL pointer, the object pointed to is freed.
 7217 *
 7218 * Only alignments up to those guaranteed by kmalloc() will be honored. Please see
 7219 * Documentation/core-api/memory-allocation.rst for more details.
 7220 *
 7221 * If __GFP_ZERO logic is requested, callers must ensure that, starting with the
 7222 * initial memory allocation, every subsequent call to this API for the same
 7223 * memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that
 7224 * __GFP_ZERO is not fully honored by this API.
 7225 *
 7226 * In any case, the contents of the object pointed to are preserved up to the
 7227 * lesser of the new and old sizes.
 7228 *
 7229 * This function must not be called concurrently with itself or kvfree() for the
 7230 * same memory allocation.
 7231 *
 7232 * Return: pointer to the allocated memory or %NULL in case of error
 7233 */
 7234void *kvrealloc_node_align_noprof(const void *p, size_t size, unsigned long align,
 7235				  gfp_t flags, int nid)
 7236{
 7237	void *n;
 7238
 7239	if (is_vmalloc_addr(p))
 7240		return vrealloc_node_align_noprof(p, size, align, flags, nid);
 7241
 7242	n = krealloc_node_align_noprof(p, size, align, kmalloc_gfp_adjust(flags, size), nid);
 7243	if (!n) {
 7244		/* We failed to krealloc(), fall back to kvmalloc(). */
 7245		n = kvmalloc_node_align_noprof(size, align, flags, nid);
 7246		if (!n)
 7247			return NULL;
 7248
 7249		if (p) {
 7250			/* We already know that `p` is not a vmalloc address. */
 7251			kasan_disable_current();
 7252			memcpy(n, kasan_reset_tag(p), ksize(p));
 7253			kasan_enable_current();
 7254
 7255			kfree(p);
 7256		}
 7257	}
 7258
 7259	return n;
 7260}
 7261EXPORT_SYMBOL(kvrealloc_node_align_noprof);
 7262
 7263struct detached_freelist {
 7264	struct slab *slab;
 7265	void *tail;
 7266	void *freelist;
 7267	int cnt;
 7268	struct kmem_cache *s;
 7269};
 7270
 7271/*
 7272 * This function progressively scans the array with free objects (with
 7273 * a limited look ahead) and extract objects belonging to the same
 7274 * slab.  It builds a detached freelist directly within the given
 7275 * slab/objects.  This can happen without any need for
 7276 * synchronization, because the objects are owned by running process.
 7277 * The freelist is build up as a single linked list in the objects.
 7278 * The idea is, that this detached freelist can then be bulk
 7279 * transferred to the real freelist(s), but only requiring a single
 7280 * synchronization primitive.  Look ahead in the array is limited due
 7281 * to performance reasons.
 7282 */
 7283static inline
 7284int build_detached_freelist(struct kmem_cache *s, size_t size,
 7285			    void **p, struct detached_freelist *df)
 7286{
 7287	int lookahead = 3;
 7288	void *object;
 7289	struct page *page;
 7290	struct slab *slab;
 7291	size_t same;
 7292
 7293	object = p[--size];
 7294	page = virt_to_page(object);
 7295	slab = page_slab(page);
 7296	if (!s) {
 7297		/* Handle kalloc'ed objects */
 7298		if (!slab) {
 7299			free_large_kmalloc(page, object);
 7300			df->slab = NULL;
 7301			return size;
 7302		}
 7303		/* Derive kmem_cache from object */
 7304		df->slab = slab;
 7305		df->s = slab->slab_cache;
 7306	} else {
 7307		df->slab = slab;
 7308		df->s = cache_from_obj(s, object); /* Support for memcg */
 7309	}
 7310
 7311	/* Start new detached freelist */
 7312	df->tail = object;
 7313	df->freelist = object;
 7314	df->cnt = 1;
 7315
 7316	if (is_kfence_address(object))
 7317		return size;
 7318
 7319	set_freepointer(df->s, object, NULL);
 7320
 7321	same = size;
 7322	while (size) {
 7323		object = p[--size];
 7324		/* df->slab is always set at this point */
 7325		if (df->slab == virt_to_slab(object)) {
 7326			/* Opportunity build freelist */
 7327			set_freepointer(df->s, object, df->freelist);
 7328			df->freelist = object;
 7329			df->cnt++;
 7330			same--;
 7331			if (size != same)
 7332				swap(p[size], p[same]);
 7333			continue;
 7334		}
 7335
 7336		/* Limit look ahead search */
 7337		if (!--lookahead)
 7338			break;
 7339	}
 7340
 7341	return same;
 7342}
 7343
 7344/*
 7345 * Internal bulk free of objects that were not initialised by the post alloc
 7346 * hooks and thus should not be processed by the free hooks
 7347 */
 7348static void __kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
 7349{
 7350	if (!size)
 7351		return;
 7352
 7353	do {
 7354		struct detached_freelist df;
 7355
 7356		size = build_detached_freelist(s, size, p, &df);
 7357		if (!df.slab)
 7358			continue;
 7359
 7360		if (kfence_free(df.freelist))
 7361			continue;
 7362
 7363		do_slab_free(df.s, df.slab, df.freelist, df.tail, df.cnt,
 7364			     _RET_IP_);
 7365	} while (likely(size));
 7366}
 7367
 7368/* Note that interrupts must be enabled when calling this function. */
 7369void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
 7370{
 7371	if (!size)
 7372		return;
 7373
 7374	/*
 7375	 * freeing to sheaves is so incompatible with the detached freelist so
 7376	 * once we go that way, we have to do everything differently
 7377	 */
 7378	if (s && s->cpu_sheaves) {
 7379		free_to_pcs_bulk(s, size, p);
 7380		return;
 7381	}
 7382
 7383	do {
 7384		struct detached_freelist df;
 7385
 7386		size = build_detached_freelist(s, size, p, &df);
 7387		if (!df.slab)
 7388			continue;
 7389
 7390		slab_free_bulk(df.s, df.slab, df.freelist, df.tail, &p[size],
 7391			       df.cnt, _RET_IP_);
 7392	} while (likely(size));
 7393}
 7394EXPORT_SYMBOL(kmem_cache_free_bulk);
 7395
 7396static inline
 7397int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
 7398			    void **p)
 7399{
 7400	struct kmem_cache_cpu *c;
 7401	unsigned long irqflags;
 7402	int i;
 7403
 7404	/*
 7405	 * Drain objects in the per cpu slab, while disabling local
 7406	 * IRQs, which protects against PREEMPT and interrupts
 7407	 * handlers invoking normal fastpath.
 7408	 */
 7409	c = slub_get_cpu_ptr(s->cpu_slab);
 7410	local_lock_irqsave(&s->cpu_slab->lock, irqflags);
 7411
 7412	for (i = 0; i < size; i++) {
 7413		void *object = c->freelist;
 7414
 7415		if (unlikely(!object)) {
 7416			/*
 7417			 * We may have removed an object from c->freelist using
 7418			 * the fastpath in the previous iteration; in that case,
 7419			 * c->tid has not been bumped yet.
 7420			 * Since ___slab_alloc() may reenable interrupts while
 7421			 * allocating memory, we should bump c->tid now.
 7422			 */
 7423			c->tid = next_tid(c->tid);
 7424
 7425			local_unlock_irqrestore(&s->cpu_slab->lock, irqflags);
 7426
 7427			/*
 7428			 * Invoking slow path likely have side-effect
 7429			 * of re-populating per CPU c->freelist
 7430			 */
 7431			p[i] = ___slab_alloc(s, flags, NUMA_NO_NODE,
 7432					    _RET_IP_, c, s->object_size);
 7433			if (unlikely(!p[i]))
 7434				goto error;
 7435
 7436			c = this_cpu_ptr(s->cpu_slab);
 7437			maybe_wipe_obj_freeptr(s, p[i]);
 7438
 7439			local_lock_irqsave(&s->cpu_slab->lock, irqflags);
 7440
 7441			continue; /* goto for-loop */
 7442		}
 7443		c->freelist = get_freepointer(s, object);
 7444		p[i] = object;
 7445		maybe_wipe_obj_freeptr(s, p[i]);
 7446		stat(s, ALLOC_FASTPATH);
 7447	}
 7448	c->tid = next_tid(c->tid);
 7449	local_unlock_irqrestore(&s->cpu_slab->lock, irqflags);
 7450	slub_put_cpu_ptr(s->cpu_slab);
 7451
 7452	return i;
 7453
 7454error:
 7455	slub_put_cpu_ptr(s->cpu_slab);
 7456	__kmem_cache_free_bulk(s, i, p);
 7457	return 0;
 7458
 7459}
 7460
 7461/* Note that interrupts must be enabled when calling this function. */
 7462int kmem_cache_alloc_bulk_noprof(struct kmem_cache *s, gfp_t flags, size_t size,
 7463				 void **p)
 7464{
 7465	unsigned int i = 0;
 7466	void *kfence_obj;
 7467
 7468	if (!size)
 7469		return 0;
 7470
 7471	s = slab_pre_alloc_hook(s, flags);
 7472	if (unlikely(!s))
 7473		return 0;
 7474
 7475	/*
 7476	 * to make things simpler, only assume at most once kfence allocated
 7477	 * object per bulk allocation and choose its index randomly
 7478	 */
 7479	kfence_obj = kfence_alloc(s, s->object_size, flags);
 7480
 7481	if (unlikely(kfence_obj)) {
 7482		if (unlikely(size == 1)) {
 7483			p[0] = kfence_obj;
 7484			goto out;
 7485		}
 7486		size--;
 7487	}
 7488
 7489	if (s->cpu_sheaves)
 7490		i = alloc_from_pcs_bulk(s, size, p);
 7491
 7492	if (i < size) {
 7493		/*
 7494		 * If we ran out of memory, don't bother with freeing back to
 7495		 * the percpu sheaves, we have bigger problems.
 7496		 */
 7497		if (unlikely(__kmem_cache_alloc_bulk(s, flags, size - i, p + i) == 0)) {
 7498			if (i > 0)
 7499				__kmem_cache_free_bulk(s, i, p);
 7500			if (kfence_obj)
 7501				__kfence_free(kfence_obj);
 7502			return 0;
 7503		}
 7504	}
 7505
 7506	if (unlikely(kfence_obj)) {
 7507		int idx = get_random_u32_below(size + 1);
 7508
 7509		if (idx != size)
 7510			p[size] = p[idx];
 7511		p[idx] = kfence_obj;
 7512
 7513		size++;
 7514	}
 7515
 7516out:
 7517	/*
 7518	 * memcg and kmem_cache debug support and memory initialization.
 7519	 * Done outside of the IRQ disabled fastpath loop.
 7520	 */
 7521	if (unlikely(!slab_post_alloc_hook(s, NULL, flags, size, p,
 7522		    slab_want_init_on_alloc(flags, s), s->object_size))) {
 7523		return 0;
 7524	}
 7525
 7526	return size;
 7527}
 7528EXPORT_SYMBOL(kmem_cache_alloc_bulk_noprof);
 7529
 7530/*
 7531 * Object placement in a slab is made very easy because we always start at
 7532 * offset 0. If we tune the size of the object to the alignment then we can
 7533 * get the required alignment by putting one properly sized object after
 7534 * another.
 7535 *
 7536 * Notice that the allocation order determines the sizes of the per cpu
 7537 * caches. Each processor has always one slab available for allocations.
 7538 * Increasing the allocation order reduces the number of times that slabs
 7539 * must be moved on and off the partial lists and is therefore a factor in
 7540 * locking overhead.
 7541 */
 7542
 7543/*
 7544 * Minimum / Maximum order of slab pages. This influences locking overhead
 7545 * and slab fragmentation. A higher order reduces the number of partial slabs
 7546 * and increases the number of allocations possible without having to
 7547 * take the list_lock.
 7548 */
 7549static unsigned int slub_min_order;
 7550static unsigned int slub_max_order =
 7551	IS_ENABLED(CONFIG_SLUB_TINY) ? 1 : PAGE_ALLOC_COSTLY_ORDER;
 7552static unsigned int slub_min_objects;
 7553
 7554/*
 7555 * Calculate the order of allocation given an slab object size.
 7556 *
 7557 * The order of allocation has significant impact on performance and other
 7558 * system components. Generally order 0 allocations should be preferred since
 7559 * order 0 does not cause fragmentation in the page allocator. Larger objects
 7560 * be problematic to put into order 0 slabs because there may be too much
 7561 * unused space left. We go to a higher order if more than 1/16th of the slab
 7562 * would be wasted.
 7563 *
 7564 * In order to reach satisfactory performance we must ensure that a minimum
 7565 * number of objects is in one slab. Otherwise we may generate too much
 7566 * activity on the partial lists which requires taking the list_lock. This is
 7567 * less a concern for large slabs though which are rarely used.
 7568 *
 7569 * slab_max_order specifies the order where we begin to stop considering the
 7570 * number of objects in a slab as critical. If we reach slab_max_order then
 7571 * we try to keep the page order as low as possible. So we accept more waste
 7572 * of space in favor of a small page order.
 7573 *
 7574 * Higher order allocations also allow the placement of more objects in a
 7575 * slab and thereby reduce object handling overhead. If the user has
 7576 * requested a higher minimum order then we start with that one instead of
 7577 * the smallest order which will fit the object.
 7578 */
 7579static inline unsigned int calc_slab_order(unsigned int size,
 7580		unsigned int min_order, unsigned int max_order,
 7581		unsigned int fract_leftover)
 7582{
 7583	unsigned int order;
 7584
 7585	for (order = min_order; order <= max_order; order++) {
 7586
 7587		unsigned int slab_size = (unsigned int)PAGE_SIZE << order;
 7588		unsigned int rem;
 7589
 7590		rem = slab_size % size;
 7591
 7592		if (rem <= slab_size / fract_leftover)
 7593			break;
 7594	}
 7595
 7596	return order;
 7597}
 7598
 7599static inline int calculate_order(unsigned int size)
 7600{
 7601	unsigned int order;
 7602	unsigned int min_objects;
 7603	unsigned int max_objects;
 7604	unsigned int min_order;
 7605
 7606	min_objects = slub_min_objects;
 7607	if (!min_objects) {
 7608		/*
 7609		 * Some architectures will only update present cpus when
 7610		 * onlining them, so don't trust the number if it's just 1. But
 7611		 * we also don't want to use nr_cpu_ids always, as on some other
 7612		 * architectures, there can be many possible cpus, but never
 7613		 * onlined. Here we compromise between trying to avoid too high
 7614		 * order on systems that appear larger than they are, and too
 7615		 * low order on systems that appear smaller than they are.
 7616		 */
 7617		unsigned int nr_cpus = num_present_cpus();
 7618		if (nr_cpus <= 1)
 7619			nr_cpus = nr_cpu_ids;
 7620		min_objects = 4 * (fls(nr_cpus) + 1);
 7621	}
 7622	/* min_objects can't be 0 because get_order(0) is undefined */
 7623	max_objects = max(order_objects(slub_max_order, size), 1U);
 7624	min_objects = min(min_objects, max_objects);
 7625
 7626	min_order = max_t(unsigned int, slub_min_order,
 7627			  get_order(min_objects * size));
 7628	if (order_objects(min_order, size) > MAX_OBJS_PER_PAGE)
 7629		return get_order(size * MAX_OBJS_PER_PAGE) - 1;
 7630
 7631	/*
 7632	 * Attempt to find best configuration for a slab. This works by first
 7633	 * attempting to generate a layout with the best possible configuration
 7634	 * and backing off gradually.
 7635	 *
 7636	 * We start with accepting at most 1/16 waste and try to find the
 7637	 * smallest order from min_objects-derived/slab_min_order up to
 7638	 * slab_max_order that will satisfy the constraint. Note that increasing
 7639	 * the order can only result in same or less fractional waste, not more.
 7640	 *
 7641	 * If that fails, we increase the acceptable fraction of waste and try
 7642	 * again. The last iteration with fraction of 1/2 would effectively
 7643	 * accept any waste and give us the order determined by min_objects, as
 7644	 * long as at least single object fits within slab_max_order.
 7645	 */
 7646	for (unsigned int fraction = 16; fraction > 1; fraction /= 2) {
 7647		order = calc_slab_order(size, min_order, slub_max_order,
 7648					fraction);
 7649		if (order <= slub_max_order)
 7650			return order;
 7651	}
 7652
 7653	/*
 7654	 * Doh this slab cannot be placed using slab_max_order.
 7655	 */
 7656	order = get_order(size);
 7657	if (order <= MAX_PAGE_ORDER)
 7658		return order;
 7659	return -ENOSYS;
 7660}
 7661
 7662static void
 7663init_kmem_cache_node(struct kmem_cache_node *n, struct node_barn *barn)
 7664{
 7665	n->nr_partial = 0;
 7666	spin_lock_init(&n->list_lock);
 7667	INIT_LIST_HEAD(&n->partial);
 7668#ifdef CONFIG_SLUB_DEBUG
 7669	atomic_long_set(&n->nr_slabs, 0);
 7670	atomic_long_set(&n->total_objects, 0);
 7671	INIT_LIST_HEAD(&n->full);
 7672#endif
 7673	n->barn = barn;
 7674	if (barn)
 7675		barn_init(barn);
 7676}
 7677
 7678static inline int alloc_kmem_cache_cpus(struct kmem_cache *s)
 7679{
 7680	BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE <
 7681			NR_KMALLOC_TYPES * KMALLOC_SHIFT_HIGH *
 7682			sizeof(struct kmem_cache_cpu));
 7683
 7684	/*
 7685	 * Must align to double word boundary for the double cmpxchg
 7686	 * instructions to work; see __pcpu_double_call_return_bool().
 7687	 */
 7688	s->cpu_slab = __alloc_percpu(sizeof(struct kmem_cache_cpu),
 7689				     2 * sizeof(void *));
 7690
 7691	if (!s->cpu_slab)
 7692		return 0;
 7693
 7694	init_kmem_cache_cpus(s);
 7695
 7696	return 1;
 7697}
 7698
 7699static int init_percpu_sheaves(struct kmem_cache *s)
 7700{
 7701	int cpu;
 7702
 7703	for_each_possible_cpu(cpu) {
 7704		struct slub_percpu_sheaves *pcs;
 7705
 7706		pcs = per_cpu_ptr(s->cpu_sheaves, cpu);
 7707
 7708		local_trylock_init(&pcs->lock);
 7709
 7710		pcs->main = alloc_empty_sheaf(s, GFP_KERNEL);
 7711
 7712		if (!pcs->main)
 7713			return -ENOMEM;
 7714	}
 7715
 7716	return 0;
 7717}
 7718
 7719static struct kmem_cache *kmem_cache_node;
 7720
 7721/*
 7722 * No kmalloc_node yet so do it by hand. We know that this is the first
 7723 * slab on the node for this slabcache. There are no concurrent accesses
 7724 * possible.
 7725 *
 7726 * Note that this function only works on the kmem_cache_node
 7727 * when allocating for the kmem_cache_node. This is used for bootstrapping
 7728 * memory on a fresh node that has no slab structures yet.
 7729 */
 7730static void early_kmem_cache_node_alloc(int node)
 7731{
 7732	struct slab *slab;
 7733	struct kmem_cache_node *n;
 7734
 7735	BUG_ON(kmem_cache_node->size < sizeof(struct kmem_cache_node));
 7736
 7737	slab = new_slab(kmem_cache_node, GFP_NOWAIT, node);
 7738
 7739	BUG_ON(!slab);
 7740	if (slab_nid(slab) != node) {
 7741		pr_err("SLUB: Unable to allocate memory from node %d\n", node);
 7742		pr_err("SLUB: Allocating a useless per node structure in order to be able to continue\n");
 7743	}
 7744
 7745	n = slab->freelist;
 7746	BUG_ON(!n);
 7747#ifdef CONFIG_SLUB_DEBUG
 7748	init_object(kmem_cache_node, n, SLUB_RED_ACTIVE);
 7749#endif
 7750	n = kasan_slab_alloc(kmem_cache_node, n, GFP_KERNEL, false);
 7751	slab->freelist = get_freepointer(kmem_cache_node, n);
 7752	slab->inuse = 1;
 7753	kmem_cache_node->node[node] = n;
 7754	init_kmem_cache_node(n, NULL);
 7755	inc_slabs_node(kmem_cache_node, node, slab->objects);
 7756
 7757	/*
 7758	 * No locks need to be taken here as it has just been
 7759	 * initialized and there is no concurrent access.
 7760	 */
 7761	__add_partial(n, slab, DEACTIVATE_TO_HEAD);
 7762}
 7763
 7764static void free_kmem_cache_nodes(struct kmem_cache *s)
 7765{
 7766	int node;
 7767	struct kmem_cache_node *n;
 7768
 7769	for_each_kmem_cache_node(s, node, n) {
 7770		if (n->barn) {
 7771			WARN_ON(n->barn->nr_full);
 7772			WARN_ON(n->barn->nr_empty);
 7773			kfree(n->barn);
 7774			n->barn = NULL;
 7775		}
 7776
 7777		s->node[node] = NULL;
 7778		kmem_cache_free(kmem_cache_node, n);
 7779	}
 7780}
 7781
 7782void __kmem_cache_release(struct kmem_cache *s)
 7783{
 7784	cache_random_seq_destroy(s);
 7785	if (s->cpu_sheaves)
 7786		pcs_destroy(s);
 7787#ifdef CONFIG_PREEMPT_RT
 7788	if (s->cpu_slab)
 7789		lockdep_unregister_key(&s->lock_key);
 7790#endif
 7791	free_percpu(s->cpu_slab);
 7792	free_kmem_cache_nodes(s);
 7793}
 7794
 7795static int init_kmem_cache_nodes(struct kmem_cache *s)
 7796{
 7797	int node;
 7798
 7799	for_each_node_mask(node, slab_nodes) {
 7800		struct kmem_cache_node *n;
 7801		struct node_barn *barn = NULL;
 7802
 7803		if (slab_state == DOWN) {
 7804			early_kmem_cache_node_alloc(node);
 7805			continue;
 7806		}
 7807
 7808		if (s->cpu_sheaves) {
 7809			barn = kmalloc_node(sizeof(*barn), GFP_KERNEL, node);
 7810
 7811			if (!barn)
 7812				return 0;
 7813		}
 7814
 7815		n = kmem_cache_alloc_node(kmem_cache_node,
 7816						GFP_KERNEL, node);
 7817		if (!n) {
 7818			kfree(barn);
 7819			return 0;
 7820		}
 7821
 7822		init_kmem_cache_node(n, barn);
 7823
 7824		s->node[node] = n;
 7825	}
 7826	return 1;
 7827}
 7828
 7829static void set_cpu_partial(struct kmem_cache *s)
 7830{
 7831#ifdef CONFIG_SLUB_CPU_PARTIAL
 7832	unsigned int nr_objects;
 7833
 7834	/*
 7835	 * cpu_partial determined the maximum number of objects kept in the
 7836	 * per cpu partial lists of a processor.
 7837	 *
 7838	 * Per cpu partial lists mainly contain slabs that just have one
 7839	 * object freed. If they are used for allocation then they can be
 7840	 * filled up again with minimal effort. The slab will never hit the
 7841	 * per node partial lists and therefore no locking will be required.
 7842	 *
 7843	 * For backwards compatibility reasons, this is determined as number
 7844	 * of objects, even though we now limit maximum number of pages, see
 7845	 * slub_set_cpu_partial()
 7846	 */
 7847	if (!kmem_cache_has_cpu_partial(s))
 7848		nr_objects = 0;
 7849	else if (s->size >= PAGE_SIZE)
 7850		nr_objects = 6;
 7851	else if (s->size >= 1024)
 7852		nr_objects = 24;
 7853	else if (s->size >= 256)
 7854		nr_objects = 52;
 7855	else
 7856		nr_objects = 120;
 7857
 7858	slub_set_cpu_partial(s, nr_objects);
 7859#endif
 7860}
 7861
 7862/*
 7863 * calculate_sizes() determines the order and the distribution of data within
 7864 * a slab object.
 7865 */
 7866static int calculate_sizes(struct kmem_cache_args *args, struct kmem_cache *s)
 7867{
 7868	slab_flags_t flags = s->flags;
 7869	unsigned int size = s->object_size;
 7870	unsigned int order;
 7871
 7872	/*
 7873	 * Round up object size to the next word boundary. We can only
 7874	 * place the free pointer at word boundaries and this determines
 7875	 * the possible location of the free pointer.
 7876	 */
 7877	size = ALIGN(size, sizeof(void *));
 7878
 7879#ifdef CONFIG_SLUB_DEBUG
 7880	/*
 7881	 * Determine if we can poison the object itself. If the user of
 7882	 * the slab may touch the object after free or before allocation
 7883	 * then we should never poison the object itself.
 7884	 */
 7885	if ((flags & SLAB_POISON) && !(flags & SLAB_TYPESAFE_BY_RCU) &&
 7886			!s->ctor)
 7887		s->flags |= __OBJECT_POISON;
 7888	else
 7889		s->flags &= ~__OBJECT_POISON;
 7890
 7891
 7892	/*
 7893	 * If we are Redzoning then check if there is some space between the
 7894	 * end of the object and the free pointer. If not then add an
 7895	 * additional word to have some bytes to store Redzone information.
 7896	 */
 7897	if ((flags & SLAB_RED_ZONE) && size == s->object_size)
 7898		size += sizeof(void *);
 7899#endif
 7900
 7901	/*
 7902	 * With that we have determined the number of bytes in actual use
 7903	 * by the object and redzoning.
 7904	 */
 7905	s->inuse = size;
 7906
 7907	if (((flags & SLAB_TYPESAFE_BY_RCU) && !args->use_freeptr_offset) ||
 7908	    (flags & SLAB_POISON) || s->ctor ||
 7909	    ((flags & SLAB_RED_ZONE) &&
 7910	     (s->object_size < sizeof(void *) || slub_debug_orig_size(s)))) {
 7911		/*
 7912		 * Relocate free pointer after the object if it is not
 7913		 * permitted to overwrite the first word of the object on
 7914		 * kmem_cache_free.
 7915		 *
 7916		 * This is the case if we do RCU, have a constructor, are
 7917		 * poisoning the objects, or are redzoning an object smaller
 7918		 * than sizeof(void *) or are redzoning an object with
 7919		 * slub_debug_orig_size() enabled, in which case the right
 7920		 * redzone may be extended.
 7921		 *
 7922		 * The assumption that s->offset >= s->inuse means free
 7923		 * pointer is outside of the object is used in the
 7924		 * freeptr_outside_object() function. If that is no
 7925		 * longer true, the function needs to be modified.
 7926		 */
 7927		s->offset = size;
 7928		size += sizeof(void *);
 7929	} else if ((flags & SLAB_TYPESAFE_BY_RCU) && args->use_freeptr_offset) {
 7930		s->offset = args->freeptr_offset;
 7931	} else {
 7932		/*
 7933		 * Store freelist pointer near middle of object to keep
 7934		 * it away from the edges of the object to avoid small
 7935		 * sized over/underflows from neighboring allocations.
 7936		 */
 7937		s->offset = ALIGN_DOWN(s->object_size / 2, sizeof(void *));
 7938	}
 7939
 7940#ifdef CONFIG_SLUB_DEBUG
 7941	if (flags & SLAB_STORE_USER) {
 7942		/*
 7943		 * Need to store information about allocs and frees after
 7944		 * the object.
 7945		 */
 7946		size += 2 * sizeof(struct track);
 7947
 7948		/* Save the original kmalloc request size */
 7949		if (flags & SLAB_KMALLOC)
 7950			size += sizeof(unsigned int);
 7951	}
 7952#endif
 7953
 7954	kasan_cache_create(s, &size, &s->flags);
 7955#ifdef CONFIG_SLUB_DEBUG
 7956	if (flags & SLAB_RED_ZONE) {
 7957		/*
 7958		 * Add some empty padding so that we can catch
 7959		 * overwrites from earlier objects rather than let
 7960		 * tracking information or the free pointer be
 7961		 * corrupted if a user writes before the start
 7962		 * of the object.
 7963		 */
 7964		size += sizeof(void *);
 7965
 7966		s->red_left_pad = sizeof(void *);
 7967		s->red_left_pad = ALIGN(s->red_left_pad, s->align);
 7968		size += s->red_left_pad;
 7969	}
 7970#endif
 7971
 7972	/*
 7973	 * SLUB stores one object immediately after another beginning from
 7974	 * offset 0. In order to align the objects we have to simply size
 7975	 * each object to conform to the alignment.
 7976	 */
 7977	size = ALIGN(size, s->align);
 7978	s->size = size;
 7979	s->reciprocal_size = reciprocal_value(size);
 7980	order = calculate_order(size);
 7981
 7982	if ((int)order < 0)
 7983		return 0;
 7984
 7985	s->allocflags = __GFP_COMP;
 7986
 7987	if (s->flags & SLAB_CACHE_DMA)
 7988		s->allocflags |= GFP_DMA;
 7989
 7990	if (s->flags & SLAB_CACHE_DMA32)
 7991		s->allocflags |= GFP_DMA32;
 7992
 7993	if (s->flags & SLAB_RECLAIM_ACCOUNT)
 7994		s->allocflags |= __GFP_RECLAIMABLE;
 7995
 7996	/*
 7997	 * Determine the number of objects per slab
 7998	 */
 7999	s->oo = oo_make(order, size);
 8000	s->min = oo_make(get_order(size), size);
 8001
 8002	return !!oo_objects(s->oo);
 8003}
 8004
 8005static void list_slab_objects(struct kmem_cache *s, struct slab *slab)
 8006{
 8007#ifdef CONFIG_SLUB_DEBUG
 8008	void *addr = slab_address(slab);
 8009	void *p;
 8010
 8011	if (!slab_add_kunit_errors())
 8012		slab_bug(s, "Objects remaining on __kmem_cache_shutdown()");
 8013
 8014	spin_lock(&object_map_lock);
 8015	__fill_map(object_map, s, slab);
 8016
 8017	for_each_object(p, s, addr, slab->objects) {
 8018
 8019		if (!test_bit(__obj_to_index(s, addr, p), object_map)) {
 8020			if (slab_add_kunit_errors())
 8021				continue;
 8022			pr_err("Object 0x%p @offset=%tu\n", p, p - addr);
 8023			print_tracking(s, p);
 8024		}
 8025	}
 8026	spin_unlock(&object_map_lock);
 8027
 8028	__slab_err(slab);
 8029#endif
 8030}
 8031
 8032/*
 8033 * Attempt to free all partial slabs on a node.
 8034 * This is called from __kmem_cache_shutdown(). We must take list_lock
 8035 * because sysfs file might still access partial list after the shutdowning.
 8036 */
 8037static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
 8038{
 8039	LIST_HEAD(discard);
 8040	struct slab *slab, *h;
 8041
 8042	BUG_ON(irqs_disabled());
 8043	spin_lock_irq(&n->list_lock);
 8044	list_for_each_entry_safe(slab, h, &n->partial, slab_list) {
 8045		if (!slab->inuse) {
 8046			remove_partial(n, slab);
 8047			list_add(&slab->slab_list, &discard);
 8048		} else {
 8049			list_slab_objects(s, slab);
 8050		}
 8051	}
 8052	spin_unlock_irq(&n->list_lock);
 8053
 8054	list_for_each_entry_safe(slab, h, &discard, slab_list)
 8055		discard_slab(s, slab);
 8056}
 8057
 8058bool __kmem_cache_empty(struct kmem_cache *s)
 8059{
 8060	int node;
 8061	struct kmem_cache_node *n;
 8062
 8063	for_each_kmem_cache_node(s, node, n)
 8064		if (n->nr_partial || node_nr_slabs(n))
 8065			return false;
 8066	return true;
 8067}
 8068
 8069/*
 8070 * Release all resources used by a slab cache.
 8071 */
 8072int __kmem_cache_shutdown(struct kmem_cache *s)
 8073{
 8074	int node;
 8075	struct kmem_cache_node *n;
 8076
 8077	flush_all_cpus_locked(s);
 8078
 8079	/* we might have rcu sheaves in flight */
 8080	if (s->cpu_sheaves)
 8081		rcu_barrier();
 8082
 8083	/* Attempt to free all objects */
 8084	for_each_kmem_cache_node(s, node, n) {
 8085		if (n->barn)
 8086			barn_shrink(s, n->barn);
 8087		free_partial(s, n);
 8088		if (n->nr_partial || node_nr_slabs(n))
 8089			return 1;
 8090	}
 8091	return 0;
 8092}
 8093
 8094#ifdef CONFIG_PRINTK
 8095void __kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab)
 8096{
 8097	void *base;
 8098	int __maybe_unused i;
 8099	unsigned int objnr;
 8100	void *objp;
 8101	void *objp0;
 8102	struct kmem_cache *s = slab->slab_cache;
 8103	struct track __maybe_unused *trackp;
 8104
 8105	kpp->kp_ptr = object;
 8106	kpp->kp_slab = slab;
 8107	kpp->kp_slab_cache = s;
 8108	base = slab_address(slab);
 8109	objp0 = kasan_reset_tag(object);
 8110#ifdef CONFIG_SLUB_DEBUG
 8111	objp = restore_red_left(s, objp0);
 8112#else
 8113	objp = objp0;
 8114#endif
 8115	objnr = obj_to_index(s, slab, objp);
 8116	kpp->kp_data_offset = (unsigned long)((char *)objp0 - (char *)objp);
 8117	objp = base + s->size * objnr;
 8118	kpp->kp_objp = objp;
 8119	if (WARN_ON_ONCE(objp < base || objp >= base + slab->objects * s->size
 8120			 || (objp - base) % s->size) ||
 8121	    !(s->flags & SLAB_STORE_USER))
 8122		return;
 8123#ifdef CONFIG_SLUB_DEBUG
 8124	objp = fixup_red_left(s, objp);
 8125	trackp = get_track(s, objp, TRACK_ALLOC);
 8126	kpp->kp_ret = (void *)trackp->addr;
 8127#ifdef CONFIG_STACKDEPOT
 8128	{
 8129		depot_stack_handle_t handle;
 8130		unsigned long *entries;
 8131		unsigned int nr_entries;
 8132
 8133		handle = READ_ONCE(trackp->handle);
 8134		if (handle) {
 8135			nr_entries = stack_depot_fetch(handle, &entries);
 8136			for (i = 0; i < KS_ADDRS_COUNT && i < nr_entries; i++)
 8137				kpp->kp_stack[i] = (void *)entries[i];
 8138		}
 8139
 8140		trackp = get_track(s, objp, TRACK_FREE);
 8141		handle = READ_ONCE(trackp->handle);
 8142		if (handle) {
 8143			nr_entries = stack_depot_fetch(handle, &entries);
 8144			for (i = 0; i < KS_ADDRS_COUNT && i < nr_entries; i++)
 8145				kpp->kp_free_stack[i] = (void *)entries[i];
 8146		}
 8147	}
 8148#endif
 8149#endif
 8150}
 8151#endif
 8152
 8153/********************************************************************
 8154 *		Kmalloc subsystem
 8155 *******************************************************************/
 8156
 8157static int __init setup_slub_min_order(const char *str, const struct kernel_param *kp)
 8158{
 8159	int ret;
 8160
 8161	ret = kstrtouint(str, 0, &slub_min_order);
 8162	if (ret)
 8163		return ret;
 8164
 8165	if (slub_min_order > slub_max_order)
 8166		slub_max_order = slub_min_order;
 8167
 8168	return 0;
 8169}
 8170
 8171static const struct kernel_param_ops param_ops_slab_min_order __initconst = {
 8172	.set = setup_slub_min_order,
 8173};
 8174__core_param_cb(slab_min_order, &param_ops_slab_min_order, &slub_min_order, 0);
 8175__core_param_cb(slub_min_order, &param_ops_slab_min_order, &slub_min_order, 0);
 8176
 8177static int __init setup_slub_max_order(const char *str, const struct kernel_param *kp)
 8178{
 8179	int ret;
 8180
 8181	ret = kstrtouint(str, 0, &slub_max_order);
 8182	if (ret)
 8183		return ret;
 8184
 8185	slub_max_order = min_t(unsigned int, slub_max_order, MAX_PAGE_ORDER);
 8186
 8187	if (slub_min_order > slub_max_order)
 8188		slub_min_order = slub_max_order;
 8189
 8190	return 0;
 8191}
 8192
 8193static const struct kernel_param_ops param_ops_slab_max_order __initconst = {
 8194	.set = setup_slub_max_order,
 8195};
 8196__core_param_cb(slab_max_order, &param_ops_slab_max_order, &slub_max_order, 0);
 8197__core_param_cb(slub_max_order, &param_ops_slab_max_order, &slub_max_order, 0);
 8198
 8199core_param(slab_min_objects, slub_min_objects, uint, 0);
 8200core_param(slub_min_objects, slub_min_objects, uint, 0);
 8201
 8202#ifdef CONFIG_NUMA
 8203static int __init setup_slab_strict_numa(const char *str, const struct kernel_param *kp)
 8204{
 8205	if (nr_node_ids > 1) {
 8206		static_branch_enable(&strict_numa);
 8207		pr_info("SLUB: Strict NUMA enabled.\n");
 8208	} else {
 8209		pr_warn("slab_strict_numa parameter set on non NUMA system.\n");
 8210	}
 8211
 8212	return 0;
 8213}
 8214
 8215static const struct kernel_param_ops param_ops_slab_strict_numa __initconst = {
 8216	.flags = KERNEL_PARAM_OPS_FL_NOARG,
 8217	.set = setup_slab_strict_numa,
 8218};
 8219__core_param_cb(slab_strict_numa, &param_ops_slab_strict_numa, NULL, 0);
 8220#endif
 8221
 8222
 8223#ifdef CONFIG_HARDENED_USERCOPY
 8224/*
 8225 * Rejects incorrectly sized objects and objects that are to be copied
 8226 * to/from userspace but do not fall entirely within the containing slab
 8227 * cache's usercopy region.
 8228 *
 8229 * Returns NULL if check passes, otherwise const char * to name of cache
 8230 * to indicate an error.
 8231 */
 8232void __check_heap_object(const void *ptr, unsigned long n,
 8233			 const struct slab *slab, bool to_user)
 8234{
 8235	struct kmem_cache *s;
 8236	unsigned int offset;
 8237	bool is_kfence = is_kfence_address(ptr);
 8238
 8239	ptr = kasan_reset_tag(ptr);
 8240
 8241	/* Find object and usable object size. */
 8242	s = slab->slab_cache;
 8243
 8244	/* Reject impossible pointers. */
 8245	if (ptr < slab_address(slab))
 8246		usercopy_abort("SLUB object not in SLUB page?!", NULL,
 8247			       to_user, 0, n);
 8248
 8249	/* Find offset within object. */
 8250	if (is_kfence)
 8251		offset = ptr - kfence_object_start(ptr);
 8252	else
 8253		offset = (ptr - slab_address(slab)) % s->size;
 8254
 8255	/* Adjust for redzone and reject if within the redzone. */
 8256	if (!is_kfence && kmem_cache_debug_flags(s, SLAB_RED_ZONE)) {
 8257		if (offset < s->red_left_pad)
 8258			usercopy_abort("SLUB object in left red zone",
 8259				       s->name, to_user, offset, n);
 8260		offset -= s->red_left_pad;
 8261	}
 8262
 8263	/* Allow address range falling entirely within usercopy region. */
 8264	if (offset >= s->useroffset &&
 8265	    offset - s->useroffset <= s->usersize &&
 8266	    n <= s->useroffset - offset + s->usersize)
 8267		return;
 8268
 8269	usercopy_abort("SLUB object", s->name, to_user, offset, n);
 8270}
 8271#endif /* CONFIG_HARDENED_USERCOPY */
 8272
 8273#define SHRINK_PROMOTE_MAX 32
 8274
 8275/*
 8276 * kmem_cache_shrink discards empty slabs and promotes the slabs filled
 8277 * up most to the head of the partial lists. New allocations will then
 8278 * fill those up and thus they can be removed from the partial lists.
 8279 *
 8280 * The slabs with the least items are placed last. This results in them
 8281 * being allocated from last increasing the chance that the last objects
 8282 * are freed in them.
 8283 */
 8284static int __kmem_cache_do_shrink(struct kmem_cache *s)
 8285{
 8286	int node;
 8287	int i;
 8288	struct kmem_cache_node *n;
 8289	struct slab *slab;
 8290	struct slab *t;
 8291	struct list_head discard;
 8292	struct list_head promote[SHRINK_PROMOTE_MAX];
 8293	unsigned long flags;
 8294	int ret = 0;
 8295
 8296	for_each_kmem_cache_node(s, node, n) {
 8297		INIT_LIST_HEAD(&discard);
 8298		for (i = 0; i < SHRINK_PROMOTE_MAX; i++)
 8299			INIT_LIST_HEAD(promote + i);
 8300
 8301		if (n->barn)
 8302			barn_shrink(s, n->barn);
 8303
 8304		spin_lock_irqsave(&n->list_lock, flags);
 8305
 8306		/*
 8307		 * Build lists of slabs to discard or promote.
 8308		 *
 8309		 * Note that concurrent frees may occur while we hold the
 8310		 * list_lock. slab->inuse here is the upper limit.
 8311		 */
 8312		list_for_each_entry_safe(slab, t, &n->partial, slab_list) {
 8313			int free = slab->objects - slab->inuse;
 8314
 8315			/* Do not reread slab->inuse */
 8316			barrier();
 8317
 8318			/* We do not keep full slabs on the list */
 8319			BUG_ON(free <= 0);
 8320
 8321			if (free == slab->objects) {
 8322				list_move(&slab->slab_list, &discard);
 8323				slab_clear_node_partial(slab);
 8324				n->nr_partial--;
 8325				dec_slabs_node(s, node, slab->objects);
 8326			} else if (free <= SHRINK_PROMOTE_MAX)
 8327				list_move(&slab->slab_list, promote + free - 1);
 8328		}
 8329
 8330		/*
 8331		 * Promote the slabs filled up most to the head of the
 8332		 * partial list.
 8333		 */
 8334		for (i = SHRINK_PROMOTE_MAX - 1; i >= 0; i--)
 8335			list_splice(promote + i, &n->partial);
 8336
 8337		spin_unlock_irqrestore(&n->list_lock, flags);
 8338
 8339		/* Release empty slabs */
 8340		list_for_each_entry_safe(slab, t, &discard, slab_list)
 8341			free_slab(s, slab);
 8342
 8343		if (node_nr_slabs(n))
 8344			ret = 1;
 8345	}
 8346
 8347	return ret;
 8348}
 8349
 8350int __kmem_cache_shrink(struct kmem_cache *s)
 8351{
 8352	flush_all(s);
 8353	return __kmem_cache_do_shrink(s);
 8354}
 8355
 8356static int slab_mem_going_offline_callback(void)
 8357{
 8358	struct kmem_cache *s;
 8359
 8360	mutex_lock(&slab_mutex);
 8361	list_for_each_entry(s, &slab_caches, list) {
 8362		flush_all_cpus_locked(s);
 8363		__kmem_cache_do_shrink(s);
 8364	}
 8365	mutex_unlock(&slab_mutex);
 8366
 8367	return 0;
 8368}
 8369
 8370static int slab_mem_going_online_callback(int nid)
 8371{
 8372	struct kmem_cache_node *n;
 8373	struct kmem_cache *s;
 8374	int ret = 0;
 8375
 8376	/*
 8377	 * We are bringing a node online. No memory is available yet. We must
 8378	 * allocate a kmem_cache_node structure in order to bring the node
 8379	 * online.
 8380	 */
 8381	mutex_lock(&slab_mutex);
 8382	list_for_each_entry(s, &slab_caches, list) {
 8383		struct node_barn *barn = NULL;
 8384
 8385		/*
 8386		 * The structure may already exist if the node was previously
 8387		 * onlined and offlined.
 8388		 */
 8389		if (get_node(s, nid))
 8390			continue;
 8391
 8392		if (s->cpu_sheaves) {
 8393			barn = kmalloc_node(sizeof(*barn), GFP_KERNEL, nid);
 8394
 8395			if (!barn) {
 8396				ret = -ENOMEM;
 8397				goto out;
 8398			}
 8399		}
 8400
 8401		/*
 8402		 * XXX: kmem_cache_alloc_node will fallback to other nodes
 8403		 *      since memory is not yet available from the node that
 8404		 *      is brought up.
 8405		 */
 8406		n = kmem_cache_alloc(kmem_cache_node, GFP_KERNEL);
 8407		if (!n) {
 8408			kfree(barn);
 8409			ret = -ENOMEM;
 8410			goto out;
 8411		}
 8412
 8413		init_kmem_cache_node(n, barn);
 8414
 8415		s->node[nid] = n;
 8416	}
 8417	/*
 8418	 * Any cache created after this point will also have kmem_cache_node
 8419	 * initialized for the new node.
 8420	 */
 8421	node_set(nid, slab_nodes);
 8422out:
 8423	mutex_unlock(&slab_mutex);
 8424	return ret;
 8425}
 8426
 8427static int slab_memory_callback(struct notifier_block *self,
 8428				unsigned long action, void *arg)
 8429{
 8430	struct node_notify *nn = arg;
 8431	int nid = nn->nid;
 8432	int ret = 0;
 8433
 8434	switch (action) {
 8435	case NODE_ADDING_FIRST_MEMORY:
 8436		ret = slab_mem_going_online_callback(nid);
 8437		break;
 8438	case NODE_REMOVING_LAST_MEMORY:
 8439		ret = slab_mem_going_offline_callback();
 8440		break;
 8441	}
 8442	if (ret)
 8443		ret = notifier_from_errno(ret);
 8444	else
 8445		ret = NOTIFY_OK;
 8446	return ret;
 8447}
 8448
 8449/********************************************************************
 8450 *			Basic setup of slabs
 8451 *******************************************************************/
 8452
 8453/*
 8454 * Used for early kmem_cache structures that were allocated using
 8455 * the page allocator. Allocate them properly then fix up the pointers
 8456 * that may be pointing to the wrong kmem_cache structure.
 8457 */
 8458
 8459static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache)
 8460{
 8461	int node;
 8462	struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
 8463	struct kmem_cache_node *n;
 8464
 8465	memcpy(s, static_cache, kmem_cache->object_size);
 8466
 8467	/*
 8468	 * This runs very early, and only the boot processor is supposed to be
 8469	 * up.  Even if it weren't true, IRQs are not up so we couldn't fire
 8470	 * IPIs around.
 8471	 */
 8472	__flush_cpu_slab(s, smp_processor_id());
 8473	for_each_kmem_cache_node(s, node, n) {
 8474		struct slab *p;
 8475
 8476		list_for_each_entry(p, &n->partial, slab_list)
 8477			p->slab_cache = s;
 8478
 8479#ifdef CONFIG_SLUB_DEBUG
 8480		list_for_each_entry(p, &n->full, slab_list)
 8481			p->slab_cache = s;
 8482#endif
 8483	}
 8484	list_add(&s->list, &slab_caches);
 8485	return s;
 8486}
 8487
 8488void __init kmem_cache_init(void)
 8489{
 8490	static __initdata struct kmem_cache boot_kmem_cache,
 8491		boot_kmem_cache_node;
 8492	int node;
 8493
 8494	if (debug_guardpage_minorder())
 8495		slub_max_order = 0;
 8496
 8497	/* Inform pointer hashing choice about slub debugging state. */
 8498	hash_pointers_finalize(__slub_debug_enabled());
 8499
 8500	kmem_cache_node = &boot_kmem_cache_node;
 8501	kmem_cache = &boot_kmem_cache;
 8502
 8503	/*
 8504	 * Initialize the nodemask for which we will allocate per node
 8505	 * structures. Here we don't need taking slab_mutex yet.
 8506	 */
 8507	for_each_node_state(node, N_MEMORY)
 8508		node_set(node, slab_nodes);
 8509
 8510	create_boot_cache(kmem_cache_node, "kmem_cache_node",
 8511			sizeof(struct kmem_cache_node),
 8512			SLAB_HWCACHE_ALIGN | SLAB_NO_OBJ_EXT, 0, 0);
 8513
 8514	hotplug_node_notifier(slab_memory_callback, SLAB_CALLBACK_PRI);
 8515
 8516	/* Able to allocate the per node structures */
 8517	slab_state = PARTIAL;
 8518
 8519	create_boot_cache(kmem_cache, "kmem_cache",
 8520			offsetof(struct kmem_cache, node) +
 8521				nr_node_ids * sizeof(struct kmem_cache_node *),
 8522			SLAB_HWCACHE_ALIGN | SLAB_NO_OBJ_EXT, 0, 0);
 8523
 8524	kmem_cache = bootstrap(&boot_kmem_cache);
 8525	kmem_cache_node = bootstrap(&boot_kmem_cache_node);
 8526
 8527	/* Now we can use the kmem_cache to allocate kmalloc slabs */
 8528	setup_kmalloc_cache_index_table();
 8529	create_kmalloc_caches();
 8530
 8531	/* Setup random freelists for each cache */
 8532	init_freelist_randomization();
 8533
 8534	cpuhp_setup_state_nocalls(CPUHP_SLUB_DEAD, "slub:dead", NULL,
 8535				  slub_cpu_dead);
 8536
 8537	pr_info("SLUB: HWalign=%d, Order=%u-%u, MinObjects=%u, CPUs=%u, Nodes=%u\n",
 8538		cache_line_size(),
 8539		slub_min_order, slub_max_order, slub_min_objects,
 8540		nr_cpu_ids, nr_node_ids);
 8541}
 8542
 8543void __init kmem_cache_init_late(void)
 8544{
 8545	flushwq = alloc_workqueue("slub_flushwq", WQ_MEM_RECLAIM, 0);
 8546	WARN_ON(!flushwq);
 8547}
 8548
 8549struct kmem_cache *
 8550__kmem_cache_alias(const char *name, unsigned int size, unsigned int align,
 8551		   slab_flags_t flags, void (*ctor)(void *))
 8552{
 8553	struct kmem_cache *s;
 8554
 8555	s = find_mergeable(size, align, flags, name, ctor);
 8556	if (s) {
 8557		if (sysfs_slab_alias(s, name))
 8558			pr_err("SLUB: Unable to add cache alias %s to sysfs\n",
 8559			       name);
 8560
 8561		s->refcount++;
 8562
 8563		/*
 8564		 * Adjust the object sizes so that we clear
 8565		 * the complete object on kzalloc.
 8566		 */
 8567		s->object_size = max(s->object_size, size);
 8568		s->inuse = max(s->inuse, ALIGN(size, sizeof(void *)));
 8569	}
 8570
 8571	return s;
 8572}
 8573
 8574int do_kmem_cache_create(struct kmem_cache *s, const char *name,
 8575			 unsigned int size, struct kmem_cache_args *args,
 8576			 slab_flags_t flags)
 8577{
 8578	int err = -EINVAL;
 8579
 8580	s->name = name;
 8581	s->size = s->object_size = size;
 8582
 8583	s->flags = kmem_cache_flags(flags, s->name);
 8584#ifdef CONFIG_SLAB_FREELIST_HARDENED
 8585	s->random = get_random_long();
 8586#endif
 8587	s->align = args->align;
 8588	s->ctor = args->ctor;
 8589#ifdef CONFIG_HARDENED_USERCOPY
 8590	s->useroffset = args->useroffset;
 8591	s->usersize = args->usersize;
 8592#endif
 8593
 8594	if (!calculate_sizes(args, s))
 8595		goto out;
 8596	if (disable_higher_order_debug) {
 8597		/*
 8598		 * Disable debugging flags that store metadata if the min slab
 8599		 * order increased.
 8600		 */
 8601		if (get_order(s->size) > get_order(s->object_size)) {
 8602			s->flags &= ~DEBUG_METADATA_FLAGS;
 8603			s->offset = 0;
 8604			if (!calculate_sizes(args, s))
 8605				goto out;
 8606		}
 8607	}
 8608
 8609#ifdef system_has_freelist_aba
 8610	if (system_has_freelist_aba() && !(s->flags & SLAB_NO_CMPXCHG)) {
 8611		/* Enable fast mode */
 8612		s->flags |= __CMPXCHG_DOUBLE;
 8613	}
 8614#endif
 8615
 8616	/*
 8617	 * The larger the object size is, the more slabs we want on the partial
 8618	 * list to avoid pounding the page allocator excessively.
 8619	 */
 8620	s->min_partial = min_t(unsigned long, MAX_PARTIAL, ilog2(s->size) / 2);
 8621	s->min_partial = max_t(unsigned long, MIN_PARTIAL, s->min_partial);
 8622
 8623	set_cpu_partial(s);
 8624
 8625	if (args->sheaf_capacity && !IS_ENABLED(CONFIG_SLUB_TINY)
 8626					&& !(s->flags & SLAB_DEBUG_FLAGS)) {
 8627		s->cpu_sheaves = alloc_percpu(struct slub_percpu_sheaves);
 8628		if (!s->cpu_sheaves) {
 8629			err = -ENOMEM;
 8630			goto out;
 8631		}
 8632		// TODO: increase capacity to grow slab_sheaf up to next kmalloc size?
 8633		s->sheaf_capacity = args->sheaf_capacity;
 8634	}
 8635
 8636#ifdef CONFIG_NUMA
 8637	s->remote_node_defrag_ratio = 1000;
 8638#endif
 8639
 8640	/* Initialize the pre-computed randomized freelist if slab is up */
 8641	if (slab_state >= UP) {
 8642		if (init_cache_random_seq(s))
 8643			goto out;
 8644	}
 8645
 8646	if (!init_kmem_cache_nodes(s))
 8647		goto out;
 8648
 8649	if (!alloc_kmem_cache_cpus(s))
 8650		goto out;
 8651
 8652	if (s->cpu_sheaves) {
 8653		err = init_percpu_sheaves(s);
 8654		if (err)
 8655			goto out;
 8656	}
 8657
 8658	err = 0;
 8659
 8660	/* Mutex is not taken during early boot */
 8661	if (slab_state <= UP)
 8662		goto out;
 8663
 8664	/*
 8665	 * Failing to create sysfs files is not critical to SLUB functionality.
 8666	 * If it fails, proceed with cache creation without these files.
 8667	 */
 8668	if (sysfs_slab_add(s))
 8669		pr_err("SLUB: Unable to add cache %s to sysfs\n", s->name);
 8670
 8671	if (s->flags & SLAB_STORE_USER)
 8672		debugfs_slab_add(s);
 8673
 8674out:
 8675	if (err)
 8676		__kmem_cache_release(s);
 8677	return err;
 8678}
 8679
 8680#ifdef SLAB_SUPPORTS_SYSFS
 8681static int count_inuse(struct slab *slab)
 8682{
 8683	return slab->inuse;
 8684}
 8685
 8686static int count_total(struct slab *slab)
 8687{
 8688	return slab->objects;
 8689}
 8690#endif
 8691
 8692#ifdef CONFIG_SLUB_DEBUG
 8693static void validate_slab(struct kmem_cache *s, struct slab *slab,
 8694			  unsigned long *obj_map)
 8695{
 8696	void *p;
 8697	void *addr = slab_address(slab);
 8698
 8699	if (!validate_slab_ptr(slab)) {
 8700		slab_err(s, slab, "Not a valid slab page");
 8701		return;
 8702	}
 8703
 8704	if (!check_slab(s, slab) || !on_freelist(s, slab, NULL))
 8705		return;
 8706
 8707	/* Now we know that a valid freelist exists */
 8708	__fill_map(obj_map, s, slab);
 8709	for_each_object(p, s, addr, slab->objects) {
 8710		u8 val = test_bit(__obj_to_index(s, addr, p), obj_map) ?
 8711			 SLUB_RED_INACTIVE : SLUB_RED_ACTIVE;
 8712
 8713		if (!check_object(s, slab, p, val))
 8714			break;
 8715	}
 8716}
 8717
 8718static int validate_slab_node(struct kmem_cache *s,
 8719		struct kmem_cache_node *n, unsigned long *obj_map)
 8720{
 8721	unsigned long count = 0;
 8722	struct slab *slab;
 8723	unsigned long flags;
 8724
 8725	spin_lock_irqsave(&n->list_lock, flags);
 8726
 8727	list_for_each_entry(slab, &n->partial, slab_list) {
 8728		validate_slab(s, slab, obj_map);
 8729		count++;
 8730	}
 8731	if (count != n->nr_partial) {
 8732		pr_err("SLUB %s: %ld partial slabs counted but counter=%ld\n",
 8733		       s->name, count, n->nr_partial);
 8734		slab_add_kunit_errors();
 8735	}
 8736
 8737	if (!(s->flags & SLAB_STORE_USER))
 8738		goto out;
 8739
 8740	list_for_each_entry(slab, &n->full, slab_list) {
 8741		validate_slab(s, slab, obj_map);
 8742		count++;
 8743	}
 8744	if (count != node_nr_slabs(n)) {
 8745		pr_err("SLUB: %s %ld slabs counted but counter=%ld\n",
 8746		       s->name, count, node_nr_slabs(n));
 8747		slab_add_kunit_errors();
 8748	}
 8749
 8750out:
 8751	spin_unlock_irqrestore(&n->list_lock, flags);
 8752	return count;
 8753}
 8754
 8755long validate_slab_cache(struct kmem_cache *s)
 8756{
 8757	int node;
 8758	unsigned long count = 0;
 8759	struct kmem_cache_node *n;
 8760	unsigned long *obj_map;
 8761
 8762	obj_map = bitmap_alloc(oo_objects(s->oo), GFP_KERNEL);
 8763	if (!obj_map)
 8764		return -ENOMEM;
 8765
 8766	flush_all(s);
 8767	for_each_kmem_cache_node(s, node, n)
 8768		count += validate_slab_node(s, n, obj_map);
 8769
 8770	bitmap_free(obj_map);
 8771
 8772	return count;
 8773}
 8774EXPORT_SYMBOL(validate_slab_cache);
 8775
 8776#ifdef CONFIG_DEBUG_FS
 8777/*
 8778 * Generate lists of code addresses where slabcache objects are allocated
 8779 * and freed.
 8780 */
 8781
 8782struct location {
 8783	depot_stack_handle_t handle;
 8784	unsigned long count;
 8785	unsigned long addr;
 8786	unsigned long waste;
 8787	long long sum_time;
 8788	long min_time;
 8789	long max_time;
 8790	long min_pid;
 8791	long max_pid;
 8792	DECLARE_BITMAP(cpus, NR_CPUS);
 8793	nodemask_t nodes;
 8794};
 8795
 8796struct loc_track {
 8797	unsigned long max;
 8798	unsigned long count;
 8799	struct location *loc;
 8800	loff_t idx;
 8801};
 8802
 8803static struct dentry *slab_debugfs_root;
 8804
 8805static void free_loc_track(struct loc_track *t)
 8806{
 8807	if (t->max)
 8808		free_pages((unsigned long)t->loc,
 8809			get_order(sizeof(struct location) * t->max));
 8810}
 8811
 8812static int alloc_loc_track(struct loc_track *t, unsigned long max, gfp_t flags)
 8813{
 8814	struct location *l;
 8815	int order;
 8816
 8817	order = get_order(sizeof(struct location) * max);
 8818
 8819	l = (void *)__get_free_pages(flags, order);
 8820	if (!l)
 8821		return 0;
 8822
 8823	if (t->count) {
 8824		memcpy(l, t->loc, sizeof(struct location) * t->count);
 8825		free_loc_track(t);
 8826	}
 8827	t->max = max;
 8828	t->loc = l;
 8829	return 1;
 8830}
 8831
 8832static int add_location(struct loc_track *t, struct kmem_cache *s,
 8833				const struct track *track,
 8834				unsigned int orig_size)
 8835{
 8836	long start, end, pos;
 8837	struct location *l;
 8838	unsigned long caddr, chandle, cwaste;
 8839	unsigned long age = jiffies - track->when;
 8840	depot_stack_handle_t handle = 0;
 8841	unsigned int waste = s->object_size - orig_size;
 8842
 8843#ifdef CONFIG_STACKDEPOT
 8844	handle = READ_ONCE(track->handle);
 8845#endif
 8846	start = -1;
 8847	end = t->count;
 8848
 8849	for ( ; ; ) {
 8850		pos = start + (end - start + 1) / 2;
 8851
 8852		/*
 8853		 * There is nothing at "end". If we end up there
 8854		 * we need to add something to before end.
 8855		 */
 8856		if (pos == end)
 8857			break;
 8858
 8859		l = &t->loc[pos];
 8860		caddr = l->addr;
 8861		chandle = l->handle;
 8862		cwaste = l->waste;
 8863		if ((track->addr == caddr) && (handle == chandle) &&
 8864			(waste == cwaste)) {
 8865
 8866			l->count++;
 8867			if (track->when) {
 8868				l->sum_time += age;
 8869				if (age < l->min_time)
 8870					l->min_time = age;
 8871				if (age > l->max_time)
 8872					l->max_time = age;
 8873
 8874				if (track->pid < l->min_pid)
 8875					l->min_pid = track->pid;
 8876				if (track->pid > l->max_pid)
 8877					l->max_pid = track->pid;
 8878
 8879				cpumask_set_cpu(track->cpu,
 8880						to_cpumask(l->cpus));
 8881			}
 8882			node_set(page_to_nid(virt_to_page(track)), l->nodes);
 8883			return 1;
 8884		}
 8885
 8886		if (track->addr < caddr)
 8887			end = pos;
 8888		else if (track->addr == caddr && handle < chandle)
 8889			end = pos;
 8890		else if (track->addr == caddr && handle == chandle &&
 8891				waste < cwaste)
 8892			end = pos;
 8893		else
 8894			start = pos;
 8895	}
 8896
 8897	/*
 8898	 * Not found. Insert new tracking element.
 8899	 */
 8900	if (t->count >= t->max && !alloc_loc_track(t, 2 * t->max, GFP_ATOMIC))
 8901		return 0;
 8902
 8903	l = t->loc + pos;
 8904	if (pos < t->count)
 8905		memmove(l + 1, l,
 8906			(t->count - pos) * sizeof(struct location));
 8907	t->count++;
 8908	l->count = 1;
 8909	l->addr = track->addr;
 8910	l->sum_time = age;
 8911	l->min_time = age;
 8912	l->max_time = age;
 8913	l->min_pid = track->pid;
 8914	l->max_pid = track->pid;
 8915	l->handle = handle;
 8916	l->waste = waste;
 8917	cpumask_clear(to_cpumask(l->cpus));
 8918	cpumask_set_cpu(track->cpu, to_cpumask(l->cpus));
 8919	nodes_clear(l->nodes);
 8920	node_set(page_to_nid(virt_to_page(track)), l->nodes);
 8921	return 1;
 8922}
 8923
 8924static void process_slab(struct loc_track *t, struct kmem_cache *s,
 8925		struct slab *slab, enum track_item alloc,
 8926		unsigned long *obj_map)
 8927{
 8928	void *addr = slab_address(slab);
 8929	bool is_alloc = (alloc == TRACK_ALLOC);
 8930	void *p;
 8931
 8932	__fill_map(obj_map, s, slab);
 8933
 8934	for_each_object(p, s, addr, slab->objects)
 8935		if (!test_bit(__obj_to_index(s, addr, p), obj_map))
 8936			add_location(t, s, get_track(s, p, alloc),
 8937				     is_alloc ? get_orig_size(s, p) :
 8938						s->object_size);
 8939}
 8940#endif  /* CONFIG_DEBUG_FS   */
 8941#endif	/* CONFIG_SLUB_DEBUG */
 8942
 8943#ifdef SLAB_SUPPORTS_SYSFS
 8944enum slab_stat_type {
 8945	SL_ALL,			/* All slabs */
 8946	SL_PARTIAL,		/* Only partially allocated slabs */
 8947	SL_CPU,			/* Only slabs used for cpu caches */
 8948	SL_OBJECTS,		/* Determine allocated objects not slabs */
 8949	SL_TOTAL		/* Determine object capacity not slabs */
 8950};
 8951
 8952#define SO_ALL		(1 << SL_ALL)
 8953#define SO_PARTIAL	(1 << SL_PARTIAL)
 8954#define SO_CPU		(1 << SL_CPU)
 8955#define SO_OBJECTS	(1 << SL_OBJECTS)
 8956#define SO_TOTAL	(1 << SL_TOTAL)
 8957
 8958static ssize_t show_slab_objects(struct kmem_cache *s,
 8959				 char *buf, unsigned long flags)
 8960{
 8961	unsigned long total = 0;
 8962	int node;
 8963	int x;
 8964	unsigned long *nodes;
 8965	int len = 0;
 8966
 8967	nodes = kcalloc(nr_node_ids, sizeof(unsigned long), GFP_KERNEL);
 8968	if (!nodes)
 8969		return -ENOMEM;
 8970
 8971	if (flags & SO_CPU) {
 8972		int cpu;
 8973
 8974		for_each_possible_cpu(cpu) {
 8975			struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab,
 8976							       cpu);
 8977			int node;
 8978			struct slab *slab;
 8979
 8980			slab = READ_ONCE(c->slab);
 8981			if (!slab)
 8982				continue;
 8983
 8984			node = slab_nid(slab);
 8985			if (flags & SO_TOTAL)
 8986				x = slab->objects;
 8987			else if (flags & SO_OBJECTS)
 8988				x = slab->inuse;
 8989			else
 8990				x = 1;
 8991
 8992			total += x;
 8993			nodes[node] += x;
 8994
 8995#ifdef CONFIG_SLUB_CPU_PARTIAL
 8996			slab = slub_percpu_partial_read_once(c);
 8997			if (slab) {
 8998				node = slab_nid(slab);
 8999				if (flags & SO_TOTAL)
 9000					WARN_ON_ONCE(1);
 9001				else if (flags & SO_OBJECTS)
 9002					WARN_ON_ONCE(1);
 9003				else
 9004					x = data_race(slab->slabs);
 9005				total += x;
 9006				nodes[node] += x;
 9007			}
 9008#endif
 9009		}
 9010	}
 9011
 9012	/*
 9013	 * It is impossible to take "mem_hotplug_lock" here with "kernfs_mutex"
 9014	 * already held which will conflict with an existing lock order:
 9015	 *
 9016	 * mem_hotplug_lock->slab_mutex->kernfs_mutex
 9017	 *
 9018	 * We don't really need mem_hotplug_lock (to hold off
 9019	 * slab_mem_going_offline_callback) here because slab's memory hot
 9020	 * unplug code doesn't destroy the kmem_cache->node[] data.
 9021	 */
 9022
 9023#ifdef CONFIG_SLUB_DEBUG
 9024	if (flags & SO_ALL) {
 9025		struct kmem_cache_node *n;
 9026
 9027		for_each_kmem_cache_node(s, node, n) {
 9028
 9029			if (flags & SO_TOTAL)
 9030				x = node_nr_objs(n);
 9031			else if (flags & SO_OBJECTS)
 9032				x = node_nr_objs(n) - count_partial(n, count_free);
 9033			else
 9034				x = node_nr_slabs(n);
 9035			total += x;
 9036			nodes[node] += x;
 9037		}
 9038
 9039	} else
 9040#endif
 9041	if (flags & SO_PARTIAL) {
 9042		struct kmem_cache_node *n;
 9043
 9044		for_each_kmem_cache_node(s, node, n) {
 9045			if (flags & SO_TOTAL)
 9046				x = count_partial(n, count_total);
 9047			else if (flags & SO_OBJECTS)
 9048				x = count_partial(n, count_inuse);
 9049			else
 9050				x = n->nr_partial;
 9051			total += x;
 9052			nodes[node] += x;
 9053		}
 9054	}
 9055
 9056	len += sysfs_emit_at(buf, len, "%lu", total);
 9057#ifdef CONFIG_NUMA
 9058	for (node = 0; node < nr_node_ids; node++) {
 9059		if (nodes[node])
 9060			len += sysfs_emit_at(buf, len, " N%d=%lu",
 9061					     node, nodes[node]);
 9062	}
 9063#endif
 9064	len += sysfs_emit_at(buf, len, "\n");
 9065	kfree(nodes);
 9066
 9067	return len;
 9068}
 9069
 9070#define to_slab_attr(n) container_of(n, struct slab_attribute, attr)
 9071#define to_slab(n) container_of(n, struct kmem_cache, kobj)
 9072
 9073struct slab_attribute {
 9074	struct attribute attr;
 9075	ssize_t (*show)(struct kmem_cache *s, char *buf);
 9076	ssize_t (*store)(struct kmem_cache *s, const char *x, size_t count);
 9077};
 9078
 9079#define SLAB_ATTR_RO(_name) \
 9080	static struct slab_attribute _name##_attr = __ATTR_RO_MODE(_name, 0400)
 9081
 9082#define SLAB_ATTR(_name) \
 9083	static struct slab_attribute _name##_attr = __ATTR_RW_MODE(_name, 0600)
 9084
 9085static ssize_t slab_size_show(struct kmem_cache *s, char *buf)
 9086{
 9087	return sysfs_emit(buf, "%u\n", s->size);
 9088}
 9089SLAB_ATTR_RO(slab_size);
 9090
 9091static ssize_t align_show(struct kmem_cache *s, char *buf)
 9092{
 9093	return sysfs_emit(buf, "%u\n", s->align);
 9094}
 9095SLAB_ATTR_RO(align);
 9096
 9097static ssize_t object_size_show(struct kmem_cache *s, char *buf)
 9098{
 9099	return sysfs_emit(buf, "%u\n", s->object_size);
 9100}
 9101SLAB_ATTR_RO(object_size);
 9102
 9103static ssize_t objs_per_slab_show(struct kmem_cache *s, char *buf)
 9104{
 9105	return sysfs_emit(buf, "%u\n", oo_objects(s->oo));
 9106}
 9107SLAB_ATTR_RO(objs_per_slab);
 9108
 9109static ssize_t order_show(struct kmem_cache *s, char *buf)
 9110{
 9111	return sysfs_emit(buf, "%u\n", oo_order(s->oo));
 9112}
 9113SLAB_ATTR_RO(order);
 9114
 9115static ssize_t sheaf_capacity_show(struct kmem_cache *s, char *buf)
 9116{
 9117	return sysfs_emit(buf, "%u\n", s->sheaf_capacity);
 9118}
 9119SLAB_ATTR_RO(sheaf_capacity);
 9120
 9121static ssize_t min_partial_show(struct kmem_cache *s, char *buf)
 9122{
 9123	return sysfs_emit(buf, "%lu\n", s->min_partial);
 9124}
 9125
 9126static ssize_t min_partial_store(struct kmem_cache *s, const char *buf,
 9127				 size_t length)
 9128{
 9129	unsigned long min;
 9130	int err;
 9131
 9132	err = kstrtoul(buf, 10, &min);
 9133	if (err)
 9134		return err;
 9135
 9136	s->min_partial = min;
 9137	return length;
 9138}
 9139SLAB_ATTR(min_partial);
 9140
 9141static ssize_t cpu_partial_show(struct kmem_cache *s, char *buf)
 9142{
 9143	unsigned int nr_partial = 0;
 9144#ifdef CONFIG_SLUB_CPU_PARTIAL
 9145	nr_partial = s->cpu_partial;
 9146#endif
 9147
 9148	return sysfs_emit(buf, "%u\n", nr_partial);
 9149}
 9150
 9151static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf,
 9152				 size_t length)
 9153{
 9154	unsigned int objects;
 9155	int err;
 9156
 9157	err = kstrtouint(buf, 10, &objects);
 9158	if (err)
 9159		return err;
 9160	if (objects && !kmem_cache_has_cpu_partial(s))
 9161		return -EINVAL;
 9162
 9163	slub_set_cpu_partial(s, objects);
 9164	flush_all(s);
 9165	return length;
 9166}
 9167SLAB_ATTR(cpu_partial);
 9168
 9169static ssize_t ctor_show(struct kmem_cache *s, char *buf)
 9170{
 9171	if (!s->ctor)
 9172		return 0;
 9173	return sysfs_emit(buf, "%pS\n", s->ctor);
 9174}
 9175SLAB_ATTR_RO(ctor);
 9176
 9177static ssize_t aliases_show(struct kmem_cache *s, char *buf)
 9178{
 9179	return sysfs_emit(buf, "%d\n", s->refcount < 0 ? 0 : s->refcount - 1);
 9180}
 9181SLAB_ATTR_RO(aliases);
 9182
 9183static ssize_t partial_show(struct kmem_cache *s, char *buf)
 9184{
 9185	return show_slab_objects(s, buf, SO_PARTIAL);
 9186}
 9187SLAB_ATTR_RO(partial);
 9188
 9189static ssize_t cpu_slabs_show(struct kmem_cache *s, char *buf)
 9190{
 9191	return show_slab_objects(s, buf, SO_CPU);
 9192}
 9193SLAB_ATTR_RO(cpu_slabs);
 9194
 9195static ssize_t objects_partial_show(struct kmem_cache *s, char *buf)
 9196{
 9197	return show_slab_objects(s, buf, SO_PARTIAL|SO_OBJECTS);
 9198}
 9199SLAB_ATTR_RO(objects_partial);
 9200
 9201static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf)
 9202{
 9203	int objects = 0;
 9204	int slabs = 0;
 9205	int cpu __maybe_unused;
 9206	int len = 0;
 9207
 9208#ifdef CONFIG_SLUB_CPU_PARTIAL
 9209	for_each_online_cpu(cpu) {
 9210		struct slab *slab;
 9211
 9212		slab = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu));
 9213
 9214		if (slab)
 9215			slabs += data_race(slab->slabs);
 9216	}
 9217#endif
 9218
 9219	/* Approximate half-full slabs, see slub_set_cpu_partial() */
 9220	objects = (slabs * oo_objects(s->oo)) / 2;
 9221	len += sysfs_emit_at(buf, len, "%d(%d)", objects, slabs);
 9222
 9223#ifdef CONFIG_SLUB_CPU_PARTIAL
 9224	for_each_online_cpu(cpu) {
 9225		struct slab *slab;
 9226
 9227		slab = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu));
 9228		if (slab) {
 9229			slabs = data_race(slab->slabs);
 9230			objects = (slabs * oo_objects(s->oo)) / 2;
 9231			len += sysfs_emit_at(buf, len, " C%d=%d(%d)",
 9232					     cpu, objects, slabs);
 9233		}
 9234	}
 9235#endif
 9236	len += sysfs_emit_at(buf, len, "\n");
 9237
 9238	return len;
 9239}
 9240SLAB_ATTR_RO(slabs_cpu_partial);
 9241
 9242static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf)
 9243{
 9244	return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT));
 9245}
 9246SLAB_ATTR_RO(reclaim_account);
 9247
 9248static ssize_t hwcache_align_show(struct kmem_cache *s, char *buf)
 9249{
 9250	return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_HWCACHE_ALIGN));
 9251}
 9252SLAB_ATTR_RO(hwcache_align);
 9253
 9254#ifdef CONFIG_ZONE_DMA
 9255static ssize_t cache_dma_show(struct kmem_cache *s, char *buf)
 9256{
 9257	return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_CACHE_DMA));
 9258}
 9259SLAB_ATTR_RO(cache_dma);
 9260#endif
 9261
 9262#ifdef CONFIG_HARDENED_USERCOPY
 9263static ssize_t usersize_show(struct kmem_cache *s, char *buf)
 9264{
 9265	return sysfs_emit(buf, "%u\n", s->usersize);
 9266}
 9267SLAB_ATTR_RO(usersize);
 9268#endif
 9269
 9270static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf)
 9271{
 9272	return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_TYPESAFE_BY_RCU));
 9273}
 9274SLAB_ATTR_RO(destroy_by_rcu);
 9275
 9276#ifdef CONFIG_SLUB_DEBUG
 9277static ssize_t slabs_show(struct kmem_cache *s, char *buf)
 9278{
 9279	return show_slab_objects(s, buf, SO_ALL);
 9280}
 9281SLAB_ATTR_RO(slabs);
 9282
 9283static ssize_t total_objects_show(struct kmem_cache *s, char *buf)
 9284{
 9285	return show_slab_objects(s, buf, SO_ALL|SO_TOTAL);
 9286}
 9287SLAB_ATTR_RO(total_objects);
 9288
 9289static ssize_t objects_show(struct kmem_cache *s, char *buf)
 9290{
 9291	return show_slab_objects(s, buf, SO_ALL|SO_OBJECTS);
 9292}
 9293SLAB_ATTR_RO(objects);
 9294
 9295static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf)
 9296{
 9297	return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_CONSISTENCY_CHECKS));
 9298}
 9299SLAB_ATTR_RO(sanity_checks);
 9300
 9301static ssize_t trace_show(struct kmem_cache *s, char *buf)
 9302{
 9303	return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_TRACE));
 9304}
 9305SLAB_ATTR_RO(trace);
 9306
 9307static ssize_t red_zone_show(struct kmem_cache *s, char *buf)
 9308{
 9309	return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_RED_ZONE));
 9310}
 9311
 9312SLAB_ATTR_RO(red_zone);
 9313
 9314static ssize_t poison_show(struct kmem_cache *s, char *buf)
 9315{
 9316	return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_POISON));
 9317}
 9318
 9319SLAB_ATTR_RO(poison);
 9320
 9321static ssize_t store_user_show(struct kmem_cache *s, char *buf)
 9322{
 9323	return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_STORE_USER));
 9324}
 9325
 9326SLAB_ATTR_RO(store_user);
 9327
 9328static ssize_t validate_show(struct kmem_cache *s, char *buf)
 9329{
 9330	return 0;
 9331}
 9332
 9333static ssize_t validate_store(struct kmem_cache *s,
 9334			const char *buf, size_t length)
 9335{
 9336	int ret = -EINVAL;
 9337
 9338	if (buf[0] == '1' && kmem_cache_debug(s)) {
 9339		ret = validate_slab_cache(s);
 9340		if (ret >= 0)
 9341			ret = length;
 9342	}
 9343	return ret;
 9344}
 9345SLAB_ATTR(validate);
 9346
 9347#endif /* CONFIG_SLUB_DEBUG */
 9348
 9349#ifdef CONFIG_FAILSLAB
 9350static ssize_t failslab_show(struct kmem_cache *s, char *buf)
 9351{
 9352	return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_FAILSLAB));
 9353}
 9354
 9355static ssize_t failslab_store(struct kmem_cache *s, const char *buf,
 9356				size_t length)
 9357{
 9358	if (s->refcount > 1)
 9359		return -EINVAL;
 9360
 9361	if (buf[0] == '1')
 9362		WRITE_ONCE(s->flags, s->flags | SLAB_FAILSLAB);
 9363	else
 9364		WRITE_ONCE(s->flags, s->flags & ~SLAB_FAILSLAB);
 9365
 9366	return length;
 9367}
 9368SLAB_ATTR(failslab);
 9369#endif
 9370
 9371static ssize_t shrink_show(struct kmem_cache *s, char *buf)
 9372{
 9373	return 0;
 9374}
 9375
 9376static ssize_t shrink_store(struct kmem_cache *s,
 9377			const char *buf, size_t length)
 9378{
 9379	if (buf[0] == '1')
 9380		kmem_cache_shrink(s);
 9381	else
 9382		return -EINVAL;
 9383	return length;
 9384}
 9385SLAB_ATTR(shrink);
 9386
 9387#ifdef CONFIG_NUMA
 9388static ssize_t remote_node_defrag_ratio_show(struct kmem_cache *s, char *buf)
 9389{
 9390	return sysfs_emit(buf, "%u\n", s->remote_node_defrag_ratio / 10);
 9391}
 9392
 9393static ssize_t remote_node_defrag_ratio_store(struct kmem_cache *s,
 9394				const char *buf, size_t length)
 9395{
 9396	unsigned int ratio;
 9397	int err;
 9398
 9399	err = kstrtouint(buf, 10, &ratio);
 9400	if (err)
 9401		return err;
 9402	if (ratio > 100)
 9403		return -ERANGE;
 9404
 9405	s->remote_node_defrag_ratio = ratio * 10;
 9406
 9407	return length;
 9408}
 9409SLAB_ATTR(remote_node_defrag_ratio);
 9410#endif
 9411
 9412#ifdef CONFIG_SLUB_STATS
 9413static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si)
 9414{
 9415	unsigned long sum  = 0;
 9416	int cpu;
 9417	int len = 0;
 9418	int *data = kmalloc_array(nr_cpu_ids, sizeof(int), GFP_KERNEL);
 9419
 9420	if (!data)
 9421		return -ENOMEM;
 9422
 9423	for_each_online_cpu(cpu) {
 9424		unsigned x = per_cpu_ptr(s->cpu_slab, cpu)->stat[si];
 9425
 9426		data[cpu] = x;
 9427		sum += x;
 9428	}
 9429
 9430	len += sysfs_emit_at(buf, len, "%lu", sum);
 9431
 9432#ifdef CONFIG_SMP
 9433	for_each_online_cpu(cpu) {
 9434		if (data[cpu])
 9435			len += sysfs_emit_at(buf, len, " C%d=%u",
 9436					     cpu, data[cpu]);
 9437	}
 9438#endif
 9439	kfree(data);
 9440	len += sysfs_emit_at(buf, len, "\n");
 9441
 9442	return len;
 9443}
 9444
 9445static void clear_stat(struct kmem_cache *s, enum stat_item si)
 9446{
 9447	int cpu;
 9448
 9449	for_each_online_cpu(cpu)
 9450		per_cpu_ptr(s->cpu_slab, cpu)->stat[si] = 0;
 9451}
 9452
 9453#define STAT_ATTR(si, text) 					\
 9454static ssize_t text##_show(struct kmem_cache *s, char *buf)	\
 9455{								\
 9456	return show_stat(s, buf, si);				\
 9457}								\
 9458static ssize_t text##_store(struct kmem_cache *s,		\
 9459				const char *buf, size_t length)	\
 9460{								\
 9461	if (buf[0] != '0')					\
 9462		return -EINVAL;					\
 9463	clear_stat(s, si);					\
 9464	return length;						\
 9465}								\
 9466SLAB_ATTR(text);						\
 9467
 9468STAT_ATTR(ALLOC_PCS, alloc_cpu_sheaf);
 9469STAT_ATTR(ALLOC_FASTPATH, alloc_fastpath);
 9470STAT_ATTR(ALLOC_SLOWPATH, alloc_slowpath);
 9471STAT_ATTR(FREE_PCS, free_cpu_sheaf);
 9472STAT_ATTR(FREE_RCU_SHEAF, free_rcu_sheaf);
 9473STAT_ATTR(FREE_RCU_SHEAF_FAIL, free_rcu_sheaf_fail);
 9474STAT_ATTR(FREE_FASTPATH, free_fastpath);
 9475STAT_ATTR(FREE_SLOWPATH, free_slowpath);
 9476STAT_ATTR(FREE_FROZEN, free_frozen);
 9477STAT_ATTR(FREE_ADD_PARTIAL, free_add_partial);
 9478STAT_ATTR(FREE_REMOVE_PARTIAL, free_remove_partial);
 9479STAT_ATTR(ALLOC_FROM_PARTIAL, alloc_from_partial);
 9480STAT_ATTR(ALLOC_SLAB, alloc_slab);
 9481STAT_ATTR(ALLOC_REFILL, alloc_refill);
 9482STAT_ATTR(ALLOC_NODE_MISMATCH, alloc_node_mismatch);
 9483STAT_ATTR(FREE_SLAB, free_slab);
 9484STAT_ATTR(CPUSLAB_FLUSH, cpuslab_flush);
 9485STAT_ATTR(DEACTIVATE_FULL, deactivate_full);
 9486STAT_ATTR(DEACTIVATE_EMPTY, deactivate_empty);
 9487STAT_ATTR(DEACTIVATE_TO_HEAD, deactivate_to_head);
 9488STAT_ATTR(DEACTIVATE_TO_TAIL, deactivate_to_tail);
 9489STAT_ATTR(DEACTIVATE_REMOTE_FREES, deactivate_remote_frees);
 9490STAT_ATTR(DEACTIVATE_BYPASS, deactivate_bypass);
 9491STAT_ATTR(ORDER_FALLBACK, order_fallback);
 9492STAT_ATTR(CMPXCHG_DOUBLE_CPU_FAIL, cmpxchg_double_cpu_fail);
 9493STAT_ATTR(CMPXCHG_DOUBLE_FAIL, cmpxchg_double_fail);
 9494STAT_ATTR(CPU_PARTIAL_ALLOC, cpu_partial_alloc);
 9495STAT_ATTR(CPU_PARTIAL_FREE, cpu_partial_free);
 9496STAT_ATTR(CPU_PARTIAL_NODE, cpu_partial_node);
 9497STAT_ATTR(CPU_PARTIAL_DRAIN, cpu_partial_drain);
 9498STAT_ATTR(SHEAF_FLUSH, sheaf_flush);
 9499STAT_ATTR(SHEAF_REFILL, sheaf_refill);
 9500STAT_ATTR(SHEAF_ALLOC, sheaf_alloc);
 9501STAT_ATTR(SHEAF_FREE, sheaf_free);
 9502STAT_ATTR(BARN_GET, barn_get);
 9503STAT_ATTR(BARN_GET_FAIL, barn_get_fail);
 9504STAT_ATTR(BARN_PUT, barn_put);
 9505STAT_ATTR(BARN_PUT_FAIL, barn_put_fail);
 9506STAT_ATTR(SHEAF_PREFILL_FAST, sheaf_prefill_fast);
 9507STAT_ATTR(SHEAF_PREFILL_SLOW, sheaf_prefill_slow);
 9508STAT_ATTR(SHEAF_PREFILL_OVERSIZE, sheaf_prefill_oversize);
 9509STAT_ATTR(SHEAF_RETURN_FAST, sheaf_return_fast);
 9510STAT_ATTR(SHEAF_RETURN_SLOW, sheaf_return_slow);
 9511#endif	/* CONFIG_SLUB_STATS */
 9512
 9513#ifdef CONFIG_KFENCE
 9514static ssize_t skip_kfence_show(struct kmem_cache *s, char *buf)
 9515{
 9516	return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_SKIP_KFENCE));
 9517}
 9518
 9519static ssize_t skip_kfence_store(struct kmem_cache *s,
 9520			const char *buf, size_t length)
 9521{
 9522	int ret = length;
 9523
 9524	if (buf[0] == '0')
 9525		s->flags &= ~SLAB_SKIP_KFENCE;
 9526	else if (buf[0] == '1')
 9527		s->flags |= SLAB_SKIP_KFENCE;
 9528	else
 9529		ret = -EINVAL;
 9530
 9531	return ret;
 9532}
 9533SLAB_ATTR(skip_kfence);
 9534#endif
 9535
 9536static struct attribute *slab_attrs[] = {
 9537	&slab_size_attr.attr,
 9538	&object_size_attr.attr,
 9539	&objs_per_slab_attr.attr,
 9540	&order_attr.attr,
 9541	&sheaf_capacity_attr.attr,
 9542	&min_partial_attr.attr,
 9543	&cpu_partial_attr.attr,
 9544	&objects_partial_attr.attr,
 9545	&partial_attr.attr,
 9546	&cpu_slabs_attr.attr,
 9547	&ctor_attr.attr,
 9548	&aliases_attr.attr,
 9549	&align_attr.attr,
 9550	&hwcache_align_attr.attr,
 9551	&reclaim_account_attr.attr,
 9552	&destroy_by_rcu_attr.attr,
 9553	&shrink_attr.attr,
 9554	&slabs_cpu_partial_attr.attr,
 9555#ifdef CONFIG_SLUB_DEBUG
 9556	&total_objects_attr.attr,
 9557	&objects_attr.attr,
 9558	&slabs_attr.attr,
 9559	&sanity_checks_attr.attr,
 9560	&trace_attr.attr,
 9561	&red_zone_attr.attr,
 9562	&poison_attr.attr,
 9563	&store_user_attr.attr,
 9564	&validate_attr.attr,
 9565#endif
 9566#ifdef CONFIG_ZONE_DMA
 9567	&cache_dma_attr.attr,
 9568#endif
 9569#ifdef CONFIG_NUMA
 9570	&remote_node_defrag_ratio_attr.attr,
 9571#endif
 9572#ifdef CONFIG_SLUB_STATS
 9573	&alloc_cpu_sheaf_attr.attr,
 9574	&alloc_fastpath_attr.attr,
 9575	&alloc_slowpath_attr.attr,
 9576	&free_cpu_sheaf_attr.attr,
 9577	&free_rcu_sheaf_attr.attr,
 9578	&free_rcu_sheaf_fail_attr.attr,
 9579	&free_fastpath_attr.attr,
 9580	&free_slowpath_attr.attr,
 9581	&free_frozen_attr.attr,
 9582	&free_add_partial_attr.attr,
 9583	&free_remove_partial_attr.attr,
 9584	&alloc_from_partial_attr.attr,
 9585	&alloc_slab_attr.attr,
 9586	&alloc_refill_attr.attr,
 9587	&alloc_node_mismatch_attr.attr,
 9588	&free_slab_attr.attr,
 9589	&cpuslab_flush_attr.attr,
 9590	&deactivate_full_attr.attr,
 9591	&deactivate_empty_attr.attr,
 9592	&deactivate_to_head_attr.attr,
 9593	&deactivate_to_tail_attr.attr,
 9594	&deactivate_remote_frees_attr.attr,
 9595	&deactivate_bypass_attr.attr,
 9596	&order_fallback_attr.attr,
 9597	&cmpxchg_double_fail_attr.attr,
 9598	&cmpxchg_double_cpu_fail_attr.attr,
 9599	&cpu_partial_alloc_attr.attr,
 9600	&cpu_partial_free_attr.attr,
 9601	&cpu_partial_node_attr.attr,
 9602	&cpu_partial_drain_attr.attr,
 9603	&sheaf_flush_attr.attr,
 9604	&sheaf_refill_attr.attr,
 9605	&sheaf_alloc_attr.attr,
 9606	&sheaf_free_attr.attr,
 9607	&barn_get_attr.attr,
 9608	&barn_get_fail_attr.attr,
 9609	&barn_put_attr.attr,
 9610	&barn_put_fail_attr.attr,
 9611	&sheaf_prefill_fast_attr.attr,
 9612	&sheaf_prefill_slow_attr.attr,
 9613	&sheaf_prefill_oversize_attr.attr,
 9614	&sheaf_return_fast_attr.attr,
 9615	&sheaf_return_slow_attr.attr,
 9616#endif
 9617#ifdef CONFIG_FAILSLAB
 9618	&failslab_attr.attr,
 9619#endif
 9620#ifdef CONFIG_HARDENED_USERCOPY
 9621	&usersize_attr.attr,
 9622#endif
 9623#ifdef CONFIG_KFENCE
 9624	&skip_kfence_attr.attr,
 9625#endif
 9626
 9627	NULL
 9628};
 9629
 9630static const struct attribute_group slab_attr_group = {
 9631	.attrs = slab_attrs,
 9632};
 9633
 9634static ssize_t slab_attr_show(struct kobject *kobj,
 9635				struct attribute *attr,
 9636				char *buf)
 9637{
 9638	struct slab_attribute *attribute;
 9639	struct kmem_cache *s;
 9640
 9641	attribute = to_slab_attr(attr);
 9642	s = to_slab(kobj);
 9643
 9644	if (!attribute->show)
 9645		return -EIO;
 9646
 9647	return attribute->show(s, buf);
 9648}
 9649
 9650static ssize_t slab_attr_store(struct kobject *kobj,
 9651				struct attribute *attr,
 9652				const char *buf, size_t len)
 9653{
 9654	struct slab_attribute *attribute;
 9655	struct kmem_cache *s;
 9656
 9657	attribute = to_slab_attr(attr);
 9658	s = to_slab(kobj);
 9659
 9660	if (!attribute->store)
 9661		return -EIO;
 9662
 9663	return attribute->store(s, buf, len);
 9664}
 9665
 9666static void kmem_cache_release(struct kobject *k)
 9667{
 9668	slab_kmem_cache_release(to_slab(k));
 9669}
 9670
 9671static const struct sysfs_ops slab_sysfs_ops = {
 9672	.show = slab_attr_show,
 9673	.store = slab_attr_store,
 9674};
 9675
 9676static const struct kobj_type slab_ktype = {
 9677	.sysfs_ops = &slab_sysfs_ops,
 9678	.release = kmem_cache_release,
 9679};
 9680
 9681static struct kset *slab_kset;
 9682
 9683static inline struct kset *cache_kset(struct kmem_cache *s)
 9684{
 9685	return slab_kset;
 9686}
 9687
 9688#define ID_STR_LENGTH 32
 9689
 9690/* Create a unique string id for a slab cache:
 9691 *
 9692 * Format	:[flags-]size
 9693 */
 9694static char *create_unique_id(struct kmem_cache *s)
 9695{
 9696	char *name = kmalloc(ID_STR_LENGTH, GFP_KERNEL);
 9697	char *p = name;
 9698
 9699	if (!name)
 9700		return ERR_PTR(-ENOMEM);
 9701
 9702	*p++ = ':';
 9703	/*
 9704	 * First flags affecting slabcache operations. We will only
 9705	 * get here for aliasable slabs so we do not need to support
 9706	 * too many flags. The flags here must cover all flags that
 9707	 * are matched during merging to guarantee that the id is
 9708	 * unique.
 9709	 */
 9710	if (s->flags & SLAB_CACHE_DMA)
 9711		*p++ = 'd';
 9712	if (s->flags & SLAB_CACHE_DMA32)
 9713		*p++ = 'D';
 9714	if (s->flags & SLAB_RECLAIM_ACCOUNT)
 9715		*p++ = 'a';
 9716	if (s->flags & SLAB_CONSISTENCY_CHECKS)
 9717		*p++ = 'F';
 9718	if (s->flags & SLAB_ACCOUNT)
 9719		*p++ = 'A';
 9720	if (p != name + 1)
 9721		*p++ = '-';
 9722	p += snprintf(p, ID_STR_LENGTH - (p - name), "%07u", s->size);
 9723
 9724	if (WARN_ON(p > name + ID_STR_LENGTH - 1)) {
 9725		kfree(name);
 9726		return ERR_PTR(-EINVAL);
 9727	}
 9728	kmsan_unpoison_memory(name, p - name);
 9729	return name;
 9730}
 9731
 9732static int sysfs_slab_add(struct kmem_cache *s)
 9733{
 9734	int err;
 9735	const char *name;
 9736	struct kset *kset = cache_kset(s);
 9737	int unmergeable = slab_unmergeable(s);
 9738
 9739	if (!unmergeable && disable_higher_order_debug &&
 9740			(slub_debug & DEBUG_METADATA_FLAGS))
 9741		unmergeable = 1;
 9742
 9743	if (unmergeable) {
 9744		/*
 9745		 * Slabcache can never be merged so we can use the name proper.
 9746		 * This is typically the case for debug situations. In that
 9747		 * case we can catch duplicate names easily.
 9748		 */
 9749		sysfs_remove_link(&slab_kset->kobj, s->name);
 9750		name = s->name;
 9751	} else {
 9752		/*
 9753		 * Create a unique name for the slab as a target
 9754		 * for the symlinks.
 9755		 */
 9756		name = create_unique_id(s);
 9757		if (IS_ERR(name))
 9758			return PTR_ERR(name);
 9759	}
 9760
 9761	s->kobj.kset = kset;
 9762	err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, "%s", name);
 9763	if (err)
 9764		goto out;
 9765
 9766	err = sysfs_create_group(&s->kobj, &slab_attr_group);
 9767	if (err)
 9768		goto out_del_kobj;
 9769
 9770	if (!unmergeable) {
 9771		/* Setup first alias */
 9772		sysfs_slab_alias(s, s->name);
 9773	}
 9774out:
 9775	if (!unmergeable)
 9776		kfree(name);
 9777	return err;
 9778out_del_kobj:
 9779	kobject_del(&s->kobj);
 9780	goto out;
 9781}
 9782
 9783void sysfs_slab_unlink(struct kmem_cache *s)
 9784{
 9785	if (s->kobj.state_in_sysfs)
 9786		kobject_del(&s->kobj);
 9787}
 9788
 9789void sysfs_slab_release(struct kmem_cache *s)
 9790{
 9791	kobject_put(&s->kobj);
 9792}
 9793
 9794/*
 9795 * Need to buffer aliases during bootup until sysfs becomes
 9796 * available lest we lose that information.
 9797 */
 9798struct saved_alias {
 9799	struct kmem_cache *s;
 9800	const char *name;
 9801	struct saved_alias *next;
 9802};
 9803
 9804static struct saved_alias *alias_list;
 9805
 9806static int sysfs_slab_alias(struct kmem_cache *s, const char *name)
 9807{
 9808	struct saved_alias *al;
 9809
 9810	if (slab_state == FULL) {
 9811		/*
 9812		 * If we have a leftover link then remove it.
 9813		 */
 9814		sysfs_remove_link(&slab_kset->kobj, name);
 9815		/*
 9816		 * The original cache may have failed to generate sysfs file.
 9817		 * In that case, sysfs_create_link() returns -ENOENT and
 9818		 * symbolic link creation is skipped.
 9819		 */
 9820		return sysfs_create_link(&slab_kset->kobj, &s->kobj, name);
 9821	}
 9822
 9823	al = kmalloc(sizeof(struct saved_alias), GFP_KERNEL);
 9824	if (!al)
 9825		return -ENOMEM;
 9826
 9827	al->s = s;
 9828	al->name = name;
 9829	al->next = alias_list;
 9830	alias_list = al;
 9831	kmsan_unpoison_memory(al, sizeof(*al));
 9832	return 0;
 9833}
 9834
 9835static int __init slab_sysfs_init(void)
 9836{
 9837	struct kmem_cache *s;
 9838	int err;
 9839
 9840	mutex_lock(&slab_mutex);
 9841
 9842	slab_kset = kset_create_and_add("slab", NULL, kernel_kobj);
 9843	if (!slab_kset) {
 9844		mutex_unlock(&slab_mutex);
 9845		pr_err("Cannot register slab subsystem.\n");
 9846		return -ENOMEM;
 9847	}
 9848
 9849	slab_state = FULL;
 9850
 9851	list_for_each_entry(s, &slab_caches, list) {
 9852		err = sysfs_slab_add(s);
 9853		if (err)
 9854			pr_err("SLUB: Unable to add boot slab %s to sysfs\n",
 9855			       s->name);
 9856	}
 9857
 9858	while (alias_list) {
 9859		struct saved_alias *al = alias_list;
 9860
 9861		alias_list = alias_list->next;
 9862		err = sysfs_slab_alias(al->s, al->name);
 9863		if (err)
 9864			pr_err("SLUB: Unable to add boot slab alias %s to sysfs\n",
 9865			       al->name);
 9866		kfree(al);
 9867	}
 9868
 9869	mutex_unlock(&slab_mutex);
 9870	return 0;
 9871}
 9872late_initcall(slab_sysfs_init);
 9873#endif /* SLAB_SUPPORTS_SYSFS */
 9874
 9875#if defined(CONFIG_SLUB_DEBUG) && defined(CONFIG_DEBUG_FS)
 9876static int slab_debugfs_show(struct seq_file *seq, void *v)
 9877{
 9878	struct loc_track *t = seq->private;
 9879	struct location *l;
 9880	unsigned long idx;
 9881
 9882	idx = (unsigned long) t->idx;
 9883	if (idx < t->count) {
 9884		l = &t->loc[idx];
 9885
 9886		seq_printf(seq, "%7ld ", l->count);
 9887
 9888		if (l->addr)
 9889			seq_printf(seq, "%pS", (void *)l->addr);
 9890		else
 9891			seq_puts(seq, "<not-available>");
 9892
 9893		if (l->waste)
 9894			seq_printf(seq, " waste=%lu/%lu",
 9895				l->count * l->waste, l->waste);
 9896
 9897		if (l->sum_time != l->min_time) {
 9898			seq_printf(seq, " age=%ld/%llu/%ld",
 9899				l->min_time, div_u64(l->sum_time, l->count),
 9900				l->max_time);
 9901		} else
 9902			seq_printf(seq, " age=%ld", l->min_time);
 9903
 9904		if (l->min_pid != l->max_pid)
 9905			seq_printf(seq, " pid=%ld-%ld", l->min_pid, l->max_pid);
 9906		else
 9907			seq_printf(seq, " pid=%ld",
 9908				l->min_pid);
 9909
 9910		if (num_online_cpus() > 1 && !cpumask_empty(to_cpumask(l->cpus)))
 9911			seq_printf(seq, " cpus=%*pbl",
 9912				 cpumask_pr_args(to_cpumask(l->cpus)));
 9913
 9914		if (nr_online_nodes > 1 && !nodes_empty(l->nodes))
 9915			seq_printf(seq, " nodes=%*pbl",
 9916				 nodemask_pr_args(&l->nodes));
 9917
 9918#ifdef CONFIG_STACKDEPOT
 9919		{
 9920			depot_stack_handle_t handle;
 9921			unsigned long *entries;
 9922			unsigned int nr_entries, j;
 9923
 9924			handle = READ_ONCE(l->handle);
 9925			if (handle) {
 9926				nr_entries = stack_depot_fetch(handle, &entries);
 9927				seq_puts(seq, "\n");
 9928				for (j = 0; j < nr_entries; j++)
 9929					seq_printf(seq, "        %pS\n", (void *)entries[j]);
 9930			}
 9931		}
 9932#endif
 9933		seq_puts(seq, "\n");
 9934	}
 9935
 9936	if (!idx && !t->count)
 9937		seq_puts(seq, "No data\n");
 9938
 9939	return 0;
 9940}
 9941
 9942static void slab_debugfs_stop(struct seq_file *seq, void *v)
 9943{
 9944}
 9945
 9946static void *slab_debugfs_next(struct seq_file *seq, void *v, loff_t *ppos)
 9947{
 9948	struct loc_track *t = seq->private;
 9949
 9950	t->idx = ++(*ppos);
 9951	if (*ppos <= t->count)
 9952		return ppos;
 9953
 9954	return NULL;
 9955}
 9956
 9957static int cmp_loc_by_count(const void *a, const void *b)
 9958{
 9959	struct location *loc1 = (struct location *)a;
 9960	struct location *loc2 = (struct location *)b;
 9961
 9962	return cmp_int(loc2->count, loc1->count);
 9963}
 9964
 9965static void *slab_debugfs_start(struct seq_file *seq, loff_t *ppos)
 9966{
 9967	struct loc_track *t = seq->private;
 9968
 9969	t->idx = *ppos;
 9970	return ppos;
 9971}
 9972
 9973static const struct seq_operations slab_debugfs_sops = {
 9974	.start  = slab_debugfs_start,
 9975	.next   = slab_debugfs_next,
 9976	.stop   = slab_debugfs_stop,
 9977	.show   = slab_debugfs_show,
 9978};
 9979
 9980static int slab_debug_trace_open(struct inode *inode, struct file *filep)
 9981{
 9982
 9983	struct kmem_cache_node *n;
 9984	enum track_item alloc;
 9985	int node;
 9986	struct loc_track *t = __seq_open_private(filep, &slab_debugfs_sops,
 9987						sizeof(struct loc_track));
 9988	struct kmem_cache *s = file_inode(filep)->i_private;
 9989	unsigned long *obj_map;
 9990
 9991	if (!t)
 9992		return -ENOMEM;
 9993
 9994	obj_map = bitmap_alloc(oo_objects(s->oo), GFP_KERNEL);
 9995	if (!obj_map) {
 9996		seq_release_private(inode, filep);
 9997		return -ENOMEM;
 9998	}
 9999
10000	alloc = debugfs_get_aux_num(filep);
10001
10002	if (!alloc_loc_track(t, PAGE_SIZE / sizeof(struct location), GFP_KERNEL)) {
10003		bitmap_free(obj_map);
10004		seq_release_private(inode, filep);
10005		return -ENOMEM;
10006	}
10007
10008	for_each_kmem_cache_node(s, node, n) {
10009		unsigned long flags;
10010		struct slab *slab;
10011
10012		if (!node_nr_slabs(n))
10013			continue;
10014
10015		spin_lock_irqsave(&n->list_lock, flags);
10016		list_for_each_entry(slab, &n->partial, slab_list)
10017			process_slab(t, s, slab, alloc, obj_map);
10018		list_for_each_entry(slab, &n->full, slab_list)
10019			process_slab(t, s, slab, alloc, obj_map);
10020		spin_unlock_irqrestore(&n->list_lock, flags);
10021	}
10022
10023	/* Sort locations by count */
10024	sort(t->loc, t->count, sizeof(struct location),
10025	     cmp_loc_by_count, NULL);
10026
10027	bitmap_free(obj_map);
10028	return 0;
10029}
10030
10031static int slab_debug_trace_release(struct inode *inode, struct file *file)
10032{
10033	struct seq_file *seq = file->private_data;
10034	struct loc_track *t = seq->private;
10035
10036	free_loc_track(t);
10037	return seq_release_private(inode, file);
10038}
10039
10040static const struct file_operations slab_debugfs_fops = {
10041	.open    = slab_debug_trace_open,
10042	.read    = seq_read,
10043	.llseek  = seq_lseek,
10044	.release = slab_debug_trace_release,
10045};
10046
10047static void debugfs_slab_add(struct kmem_cache *s)
10048{
10049	struct dentry *slab_cache_dir;
10050
10051	if (unlikely(!slab_debugfs_root))
10052		return;
10053
10054	slab_cache_dir = debugfs_create_dir(s->name, slab_debugfs_root);
10055
10056	debugfs_create_file_aux_num("alloc_traces", 0400, slab_cache_dir, s,
10057					TRACK_ALLOC, &slab_debugfs_fops);
10058
10059	debugfs_create_file_aux_num("free_traces", 0400, slab_cache_dir, s,
10060					TRACK_FREE, &slab_debugfs_fops);
10061}
10062
10063void debugfs_slab_release(struct kmem_cache *s)
10064{
10065	debugfs_lookup_and_remove(s->name, slab_debugfs_root);
10066}
10067
10068static int __init slab_debugfs_init(void)
10069{
10070	struct kmem_cache *s;
10071
10072	slab_debugfs_root = debugfs_create_dir("slab", NULL);
10073
10074	list_for_each_entry(s, &slab_caches, list)
10075		if (s->flags & SLAB_STORE_USER)
10076			debugfs_slab_add(s);
10077
10078	return 0;
10079
10080}
10081__initcall(slab_debugfs_init);
10082#endif
10083/*
10084 * The /proc/slabinfo ABI
10085 */
10086#ifdef CONFIG_SLUB_DEBUG
10087void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo)
10088{
10089	unsigned long nr_slabs = 0;
10090	unsigned long nr_objs = 0;
10091	unsigned long nr_free = 0;
10092	int node;
10093	struct kmem_cache_node *n;
10094
10095	for_each_kmem_cache_node(s, node, n) {
10096		nr_slabs += node_nr_slabs(n);
10097		nr_objs += node_nr_objs(n);
10098		nr_free += count_partial_free_approx(n);
10099	}
10100
10101	sinfo->active_objs = nr_objs - nr_free;
10102	sinfo->num_objs = nr_objs;
10103	sinfo->active_slabs = nr_slabs;
10104	sinfo->num_slabs = nr_slabs;
10105	sinfo->objects_per_slab = oo_objects(s->oo);
10106	sinfo->cache_order = oo_order(s->oo);
10107}
10108#endif /* CONFIG_SLUB_DEBUG */